package Feed::Role::DeDupe;
use Moose::Role;
use 5.012;
use namespace::autoclean -also => ['_maybe_build_schema'];
use DBI;
use Try::Tiny;
use Encode;
use Digest::SHA1 'sha1_base64';
requires 'get_feed','process','process_entry';
has 'dupe_dsn' => (
is => 'ro',
isa => 'Str',
required => 1,
);
has dbh => (
is => 'ro',
lazy_build => 1,
);
has ['_insert_sth','_update_sth'] => (
is => 'ro',
lazy_build => 1,
);
has dedupe_use_body => (
is => 'ro',
isa => 'Bool',
default => 1,
);
has dedupe_use_date => (
is => 'ro',
isa => 'Bool',
default => 1,
);
sub _build_dbh {
my ($self) = @_;
my $dbh = DBI->connect($self->dupe_dsn,undef,undef,{
RaiseError => 1,
PrintError => 0,
AutoCommit => 0,
});
_maybe_build_schema($dbh);
return $dbh;
}
sub _maybe_build_schema {
my ($dbh) = @_;
try {
$dbh->selectrow_array(q{SELECT * FROM seen_items LIMIT 1});
$dbh->rollback;
} catch {
$dbh->do(<<'SQL');
CREATE TABLE seen_items (
id VARCHAR(255) PRIMARY KEY,
last_seen TIMESTAMP
)
SQL
$dbh->commit;
};
return;
}
sub _build__insert_sth {
my ($self) = @_;
return $self->dbh->prepare(<<'SQL');
INSERT INTO seen_items(id,last_seen)
VALUES (?,datetime('now'))
SQL
}
sub _build__update_sth {
my ($self) = @_;
return $self->dbh->prepare(<<'SQL');
UPDATE seen_items
SET last_seen = datetime('now')
WHERE id = ?
SQL
}
after process => sub {
my ($self) = @_;
$self->log->trace('after process');
$self->dbh->commit;
};
around process_entry => sub {
my ($orig,$self,$entry) = @_;
$self->log->trace('around process_entry - begin');
return if $self->seen_already($entry);
$self->log->trace('around process_entry - call original');
$self->$orig($entry);
$self->mark_seen($entry);
$self->log->trace('around process_entry - end');
return;
};
sub seen_already {
my ($self,$e) = @_;
$self->log->trace('seen_already - begin');
my $id = $self->_entry_id($e);
my $count = 0 + $self->_update_sth->execute($id);
$self->log->trace("seen_already - end ($count)");
return $count;
}
sub mark_seen {
my ($self,$e) = @_;
$self->log->trace('mark_seen - begin');
my $id = $self->_entry_id($e);
$self->_insert_sth->execute($id);
$self->log->trace('mark_seen - end');
}
sub _entry_id {
my ($self,$e) = @_;
my $id = $e->id // $e->link;
$self->log->trace("_entry_id: $id");
if ($self->dedupe_use_date) {
my $date = $e->modified // $e->issued;
if ($date) {
$id .= '-' . $date->iso8601;
$self->log->trace("_entry_id: (with date) $id");
}
}
if ($self->dedupe_use_body) {
my $body = $e->content->body;
if ($body) {
my $content_digest = sha1_base64(encode('utf-8',$body));
$id .= '-' . $content_digest;
$self->log->trace("_entry_id: (with content) $id");
}
}
return encode('utf-8',$id);
}
1;