summaryrefslogtreecommitdiff
path: root/lib/Feed/Role/DeDupe.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Feed/Role/DeDupe.pm')
-rw-r--r--lib/Feed/Role/DeDupe.pm141
1 files changed, 141 insertions, 0 deletions
diff --git a/lib/Feed/Role/DeDupe.pm b/lib/Feed/Role/DeDupe.pm
new file mode 100644
index 0000000..f1bfc71
--- /dev/null
+++ b/lib/Feed/Role/DeDupe.pm
@@ -0,0 +1,141 @@
+package Feed::Role::DeDupe;
+use Moose::Role;
+use 5.016;
+use namespace::autoclean -also => ['_maybe_build_schema'];
+use DBI;
+use Try::Tiny;
+use Encode;
+use Digest::SHA1 'sha1_base64';
+
+requires 'get_feed','process','process_entry';
+
+has 'dupe_dsn' => (
+ is => 'ro',
+ isa => 'Str',
+ required => 1,
+);
+
+has dbh => (
+ is => 'ro',
+ lazy_build => 1,
+);
+
+has ['_find_sth','_insert_sth'] => (
+ is => 'ro',
+ lazy_build => 1,
+);
+
+sub _build_dbh {
+ my ($self) = @_;
+
+ my $dbh = DBI->connect($self->dupe_dsn,undef,undef,{
+ RaiseError => 1,
+ PrintError => 0,
+ AutoCommit => 0,
+ });
+
+ _maybe_build_schema($dbh);
+
+ return $dbh;
+}
+
+sub _maybe_build_schema {
+ my ($dbh) = @_;
+
+ try {
+ $dbh->selectrow_array(q{SELECT * FROM seen_items LIMIT 1});
+ $dbh->rollback;
+ } catch {
+ $dbh->do(<<'SQL');
+CREATE TABLE seen_items (
+ id VARCHAR(255) PRIMARY KEY
+)
+SQL
+$dbh->commit;
+ };
+
+ return;
+}
+
+sub _build__find_sth {
+ my ($self) = @_;
+
+ return $self->dbh->prepare(<<'SQL');
+SELECT COUNT(*)
+FROM seen_items
+WHERE id=?
+SQL
+}
+
+sub _build__insert_sth {
+ my ($self) = @_;
+
+ return $self->dbh->prepare(<<'SQL');
+INSERT INTO seen_items(id)
+VALUES (?)
+SQL
+}
+
+after process => sub {
+ my ($self) = @_;
+
+ $self->log->trace('after process');
+
+ $self->dbh->commit;
+};
+
+around process_entry => sub {
+ my ($orig,$self,$entry) = @_;
+
+ $self->log->trace('around process_entry - begin');
+
+ return if $self->seen_already($entry);
+
+ $self->log->trace('around process_entry - call original');
+
+ $self->$orig($entry);
+
+ $self->mark_seen($entry);
+
+ $self->log->trace('around process_entry - end');
+
+ return;
+};
+
+sub seen_already {
+ my ($self,$e) = @_;
+
+ $self->log->trace('seen_already - begin');
+
+ my $id = $self->_entry_id($e);
+
+ $self->_find_sth->execute($id);
+ my ($count) = $self->_find_sth->fetchrow_array;
+
+ $self->log->trace("seen_already - end ($count)");
+
+ return $count;
+}
+
+sub mark_seen {
+ my ($self,$e) = @_;
+
+ $self->log->trace('mark_seen - begin');
+
+ my $id = $self->_entry_id($e);
+
+ $self->_insert_sth->execute($id);
+
+ $self->log->trace('mark_seen - end');
+}
+
+sub _entry_id {
+ my ($self,$e) = @_;
+
+ my $body = $e->content->body;
+ my $content_digest = sha1_base64(encode('utf-8',$body));
+ my $id = join '-',$e->id,$e->modified->iso8601,$content_digest;
+ return encode('utf-8',$id);
+}
+
+1;