package Feed::Role::DeDupe; use Moose::Role; use 5.012; use namespace::autoclean -also => ['_maybe_build_schema']; use DBI; use Try::Tiny; use Encode; use Digest::SHA1 'sha1_base64'; requires 'get_feed','process','process_entry'; has 'dupe_dsn' => ( is => 'ro', isa => 'Str', required => 1, ); has dbh => ( is => 'ro', lazy_build => 1, ); has ['_insert_sth','_update_sth'] => ( is => 'ro', lazy_build => 1, ); has dedupe_use_body => ( is => 'ro', isa => 'Bool', default => 1, ); has dedupe_use_date => ( is => 'ro', isa => 'Bool', default => 1, ); sub _build_dbh { my ($self) = @_; my $dbh = DBI->connect($self->dupe_dsn,undef,undef,{ RaiseError => 1, PrintError => 0, AutoCommit => 0, }); _maybe_build_schema($dbh); return $dbh; } sub _maybe_build_schema { my ($dbh) = @_; try { $dbh->selectrow_array(q{SELECT * FROM seen_items LIMIT 1}); $dbh->rollback; } catch { $dbh->do(<<'SQL'); CREATE TABLE seen_items ( id VARCHAR(255) PRIMARY KEY, last_seen TIMESTAMP ) SQL $dbh->commit; }; return; } sub _build__insert_sth { my ($self) = @_; return $self->dbh->prepare(<<'SQL'); INSERT INTO seen_items(id,last_seen) VALUES (?,datetime('now')) SQL } sub _build__update_sth { my ($self) = @_; return $self->dbh->prepare(<<'SQL'); UPDATE seen_items SET last_seen = datetime('now') WHERE id = ? SQL } after process => sub { my ($self) = @_; $self->log->trace('after process'); $self->dbh->commit; }; around process_entry => sub { my ($orig,$self,$entry) = @_; $self->log->trace('around process_entry - begin'); return if $self->seen_already($entry); $self->log->trace('around process_entry - call original'); $self->$orig($entry); $self->mark_seen($entry); $self->log->trace('around process_entry - end'); return; }; sub seen_already { my ($self,$e) = @_; $self->log->trace('seen_already - begin'); my $id = $self->_entry_id($e); my $count = 0 + $self->_update_sth->execute($id); $self->log->trace("seen_already - end ($count)"); return $count; } sub mark_seen { my ($self,$e) = @_; $self->log->trace('mark_seen - begin'); my $id = $self->_entry_id($e); $self->_insert_sth->execute($id); $self->log->trace('mark_seen - end'); } sub _entry_id { my ($self,$e) = @_; my $id = $e->unified_id; $self->log->trace("_entry_id: $id"); if ($self->dedupe_use_date) { my $date = $e->unified_date; if ($date) { $id .= '-' . $date->iso8601; $self->log->trace("_entry_id: (with date) $id"); } } if ($self->dedupe_use_body) { my $body = $e->unified_content->body; if ($body) { my $content_digest = sha1_base64(encode('utf-8',$body)); $id .= '-' . $content_digest; $self->log->trace("_entry_id: (with content) $id"); } } return encode('utf-8',$id); } 1;