diff options
Diffstat (limited to 'lib/Feed')
-rw-r--r-- | lib/Feed/DeDupe.pm | 141 | ||||
-rw-r--r-- | lib/Feed/FixDateTime.pm | 4 | ||||
-rw-r--r-- | lib/Feed/Printer.pm | 5 |
3 files changed, 150 insertions, 0 deletions
diff --git a/lib/Feed/DeDupe.pm b/lib/Feed/DeDupe.pm new file mode 100644 index 0000000..f617154 --- /dev/null +++ b/lib/Feed/DeDupe.pm @@ -0,0 +1,141 @@ +package Feed::DeDupe; +use Moose::Role; +use 5.016; +use namespace::autoclean -also => ['_maybe_build_schema']; +use DBI; +use Try::Tiny; +use Encode; +use Digest::SHA1 'sha1_base64'; + +requires 'get_feed','process','process_entry'; + +has 'dupe_dsn' => ( + is => 'ro', + isa => 'Str', + required => 1, +); + +has dbh => ( + is => 'ro', + lazy_build => 1, +); + +has ['_find_sth','_insert_sth'] => ( + is => 'ro', + lazy_build => 1, +); + +sub _build_dbh { + my ($self) = @_; + + my $dbh = DBI->connect($self->dupe_dsn,undef,undef,{ + RaiseError => 1, + PrintError => 0, + AutoCommit => 0, + }); + + _maybe_build_schema($dbh); + + return $dbh; +} + +sub _maybe_build_schema { + my ($dbh) = @_; + + try { + $dbh->selectrow_array(q{SELECT * FROM seen_items LIMIT 1}); + $dbh->rollback; + } catch { + $dbh->do(<<'SQL'); +CREATE TABLE seen_items ( + id VARCHAR(255) PRIMARY KEY +) +SQL +$dbh->commit; + }; + + return; +} + +sub _build__find_sth { + my ($self) = @_; + + return $self->dbh->prepare(<<'SQL'); +SELECT COUNT(*) +FROM seen_items +WHERE id=? +SQL +} + +sub _build__insert_sth { + my ($self) = @_; + + return $self->dbh->prepare(<<'SQL'); +INSERT INTO seen_items(id) +VALUES (?) +SQL +} + +after process => sub { + my ($self) = @_; + + $self->log->trace('after process'); + + $self->dbh->commit; +}; + +around process_entry => sub { + my ($orig,$self,$entry) = @_; + + $self->log->trace('around process_entry - begin'); + + return if $self->seen_already($entry); + + $self->log->trace('around process_entry - call original'); + + $self->$orig($entry); + + $self->mark_seen($entry); + + $self->log->trace('around process_entry - end'); + + return; +}; + +sub seen_already { + my ($self,$e) = @_; + + $self->log->trace('seen_already - begin'); + + my $id = $self->_entry_id($e); + + $self->_find_sth->execute($id); + my ($count) = $self->_find_sth->fetchrow_array; + + $self->log->trace("seen_already - end ($count)"); + + return $count; +} + +sub mark_seen { + my ($self,$e) = @_; + + $self->log->trace('mark_seen - begin'); + + my $id = $self->_entry_id($e); + + $self->_insert_sth->execute($id); + + $self->log->trace('mark_seen - end'); +} + +sub _entry_id { + my ($self,$e) = @_; + + my $body = $e->content->body; + my $content_digest = sha1_base64(encode('utf-8',$body)); + my $id = join '-',$e->id,$e->modified->iso8601,$content_digest; + return encode('utf-8',$id); +} + +1; diff --git a/lib/Feed/FixDateTime.pm b/lib/Feed/FixDateTime.pm index f9ebe73..6034931 100644 --- a/lib/Feed/FixDateTime.pm +++ b/lib/Feed/FixDateTime.pm @@ -9,6 +9,8 @@ requires 'process_entry'; before process_entry => sub { my ($self,$entry) = @_; + $self->log->trace('before process_entry - begin'); + for my $f ('issued','modified') { my $date = $entry->$f; if (!$date) { @@ -20,6 +22,8 @@ before process_entry => sub { $entry->$f($date); } + + $self->log->trace('before process_entry - end'); }; 1; diff --git a/lib/Feed/Printer.pm b/lib/Feed/Printer.pm index 5743883..7fd2f65 100644 --- a/lib/Feed/Printer.pm +++ b/lib/Feed/Printer.pm @@ -14,10 +14,15 @@ before process => sub { sub process_entry { my ($self,$entry) = @_; + $self->log->trace('process_entry - begin'); + for my $f (qw(id title link issued modified)) { say " $f:",$entry->$f//'<undef>'; } + say $entry->content->body; say ''; + + $self->log->trace('process_entry - end'); } 1; |