From 52a43ca1010cdff1d4801d517115df7cdb8a56e5 Mon Sep 17 00:00:00 2001 From: Gianni Ceccarelli Date: Thu, 13 Dec 2012 11:21:42 +0000 Subject: dedupe: save last see timestamp also, allow to ignore date and/or body --- lib/Feed/Role/DeDupe.pm | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/Feed/Role/DeDupe.pm b/lib/Feed/Role/DeDupe.pm index 641c898..164e3ec 100644 --- a/lib/Feed/Role/DeDupe.pm +++ b/lib/Feed/Role/DeDupe.pm @@ -20,11 +20,23 @@ has dbh => ( lazy_build => 1, ); -has ['_find_sth','_insert_sth'] => ( +has ['_insert_sth','_update_sth'] => ( is => 'ro', lazy_build => 1, ); +has dedupe_use_body => ( + is => 'ro', + isa => 'Bool', + default => 1, +); + +has dedupe_use_date => ( + is => 'ro', + isa => 'Bool', + default => 1, +); + sub _build_dbh { my ($self) = @_; @@ -57,22 +69,22 @@ $dbh->commit; return; } -sub _build__find_sth { +sub _build__insert_sth { my ($self) = @_; return $self->dbh->prepare(<<'SQL'); -SELECT COUNT(*) -FROM seen_items -WHERE id=? +INSERT INTO seen_items(id,last_seen) +VALUES (?,datetime('now')) SQL } -sub _build__insert_sth { +sub _build__update_sth { my ($self) = @_; return $self->dbh->prepare(<<'SQL'); -INSERT INTO seen_items(id) -VALUES (?) +UPDATE seen_items +SET last_seen = datetime('now') +WHERE id = ? SQL } @@ -109,8 +121,7 @@ sub seen_already { my $id = $self->_entry_id($e); - $self->_find_sth->execute($id); - my ($count) = $self->_find_sth->fetchrow_array; + my $count = $self->_update_sth->execute($id); $self->log->trace("seen_already - end ($count)"); @@ -134,12 +145,16 @@ sub _entry_id { my $id = $e->id; - my $date = $e->modified // $e->issued; - if ($date) { $id .= '-' . $date->iso8601 } + if ($self->dedupe_use_date) { + my $date = $e->modified // $e->issued; + if ($date) { $id .= '-' . $date->iso8601 } + } - my $body = $e->content->body; - my $content_digest = sha1_base64(encode('utf-8',$body)); - $id .= '-' . $content_digest; + if ($self->dedupe_use_body) { + my $body = $e->content->body; + my $content_digest = sha1_base64(encode('utf-8',$body)); + $id .= '-' . $content_digest; + } return encode('utf-8',$id); } -- cgit v1.2.3