summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorGianni Ceccarelli <gianni.ceccarelli@net-a-porter.com>2012-12-13 11:21:42 +0000
committerGianni Ceccarelli <gianni.ceccarelli@net-a-porter.com>2012-12-13 11:21:42 +0000
commit52a43ca1010cdff1d4801d517115df7cdb8a56e5 (patch)
tree201311bde20776a536b2601d0d02d53a92dd4c13 /lib
parentfix mail header encoding (diff)
downloadfeeder-52a43ca1010cdff1d4801d517115df7cdb8a56e5.tar.gz
feeder-52a43ca1010cdff1d4801d517115df7cdb8a56e5.tar.bz2
feeder-52a43ca1010cdff1d4801d517115df7cdb8a56e5.zip
dedupe: save last see timestamp
also, allow to ignore date and/or body
Diffstat (limited to 'lib')
-rw-r--r--lib/Feed/Role/DeDupe.pm45
1 files changed, 30 insertions, 15 deletions
diff --git a/lib/Feed/Role/DeDupe.pm b/lib/Feed/Role/DeDupe.pm
index 641c898..164e3ec 100644
--- a/lib/Feed/Role/DeDupe.pm
+++ b/lib/Feed/Role/DeDupe.pm
@@ -20,11 +20,23 @@ has dbh => (
lazy_build => 1,
);
-has ['_find_sth','_insert_sth'] => (
+has ['_insert_sth','_update_sth'] => (
is => 'ro',
lazy_build => 1,
);
+has dedupe_use_body => (
+ is => 'ro',
+ isa => 'Bool',
+ default => 1,
+);
+
+has dedupe_use_date => (
+ is => 'ro',
+ isa => 'Bool',
+ default => 1,
+);
+
sub _build_dbh {
my ($self) = @_;
@@ -57,22 +69,22 @@ $dbh->commit;
return;
}
-sub _build__find_sth {
+sub _build__insert_sth {
my ($self) = @_;
return $self->dbh->prepare(<<'SQL');
-SELECT COUNT(*)
-FROM seen_items
-WHERE id=?
+INSERT INTO seen_items(id,last_seen)
+VALUES (?,datetime('now'))
SQL
}
-sub _build__insert_sth {
+sub _build__update_sth {
my ($self) = @_;
return $self->dbh->prepare(<<'SQL');
-INSERT INTO seen_items(id)
-VALUES (?)
+UPDATE seen_items
+SET last_seen = datetime('now')
+WHERE id = ?
SQL
}
@@ -109,8 +121,7 @@ sub seen_already {
my $id = $self->_entry_id($e);
- $self->_find_sth->execute($id);
- my ($count) = $self->_find_sth->fetchrow_array;
+ my $count = $self->_update_sth->execute($id);
$self->log->trace("seen_already - end ($count)");
@@ -134,12 +145,16 @@ sub _entry_id {
my $id = $e->id;
- my $date = $e->modified // $e->issued;
- if ($date) { $id .= '-' . $date->iso8601 }
+ if ($self->dedupe_use_date) {
+ my $date = $e->modified // $e->issued;
+ if ($date) { $id .= '-' . $date->iso8601 }
+ }
- my $body = $e->content->body;
- my $content_digest = sha1_base64(encode('utf-8',$body));
- $id .= '-' . $content_digest;
+ if ($self->dedupe_use_body) {
+ my $body = $e->content->body;
+ my $content_digest = sha1_base64(encode('utf-8',$body));
+ $id .= '-' . $content_digest;
+ }
return encode('utf-8',$id);
}