summaryrefslogtreecommitdiff
path: root/lib/Feed
diff options
context:
space:
mode:
authordakkar <dakkar@thenautilus.net>2012-12-09 13:45:17 +0000
committerdakkar <dakkar@thenautilus.net>2012-12-09 13:45:17 +0000
commitf4c31024fc3dd0210ce8821e8b63d9f411828c3f (patch)
treee24f34fdedf53ee7ede6c2e97326a1ce396fc0f0 /lib/Feed
parentfix datetime (diff)
downloadfeeder-f4c31024fc3dd0210ce8821e8b63d9f411828c3f.tar.gz
feeder-f4c31024fc3dd0210ce8821e8b63d9f411828c3f.tar.bz2
feeder-f4c31024fc3dd0210ce8821e8b63d9f411828c3f.zip
de-duping
Diffstat (limited to 'lib/Feed')
-rw-r--r--lib/Feed/DeDupe.pm141
-rw-r--r--lib/Feed/FixDateTime.pm4
-rw-r--r--lib/Feed/Printer.pm5
3 files changed, 150 insertions, 0 deletions
diff --git a/lib/Feed/DeDupe.pm b/lib/Feed/DeDupe.pm
new file mode 100644
index 0000000..f617154
--- /dev/null
+++ b/lib/Feed/DeDupe.pm
@@ -0,0 +1,141 @@
+package Feed::DeDupe;
+use Moose::Role;
+use 5.016;
+use namespace::autoclean -also => ['_maybe_build_schema'];
+use DBI;
+use Try::Tiny;
+use Encode;
+use Digest::SHA1 'sha1_base64';
+
+requires 'get_feed','process','process_entry';
+
+has 'dupe_dsn' => (
+ is => 'ro',
+ isa => 'Str',
+ required => 1,
+);
+
+has dbh => (
+ is => 'ro',
+ lazy_build => 1,
+);
+
+has ['_find_sth','_insert_sth'] => (
+ is => 'ro',
+ lazy_build => 1,
+);
+
+sub _build_dbh {
+ my ($self) = @_;
+
+ my $dbh = DBI->connect($self->dupe_dsn,undef,undef,{
+ RaiseError => 1,
+ PrintError => 0,
+ AutoCommit => 0,
+ });
+
+ _maybe_build_schema($dbh);
+
+ return $dbh;
+}
+
+sub _maybe_build_schema {
+ my ($dbh) = @_;
+
+ try {
+ $dbh->selectrow_array(q{SELECT * FROM seen_items LIMIT 1});
+ $dbh->rollback;
+ } catch {
+ $dbh->do(<<'SQL');
+CREATE TABLE seen_items (
+ id VARCHAR(255) PRIMARY KEY
+)
+SQL
+$dbh->commit;
+ };
+
+ return;
+}
+
+sub _build__find_sth {
+ my ($self) = @_;
+
+ return $self->dbh->prepare(<<'SQL');
+SELECT COUNT(*)
+FROM seen_items
+WHERE id=?
+SQL
+}
+
+sub _build__insert_sth {
+ my ($self) = @_;
+
+ return $self->dbh->prepare(<<'SQL');
+INSERT INTO seen_items(id)
+VALUES (?)
+SQL
+}
+
+after process => sub {
+ my ($self) = @_;
+
+ $self->log->trace('after process');
+
+ $self->dbh->commit;
+};
+
+around process_entry => sub {
+ my ($orig,$self,$entry) = @_;
+
+ $self->log->trace('around process_entry - begin');
+
+ return if $self->seen_already($entry);
+
+ $self->log->trace('around process_entry - call original');
+
+ $self->$orig($entry);
+
+ $self->mark_seen($entry);
+
+ $self->log->trace('around process_entry - end');
+
+ return;
+};
+
+sub seen_already {
+ my ($self,$e) = @_;
+
+ $self->log->trace('seen_already - begin');
+
+ my $id = $self->_entry_id($e);
+
+ $self->_find_sth->execute($id);
+ my ($count) = $self->_find_sth->fetchrow_array;
+
+ $self->log->trace("seen_already - end ($count)");
+
+ return $count;
+}
+
+sub mark_seen {
+ my ($self,$e) = @_;
+
+ $self->log->trace('mark_seen - begin');
+
+ my $id = $self->_entry_id($e);
+
+ $self->_insert_sth->execute($id);
+
+ $self->log->trace('mark_seen - end');
+}
+
+sub _entry_id {
+ my ($self,$e) = @_;
+
+ my $body = $e->content->body;
+ my $content_digest = sha1_base64(encode('utf-8',$body));
+ my $id = join '-',$e->id,$e->modified->iso8601,$content_digest;
+ return encode('utf-8',$id);
+}
+
+1;
diff --git a/lib/Feed/FixDateTime.pm b/lib/Feed/FixDateTime.pm
index f9ebe73..6034931 100644
--- a/lib/Feed/FixDateTime.pm
+++ b/lib/Feed/FixDateTime.pm
@@ -9,6 +9,8 @@ requires 'process_entry';
before process_entry => sub {
my ($self,$entry) = @_;
+ $self->log->trace('before process_entry - begin');
+
for my $f ('issued','modified') {
my $date = $entry->$f;
if (!$date) {
@@ -20,6 +22,8 @@ before process_entry => sub {
$entry->$f($date);
}
+
+ $self->log->trace('before process_entry - end');
};
1;
diff --git a/lib/Feed/Printer.pm b/lib/Feed/Printer.pm
index 5743883..7fd2f65 100644
--- a/lib/Feed/Printer.pm
+++ b/lib/Feed/Printer.pm
@@ -14,10 +14,15 @@ before process => sub {
sub process_entry {
my ($self,$entry) = @_;
+ $self->log->trace('process_entry - begin');
+
for my $f (qw(id title link issued modified)) {
say " $f:",$entry->$f//'<undef>';
}
+ say $entry->content->body;
say '';
+
+ $self->log->trace('process_entry - end');
}
1;