Add page fetcher and content extractor

author: Max Maischein <corion@corion.net> 2013-07-09 20:06:47 +0200
committer: dakkar <dakkar@thenautilus.net> 2013-07-09 19:30:34 +0100
commit: ceb6750cc5934cc19226243af9d78b0ddecda920 (patch)
tree: 97a1b6f199183fa04af7d409793463d91852e493
parent: readme + licence (diff)
download: feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.tar.gz
feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.tar.bz2
feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.zip
2 files changed, 105 insertions, 0 deletions
diff --git a/lib/Feed/Role/ContentOnly.pm b/lib/Feed/Role/ContentOnly.pm
new file mode 100644
index 0000000..1bc524f
--- /dev/null
+++ b/lib/Feed/Role/ContentOnly.pm
@@ -0,0 +1,52 @@
+package Feed::Role::ContentOnly;
+use Moose::Role;
+use 5.012;
+use namespace::autoclean;
+use HTML::ExtractMain 'extract_main_html';
+use XML::Feed::Content;
+
+=head1 NAME
+
+Feed::Role::ContentOnly - extract and sanitize HTML content from a
+feed body
+
+=head1 SYNOPSIS
+
+    set_feed_class(Feed->with_traits(
+        'Mail',
+        'LinkedPage',
+        'ContentOnly',
+    ));
+
+If your feed only has summaries or only the first paragraph, maybe you
+want to fetch the complete HTML page for processing. This strips out
+the sidebars and sanitizes the HTML to look sane by using
+L<HTML::ExtractMain>.
+
+=cut
+
+around extract_entries => sub {
+    my ($orig, $self) = @_;
+
+    $self->log->trace('around extract_entries - begin');
+
+    my $entries= $self->$orig();
+
+    for my $entry (@{ $entries }) {
+        my $html= $entry->content->body;
+        $self->log->trace('finding content');
+        if( $html ) {
+            my $c= XML::Feed::Content->wrap({
+                type => 'text/html',
+                body => extract_main_html( $html ),
+                base => $entry->content->base,
+            });
+            $entry->content( $c );
+        };
+    };
+
+    $self->log->trace('around extract_entries - end');
+    $entries
+};
+
+1;
diff --git a/lib/Feed/Role/LinkedPage.pm b/lib/Feed/Role/LinkedPage.pm
new file mode 100644
index 0000000..694e4c1
--- /dev/null
+++ b/lib/Feed/Role/LinkedPage.pm
@@ -0,0 +1,53 @@
+package Feed::Role::LinkedPage;
+use Moose::Role;
+use 5.012;
+use namespace::autoclean;
+use XML::Feed::Content;
+
+=head1 NAME
+
+Feed::Role::LinkedPage - fetch the linked resource instead of using
+the RSS summary
+
+=head1 SYNOPSIS
+
+    set_feed_class(Feed->with_traits(
+        'Mail',
+        'LinkedPage',
+        'ContentOnly',
+    ));
+
+If your feed only has summaries or only the first paragraph, maybe you
+want to fetch the complete HTML page for processing.
+
+=cut
+
+around extract_entries => sub {
+    my ($orig, $self) = @_;
+
+    $self->log->trace('around extract_entries - begin');
+
+    # Fetch the linked HTML page from the feed instead of
+    # using the content of the feed itself
+
+    my $entries= $self->$orig();
+    splice @$entries, 2;
+
+    for my $entry (@{ $entries }) {
+        $self->log->trace('around extract_entries - fetching ' . $entry->link);
+        my $res= $self->user_agent->get( $entry->link );
+        if( $res->is_success and $res->decoded_content ) {
+            my $c= XML::Feed::Content->wrap({
+                type => $res->header( 'Content-Type' ),
+                body => $res->decoded_content,
+                base => $entry->link,
+            });
+            $entry->content( $c );
+        };
+    };
+
+    $self->log->trace('around extract_entries - end');
+    $entries
+};
+
+1;
author	Max Maischein <corion@corion.net>	2013-07-09 20:06:47 +0200
committer	dakkar <dakkar@thenautilus.net>	2013-07-09 19:30:34 +0100
commit	ceb6750cc5934cc19226243af9d78b0ddecda920 (patch)
tree	97a1b6f199183fa04af7d409793463d91852e493
parent	readme + licence (diff)
download	feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.tar.gz feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.tar.bz2 feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.zip