From ceb6750cc5934cc19226243af9d78b0ddecda920 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Tue, 9 Jul 2013 20:06:47 +0200 Subject: Add page fetcher and content extractor --- lib/Feed/Role/ContentOnly.pm | 52 +++++++++++++++++++++++++++++++++++++++++++ lib/Feed/Role/LinkedPage.pm | 53 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 lib/Feed/Role/ContentOnly.pm create mode 100644 lib/Feed/Role/LinkedPage.pm diff --git a/lib/Feed/Role/ContentOnly.pm b/lib/Feed/Role/ContentOnly.pm new file mode 100644 index 0000000..1bc524f --- /dev/null +++ b/lib/Feed/Role/ContentOnly.pm @@ -0,0 +1,52 @@ +package Feed::Role::ContentOnly; +use Moose::Role; +use 5.012; +use namespace::autoclean; +use HTML::ExtractMain 'extract_main_html'; +use XML::Feed::Content; + +=head1 NAME + +Feed::Role::ContentOnly - extract and sanitize HTML content from a +feed body + +=head1 SYNOPSIS + + set_feed_class(Feed->with_traits( + 'Mail', + 'LinkedPage', + 'ContentOnly', + )); + +If your feed only has summaries or only the first paragraph, maybe you +want to fetch the complete HTML page for processing. This strips out +the sidebars and sanitizes the HTML to look sane by using +L. + +=cut + +around extract_entries => sub { + my ($orig, $self) = @_; + + $self->log->trace('around extract_entries - begin'); + + my $entries= $self->$orig(); + + for my $entry (@{ $entries }) { + my $html= $entry->content->body; + $self->log->trace('finding content'); + if( $html ) { + my $c= XML::Feed::Content->wrap({ + type => 'text/html', + body => extract_main_html( $html ), + base => $entry->content->base, + }); + $entry->content( $c ); + }; + }; + + $self->log->trace('around extract_entries - end'); + $entries +}; + +1; diff --git a/lib/Feed/Role/LinkedPage.pm b/lib/Feed/Role/LinkedPage.pm new file mode 100644 index 0000000..694e4c1 --- /dev/null +++ b/lib/Feed/Role/LinkedPage.pm @@ -0,0 +1,53 @@ +package Feed::Role::LinkedPage; +use Moose::Role; +use 5.012; +use namespace::autoclean; +use XML::Feed::Content; + +=head1 NAME + +Feed::Role::LinkedPage - fetch the linked resource instead of using +the RSS summary + +=head1 SYNOPSIS + + set_feed_class(Feed->with_traits( + 'Mail', + 'LinkedPage', + 'ContentOnly', + )); + +If your feed only has summaries or only the first paragraph, maybe you +want to fetch the complete HTML page for processing. + +=cut + +around extract_entries => sub { + my ($orig, $self) = @_; + + $self->log->trace('around extract_entries - begin'); + + # Fetch the linked HTML page from the feed instead of + # using the content of the feed itself + + my $entries= $self->$orig(); + splice @$entries, 2; + + for my $entry (@{ $entries }) { + $self->log->trace('around extract_entries - fetching ' . $entry->link); + my $res= $self->user_agent->get( $entry->link ); + if( $res->is_success and $res->decoded_content ) { + my $c= XML::Feed::Content->wrap({ + type => $res->header( 'Content-Type' ), + body => $res->decoded_content, + base => $entry->link, + }); + $entry->content( $c ); + }; + }; + + $self->log->trace('around extract_entries - end'); + $entries +}; + +1; -- cgit v1.2.3