diff options
author | Max Maischein <corion@corion.net> | 2013-07-09 20:06:47 +0200 |
---|---|---|
committer | dakkar <dakkar@thenautilus.net> | 2013-07-09 19:30:34 +0100 |
commit | ceb6750cc5934cc19226243af9d78b0ddecda920 (patch) | |
tree | 97a1b6f199183fa04af7d409793463d91852e493 /lib/Feed/Role/ContentOnly.pm | |
parent | readme + licence (diff) | |
download | feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.tar.gz feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.tar.bz2 feeder-ceb6750cc5934cc19226243af9d78b0ddecda920.zip |
Add page fetcher and content extractor
Diffstat (limited to 'lib/Feed/Role/ContentOnly.pm')
-rw-r--r-- | lib/Feed/Role/ContentOnly.pm | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/lib/Feed/Role/ContentOnly.pm b/lib/Feed/Role/ContentOnly.pm new file mode 100644 index 0000000..1bc524f --- /dev/null +++ b/lib/Feed/Role/ContentOnly.pm @@ -0,0 +1,52 @@ +package Feed::Role::ContentOnly; +use Moose::Role; +use 5.012; +use namespace::autoclean; +use HTML::ExtractMain 'extract_main_html'; +use XML::Feed::Content; + +=head1 NAME + +Feed::Role::ContentOnly - extract and sanitize HTML content from a +feed body + +=head1 SYNOPSIS + + set_feed_class(Feed->with_traits( + 'Mail', + 'LinkedPage', + 'ContentOnly', + )); + +If your feed only has summaries or only the first paragraph, maybe you +want to fetch the complete HTML page for processing. This strips out +the sidebars and sanitizes the HTML to look sane by using +L<HTML::ExtractMain>. + +=cut + +around extract_entries => sub { + my ($orig, $self) = @_; + + $self->log->trace('around extract_entries - begin'); + + my $entries= $self->$orig(); + + for my $entry (@{ $entries }) { + my $html= $entry->content->body; + $self->log->trace('finding content'); + if( $html ) { + my $c= XML::Feed::Content->wrap({ + type => 'text/html', + body => extract_main_html( $html ), + base => $entry->content->base, + }); + $entry->content( $c ); + }; + }; + + $self->log->trace('around extract_entries - end'); + $entries +}; + +1; |