diff options
Diffstat (limited to 'lib/Feed/Role/ContentOnly.pm')
-rw-r--r-- | lib/Feed/Role/ContentOnly.pm | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/lib/Feed/Role/ContentOnly.pm b/lib/Feed/Role/ContentOnly.pm new file mode 100644 index 0000000..1bc524f --- /dev/null +++ b/lib/Feed/Role/ContentOnly.pm @@ -0,0 +1,52 @@ +package Feed::Role::ContentOnly; +use Moose::Role; +use 5.012; +use namespace::autoclean; +use HTML::ExtractMain 'extract_main_html'; +use XML::Feed::Content; + +=head1 NAME + +Feed::Role::ContentOnly - extract and sanitize HTML content from a +feed body + +=head1 SYNOPSIS + + set_feed_class(Feed->with_traits( + 'Mail', + 'LinkedPage', + 'ContentOnly', + )); + +If your feed only has summaries or only the first paragraph, maybe you +want to fetch the complete HTML page for processing. This strips out +the sidebars and sanitizes the HTML to look sane by using +L<HTML::ExtractMain>. + +=cut + +around extract_entries => sub { + my ($orig, $self) = @_; + + $self->log->trace('around extract_entries - begin'); + + my $entries= $self->$orig(); + + for my $entry (@{ $entries }) { + my $html= $entry->content->body; + $self->log->trace('finding content'); + if( $html ) { + my $c= XML::Feed::Content->wrap({ + type => 'text/html', + body => extract_main_html( $html ), + base => $entry->content->base, + }); + $entry->content( $c ); + }; + }; + + $self->log->trace('around extract_entries - end'); + $entries +}; + +1; |