package Feed::Role::ContentOnly;
use Moose::Role;
use 5.012;
use namespace::autoclean;
use HTML::ExtractMain 'extract_main_html';
use XML::Feed::Content;

=head1 NAME

Feed::Role::ContentOnly - extract and sanitize HTML content from a
feed body

=head1 SYNOPSIS

    set_feed_class(Feed->with_traits(
        'Mail',
        'LinkedPage',
        'ContentOnly',
    ));

If your feed only has summaries or only the first paragraph, maybe you
want to fetch the complete HTML page for processing. This strips out
the sidebars and sanitizes the HTML to look sane by using
L<HTML::ExtractMain>.

=cut

around extract_entries => sub {
    my ($orig, $self) = @_;

    $self->log->trace('around extract_entries - begin');

    my $entries= $self->$orig();

    for my $entry (@{ $entries }) {
        my $html= $entry->content->body;
        $self->log->trace('finding content');
        if( $html ) {
            my $c= XML::Feed::Content->wrap({
                type => 'text/html',
                body => extract_main_html( $html ),
                base => $entry->content->base,
            });
            $entry->content( $c );
        };
    };

    $self->log->trace('around extract_entries - end');
    $entries
};

1;