lib/Feed/Role/ContentOnly.pm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

package Feed::Role::ContentOnly; 
use Moose::Role;
use 5.012;
use namespace::autoclean;
use HTML::ExtractMain 'extract_main_html';
use XML::Feed::Content;
 
=head1 NAME
 
Feed::Role::ContentOnly - extract and sanitize HTML content from a
feed body
 
=head1 SYNOPSIS
 
    set_feed_class(Feed->with_traits(
        'Mail',
        'LinkedPage',
        'ContentOnly',
    ));
 
If your feed only has summaries or only the first paragraph, maybe you
want to fetch the complete HTML page for processing. This strips out
the sidebars and sanitizes the HTML to look sane by using
L<HTML::ExtractMain>.
 
=cut
 
around extract_entries => sub {
    my ($orig, $self) = @_;
 
    $self->log->trace('around extract_entries - begin');
 
    my $entries= $self->$orig();
 
    for my $entry (@{ $entries }) {
        my $html= $entry->content->body;
        $self->log->trace('finding content');
        if( $html ) {
            my $c= XML::Feed::Content->wrap({
                type => 'text/html',
                body => extract_main_html( $html ),
                base => $entry->content->base,
            });
            $entry->content( $c );
        };
    };
 
    $self->log->trace('around extract_entries - end');
    $entries
};
 
1;