lib/MooseX/UserAgent.pm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

package MooseX::UserAgent;

use Moose::Role;

our $VERSION = '0.2.0';

use Encode;
use HTTP::Response;
use LWPx::ParanoidAgent;
use HTML::Encoding 'encoding_from_http_message';
use Compress::Zlib;

has 'agent' => (
    isa     => 'Object',
    is      => 'rw',
    lazy    => 1,
    default => sub {
        my $self = shift;
        my $ua   = LWPx::ParanoidAgent->new;

        my $conf = $self->useragent_conf;
        $ua->agent( $conf->{name} ) if $conf->{name};
        $ua->from( $conf->{mail} )  if $conf->{mail};
        $ua->max_size( $conf->{max_size} || 3000000 );
        $ua->timeout( $conf->{timeout}   || 30 );
        $ua;
    }
);

sub fetch {
    my ( $self, $url ) = @_;

    my $req = HTTP::Request->new( GET => URI->new( $url ) );

    $req->header('Accept-Encoding', 'gzip');

    if ( $self->context->{ useragent }->{ use_cache } ) {
        my $ref = $self->cache->get( $url );
        if ( defined $ref && $ref->{ LastModified } ne '' ) {
            $req->header( 'If-Modified-Since' => $ref->{ LastModified } );
        }
    }

    my $res = $self->agent->request( $req );

    if ( $self->context->{ useragent }->{ use_cache } ) {
        $self->cache->set(
            $url,
            {   ETag         => $res->header( 'Etag' )          || '',
                LastModified => $res->header( 'Last-Modified' ) || ''
            }
        );
    }

    $res;
}

sub get_content {
    my ( $self, $res ) = @_;

    my $enc = encoding_from_http_message($res);

    my $content = $res->content;
    if ( $res->content_encoding && $res->content_encoding eq 'gzip' ) {
        $content = Compress::Zlib::memGunzip($content);
    }

    if ( $enc && $enc !~ /utf-8/i ) {
        $content = $res->decoded_content( raise_error => 1 );
        if ($@) {
            $content = Encode::decode( $enc, $content );
        }
    }
    $content;
}

1;

__END__

=head1 NAME

RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library

=head1 SYNOPSIS

    package Foo;

    use Moose;
    with qw/MooseX::UserAgent/;

    has useragent_conf => (
        isa     => 'HashRef',
        default => sub {
            { name => 'myownbot', };
        }
    );

    my $res = $self->fetch($url, $cache);
    ...
    my $content = $self->get_content($res);

    --- yaml configuration
    name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
    mail: 'bot@rtgi.fr'
    max_size: 3000000
    timeout: 30

    --- kwalify schema
    "use_cache":
      name: use_cache
      desc: use cache
      required: true
      type: int
    "name":
      name: name
      desc: useragent string
      required: true
      type: str
    "mail":
      name: mail
      desc: mail for the useragent
      required: true
      type: str
    "timeout":
      name: timeout
      desc: timeout
      required: true
      type: int
    "max_size":
      name: max_size
      desc: max size
      required: true
      type: int

=head1 DESCRIPTION

=head2 METHODS

=over 4

=item B<agent>

The default useragent is a LWPx::ParanoidAgent object. In the
configuration, the name, mail of the useragent have to be defined. The
default size of a page manipulated can't excess 3 000 000 octets and the
timeout is set to 30 seconds.

=item B<fetch>

This method will fetch a given URL. This method handle only the http
protocol.

If there is a cache configuration, the url will be checked in the cache,
and if there is a match, the content will be returned.

In the case of scraping search engines, a delay may be given, so we will
not hammer the server.

=item B<get_content>

This method will return a content in utf8.

=back

=head1 BUGS AND LIMITATIONS

=head1 AUTHOR

franck cuny  C<< <franck.cuny@rtgi.fr> >>

=head1 LICENCE AND COPYRIGHT

Copyright (c) 2009, RTGI
All rights reserved.