1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
|
package MooseX::UserAgent;
use Moose::Role;
our $VERSION = '0.2.0';
use Encode;
use HTTP::Response;
use LWPx::ParanoidAgent;
use HTML::Encoding 'encoding_from_http_message';
use Compress::Zlib;
has 'agent' => (
isa => 'Object',
is => 'rw',
lazy => 1,
default => sub {
my $self = shift;
my $ua = LWPx::ParanoidAgent->new;
my $conf = $self->useragent_conf;
$ua->agent( $conf->{name} ) if $conf->{name};
$ua->from( $conf->{mail} ) if $conf->{mail};
$ua->max_size( $conf->{max_size} || 3000000 );
$ua->timeout( $conf->{timeout} || 30 );
$ua;
}
);
sub fetch {
my ( $self, $url ) = @_;
my $req = HTTP::Request->new( GET => URI->new( $url ) );
$req->header('Accept-Encoding', 'gzip');
if ( $self->context->{ useragent }->{ use_cache } ) {
my $ref = $self->cache->get( $url );
if ( defined $ref && $ref->{ LastModified } ne '' ) {
$req->header( 'If-Modified-Since' => $ref->{ LastModified } );
}
}
my $res = $self->agent->request( $req );
if ( $self->context->{ useragent }->{ use_cache } ) {
$self->cache->set(
$url,
{ ETag => $res->header( 'Etag' ) || '',
LastModified => $res->header( 'Last-Modified' ) || ''
}
);
}
$res;
}
sub get_content {
my ( $self, $res ) = @_;
my $enc = encoding_from_http_message($res);
my $content = $res->content;
if ( $res->content_encoding && $res->content_encoding eq 'gzip' ) {
$content = Compress::Zlib::memGunzip($content);
}
if ( $enc && $enc !~ /utf-8/i ) {
$content = $res->decoded_content( raise_error => 1 );
if ($@) {
$content = Encode::decode( $enc, $content );
}
}
$content;
}
1;
__END__
=head1 NAME
RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library
=head1 SYNOPSIS
package Foo;
use Moose;
with qw/MooseX::UserAgent/;
has useragent_conf => (
isa => 'HashRef',
default => sub {
{ name => 'myownbot', };
}
);
my $res = $self->fetch($url, $cache);
...
my $content = $self->get_content($res);
--- yaml configuration
name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
mail: 'bot@rtgi.fr'
max_size: 3000000
timeout: 30
--- kwalify schema
"use_cache":
name: use_cache
desc: use cache
required: true
type: int
"name":
name: name
desc: useragent string
required: true
type: str
"mail":
name: mail
desc: mail for the useragent
required: true
type: str
"timeout":
name: timeout
desc: timeout
required: true
type: int
"max_size":
name: max_size
desc: max size
required: true
type: int
=head1 DESCRIPTION
=head2 METHODS
=over 4
=item B<agent>
The default useragent is a LWPx::ParanoidAgent object. In the
configuration, the name, mail of the useragent have to be defined. The
default size of a page manipulated can't excess 3 000 000 octets and the
timeout is set to 30 seconds.
=item B<fetch>
This method will fetch a given URL. This method handle only the http
protocol.
If there is a cache configuration, the url will be checked in the cache,
and if there is a match, the content will be returned.
In the case of scraping search engines, a delay may be given, so we will
not hammer the server.
=item B<get_content>
This method will return a content in utf8.
=back
=head1 BUGS AND LIMITATIONS
=head1 AUTHOR
franck cuny C<< <franck.cuny@rtgi.fr> >>
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2009, RTGI
All rights reserved.
|