summaryrefslogtreecommitdiff
path: root/lib/MooseX/UserAgent.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/MooseX/UserAgent.pm')
-rw-r--r--lib/MooseX/UserAgent.pm167
1 files changed, 155 insertions, 12 deletions
diff --git a/lib/MooseX/UserAgent.pm b/lib/MooseX/UserAgent.pm
index 30686cf..72c2fa1 100644
--- a/lib/MooseX/UserAgent.pm
+++ b/lib/MooseX/UserAgent.pm
@@ -1,32 +1,175 @@
package MooseX::UserAgent;
-use Moose;
-our $VERSION = '0.01';
+use Moose::Role;
+
+our $VERSION = '0.2.0';
+
+use Encode;
+use HTTP::Response;
+use LWPx::ParanoidAgent;
+use HTML::Encoding 'encoding_from_http_message';
+use Compress::Zlib;
+
+has 'agent' => (
+ isa => 'Object',
+ is => 'rw',
+ lazy => 1,
+ default => sub {
+ my $self = shift;
+ my $ua = LWPx::ParanoidAgent->new;
+
+ my $conf = $self->useragent_conf;
+ $ua->agent( $conf->{name} ) if $conf->{name};
+ $ua->from( $conf->{mail} ) if $conf->{mail};
+ $ua->max_size( $conf->{max_size} || 3000000 );
+ $ua->timeout( $conf->{timeout} || 30 );
+ $ua;
+ }
+);
+
+sub fetch {
+ my ( $self, $url ) = @_;
+
+ my $req = HTTP::Request->new( GET => URI->new( $url ) );
+
+ $req->header('Accept-Encoding', 'gzip');
+
+ if ( $self->context->{ useragent }->{ use_cache } ) {
+ my $ref = $self->cache->get( $url );
+ if ( defined $ref && $ref->{ LastModified } ne '' ) {
+ $req->header( 'If-Modified-Since' => $ref->{ LastModified } );
+ }
+ }
+
+ my $res = $self->agent->request( $req );
+
+ if ( $self->context->{ useragent }->{ use_cache } ) {
+ $self->cache->set(
+ $url,
+ { ETag => $res->header( 'Etag' ) || '',
+ LastModified => $res->header( 'Last-Modified' ) || ''
+ }
+ );
+ }
+
+ $res;
+}
+
+sub get_content {
+ my ( $self, $res ) = @_;
+
+ my $enc = encoding_from_http_message($res);
+
+ my $content = $res->content;
+ if ( $res->content_encoding && $res->content_encoding eq 'gzip' ) {
+ $content = Compress::Zlib::memGunzip($content);
+ }
+
+ if ( $enc && $enc !~ /utf-8/i ) {
+ $content = $res->decoded_content( raise_error => 1 );
+ if ($@) {
+ $content = Encode::decode( $enc, $content );
+ }
+ }
+ $content;
+}
1;
+
__END__
=head1 NAME
-MooseX::UserAgent -
+RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library
=head1 SYNOPSIS
- use MooseX::UserAgent;
+ package Foo;
+
+ use Moose;
+ with qw/MooseX::UserAgent/;
+
+ has useragent_conf => (
+ isa => 'HashRef',
+ default => sub {
+ { name => 'myownbot', };
+ }
+ );
+
+ my $res = $self->fetch($url, $cache);
+ ...
+ my $content = $self->get_content($res);
+
+ --- yaml configuration
+ name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
+ mail: 'bot@rtgi.fr'
+ max_size: 3000000
+ timeout: 30
+
+ --- kwalify schema
+ "use_cache":
+ name: use_cache
+ desc: use cache
+ required: true
+ type: int
+ "name":
+ name: name
+ desc: useragent string
+ required: true
+ type: str
+ "mail":
+ name: mail
+ desc: mail for the useragent
+ required: true
+ type: str
+ "timeout":
+ name: timeout
+ desc: timeout
+ required: true
+ type: int
+ "max_size":
+ name: max_size
+ desc: max size
+ required: true
+ type: int
=head1 DESCRIPTION
-MooseX::UserAgent is
+=head2 METHODS
-=head1 AUTHOR
+=over 4
+
+=item B<agent>
+
+The default useragent is a LWPx::ParanoidAgent object. In the
+configuration, the name, mail of the useragent have to be defined. The
+default size of a page manipulated can't excess 3 000 000 octets and the
+timeout is set to 30 seconds.
-franck cuny E<lt>franck.cuny {at} rtgi.frE<gt>
+=item B<fetch>
-=head1 SEE ALSO
+This method will fetch a given URL. This method handle only the http
+protocol.
+
+If there is a cache configuration, the url will be checked in the cache,
+and if there is a match, the content will be returned.
+
+In the case of scraping search engines, a delay may be given, so we will
+not hammer the server.
+
+=item B<get_content>
+
+This method will return a content in utf8.
+
+=back
+
+=head1 BUGS AND LIMITATIONS
+
+=head1 AUTHOR
-=head1 LICENSE
+franck cuny C<< <franck.cuny@rtgi.fr> >>
-This library is free software; you can redistribute it and/or modify
-it under the same terms as Perl itself.
+=head1 LICENCE AND COPYRIGHT
-=cut
+Copyright (c) 2009, RTGI
+All rights reserved.