summaryrefslogblamecommitdiff
path: root/lib/MooseX/UserAgent.pm
blob: 72c2fa10c5617ae6005a0019245fda1bea3edaec (plain) (tree)
1
2

                          








































































                                                                          

  
 



           
                                                                  


               















































                                                           


                  
              
 







                                                                        
 
              
 

















                                                                        
 
                                         
 
                            
 

                        
package MooseX::UserAgent;

use Moose::Role;

our $VERSION = '0.2.0';

use Encode;
use HTTP::Response;
use LWPx::ParanoidAgent;
use HTML::Encoding 'encoding_from_http_message';
use Compress::Zlib;

has 'agent' => (
    isa     => 'Object',
    is      => 'rw',
    lazy    => 1,
    default => sub {
        my $self = shift;
        my $ua   = LWPx::ParanoidAgent->new;

        my $conf = $self->useragent_conf;
        $ua->agent( $conf->{name} ) if $conf->{name};
        $ua->from( $conf->{mail} )  if $conf->{mail};
        $ua->max_size( $conf->{max_size} || 3000000 );
        $ua->timeout( $conf->{timeout}   || 30 );
        $ua;
    }
);

sub fetch {
    my ( $self, $url ) = @_;

    my $req = HTTP::Request->new( GET => URI->new( $url ) );

    $req->header('Accept-Encoding', 'gzip');

    if ( $self->context->{ useragent }->{ use_cache } ) {
        my $ref = $self->cache->get( $url );
        if ( defined $ref && $ref->{ LastModified } ne '' ) {
            $req->header( 'If-Modified-Since' => $ref->{ LastModified } );
        }
    }

    my $res = $self->agent->request( $req );

    if ( $self->context->{ useragent }->{ use_cache } ) {
        $self->cache->set(
            $url,
            {   ETag         => $res->header( 'Etag' )          || '',
                LastModified => $res->header( 'Last-Modified' ) || ''
            }
        );
    }

    $res;
}

sub get_content {
    my ( $self, $res ) = @_;

    my $enc = encoding_from_http_message($res);

    my $content = $res->content;
    if ( $res->content_encoding && $res->content_encoding eq 'gzip' ) {
        $content = Compress::Zlib::memGunzip($content);
    }

    if ( $enc && $enc !~ /utf-8/i ) {
        $content = $res->decoded_content( raise_error => 1 );
        if ($@) {
            $content = Encode::decode( $enc, $content );
        }
    }
    $content;
}

1;

__END__

=head1 NAME

RTGI::Role::UserAgent - Fetch an url using LWP as the HTTP library

=head1 SYNOPSIS

    package Foo;

    use Moose;
    with qw/MooseX::UserAgent/;

    has useragent_conf => (
        isa     => 'HashRef',
        default => sub {
            { name => 'myownbot', };
        }
    );

    my $res = $self->fetch($url, $cache);
    ...
    my $content = $self->get_content($res);

    --- yaml configuration
    name: 'Mozilla/5.0 (compatible; RTGI; http://rtgi.fr/)'
    mail: 'bot@rtgi.fr'
    max_size: 3000000
    timeout: 30

    --- kwalify schema
    "use_cache":
      name: use_cache
      desc: use cache
      required: true
      type: int
    "name":
      name: name
      desc: useragent string
      required: true
      type: str
    "mail":
      name: mail
      desc: mail for the useragent
      required: true
      type: str
    "timeout":
      name: timeout
      desc: timeout
      required: true
      type: int
    "max_size":
      name: max_size
      desc: max size
      required: true
      type: int

=head1 DESCRIPTION

=head2 METHODS

=over 4

=item B<agent>

The default useragent is a LWPx::ParanoidAgent object. In the
configuration, the name, mail of the useragent have to be defined. The
default size of a page manipulated can't excess 3 000 000 octets and the
timeout is set to 30 seconds.

=item B<fetch>

This method will fetch a given URL. This method handle only the http
protocol.

If there is a cache configuration, the url will be checked in the cache,
and if there is a match, the content will be returned.

In the case of scraping search engines, a delay may be given, so we will
not hammer the server.

=item B<get_content>

This method will return a content in utf8.

=back

=head1 BUGS AND LIMITATIONS

=head1 AUTHOR

franck cuny  C<< <franck.cuny@rtgi.fr> >>

=head1 LICENCE AND COPYRIGHT

Copyright (c) 2009, RTGI
All rights reserved.