diff options
| author | franck cuny <franck@lumberjaph.net> | 2011-06-13 16:33:23 +0200 |
|---|---|---|
| committer | franck cuny <franck@lumberjaph.net> | 2011-06-13 16:33:23 +0200 |
| commit | 871336c030209b46ae6b124a702677363487f9a8 (patch) | |
| tree | 86f234d42c68b26a7aeb9cc373667127ad661e19 /lib/GitHub/Collector/Role | |
| parent | use template_toolkit and add infos about colors (diff) | |
| download | stargit-871336c030209b46ae6b124a702677363487f9a8.tar.gz | |
import github::collector
Signed-off-by: franck cuny <franck@lumberjaph.net>
Diffstat (limited to '')
| -rw-r--r-- | lib/GitHub/Collector/Role/Context.pm | 13 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Graph/Edges.pm | 13 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Graph/Gexf.pm | 47 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Graph/Neighbors.pm | 128 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Graph/Nodes.pm | 13 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Graph/Query.pm | 45 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Graph/Search.pm | 19 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Languages.pm | 25 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Logger.pm | 27 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/MongoDB.pm | 41 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Pause.pm | 18 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Profile.pm | 84 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Relation.pm | 70 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/Repository.pm | 173 | ||||
| -rw-r--r-- | lib/GitHub/Collector/Role/SPORE.pm | 27 |
15 files changed, 743 insertions, 0 deletions
diff --git a/lib/GitHub/Collector/Role/Context.pm b/lib/GitHub/Collector/Role/Context.pm new file mode 100644 index 0000000..aa30d2b --- /dev/null +++ b/lib/GitHub/Collector/Role/Context.pm @@ -0,0 +1,13 @@ +package GitHub::Collector::Role::Context; + +use YAML; +use Moose::Role; +with qw(MooseX::ConfigFromFile); + +sub get_config_from_file { + my ( $self, $file ) = @_; + my $conf = YAML::LoadFile($file); + $conf; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Edges.pm b/lib/GitHub/Collector/Role/Graph/Edges.pm new file mode 100644 index 0000000..bea5914 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Edges.pm @@ -0,0 +1,13 @@ +package GitHub::Collector::Role::Graph::Edges; + +use Moose::Role; + +has edges => ( + is => 'rw', + isa => 'HashRef', + lazy => 1, + default => sub { {} }, + auto_deref => 1, +); + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Gexf.pm b/lib/GitHub/Collector/Role/Graph/Gexf.pm new file mode 100644 index 0000000..ab83dd4 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Gexf.pm @@ -0,0 +1,47 @@ +package GitHub::Collector::Role::Graph::Gexf; + +use Moose::Role; +use Graph::GEXF; + +has output => ( + is => 'ro', + isa => 'Int', + predicate => 'should_export', +); + +sub export { + my ($self, ) = @_; + + my $gexf = Graph::GEXF->new(); + $gexf->add_node_attribute( name => 'string' ); + $gexf->add_node_attribute( lang => 'string' ); + $gexf->add_node_attribute( size => 'int' ); + $gexf->add_node_attribute( country => 'string' ); + $gexf->add_node_attribute( indegree => 'int' ); + + my $nodes = {}; + foreach my $node ( keys %{ $self->nodes } ) { + my $n = $gexf->add_node( $self->nodes->{$node}->{id} ); + $n->label($node); + $n->attribute( name => $node ); + $n->attribute( size => $self->nodes->{$node}->{size} ); + $n->attribute( lang => $self->nodes->{$node}->{language} || '' ); + $n->attribute( country => $self->nodes->{$node}->{country} || '' ); + $n->attribute( indegree => $self->nodes->{$node}->{indegree} ); + $nodes->{$node} = $n; + } + + foreach my $edge (keys %{$self->edges}){ + $nodes->{ $self->edges->{$edge}->{sourceId} }->link_to( + { + target => $self->edges->{$edge}->{targetId}, + weight => $self->edges->{$edge}->{weight} + } + ); + } + + my $xml = $gexf->to_xml(); + print $xml; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Neighbors.pm b/lib/GitHub/Collector/Role/Graph/Neighbors.pm new file mode 100644 index 0000000..627de94 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Neighbors.pm @@ -0,0 +1,128 @@ +package GitHub::Collector::Role::Graph::Neighbors; + +use Moose::Role; +with qw(GitHub::Collector::Role::Languages); + +sub neighbors { + my ( $self, $name, $with_connections ) = @_; + + $self->_neighbors($name); + + if ($with_connections) { + foreach my $id ( keys %{ $self->edges } ) { + $self->_connections( $id, $name ); + } + } +} + +sub _neighbors { + my ( $self, $name ) = @_; + + if ( !defined $self->nodes->{$name} ) { + $self->_create_node($name); + } + + $self->_fetch_edges_from($name); + $self->_fetch_edges_to($name); +} + +sub _connections { + my ( $self, $id, $name ) = @_; + + if ( $self->edges->{$id}->{sourceId} eq $name ) { + $self->_neighbors( $self->edges->{$id}->{targetId} ); + } + else { + $self->_neighbors( $self->edges->{$id}->{sourceId} ); + } +} + +sub remove_leaves { + my $self = shift; + + foreach my $id ( keys %{$self->nodes} ) { + if ( $self->nodes->{$id}->{size} < 2 ) { + delete $self->nodes->{$id}; + } + } + + foreach my $id ( keys %{$self->edges} ) { + unless ( $self->nodes->{ $self->edges->{$id}->{sourceId} } + && $self->nodes->{ $self->edges->{$id}->{targetId} } ) + { + delete $self->edges->{ $id }; + } + } +} + +sub _fetch_edges_from { + my ( $self, $name ) = @_; + + my $connections = $self->db_edges->find( { source => $name } ); + + while ( my $edge = $connections->next ) { + my $edge_id = $edge->{source} . $edge->{target}; + + if ( !defined $self->edges->{$edge_id} ) { + $self->edges->{$edge_id} = { + id => $edge_id, + targetId => $edge->{target}, + sourceId => $name, + weight => $edge->{weight}, + }; + + $self->nodes->{$name}->{size}++; + + if ( defined $self->nodes->{ $edge->{target} } ) { + $self->nodes->{ $edge->{target} }->{size}++; + } + else { + $self->_create_node( $edge->{target}, 1 ); + } + } + } +} + +sub _fetch_edges_to { + my ($self, $name) = @_; + my $connections = $self->db_edges->find( { target => $name } ); + while ( my $edge = $connections->next ) { + my $edge_id = $edge->{source} . $edge->{target}; + + if ( !defined $self->edges->{$edge_id} ) { + $self->edges->{$edge_id} = { + id => $edge_id, + targetId => $name, + sourceId => $edge->{source}, + weight => $edge->{weight}, + }; + + $self->nodes->{$name}->{size}++; + + if ( defined $self->nodes->{ $edge->{source} } ) { + $self->nodes->{ $edge->{source} }->{size}++; + } + else { + $self->_create_node( $edge->{source}, 1 ); + } + } + } +} + +sub _create_node { + my ( $self, $login, $size ) = @_; + + $size ||= 0; + + my $info = $self->db_profiles->find_one( { login => $login } ); + $self->nodes->{$login} = { + id => $login, + label => $login, + size => $size, + language => $self->map_languages($info->{language}), + country => $info->{country} || '', + indegree => $info->{indegree}, + }; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Nodes.pm b/lib/GitHub/Collector/Role/Graph/Nodes.pm new file mode 100644 index 0000000..5109e9a --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Nodes.pm @@ -0,0 +1,13 @@ +package GitHub::Collector::Role::Graph::Nodes; + +use Moose::Role; + +has nodes => ( + is => 'rw', + isa => 'HashRef', + lazy => 1, + default => sub { {} }, + auto_deref => 1, +); + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Query.pm b/lib/GitHub/Collector/Role/Graph/Query.pm new file mode 100644 index 0000000..d3ab7f0 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Query.pm @@ -0,0 +1,45 @@ +package GitHub::Collector::Role::Graph::Query; + +use Moose::Role; + +has language => ( + is => 'rw', + isa => 'Str', + predicate => 'has_language', +); + +has location => ( + is => 'rw', + isa => 'Str', + predicate => 'has_location', +); + +has company => ( + is => 'ro', + isa => 'Str', + predicate => 'has_company', +); + +has country => ( + is => 'ro', + isa => 'Str', + predicate => 'has_country', +); + + +sub build_query { + my $self = shift; + + my $search = {}; + + foreach my $attr (qw/language location company country/) { + my $predicate = "has_$attr"; + if ( $self->$predicate ) { + $search->{$attr} = $self->$attr; + } + } + + return $search; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Search.pm b/lib/GitHub/Collector/Role/Graph/Search.pm new file mode 100644 index 0000000..fbd0d07 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Search.pm @@ -0,0 +1,19 @@ +package GitHub::Collector::Role::Graph::Search; + +use Moose::Role; + +sub build_from_query { + my ($self, $search) = @_; + + $search ||= $self->build_query(); + + my $profiles = $self->db_profiles->find($search); + + while ( my $profile = $profiles->next ) { + $self->neighbors( $profile->{login}, 0 ); + } + + $self->remove_leaves(); +} + +1; diff --git a/lib/GitHub/Collector/Role/Languages.pm b/lib/GitHub/Collector/Role/Languages.pm new file mode 100644 index 0000000..a8c6583 --- /dev/null +++ b/lib/GitHub/Collector/Role/Languages.pm @@ -0,0 +1,25 @@ +package GitHub::Collector::Role::Languages; + +use Moose::Role; + +has mapping => ( + is => 'ro', + isa => 'HashRef', +); + +sub map_languages { + my ( $self, $language ) = @_; + + return "Other" if !defined $language; + + my $languages_map = $self->mapping->{languages}; + + if ( defined $languages_map->{$language} ) { + return $languages_map->{$language}; + } + else { + return "Other"; + } +} + +1; diff --git a/lib/GitHub/Collector/Role/Logger.pm b/lib/GitHub/Collector/Role/Logger.pm new file mode 100644 index 0000000..0d16307 --- /dev/null +++ b/lib/GitHub/Collector/Role/Logger.pm @@ -0,0 +1,27 @@ +package GitHub::Collector::Role::Logger; + +use Moose::Role; +use Log::Dispatchouli; + +has logger => ( + is => 'rw', + isa => 'Log::Dispatchouli', + lazy => 1, + default => sub { + my $logger = Log::Dispatchouli->new( + { + ident => 'GitHub::Collector', + facility => 'user', + to_stdout => 1, + } + ); + }, + handles => { + log => 'log', + debug => 'log_debug', + fatal => 'log_fatal', + error => 'log_error', + }, +); + +1; diff --git a/lib/GitHub/Collector/Role/MongoDB.pm b/lib/GitHub/Collector/Role/MongoDB.pm new file mode 100644 index 0000000..e4cc5b1 --- /dev/null +++ b/lib/GitHub/Collector/Role/MongoDB.pm @@ -0,0 +1,41 @@ +package GitHub::Collector::Role::MongoDB; + +use Moose::Role; +use MongoDB; + +has mongodb => ( + is => 'ro', + isa => 'Object', + lazy => 1, + default => sub { + my $self = shift; + my $conn = + MongoDB::Connection->new( timeout => 60000, query_timeout => 60000 ); + my $db = $conn->github; + $self->_create_indexes($db); + return $db; + }, + handles => { + db_profiles => 'profiles', + db_repositories => 'repositories', + db_relations => 'relations', + db_contributors => 'contributors', + db_edges => 'edges', + } +); + +sub _create_indexes { + my ( $self, $db ) = @_; + + $db->profiles->ensure_index( { login => 1 }, { unique => 1 } ); + $db->repositories->ensure_index( { uniq_name => 1 }, { unique => 1 } ); + $db->contributors->ensure_index( { project => 1 } ); + $db->contributors->ensure_index( { owner => 1 } ); + $db->relations->ensure_index( { source => 1 } ); + $db->relations->ensure_index( { target => 1 } ); + $db->relations->ensure_index( { login => 1 } ); + $db->edges->ensure_index({source => 1}); + $db->edges->ensure_index({target => 1}); +} + +1; diff --git a/lib/GitHub/Collector/Role/Pause.pm b/lib/GitHub/Collector/Role/Pause.pm new file mode 100644 index 0000000..ba7a779 --- /dev/null +++ b/lib/GitHub/Collector/Role/Pause.pm @@ -0,0 +1,18 @@ +package GitHub::Collector::Role::Pause; + +use Moose::Role; + +has pause_on_error => ( + is => 'ro', + isa => 'Int', + default => 10, +); + +sub pause { + my $self = shift; + my $rand = int(rand(10)); + my $pause = $rand == 1 ? $rand : 0; + sleep($pause); +} + +1; diff --git a/lib/GitHub/Collector/Role/Profile.pm b/lib/GitHub/Collector/Role/Profile.pm new file mode 100644 index 0000000..01241d9 --- /dev/null +++ b/lib/GitHub/Collector/Role/Profile.pm @@ -0,0 +1,84 @@ +package GitHub::Collector::Role::Profile; + +use Try::Tiny; +use Moose::Role; +use boolean; + +with qw/ + GitHub::Collector::Role::Pause + GitHub::Collector::Role::Relation + /; + +sub fetch_profile { + my ( $self, $profile ) = @_; + + my ( $res, $error ); + + try { + $res = $self->spore_client->get_info( + format => 'json', + username => $profile, + ); + } + catch { + $error = $_; + }; + + if ($error) { + if ($error->status == 403){ + $self->debug( [ "need to pause (while working on %s)", $profile ] ); + + sleep( $self->pause_on_error ); + return $self->fetch_profile($profile); + }elsif($error->status == 404){ + $self->debug("profile $profile doesn't exists anymore"); + $self->delete_profile($profile); + return; + }else{ + $self->return("can't fetch information for $profile: $error"); + return; + } + } + sleep($self->pause); + return $res->body; +} + +sub save_profile { + my ( $self, $profile_info ) = @_; + + my $id = delete $profile_info->{user}->{id}; + my $time = time(); + + $self->db_profiles->update( + { login => $profile_info->{user}->{login} }, + { + '$set' => { + crawled_at => $time, + repositories_done => false, + %{ $profile_info->{user} } + }, + } + ); + + $self->log( "profile " . $profile_info->{user}->{login} . " saved" ); +} + +sub delete_profile { + my ($self, $profile) = @_; + + $self->db_profiles->remove({login => $profile}); + foreach my $type (qw/target source/){ + $self->db_relations->remove({$type => $profile}); + } + $self->log("all informations regarding $profile have been deleted"); +} + +sub profile_is_done { + my ( $self, $login ) = @_; + $self->db_profiles->update( + { login => $login }, + { '$set' => { done => true } }, + ); +} + +1; diff --git a/lib/GitHub/Collector/Role/Relation.pm b/lib/GitHub/Collector/Role/Relation.pm new file mode 100644 index 0000000..b89d0fc --- /dev/null +++ b/lib/GitHub/Collector/Role/Relation.pm @@ -0,0 +1,70 @@ +package GitHub::Collector::Role::Relation; + +use Try::Tiny; +use Moose::Role; + +with qw/GitHub::Collector::Role::Pause/; + +has types => ( + is => 'ro', + isa => 'ArrayRef', + auto_deref => 1, + default => sub { [qw/followers following/] } +); + +sub add_relations { + my ( $self, $login ) = @_; + + foreach my $type ($self->types) { + my $users = $self->_grab_relations( $login, $type ); + foreach my $user (@$users) { + $self->_bootstrap_profile($user); + if ($type eq 'followers'){ + $self->_add_relation($user, $login); + }else{ + $self->_add_relation($login, $user); + } + } + } +} + +sub _grab_relations { + my ( $self, $login, $type ) = @_; + + $self->log( [ "fetching %s informations for %s", $type, $login ] ); + + my $method = 'list_' . $type; + my ( $users, $error ); + try { + $users = $self->spore_client->$method( + format => 'json', + user => $login, + )->body->{users}; + } + catch { + $error = $_; + if ( $error->status == 403 ) { + $self->debug( + [ "need to pause (while grabbing relations for %s)", $login ] ); + sleep($self->pause_on_error); + $self->_grab_relations( $login, $type ); + } + else { + $self->debug( + [ "can't fetch %s relation for %s: %s", $type, $login, $error ] + ); + } + }; + + sleep( $self->pause ); + return $users; +} + +sub _add_relation { + my ($self, $source, $target) = @_; + my $search = {source => $source, target => $target}; + my $exists = $self->db_relations->find_one($search); + $self->db_relations->insert($search) if !$exists; +} + +1; diff --git a/lib/GitHub/Collector/Role/Repository.pm b/lib/GitHub/Collector/Role/Repository.pm new file mode 100644 index 0000000..8a13693 --- /dev/null +++ b/lib/GitHub/Collector/Role/Repository.pm @@ -0,0 +1,173 @@ +package GitHub::Collector::Role::Repository; + +use Moose::Role; +use Try::Tiny; +use boolean; + +with qw/GitHub::Collector::Role::Pause/; + +sub fetch_repositories { + my ( $self, $profile ) = @_; + + my ( $repositories, $languages, $error ); + + try { + $repositories = $self->spore_client->list_repos( + user => $profile->{login}, + format => 'json', + )->body->{repositories}; + } + catch { + $error = $_; + }; + + if ($error) { + if ( $error->status == 403 ) { + $self->debug( + [ "need to pause (while working on %s)", $profile->{login} ] ); + sleep($self->pause_on_error); + return $self->fetch_repositories($profile); + } + else { + $self->debug("can't fetch repositories for ".$profile->{login}.": $error"); + return; + } + } + + foreach my $repo (@$repositories) { + + next if $repo->{fork}; + next unless $repo->{forks}; + + $self->_get_lang($profile, $repo, $languages); + $self->_get_contributors($profile, $repo); + $self->_save_repository($profile, $repo); + } + + sleep ($self->pause); + $self->_update_profile($profile->{login}, $languages); + return 1; +} + +sub _update_profile { + my ( $self, $login, $languages ) = @_; + + my $lang = $self->_main_lang($languages); + + $self->db_profiles->update( + { login => $login }, + { '$set' => { repositories_done => true, language => $lang } }, + ); +} + +sub _save_repository { + my ( $self, $profile, $repo ) = @_; + + my $contributors = delete $repo->{contributors}; + + if ( scalar @$contributors > 1 ) { + my $project_name = $profile->{login} . '/' . $repo->{name}; + + $repo->{uniq_name} = $project_name; + $self->db_repositories->insert($repo); + + $self->_save_contributors( $profile->{login}, $project_name, + $contributors ); + + $self->log( + [ + 'Add repository %s owned by %s', $repo->{name}, + $profile->{login} + ] + ); + } +} + +sub _save_contributors { + my ( $self, $owner, $project_name, $contributors ) = @_; + + $self->log( + [ 'Add %s contributor(s) to %s', scalar @$contributors, $project_name ] + ); + + foreach my $contrib (@$contributors) { + next if $owner eq $contrib->{login}; + $self->db_contributors->insert( + { + project => $project_name, + owner => $owner, + contributor => $contrib->{login}, + contributions => $contrib->{contributions}, + } + ); + } +} + +sub _get_lang { + my ( $self, $profile, $repo, $languages ) = @_; + + my $pr_languages; + try { + $pr_languages = $self->spore_client->list_languages( + user => $profile->{login}, + repo => $repo->{name}, + format => 'json' + )->body->{languages}; + } + catch { + my $error = $_; + if ( $error->status == 403 ) { + $self->debug( + [ "need to pause (while getting lang for %s)", $repo->{name} ] + ); + sleep($self->pause_on_error); + $self->_get_lang( $profile, $repo, $languages ); + } + }; + foreach my $l ( keys %$pr_languages ) { + $languages->{$l} += $pr_languages->{$l}; + } + + my $lang = $self->_main_lang($pr_languages); + $repo->{lang} = $lang; +} + +sub _main_lang { + my ( $self, $languages ) = @_; + my $lang = ( + sort { $languages->{$b} <=> $languages->{$a} } + keys %$languages + )[0]; + return $lang; +} + +sub _get_contributors { + my ( $self, $profile, $repo ) = @_; + + try { + my $contributors = $self->spore_client->list_contributors( + user => $profile->{login}, + repo => $repo->{name}, + format => 'json' + )->body->{contributors}; + $repo->{contributors} = scalar @$contributors > 1 ? $contributors : []; + } + catch { + my $error = $_; + if ( $error->status == 403 ) { + $self->debug( + [ + "need to pause (while getting contributors for %s)", + $repo->{name} + ] + ); + sleep($self->pause_on_error); + $self->_get_contributors( $profile, $repo ); + } + else { + $repo->{contributors} = []; + } + }; +} + +1; diff --git a/lib/GitHub/Collector/Role/SPORE.pm b/lib/GitHub/Collector/Role/SPORE.pm new file mode 100644 index 0000000..079c494 --- /dev/null +++ b/lib/GitHub/Collector/Role/SPORE.pm @@ -0,0 +1,27 @@ +package GitHub::Collector::Role::SPORE; + +use Moose::Role; +use Net::HTTP::Spore; + +has spore_configuration => ( + is => 'ro', + isa => 'HashRef', + required => 1, + documentation => 'SPORE configuration', +); + +has spore_client => ( + is => 'rw', + isa => 'Object', + lazy => 1, + default => sub { + my $self = shift; + my $client = Net::HTTP::Spore->new_from_spec( + $self->spore_configuration->{github}->{description}, + ); + $client->enable('Format::JSON'); + $client; + } +); + +1; |
