diff options
Diffstat (limited to 'lib/GitHub')
24 files changed, 1262 insertions, 0 deletions
diff --git a/lib/GitHub/Collector.pm b/lib/GitHub/Collector.pm new file mode 100644 index 0000000..56be19d --- /dev/null +++ b/lib/GitHub/Collector.pm @@ -0,0 +1,6 @@ +package GitHub::Collector; + +use Moose; +extends qw(MooseX::App::Cmd); + +1; diff --git a/lib/GitHub/Collector/Command/country.pm b/lib/GitHub/Collector/Command/country.pm new file mode 100644 index 0000000..f7f5107 --- /dev/null +++ b/lib/GitHub/Collector/Command/country.pm @@ -0,0 +1,70 @@ +package GitHub::Collector::Command::country; + +use Moose; +use boolean; + +extends qw(MooseX::App::Cmd::Command); + +has geo_conf => ( + is => 'rw', + isa => 'HashRef', + required => 1, + documentation => 'SPORE configuration for Geo API', +); + +with + 'GitHub::Collector::Role::Logger', + 'GitHub::Collector::Role::Context', + 'GitHub::Collector::Role::MongoDB', + 'Net::HTTP::Spore::Role' => + { spore_clients => [ { name => 'geo', config => 'geo_conf' } ] }; + +sub execute { + my $self = shift; + + $self->log("start to tag user using country"); + + my $profiles = $self->db_profiles->find({country_done => false}); + + while ( my $profile = $profiles->next ) { + $self->_tag_profile_by_country($profile); + } + + $self->log("done tagging users"); +} + +sub _tag_profile_by_country{ + my ($self, $profile) = @_; + + if ( !defined $profile->{location} ) { + $self->_update_country($profile->{login}, false); + return; + } + + $self->log( "searching for " + . $profile->{login} + . " based in " + . $profile->{location} ); + + my $res = $self->geo->search( + q => $profile->{location}, + username => $self->geo_conf->{api_username}, + )->body; + + die "no more requests" if $res->{status} && $res->{status}->{value} == 19; + + if (my $country = $res->{geonames}->[0]->{countryName}){ + $self->_update_country($profile->{login}, $country); + }else{ + $self->_update_country($profile->{login}, false) + } +} + +sub _update_country { + my ( $self, $login, $country ) = @_; + + $self->db_profiles->update( { login => $login }, + { '$set' => { country => $country, country_done => true } } ); +} + +1; diff --git a/lib/GitHub/Collector/Command/edges.pm b/lib/GitHub/Collector/Command/edges.pm new file mode 100644 index 0000000..4ffe0c8 --- /dev/null +++ b/lib/GitHub/Collector/Command/edges.pm @@ -0,0 +1,67 @@ +package GitHub::Collector::Command::edges; + +use Moose; +use boolean; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Context + GitHub::Collector::Role::Logger + GitHub::Collector::Role::MongoDB +); + +sub execute { + my $self = shift; + + $self->log("start to merge contributions"); + + my $profiles = $self->db_profiles->find({edges_done => false}); + + while ( my $profile = $profiles->next ) { + next if $self->_is_done($profile->{login}); + $self->log("merge contributions for ".$profile->{login}); + $self->_contributions($profile->{login}); + } + + $self->log("done merging contributions"); +} + +sub _is_done { + my ($self, $login) = @_; + $self->db_edges->find({source => $login})->count; +} + +sub _contributions { + my ( $self, $login ) = @_; + + my $contributions = + $self->db_contributors->find( { contributor => $login } ); + + my $profiles = {}; + + while ( my $contrib = $contributions->next ) { + my $project = $self->db_repositories->find_one( + { uniq_name => $contrib->{project} } ); + + next if $project->{size} == 0; + my $total = + int( ( $contrib->{contributions} / $project->{size} ) * 100 ); + $total ||= 1; + $profiles->{ $contrib->{owner} } += $total; + } + + foreach my $pr ( keys %$profiles ) { + $self->db_edges->insert({ + source => $login, + target => $pr, + weight => $profiles->{$pr} + }); + } + $self->db_profiles->update( + { login => $login }, + { '$set' => { edges_done => true } }, + ); +} + +1; diff --git a/lib/GitHub/Collector/Command/graph.pm b/lib/GitHub/Collector/Command/graph.pm new file mode 100644 index 0000000..c7766a8 --- /dev/null +++ b/lib/GitHub/Collector/Command/graph.pm @@ -0,0 +1,47 @@ +package GitHub::Collector::Command::graph; + +use Moose; +use YAML::Syck; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Context + GitHub::Collector::Role::Logger + GitHub::Collector::Role::MongoDB + GitHub::Collector::Role::Graph::Query + GitHub::Collector::Role::Graph::Nodes + GitHub::Collector::Role::Graph::Edges + GitHub::Collector::Role::Graph::Neighbors + GitHub::Collector::Role::Graph::Search + GitHub::Collector::Role::Graph::Gexf +); + +has profile => ( + is => 'ro', + isa => 'Str', + predicate => 'has_profile', +); + +has indegree => ( + is => 'ro', + isa => 'Int', + predicate => 'has_indegree', +); + +sub execute { + my $self = shift; + + if ($self->has_profile){ + $self->neighbors($self->profile, 1); + $self->remove_leaves(); + }elsif($self->has_indegree){ + $self->build_from_query( { indegree => { '$gt' => $self->indegree } } ); + }else{ + $self->build_from_query(); + } + + $self->export() if $self->should_export; +} + +1; diff --git a/lib/GitHub/Collector/Command/indegree.pm b/lib/GitHub/Collector/Command/indegree.pm new file mode 100644 index 0000000..fe9bf78 --- /dev/null +++ b/lib/GitHub/Collector/Command/indegree.pm @@ -0,0 +1,29 @@ +package GitHub::Collector::Command::indegree; + +use Moose; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Context + GitHub::Collector::Role::Logger + GitHub::Collector::Role::MongoDB +); + +sub execute { + my $self = shift; + + my $edges = $self->db_edges->find(); + + my $profiles = {}; + while ( my $edge = $edges->next ) { + $profiles->{ $edge->{target} } += $edge->{weight}; + } + + foreach my $login ( keys %$profiles ) { + $self->db_profiles->update( { login => $login }, + { '$set' => { indegree => $profiles->{$login} } } ); + } +} + +1; diff --git a/lib/GitHub/Collector/Command/lang.pm b/lib/GitHub/Collector/Command/lang.pm new file mode 100644 index 0000000..8ab6c20 --- /dev/null +++ b/lib/GitHub/Collector/Command/lang.pm @@ -0,0 +1,72 @@ +package GitHub::Collector::Command::lang; + +use Moose; +use boolean; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Logger + GitHub::Collector::Role::Context + GitHub::Collector::Role::MongoDB +); + +sub execute { + my $self = shift; + + $self->log("start to tag user using langs"); + + my $profiles = $self->db_profiles->find({ language => undef } ); + + while (my $profile = $profiles->next){ + $self->_tag_profile_by_lang($profile); + } + + $self->log("done tagging users"); +} + +sub _tag_profile_by_lang { + my ( $self, $profile ) = @_; + + my $languages = {}; + + $self->_repos( $languages, $profile->{login} ); + $self->_contribs( $languages, $profile->{login} ); + + my $lang = ( + sort { $languages->{$b} <=> $languages->{$a} } + keys %$languages + )[0]; + + $lang = "none "if ( !$lang ); + + $self->log( "pour " . $profile->{login} . " on a " . $lang ); + $self->db_profiles->update( + { login => $profile->{login}, }, + { '$set' => { language => $lang } } + ); +} + +sub _repos { + my ( $self, $languages, $login ) = @_; + + my $repositories = $self->db_repositories->find( { owner => $login } ); + + while ( my $repo = $repositories->next ) { + $languages->{ $repo->{lang} }++ if $repo->{lang}; + } +} + +sub _contribs { + my ( $self, $languages, $login ) = @_; + + my $contribs = $self->db_contributors->find( { contributor => $login } ); + + while ( my $contrib = $contribs->next ) { + my $repo = $self->db_repositories->find_one( + { uniq_name => $contrib->{project} } ); + $languages->{ $repo->{lang} }++ if $repo->{lang}; + } +} + +1; diff --git a/lib/GitHub/Collector/Command/profile.pm b/lib/GitHub/Collector/Command/profile.pm new file mode 100644 index 0000000..872bf28 --- /dev/null +++ b/lib/GitHub/Collector/Command/profile.pm @@ -0,0 +1,90 @@ +package GitHub::Collector::Command::profile; + +use YAML; +use Try::Tiny; +use Moose; +use boolean; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Context + GitHub::Collector::Role::Logger + GitHub::Collector::Role::SPORE + GitHub::Collector::Role::Profile + GitHub::Collector::Role::MongoDB + GitHub::Collector::Role::Pause +); + +has seed => ( + isa => 'ArrayRef', + is => 'ro', + required => 1, + auto_deref => 1, + documentation => 'seed to crawl', + lazy => 1, + default => sub { + my $self = shift; + return $self->context->{seed}; + } +); + +sub execute { + my $self = shift; + + $self->log("start to crawl profiles"); + + foreach my $profile ($self->seed) { + $self->_bootstrap_profile($profile); + } + + $self->log("finish to boostrap the seed"); + $self->_crawl(0); + $self->log("crawl completed"); +} + +sub get_profile { + my ( $self, $profile ) = @_; + + my $login = $profile->{login}; + + my $profile_info = $self->fetch_profile($login); + + return unless $profile_info; + + $self->save_profile($profile_info); + $self->add_relations($login); + $self->profile_is_done($login); +} + +sub _crawl { + my $self = shift; + + my $profiles_to_crawl = $self->db_profiles->find({done => false}); + + while (my $profile = $profiles_to_crawl->next) { + $self->get_profile($profile); + } + + if ($self->db_profiles->find({done => false})->count > 0) { + $self->_crawl; + } +} + +sub _bootstrap_profile { + my ( $self, $profile ) = @_; + + my $has_profile = $self->db_profiles->find( { login => $profile } ); + return if $has_profile->count > 0; + $self->debug("insert $profile into profiles"); + my $res = $self->db_profiles->insert( + { login => $profile, done => false, repositories_done => false } ); +} + +1; + +=head1 NAME + +GitHub::Collector::Command::profile - foo + +=cut diff --git a/lib/GitHub/Collector/Command/repository.pm b/lib/GitHub/Collector/Command/repository.pm new file mode 100644 index 0000000..3e7fd57 --- /dev/null +++ b/lib/GitHub/Collector/Command/repository.pm @@ -0,0 +1,44 @@ +package GitHub::Collector::Command::repository; + +use Moose; +use boolean; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Context + GitHub::Collector::Role::Logger + GitHub::Collector::Role::SPORE + GitHub::Collector::Role::MongoDB + GitHub::Collector::Role::Repository +); + +sub execute { + my $self = shift; + + $self->log("start to crawl repositories"); + $self->_crawl(); + $self->log("crawl completed"); +} + +sub get_repositories { + my ($self, $profile) = @_; + + my $login = $profile->{login}; + + $self->log("fetch repositories for $login"); + $self->fetch_repositories($profile); + $self->log("finished to work on $login"); +} + +sub _crawl { + my $self = shift; + + my $profiles = $self->db_profiles->find( { repositories_done => false } ); + + while ( my $profile = $profiles->next ) { + $self->get_repositories($profile); + } +} + +1; diff --git a/lib/GitHub/Collector/Command/stats.pm b/lib/GitHub/Collector/Command/stats.pm new file mode 100644 index 0000000..fc71d10 --- /dev/null +++ b/lib/GitHub/Collector/Command/stats.pm @@ -0,0 +1,94 @@ +package GitHub::Collector::Command::stats; + +use 5.010; +use Moose; +use boolean; +use JSON; +use DateTime; + +extends qw(MooseX::App::Cmd::Command); + +with qw( + GitHub::Collector::Role::Context + GitHub::Collector::Role::Logger + GitHub::Collector::Role::MongoDB + GitHub::Collector::Role::Languages +); + +sub execute { + my ($self, ) = @_; + + my $profiles = $self->db_profiles->find(); + + my $languages = {}; + my $country = {}; + my $company = {}; + my $created = {}; + + while ( my $profile = $profiles->next ) { + my $date = $profile->{created_at}; + + next if !defined $date; + + my ($year, $month) = $date =~ /(\d{4})(?:-|\/)(\d{2})/; + next if (!defined $year || !defined $month); + + my $lang = $self->map_languages( $profile->{language} ); + $languages->{$lang}++ if $lang ne 'Other'; + + $country->{ $profile->{country} }++ if $profile->{country}; + $company->{ $profile->{company} }++ if defined $profile->{company}; + + $created->{global}->{ $year . '/' . $month }->{total}++; + $created->{languages}->{$lang}->{ $year . '/' . $month }->{total}++; + } + +# $self->_sort_and_display($languages); +# $self->_sort_and_display($country, 10); +# $self->_sort_and_display($company, 100); + + $self->_create_flot( $created->{global}, 'global' ); + foreach my $lang ( keys %{ $created->{languages} } ) { + $self->_create_flot( $created->{languages}->{$lang}, $lang ); + } +} + +sub _sort_and_display { + my ($self, $data, $iter) = @_; + + my @sorted = sort {$data->{$b} <=> $data->{$a}} keys %$data; + + my $total = 0; + map {$total += $data->{$_} } @sorted; + $iter ||= (scalar @sorted - 1); + + for(0..$iter){ + my $pct = int(($data->{$sorted[$_]} / $total) * 100); + say " # ".$sorted[$_].":".$data->{$sorted[$_]}. " ($pct%)"; + } +} + +sub _create_flot { + my ($self, $data, $label) = @_; + + my $graph = {}; + $graph->{label} = $label; + + my @sorted = sort {$a cmp $b} keys %$data; + + # remove the first and last value since they're not really worthy + shift @sorted; + pop @sorted; + + foreach my $month (@sorted) { + (my $y, my $m) = $month =~ /(\d{4})\/(\d{2})/; + my $epoch = DateTime->new(year => $y, month => $m, day => 01)->epoch * 1000; + push @{$graph->{data}}, [$epoch, $data->{$month}->{total}]; + } + + open my $fh, '>', $label.'.json'; + print $fh JSON::encode_json($graph); + close $fh; +} + +1; diff --git a/lib/GitHub/Collector/Role/Context.pm b/lib/GitHub/Collector/Role/Context.pm new file mode 100644 index 0000000..aa30d2b --- /dev/null +++ b/lib/GitHub/Collector/Role/Context.pm @@ -0,0 +1,13 @@ +package GitHub::Collector::Role::Context; + +use YAML; +use Moose::Role; +with qw(MooseX::ConfigFromFile); + +sub get_config_from_file { + my ( $self, $file ) = @_; + my $conf = YAML::LoadFile($file); + $conf; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Edges.pm b/lib/GitHub/Collector/Role/Graph/Edges.pm new file mode 100644 index 0000000..bea5914 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Edges.pm @@ -0,0 +1,13 @@ +package GitHub::Collector::Role::Graph::Edges; + +use Moose::Role; + +has edges => ( + is => 'rw', + isa => 'HashRef', + lazy => 1, + default => sub { {} }, + auto_deref => 1, +); + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Gexf.pm b/lib/GitHub/Collector/Role/Graph/Gexf.pm new file mode 100644 index 0000000..ab83dd4 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Gexf.pm @@ -0,0 +1,47 @@ +package GitHub::Collector::Role::Graph::Gexf; + +use Moose::Role; +use Graph::GEXF; + +has output => ( + is => 'ro', + isa => 'Int', + predicate => 'should_export', +); + +sub export { + my ($self, ) = @_; + + my $gexf = Graph::GEXF->new(); + $gexf->add_node_attribute( name => 'string' ); + $gexf->add_node_attribute( lang => 'string' ); + $gexf->add_node_attribute( size => 'int' ); + $gexf->add_node_attribute( country => 'string' ); + $gexf->add_node_attribute( indegree => 'int' ); + + my $nodes = {}; + foreach my $node ( keys %{ $self->nodes } ) { + my $n = $gexf->add_node( $self->nodes->{$node}->{id} ); + $n->label($node); + $n->attribute( name => $node ); + $n->attribute( size => $self->nodes->{$node}->{size} ); + $n->attribute( lang => $self->nodes->{$node}->{language} || '' ); + $n->attribute( country => $self->nodes->{$node}->{country} || '' ); + $n->attribute( indegree => $self->nodes->{$node}->{indegree} ); + $nodes->{$node} = $n; + } + + foreach my $edge (keys %{$self->edges}){ + $nodes->{ $self->edges->{$edge}->{sourceId} }->link_to( + { + target => $self->edges->{$edge}->{targetId}, + weight => $self->edges->{$edge}->{weight} + } + ); + } + + my $xml = $gexf->to_xml(); + print $xml; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Neighbors.pm b/lib/GitHub/Collector/Role/Graph/Neighbors.pm new file mode 100644 index 0000000..627de94 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Neighbors.pm @@ -0,0 +1,128 @@ +package GitHub::Collector::Role::Graph::Neighbors; + +use Moose::Role; +with qw(GitHub::Collector::Role::Languages); + +sub neighbors { + my ( $self, $name, $with_connections ) = @_; + + $self->_neighbors($name); + + if ($with_connections) { + foreach my $id ( keys %{ $self->edges } ) { + $self->_connections( $id, $name ); + } + } +} + +sub _neighbors { + my ( $self, $name ) = @_; + + if ( !defined $self->nodes->{$name} ) { + $self->_create_node($name); + } + + $self->_fetch_edges_from($name); + $self->_fetch_edges_to($name); +} + +sub _connections { + my ( $self, $id, $name ) = @_; + + if ( $self->edges->{$id}->{sourceId} eq $name ) { + $self->_neighbors( $self->edges->{$id}->{targetId} ); + } + else { + $self->_neighbors( $self->edges->{$id}->{sourceId} ); + } +} + +sub remove_leaves { + my $self = shift; + + foreach my $id ( keys %{$self->nodes} ) { + if ( $self->nodes->{$id}->{size} < 2 ) { + delete $self->nodes->{$id}; + } + } + + foreach my $id ( keys %{$self->edges} ) { + unless ( $self->nodes->{ $self->edges->{$id}->{sourceId} } + && $self->nodes->{ $self->edges->{$id}->{targetId} } ) + { + delete $self->edges->{ $id }; + } + } +} + +sub _fetch_edges_from { + my ( $self, $name ) = @_; + + my $connections = $self->db_edges->find( { source => $name } ); + + while ( my $edge = $connections->next ) { + my $edge_id = $edge->{source} . $edge->{target}; + + if ( !defined $self->edges->{$edge_id} ) { + $self->edges->{$edge_id} = { + id => $edge_id, + targetId => $edge->{target}, + sourceId => $name, + weight => $edge->{weight}, + }; + + $self->nodes->{$name}->{size}++; + + if ( defined $self->nodes->{ $edge->{target} } ) { + $self->nodes->{ $edge->{target} }->{size}++; + } + else { + $self->_create_node( $edge->{target}, 1 ); + } + } + } +} + +sub _fetch_edges_to { + my ($self, $name) = @_; + my $connections = $self->db_edges->find( { target => $name } ); + while ( my $edge = $connections->next ) { + my $edge_id = $edge->{source} . $edge->{target}; + + if ( !defined $self->edges->{$edge_id} ) { + $self->edges->{$edge_id} = { + id => $edge_id, + targetId => $name, + sourceId => $edge->{source}, + weight => $edge->{weight}, + }; + + $self->nodes->{$name}->{size}++; + + if ( defined $self->nodes->{ $edge->{source} } ) { + $self->nodes->{ $edge->{source} }->{size}++; + } + else { + $self->_create_node( $edge->{source}, 1 ); + } + } + } +} + +sub _create_node { + my ( $self, $login, $size ) = @_; + + $size ||= 0; + + my $info = $self->db_profiles->find_one( { login => $login } ); + $self->nodes->{$login} = { + id => $login, + label => $login, + size => $size, + language => $self->map_languages($info->{language}), + country => $info->{country} || '', + indegree => $info->{indegree}, + }; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Nodes.pm b/lib/GitHub/Collector/Role/Graph/Nodes.pm new file mode 100644 index 0000000..5109e9a --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Nodes.pm @@ -0,0 +1,13 @@ +package GitHub::Collector::Role::Graph::Nodes; + +use Moose::Role; + +has nodes => ( + is => 'rw', + isa => 'HashRef', + lazy => 1, + default => sub { {} }, + auto_deref => 1, +); + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Query.pm b/lib/GitHub/Collector/Role/Graph/Query.pm new file mode 100644 index 0000000..d3ab7f0 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Query.pm @@ -0,0 +1,45 @@ +package GitHub::Collector::Role::Graph::Query; + +use Moose::Role; + +has language => ( + is => 'rw', + isa => 'Str', + predicate => 'has_language', +); + +has location => ( + is => 'rw', + isa => 'Str', + predicate => 'has_location', +); + +has company => ( + is => 'ro', + isa => 'Str', + predicate => 'has_company', +); + +has country => ( + is => 'ro', + isa => 'Str', + predicate => 'has_country', +); + + +sub build_query { + my $self = shift; + + my $search = {}; + + foreach my $attr (qw/language location company country/) { + my $predicate = "has_$attr"; + if ( $self->$predicate ) { + $search->{$attr} = $self->$attr; + } + } + + return $search; +} + +1; diff --git a/lib/GitHub/Collector/Role/Graph/Search.pm b/lib/GitHub/Collector/Role/Graph/Search.pm new file mode 100644 index 0000000..fbd0d07 --- /dev/null +++ b/lib/GitHub/Collector/Role/Graph/Search.pm @@ -0,0 +1,19 @@ +package GitHub::Collector::Role::Graph::Search; + +use Moose::Role; + +sub build_from_query { + my ($self, $search) = @_; + + $search ||= $self->build_query(); + + my $profiles = $self->db_profiles->find($search); + + while ( my $profile = $profiles->next ) { + $self->neighbors( $profile->{login}, 0 ); + } + + $self->remove_leaves(); +} + +1; diff --git a/lib/GitHub/Collector/Role/Languages.pm b/lib/GitHub/Collector/Role/Languages.pm new file mode 100644 index 0000000..a8c6583 --- /dev/null +++ b/lib/GitHub/Collector/Role/Languages.pm @@ -0,0 +1,25 @@ +package GitHub::Collector::Role::Languages; + +use Moose::Role; + +has mapping => ( + is => 'ro', + isa => 'HashRef', +); + +sub map_languages { + my ( $self, $language ) = @_; + + return "Other" if !defined $language; + + my $languages_map = $self->mapping->{languages}; + + if ( defined $languages_map->{$language} ) { + return $languages_map->{$language}; + } + else { + return "Other"; + } +} + +1; diff --git a/lib/GitHub/Collector/Role/Logger.pm b/lib/GitHub/Collector/Role/Logger.pm new file mode 100644 index 0000000..0d16307 --- /dev/null +++ b/lib/GitHub/Collector/Role/Logger.pm @@ -0,0 +1,27 @@ +package GitHub::Collector::Role::Logger; + +use Moose::Role; +use Log::Dispatchouli; + +has logger => ( + is => 'rw', + isa => 'Log::Dispatchouli', + lazy => 1, + default => sub { + my $logger = Log::Dispatchouli->new( + { + ident => 'GitHub::Collector', + facility => 'user', + to_stdout => 1, + } + ); + }, + handles => { + log => 'log', + debug => 'log_debug', + fatal => 'log_fatal', + error => 'log_error', + }, +); + +1; diff --git a/lib/GitHub/Collector/Role/MongoDB.pm b/lib/GitHub/Collector/Role/MongoDB.pm new file mode 100644 index 0000000..e4cc5b1 --- /dev/null +++ b/lib/GitHub/Collector/Role/MongoDB.pm @@ -0,0 +1,41 @@ +package GitHub::Collector::Role::MongoDB; + +use Moose::Role; +use MongoDB; + +has mongodb => ( + is => 'ro', + isa => 'Object', + lazy => 1, + default => sub { + my $self = shift; + my $conn = + MongoDB::Connection->new( timeout => 60000, query_timeout => 60000 ); + my $db = $conn->github; + $self->_create_indexes($db); + return $db; + }, + handles => { + db_profiles => 'profiles', + db_repositories => 'repositories', + db_relations => 'relations', + db_contributors => 'contributors', + db_edges => 'edges', + } +); + +sub _create_indexes { + my ( $self, $db ) = @_; + + $db->profiles->ensure_index( { login => 1 }, { unique => 1 } ); + $db->repositories->ensure_index( { uniq_name => 1 }, { unique => 1 } ); + $db->contributors->ensure_index( { project => 1 } ); + $db->contributors->ensure_index( { owner => 1 } ); + $db->relations->ensure_index( { source => 1 } ); + $db->relations->ensure_index( { target => 1 } ); + $db->relations->ensure_index( { login => 1 } ); + $db->edges->ensure_index({source => 1}); + $db->edges->ensure_index({target => 1}); +} + +1; diff --git a/lib/GitHub/Collector/Role/Pause.pm b/lib/GitHub/Collector/Role/Pause.pm new file mode 100644 index 0000000..ba7a779 --- /dev/null +++ b/lib/GitHub/Collector/Role/Pause.pm @@ -0,0 +1,18 @@ +package GitHub::Collector::Role::Pause; + +use Moose::Role; + +has pause_on_error => ( + is => 'ro', + isa => 'Int', + default => 10, +); + +sub pause { + my $self = shift; + my $rand = int(rand(10)); + my $pause = $rand == 1 ? $rand : 0; + sleep($pause); +} + +1; diff --git a/lib/GitHub/Collector/Role/Profile.pm b/lib/GitHub/Collector/Role/Profile.pm new file mode 100644 index 0000000..01241d9 --- /dev/null +++ b/lib/GitHub/Collector/Role/Profile.pm @@ -0,0 +1,84 @@ +package GitHub::Collector::Role::Profile; + +use Try::Tiny; +use Moose::Role; +use boolean; + +with qw/ + GitHub::Collector::Role::Pause + GitHub::Collector::Role::Relation + /; + +sub fetch_profile { + my ( $self, $profile ) = @_; + + my ( $res, $error ); + + try { + $res = $self->spore_client->get_info( + format => 'json', + username => $profile, + ); + } + catch { + $error = $_; + }; + + if ($error) { + if ($error->status == 403){ + $self->debug( [ "need to pause (while working on %s)", $profile ] ); + + sleep( $self->pause_on_error ); + return $self->fetch_profile($profile); + }elsif($error->status == 404){ + $self->debug("profile $profile doesn't exists anymore"); + $self->delete_profile($profile); + return; + }else{ + $self->return("can't fetch information for $profile: $error"); + return; + } + } + sleep($self->pause); + return $res->body; +} + +sub save_profile { + my ( $self, $profile_info ) = @_; + + my $id = delete $profile_info->{user}->{id}; + my $time = time(); + + $self->db_profiles->update( + { login => $profile_info->{user}->{login} }, + { + '$set' => { + crawled_at => $time, + repositories_done => false, + %{ $profile_info->{user} } + }, + } + ); + + $self->log( "profile " . $profile_info->{user}->{login} . " saved" ); +} + +sub delete_profile { + my ($self, $profile) = @_; + + $self->db_profiles->remove({login => $profile}); + foreach my $type (qw/target source/){ + $self->db_relations->remove({$type => $profile}); + } + $self->log("all informations regarding $profile have been deleted"); +} + +sub profile_is_done { + my ( $self, $login ) = @_; + $self->db_profiles->update( + { login => $login }, + { '$set' => { done => true } }, + ); +} + +1; diff --git a/lib/GitHub/Collector/Role/Relation.pm b/lib/GitHub/Collector/Role/Relation.pm new file mode 100644 index 0000000..b89d0fc --- /dev/null +++ b/lib/GitHub/Collector/Role/Relation.pm @@ -0,0 +1,70 @@ +package GitHub::Collector::Role::Relation; + +use Try::Tiny; +use Moose::Role; + +with qw/GitHub::Collector::Role::Pause/; + +has types => ( + is => 'ro', + isa => 'ArrayRef', + auto_deref => 1, + default => sub { [qw/followers following/] } +); + +sub add_relations { + my ( $self, $login ) = @_; + + foreach my $type ($self->types) { + my $users = $self->_grab_relations( $login, $type ); + foreach my $user (@$users) { + $self->_bootstrap_profile($user); + if ($type eq 'followers'){ + $self->_add_relation($user, $login); + }else{ + $self->_add_relation($login, $user); + } + } + } +} + +sub _grab_relations { + my ( $self, $login, $type ) = @_; + + $self->log( [ "fetching %s informations for %s", $type, $login ] ); + + my $method = 'list_' . $type; + my ( $users, $error ); + try { + $users = $self->spore_client->$method( + format => 'json', + user => $login, + )->body->{users}; + } + catch { + $error = $_; + if ( $error->status == 403 ) { + $self->debug( + [ "need to pause (while grabbing relations for %s)", $login ] ); + sleep($self->pause_on_error); + $self->_grab_relations( $login, $type ); + } + else { + $self->debug( + [ "can't fetch %s relation for %s: %s", $type, $login, $error ] + ); + } + }; + + sleep( $self->pause ); + return $users; +} + +sub _add_relation { + my ($self, $source, $target) = @_; + my $search = {source => $source, target => $target}; + my $exists = $self->db_relations->find_one($search); + $self->db_relations->insert($search) if !$exists; +} + +1; diff --git a/lib/GitHub/Collector/Role/Repository.pm b/lib/GitHub/Collector/Role/Repository.pm new file mode 100644 index 0000000..8a13693 --- /dev/null +++ b/lib/GitHub/Collector/Role/Repository.pm @@ -0,0 +1,173 @@ +package GitHub::Collector::Role::Repository; + +use Moose::Role; +use Try::Tiny; +use boolean; + +with qw/GitHub::Collector::Role::Pause/; + +sub fetch_repositories { + my ( $self, $profile ) = @_; + + my ( $repositories, $languages, $error ); + + try { + $repositories = $self->spore_client->list_repos( + user => $profile->{login}, + format => 'json', + )->body->{repositories}; + } + catch { + $error = $_; + }; + + if ($error) { + if ( $error->status == 403 ) { + $self->debug( + [ "need to pause (while working on %s)", $profile->{login} ] ); + sleep($self->pause_on_error); + return $self->fetch_repositories($profile); + } + else { + $self->debug("can't fetch repositories for ".$profile->{login}.": $error"); + return; + } + } + + foreach my $repo (@$repositories) { + + next if $repo->{fork}; + next unless $repo->{forks}; + + $self->_get_lang($profile, $repo, $languages); + $self->_get_contributors($profile, $repo); + $self->_save_repository($profile, $repo); + } + + sleep ($self->pause); + $self->_update_profile($profile->{login}, $languages); + return 1; +} + +sub _update_profile { + my ( $self, $login, $languages ) = @_; + + my $lang = $self->_main_lang($languages); + + $self->db_profiles->update( + { login => $login }, + { '$set' => { repositories_done => true, language => $lang } }, + ); +} + +sub _save_repository { + my ( $self, $profile, $repo ) = @_; + + my $contributors = delete $repo->{contributors}; + + if ( scalar @$contributors > 1 ) { + my $project_name = $profile->{login} . '/' . $repo->{name}; + + $repo->{uniq_name} = $project_name; + $self->db_repositories->insert($repo); + + $self->_save_contributors( $profile->{login}, $project_name, + $contributors ); + + $self->log( + [ + 'Add repository %s owned by %s', $repo->{name}, + $profile->{login} + ] + ); + } +} + +sub _save_contributors { + my ( $self, $owner, $project_name, $contributors ) = @_; + + $self->log( + [ 'Add %s contributor(s) to %s', scalar @$contributors, $project_name ] + ); + + foreach my $contrib (@$contributors) { + next if $owner eq $contrib->{login}; + $self->db_contributors->insert( + { + project => $project_name, + owner => $owner, + contributor => $contrib->{login}, + contributions => $contrib->{contributions}, + } + ); + } +} + +sub _get_lang { + my ( $self, $profile, $repo, $languages ) = @_; + + my $pr_languages; + try { + $pr_languages = $self->spore_client->list_languages( + user => $profile->{login}, + repo => $repo->{name}, + format => 'json' + )->body->{languages}; + } + catch { + my $error = $_; + if ( $error->status == 403 ) { + $self->debug( + [ "need to pause (while getting lang for %s)", $repo->{name} ] + ); + sleep($self->pause_on_error); + $self->_get_lang( $profile, $repo, $languages ); + } + }; + foreach my $l ( keys %$pr_languages ) { + $languages->{$l} += $pr_languages->{$l}; + } + + my $lang = $self->_main_lang($pr_languages); + $repo->{lang} = $lang; +} + +sub _main_lang { + my ( $self, $languages ) = @_; + my $lang = ( + sort { $languages->{$b} <=> $languages->{$a} } + keys %$languages + )[0]; + return $lang; +} + +sub _get_contributors { + my ( $self, $profile, $repo ) = @_; + + try { + my $contributors = $self->spore_client->list_contributors( + user => $profile->{login}, + repo => $repo->{name}, + format => 'json' + )->body->{contributors}; + $repo->{contributors} = scalar @$contributors > 1 ? $contributors : []; + } + catch { + my $error = $_; + if ( $error->status == 403 ) { + $self->debug( + [ + "need to pause (while getting contributors for %s)", + $repo->{name} + ] + ); + sleep($self->pause_on_error); + $self->_get_contributors( $profile, $repo ); + } + else { + $repo->{contributors} = []; + } + }; +} + +1; diff --git a/lib/GitHub/Collector/Role/SPORE.pm b/lib/GitHub/Collector/Role/SPORE.pm new file mode 100644 index 0000000..079c494 --- /dev/null +++ b/lib/GitHub/Collector/Role/SPORE.pm @@ -0,0 +1,27 @@ +package GitHub::Collector::Role::SPORE; + +use Moose::Role; +use Net::HTTP::Spore; + +has spore_configuration => ( + is => 'ro', + isa => 'HashRef', + required => 1, + documentation => 'SPORE configuration', +); + +has spore_client => ( + is => 'rw', + isa => 'Object', + lazy => 1, + default => sub { + my $self = shift; + my $client = Net::HTTP::Spore->new_from_spec( + $self->spore_configuration->{github}->{description}, + ); + $client->enable('Format::JSON'); + $client; + } +); + +1; |
