summaryrefslogtreecommitdiff
path: root/lib/GitHub/Collector/Command
diff options
context:
space:
mode:
Diffstat (limited to 'lib/GitHub/Collector/Command')
-rw-r--r--lib/GitHub/Collector/Command/country.pm70
-rw-r--r--lib/GitHub/Collector/Command/edges.pm67
-rw-r--r--lib/GitHub/Collector/Command/graph.pm47
-rw-r--r--lib/GitHub/Collector/Command/indegree.pm29
-rw-r--r--lib/GitHub/Collector/Command/lang.pm72
-rw-r--r--lib/GitHub/Collector/Command/profile.pm90
-rw-r--r--lib/GitHub/Collector/Command/repository.pm44
-rw-r--r--lib/GitHub/Collector/Command/stats.pm94
8 files changed, 513 insertions, 0 deletions
diff --git a/lib/GitHub/Collector/Command/country.pm b/lib/GitHub/Collector/Command/country.pm
new file mode 100644
index 0000000..f7f5107
--- /dev/null
+++ b/lib/GitHub/Collector/Command/country.pm
@@ -0,0 +1,70 @@
+package GitHub::Collector::Command::country;
+
+use Moose;
+use boolean;
+
+extends qw(MooseX::App::Cmd::Command);
+
+has geo_conf => (
+ is => 'rw',
+ isa => 'HashRef',
+ required => 1,
+ documentation => 'SPORE configuration for Geo API',
+);
+
+with
+ 'GitHub::Collector::Role::Logger',
+ 'GitHub::Collector::Role::Context',
+ 'GitHub::Collector::Role::MongoDB',
+ 'Net::HTTP::Spore::Role' =>
+ { spore_clients => [ { name => 'geo', config => 'geo_conf' } ] };
+
+sub execute {
+ my $self = shift;
+
+ $self->log("start to tag user using country");
+
+ my $profiles = $self->db_profiles->find({country_done => false});
+
+ while ( my $profile = $profiles->next ) {
+ $self->_tag_profile_by_country($profile);
+ }
+
+ $self->log("done tagging users");
+}
+
+sub _tag_profile_by_country{
+ my ($self, $profile) = @_;
+
+ if ( !defined $profile->{location} ) {
+ $self->_update_country($profile->{login}, false);
+ return;
+ }
+
+ $self->log( "searching for "
+ . $profile->{login}
+ . " based in "
+ . $profile->{location} );
+
+ my $res = $self->geo->search(
+ q => $profile->{location},
+ username => $self->geo_conf->{api_username},
+ )->body;
+
+ die "no more requests" if $res->{status} && $res->{status}->{value} == 19;
+
+ if (my $country = $res->{geonames}->[0]->{countryName}){
+ $self->_update_country($profile->{login}, $country);
+ }else{
+ $self->_update_country($profile->{login}, false)
+ }
+}
+
+sub _update_country {
+ my ( $self, $login, $country ) = @_;
+
+ $self->db_profiles->update( { login => $login },
+ { '$set' => { country => $country, country_done => true } } );
+}
+
+1;
diff --git a/lib/GitHub/Collector/Command/edges.pm b/lib/GitHub/Collector/Command/edges.pm
new file mode 100644
index 0000000..4ffe0c8
--- /dev/null
+++ b/lib/GitHub/Collector/Command/edges.pm
@@ -0,0 +1,67 @@
+package GitHub::Collector::Command::edges;
+
+use Moose;
+use boolean;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::MongoDB
+);
+
+sub execute {
+ my $self = shift;
+
+ $self->log("start to merge contributions");
+
+ my $profiles = $self->db_profiles->find({edges_done => false});
+
+ while ( my $profile = $profiles->next ) {
+ next if $self->_is_done($profile->{login});
+ $self->log("merge contributions for ".$profile->{login});
+ $self->_contributions($profile->{login});
+ }
+
+ $self->log("done merging contributions");
+}
+
+sub _is_done {
+ my ($self, $login) = @_;
+ $self->db_edges->find({source => $login})->count;
+}
+
+sub _contributions {
+ my ( $self, $login ) = @_;
+
+ my $contributions =
+ $self->db_contributors->find( { contributor => $login } );
+
+ my $profiles = {};
+
+ while ( my $contrib = $contributions->next ) {
+ my $project = $self->db_repositories->find_one(
+ { uniq_name => $contrib->{project} } );
+
+ next if $project->{size} == 0;
+ my $total =
+ int( ( $contrib->{contributions} / $project->{size} ) * 100 );
+ $total ||= 1;
+ $profiles->{ $contrib->{owner} } += $total;
+ }
+
+ foreach my $pr ( keys %$profiles ) {
+ $self->db_edges->insert({
+ source => $login,
+ target => $pr,
+ weight => $profiles->{$pr}
+ });
+ }
+ $self->db_profiles->update(
+ { login => $login },
+ { '$set' => { edges_done => true } },
+ );
+}
+
+1;
diff --git a/lib/GitHub/Collector/Command/graph.pm b/lib/GitHub/Collector/Command/graph.pm
new file mode 100644
index 0000000..c7766a8
--- /dev/null
+++ b/lib/GitHub/Collector/Command/graph.pm
@@ -0,0 +1,47 @@
+package GitHub::Collector::Command::graph;
+
+use Moose;
+use YAML::Syck;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::MongoDB
+ GitHub::Collector::Role::Graph::Query
+ GitHub::Collector::Role::Graph::Nodes
+ GitHub::Collector::Role::Graph::Edges
+ GitHub::Collector::Role::Graph::Neighbors
+ GitHub::Collector::Role::Graph::Search
+ GitHub::Collector::Role::Graph::Gexf
+);
+
+has profile => (
+ is => 'ro',
+ isa => 'Str',
+ predicate => 'has_profile',
+);
+
+has indegree => (
+ is => 'ro',
+ isa => 'Int',
+ predicate => 'has_indegree',
+);
+
+sub execute {
+ my $self = shift;
+
+ if ($self->has_profile){
+ $self->neighbors($self->profile, 1);
+ $self->remove_leaves();
+ }elsif($self->has_indegree){
+ $self->build_from_query( { indegree => { '$gt' => $self->indegree } } );
+ }else{
+ $self->build_from_query();
+ }
+
+ $self->export() if $self->should_export;
+}
+
+1;
diff --git a/lib/GitHub/Collector/Command/indegree.pm b/lib/GitHub/Collector/Command/indegree.pm
new file mode 100644
index 0000000..fe9bf78
--- /dev/null
+++ b/lib/GitHub/Collector/Command/indegree.pm
@@ -0,0 +1,29 @@
+package GitHub::Collector::Command::indegree;
+
+use Moose;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::MongoDB
+);
+
+sub execute {
+ my $self = shift;
+
+ my $edges = $self->db_edges->find();
+
+ my $profiles = {};
+ while ( my $edge = $edges->next ) {
+ $profiles->{ $edge->{target} } += $edge->{weight};
+ }
+
+ foreach my $login ( keys %$profiles ) {
+ $self->db_profiles->update( { login => $login },
+ { '$set' => { indegree => $profiles->{$login} } } );
+ }
+}
+
+1;
diff --git a/lib/GitHub/Collector/Command/lang.pm b/lib/GitHub/Collector/Command/lang.pm
new file mode 100644
index 0000000..8ab6c20
--- /dev/null
+++ b/lib/GitHub/Collector/Command/lang.pm
@@ -0,0 +1,72 @@
+package GitHub::Collector::Command::lang;
+
+use Moose;
+use boolean;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::MongoDB
+);
+
+sub execute {
+ my $self = shift;
+
+ $self->log("start to tag user using langs");
+
+ my $profiles = $self->db_profiles->find({ language => undef } );
+
+ while (my $profile = $profiles->next){
+ $self->_tag_profile_by_lang($profile);
+ }
+
+ $self->log("done tagging users");
+}
+
+sub _tag_profile_by_lang {
+ my ( $self, $profile ) = @_;
+
+ my $languages = {};
+
+ $self->_repos( $languages, $profile->{login} );
+ $self->_contribs( $languages, $profile->{login} );
+
+ my $lang = (
+ sort { $languages->{$b} <=> $languages->{$a} }
+ keys %$languages
+ )[0];
+
+ $lang = "none "if ( !$lang );
+
+ $self->log( "pour " . $profile->{login} . " on a " . $lang );
+ $self->db_profiles->update(
+ { login => $profile->{login}, },
+ { '$set' => { language => $lang } }
+ );
+}
+
+sub _repos {
+ my ( $self, $languages, $login ) = @_;
+
+ my $repositories = $self->db_repositories->find( { owner => $login } );
+
+ while ( my $repo = $repositories->next ) {
+ $languages->{ $repo->{lang} }++ if $repo->{lang};
+ }
+}
+
+sub _contribs {
+ my ( $self, $languages, $login ) = @_;
+
+ my $contribs = $self->db_contributors->find( { contributor => $login } );
+
+ while ( my $contrib = $contribs->next ) {
+ my $repo = $self->db_repositories->find_one(
+ { uniq_name => $contrib->{project} } );
+ $languages->{ $repo->{lang} }++ if $repo->{lang};
+ }
+}
+
+1;
diff --git a/lib/GitHub/Collector/Command/profile.pm b/lib/GitHub/Collector/Command/profile.pm
new file mode 100644
index 0000000..872bf28
--- /dev/null
+++ b/lib/GitHub/Collector/Command/profile.pm
@@ -0,0 +1,90 @@
+package GitHub::Collector::Command::profile;
+
+use YAML;
+use Try::Tiny;
+use Moose;
+use boolean;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::SPORE
+ GitHub::Collector::Role::Profile
+ GitHub::Collector::Role::MongoDB
+ GitHub::Collector::Role::Pause
+);
+
+has seed => (
+ isa => 'ArrayRef',
+ is => 'ro',
+ required => 1,
+ auto_deref => 1,
+ documentation => 'seed to crawl',
+ lazy => 1,
+ default => sub {
+ my $self = shift;
+ return $self->context->{seed};
+ }
+);
+
+sub execute {
+ my $self = shift;
+
+ $self->log("start to crawl profiles");
+
+ foreach my $profile ($self->seed) {
+ $self->_bootstrap_profile($profile);
+ }
+
+ $self->log("finish to boostrap the seed");
+ $self->_crawl(0);
+ $self->log("crawl completed");
+}
+
+sub get_profile {
+ my ( $self, $profile ) = @_;
+
+ my $login = $profile->{login};
+
+ my $profile_info = $self->fetch_profile($login);
+
+ return unless $profile_info;
+
+ $self->save_profile($profile_info);
+ $self->add_relations($login);
+ $self->profile_is_done($login);
+}
+
+sub _crawl {
+ my $self = shift;
+
+ my $profiles_to_crawl = $self->db_profiles->find({done => false});
+
+ while (my $profile = $profiles_to_crawl->next) {
+ $self->get_profile($profile);
+ }
+
+ if ($self->db_profiles->find({done => false})->count > 0) {
+ $self->_crawl;
+ }
+}
+
+sub _bootstrap_profile {
+ my ( $self, $profile ) = @_;
+
+ my $has_profile = $self->db_profiles->find( { login => $profile } );
+ return if $has_profile->count > 0;
+ $self->debug("insert $profile into profiles");
+ my $res = $self->db_profiles->insert(
+ { login => $profile, done => false, repositories_done => false } );
+}
+
+1;
+
+=head1 NAME
+
+GitHub::Collector::Command::profile - foo
+
+=cut
diff --git a/lib/GitHub/Collector/Command/repository.pm b/lib/GitHub/Collector/Command/repository.pm
new file mode 100644
index 0000000..3e7fd57
--- /dev/null
+++ b/lib/GitHub/Collector/Command/repository.pm
@@ -0,0 +1,44 @@
+package GitHub::Collector::Command::repository;
+
+use Moose;
+use boolean;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::SPORE
+ GitHub::Collector::Role::MongoDB
+ GitHub::Collector::Role::Repository
+);
+
+sub execute {
+ my $self = shift;
+
+ $self->log("start to crawl repositories");
+ $self->_crawl();
+ $self->log("crawl completed");
+}
+
+sub get_repositories {
+ my ($self, $profile) = @_;
+
+ my $login = $profile->{login};
+
+ $self->log("fetch repositories for $login");
+ $self->fetch_repositories($profile);
+ $self->log("finished to work on $login");
+}
+
+sub _crawl {
+ my $self = shift;
+
+ my $profiles = $self->db_profiles->find( { repositories_done => false } );
+
+ while ( my $profile = $profiles->next ) {
+ $self->get_repositories($profile);
+ }
+}
+
+1;
diff --git a/lib/GitHub/Collector/Command/stats.pm b/lib/GitHub/Collector/Command/stats.pm
new file mode 100644
index 0000000..fc71d10
--- /dev/null
+++ b/lib/GitHub/Collector/Command/stats.pm
@@ -0,0 +1,94 @@
+package GitHub::Collector::Command::stats;
+
+use 5.010;
+use Moose;
+use boolean;
+use JSON;
+use DateTime;
+
+extends qw(MooseX::App::Cmd::Command);
+
+with qw(
+ GitHub::Collector::Role::Context
+ GitHub::Collector::Role::Logger
+ GitHub::Collector::Role::MongoDB
+ GitHub::Collector::Role::Languages
+);
+
+sub execute {
+ my ($self, ) = @_;
+
+ my $profiles = $self->db_profiles->find();
+
+ my $languages = {};
+ my $country = {};
+ my $company = {};
+ my $created = {};
+
+ while ( my $profile = $profiles->next ) {
+ my $date = $profile->{created_at};
+
+ next if !defined $date;
+
+ my ($year, $month) = $date =~ /(\d{4})(?:-|\/)(\d{2})/;
+ next if (!defined $year || !defined $month);
+
+ my $lang = $self->map_languages( $profile->{language} );
+ $languages->{$lang}++ if $lang ne 'Other';
+
+ $country->{ $profile->{country} }++ if $profile->{country};
+ $company->{ $profile->{company} }++ if defined $profile->{company};
+
+ $created->{global}->{ $year . '/' . $month }->{total}++;
+ $created->{languages}->{$lang}->{ $year . '/' . $month }->{total}++;
+ }
+
+# $self->_sort_and_display($languages);
+# $self->_sort_and_display($country, 10);
+# $self->_sort_and_display($company, 100);
+
+ $self->_create_flot( $created->{global}, 'global' );
+ foreach my $lang ( keys %{ $created->{languages} } ) {
+ $self->_create_flot( $created->{languages}->{$lang}, $lang );
+ }
+}
+
+sub _sort_and_display {
+ my ($self, $data, $iter) = @_;
+
+ my @sorted = sort {$data->{$b} <=> $data->{$a}} keys %$data;
+
+ my $total = 0;
+ map {$total += $data->{$_} } @sorted;
+ $iter ||= (scalar @sorted - 1);
+
+ for(0..$iter){
+ my $pct = int(($data->{$sorted[$_]} / $total) * 100);
+ say " # ".$sorted[$_].":".$data->{$sorted[$_]}. " ($pct%)";
+ }
+}
+
+sub _create_flot {
+ my ($self, $data, $label) = @_;
+
+ my $graph = {};
+ $graph->{label} = $label;
+
+ my @sorted = sort {$a cmp $b} keys %$data;
+
+ # remove the first and last value since they're not really worthy
+ shift @sorted;
+ pop @sorted;
+
+ foreach my $month (@sorted) {
+ (my $y, my $m) = $month =~ /(\d{4})\/(\d{2})/;
+ my $epoch = DateTime->new(year => $y, month => $m, day => 01)->epoch * 1000;
+ push @{$graph->{data}}, [$epoch, $data->{$month}->{total}];
+ }
+
+ open my $fh, '>', $label.'.json';
+ print $fh JSON::encode_json($graph);
+ close $fh;
+}
+
+1;