diff options
| -rw-r--r-- | clean-country.pl | 34 | ||||
| -rw-r--r-- | crawl.pl | 2 | ||||
| -rw-r--r-- | lib/githubexplorer.pm | 29 | ||||
| -rw-r--r-- | lib/githubexplorer/Gexf.pm | 362 | ||||
| -rw-r--r-- | lib/githubexplorer/Schema/Result/Profiles.pm | 2 |
5 files changed, 303 insertions, 126 deletions
diff --git a/clean-country.pl b/clean-country.pl new file mode 100644 index 0000000..9fb0b5c --- /dev/null +++ b/clean-country.pl @@ -0,0 +1,34 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use lib ('lib'); +use 5.010; +use Geo::GeoNames; +use githubexplorer::Schema; +use YAML::Syck; + +my $conf = LoadFile(shift); + +my $schema = githubexplorer::Schema->connect(@{$conf->{connect_info}}); + +my $profiles = $schema->resultset('Profiles')->search({id => {'>' => 55781}, location => {'!=' => + undef}, location => {'!=' => ''}}); + +my $geo = Geo::GeoNames->new(); + +while (my $pr = $profiles->next) { + next if $pr->location =~ /^http/; + next if $pr->country; + next if $pr->location =~ /earth/i; + say "-> process ".$pr->login." with ".$pr->location; + my $result = $geo->search( q => $pr->location, maxRows => 1 ); + my $res = shift @$result; + if ($res) { + eval { + $pr->update({city => $res->{name}, country => $res->{countryName}}); + }; + next if $@; + say "** fix with ".$pr->city . " in ".$pr->country; + } + sleep(1); +}
\ No newline at end of file @@ -12,6 +12,7 @@ GetOptions( 'repo' => \my $repo, 'graph' => \my $graph, 'network' => \my $network, + 'seed' => \my $seed, 'conf=s' => \my $conf, ); @@ -30,3 +31,4 @@ $gh->harvest_profiles if $profiles; $gh->harvest_repo if $repo; $gh->graph_repo if $network; $gh->gen_graph if $graph; +$gh->gen_seed if $seed; diff --git a/lib/githubexplorer.pm b/lib/githubexplorer.pm index 4260842..5744e08 100644 --- a/lib/githubexplorer.pm +++ b/lib/githubexplorer.pm @@ -25,8 +25,8 @@ has seed => ( return \@seeds; } ); -has api_login => ( isa => 'Str', is => 'ro', required => 1 ); -has api_token => ( isa => 'Str', is => 'ro', required => 1 ); +has api_login => ( isa => 'Str|Undef', is => 'ro', required => 1 ); +has api_token => ( isa => 'Str|Undef', is => 'ro', required => 1 ); has connect_info => ( isa => 'ArrayRef', is => 'ro', required => 1 ); has with_repo => ( isa => 'Bool', is => 'ro', default => sub {0} ); has schema => ( @@ -69,8 +69,7 @@ sub gen_graph { my $self = shift; $self->_connect unless $self->has_schema; my $graph = githubexplorer::Gexf->new( schema => $self->schema ); - my $xml = $graph->gen_gexf; - $xml > io('crawl.gexf'); + $graph->gen_gexf; } sub graph_repo { @@ -82,14 +81,32 @@ sub graph_repo { } } -sub extract_seed { +sub gen_seed { my $self = shift; $self->_connect unless $self->has_schema; my $profiles = $self->schema->resultset('Profiles') ->search( { blog => { '!=' => undef }, blog => { '!=' => '' } } ); + + open my $fh, '>', 'seed.csv'; while ( my $pr = $profiles->next ) { + my %languages; + my $forks = $self->schema->resultset('Fork')->search({profile => + $pr->id}); + while (my $fork = $forks->next) { + my $languages = + $self->schema->resultset('RepoLang')->search({repository => + $fork->repos->id}); + while (my $lang = $languages->next) { + $languages{$lang->language->name}+=$lang->size; + } + } + my @sorted_lang = sort {$languages{$b} <=> $languages{$a}} keys %languages; + my $main_lang = shift @sorted_lang; + my $other_lang = join('|', @sorted_lang); + my $str = $profiles->blog.";;;github;".$main_lang.";".$other_lang.";".$profile->country."\n"; + print $fh $str; } + close $fh; } - 1; diff --git a/lib/githubexplorer/Gexf.pm b/lib/githubexplorer/Gexf.pm index f7e38cb..58281d4 100644 --- a/lib/githubexplorer/Gexf.pm +++ b/lib/githubexplorer/Gexf.pm @@ -9,113 +9,114 @@ has id_edges => (is => 'rw', isa => 'Num', traits => ['Counter'], default => 0, handles => {inc_edges => 'inc'}); has graph => ( - is => 'rw', - isa => 'HashRef', - default => sub { - my $graph = { - gexf => { - version => "1.1", - meta => { creator => ['linkfluence'] }, - graph => { - type => 'static', - attributes => { - class => 'node', - type => 'static', - attribute => [ - { - id => 0, - type => 'float', - title => 'name' - }, - { - id => 1, - type => 'string', - title => 'type', - }, - { - id => 2, - type => 'float', - title => 'followers_count' - }, - { - id => 3, - type => 'float', - title => 'following_count' - }, - { - id => 4, - type => 'float', - title => 'forks', - }, - { - id => 5, - type => 'string', - title => 'location', - }, - { - id => 6, - type => 'float', - title => 'public_gist_count', - }, - { - id => 7, - type => 'float', - title => 'public_repo_count', - }, - { - id => 8, - type => 'string', - title => 'language', - }, - { - id => 9, - type => 'string', - title => 'description', - }, - { - id => 10, - type => 'float', - title => 'watchers', - } - ] - } +is => 'rw', +isa => 'HashRef', +default => sub { + my $graph = { + gexf => { + version => "1.1", + meta => { creator => ['linkfluence'] }, + graph => { + type => 'static', + attributes => { + class => 'node', + type => 'static', + attribute => [ + { + id => 0, + type => 'float', + title => 'name' + }, + { + id => 1, + type => 'string', + title => 'type', + }, + { + id => 2, + type => 'float', + title => 'followers_count' + }, + { + id => 3, + type => 'float', + title => 'following_count' + }, + { + id => 4, + type => 'float', + title => 'forks', + }, + { + id => 5, + type => 'string', + title => 'location', + }, + { + id => 6, + type => 'float', + title => 'public_gist_count', + }, + { + id => 7, + type => 'float', + title => 'public_repo_count', + }, + { + id => 8, + type => 'string', + title => 'language', + }, + { + id => 9, + type => 'string', + title => 'description', + }, + { + id => 10, + type => 'float', + title => 'watchers', + } + ] } } - }; - } + } + }; +} ); sub gen_gexf { my $self = shift; - $self->profiles; - #$self->repositories; - say "total nodes : ".scalar (@{ $self->graph->{gexf}->{graph}->{nodes}->{node} }); - say "total edges : ".scalar (@{ $self->graph->{gexf}->{graph}->{edges}->{edge} }); + + $self->basic_profiles; + my $basic_profiles = $self->dump_gexf; + $basic_profiles > io('basic_profiles.gexf'); + + $self->profiles_from_repositories; + my $profiles_from_repositories = $self->dump_gexf; + $profiles_from_repositories > io ('profiles_from_repositories.gexf'); + + $self->repositories_from_profiles; + my $repositories_from_profiles = $self->dump_gexf; + $profiles_from_repositories > io ('repositories_from_profiles.gexf'); +} + +sub dump_gefx { + my $self = shift; my $xml_out = XMLout( $self->graph, AttrIndent => 1, keepRoot => 1 ); + $self->graph->{gexf}->{graph}->{nodes} = undef; + $self->graph->{gexf}->{graph}->{edges} = undef; return $xml_out; } -sub profiles { +sub basic_profiles { my $self = shift; - say "start profiles ..."; + $self->id_edges(0); + say "start basic_profiles ..."; my $profiles = $self->schema->resultset('Profiles')->search(); while ( my $profile = $profiles->next ) { - my $node = { - id => $profile->id, - label => $profile->login, - attvalues => { - attvalue => [ - { for => 0, value => $profile->name}, - { for => 1, value => "profile"}, - { for => 2, value => $profile->followers_count}, - { for => 3, value => $profile->following_count}, - { for => 5, value => $profile->location}, - { for => 6, value => $profile->public_gist_count}, - { for => 7, value => $profile->public_repo_count}, - ] - }, - }; + my $node = $self->_get_node_for_profile($profile); push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node; } @@ -129,26 +130,57 @@ sub profiles { }; push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; } - say " done"; + say "basic_profiles done"; } -sub repositories { +sub profiles_from_repositories { my $self = shift; + $self->id_edges(0); + say "start profiles_from_repositories ..."; - say "start repositories ..."; - my $repositories = $self->schema->resultset('Repositories')->search({fork => 0}); + my ($nodes); + my $profiles = $self->schema->resultset('Profiles')->search(); + while (my $profile = $profiles->next) { + my $node = $self->_get_node_for_profile($profile); + push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node; + } + my $repositories = $self->schema->resultset('Repositories')->search(); while (my $repos = $repositories->next) { + my $forks = $self->schema->resultset('Fork')->search({repos => $repos->id}); + my @profiles; + while (my $fork = $forks->next) { + push @profiles, $fork->profile->id; + } + foreach my $p (@profiles) { + map { + next if $_ eq $p; + my $e = { + source => $p, + target => $_, + id => $self->inc_edges, + }; + push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; + } @profiles; + } + } + say "profiles_from_repositories done"; +} + +sub repositories_from_profiles { + my $self = shift; + $self->id_edges(0); + say "start repositories_from_profiles ..."; - next if $repos->name =~ /dotfiles/i; - # available in forks ? - my $check_fork = $self->schema->resultset('Fork')->search({repos => $repos->id}); - next if $check_fork->count < 1; + my ($nodes); + my $repositories = $self->schema->resultset('Repositories')->search(); + while (my $repos = $repositories->next) { + next if $repos->name =~ /dotfiles/; - if (!grep {$_->{id} eq "repos_".$repos->name} @{$self->graph->{gexf}->{graph}->{nodes}->{node}}) { + if (!exists $nodes->{$repos->name}) { my $language = $self->schema->resultset('RepoLang')->search({repository => $repos->id}, {order_by => 'size'})->first; my $lang = $language ? $language->language->name : 'none'; - my $node = { - id => "repos_".$repos->name, + $nodes->{$repos->name} = { + id => $repos->name, label => $repos->name, attvalues => { attvalue => [ @@ -161,28 +193,118 @@ sub repositories { ], }, }; - push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node; } - my $e = { - source => $repos->id_profile->id, - target => "repos_".$repos->name, - id => $self->inc_edges, - }; - push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; + my $forks = $self->schema->resultset('Fork')->search({repos => $repos->id}); + while (my $fork = $forks->next) { + my $e = { + source => $fork->profile->id, + target => $fork->repos->name, + id => $self->inc_edges, + }; + push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; + } } + map {push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $nodes->{$_} keys %$nodes; + say "repositories_from_profiles done"; +} + +sub stats_languages_by_country { + my $self = shift; +} + +sub _get_node_for_profile { + my ($self, $profile) = @_; + my ($languages, $ordered_languages) = $self->_get_languages_for_profile($profile); + my $main_lang = shift @$ordered_languages; + my $node = { + id => $profile->id, + label => $profile->login, + attvalues => { + attvalue => [ + { for => 0, value => $profile->name}, + { for => 1, value => "profile"}, + { for => 2, value => $profile->followers_count}, + { for => 3, value => $profile->following_count}, + { for => 5, value => $profile->country}, + { for => 6, value => $profile->public_gist_count}, + { for => 7, value => $profile->public_repo_count}, + { for => 8, value => $main_lang}, + ] + }, + }; + return $node; +} - my $forks = $self->schema->resultset('Fork')->search(); +sub _get_languages_for_profile { + my ($self, $profile) = shift; + my $forks = $self->schema->resultset('Fork')->search({profile => + $profile->id}); + + my %languages; while (my $fork = $forks->next) { - next if $fork->repos->name =~ /dotfiles/i; - my $e = { - source => $fork->profile->id, - target => "repos_".$fork->repos->name, - id => $self->inc_edges, - }; - push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; + my $languages = + $self->schema->resultset('RepoLang')->search({repository => + $fork->repos->id}); + while (my $lang = $languages->next) { + $languages{$lang->language->name}+=$lang->size; + } } - say " done"; + my @sorted_lang = sort {$languages{$b} <=> $languages{$a}} keys %languages; + return (\%languages, \@sorted_lang); } +#sub repositories { +# my $self = shift; +# +# say "start repositories ..."; +# my $repositories = $self->schema->resultset('Repositories')->search({fork => 0}); +# while (my $repos = $repositories->next) { +# +# next if $repos->name =~ /dotfiles/i; +# # available in forks ? +# my $check_fork = $self->schema->resultset('Fork')->search({repos => $repos->id}); +# next if $check_fork->count < 1; +# +# if (!grep {$_->{id} eq "repos_".$repos->name} @{$self->graph->{gexf}->{graph}->{nodes}->{node}}) { +# my $language = $self->schema->resultset('RepoLang')->search({repository => $repos->id}, {order_by => 'size'})->first; +# my $lang = $language ? $language->language->name : 'none'; +# my $node = { +# id => "repos_".$repos->name, +# label => $repos->name, +# attvalues => { +# attvalue => [ +# { for => 0, value => $repos->name}, +# { for => 1, value => "repository"}, +# { for => 4, value => $repos->forks}, +# { for => 9, value => $repos->description}, +# { for => 10, value => $repos->watchers}, +# { for => 8, value => $lang}, +# ], +# }, +# }; +# push @{ $self->graph->{gexf}->{graph}->{nodes}->{node} }, $node; +# } +# my $e = { +# source => $repos->id_profile->id, +# target => "repos_".$repos->name, +# id => $self->inc_edges, +# }; +# push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; +# } +# +# my $forks = $self->schema->resultset('Fork')->search(); +# +# while (my $fork = $forks->next) { +# next if $fork->repos->name =~ /dotfiles/i; +# my $e = { +# source => $fork->profile->id, +# target => "repos_".$fork->repos->name, +# id => $self->inc_edges, +# }; +# push @{ $self->graph->{gexf}->{graph}->{edges}->{edge} }, $e; +# } +# say " done"; +#} + 1; diff --git a/lib/githubexplorer/Schema/Result/Profiles.pm b/lib/githubexplorer/Schema/Result/Profiles.pm index e0349d7..b43211e 100644 --- a/lib/githubexplorer/Schema/Result/Profiles.pm +++ b/lib/githubexplorer/Schema/Result/Profiles.pm @@ -15,6 +15,8 @@ __PACKAGE__->add_columns( following_count => { data_type => 'int' }, gravatar_id => { data_type => 'varchar', is_nullable => 1 }, location => { data_type => 'varchar', is_nullable => 1 }, + country => { data_type => 'varchar', is_nullable => 1 }, + city => { data_type => 'varchar', is_nullable => 1 }, name => { data_type => 'varchar', is_nullable => 1 }, public_gist_count => { data_type => 'int' }, public_repo_count => { data_type => 'int' }, |
