ruby github爬虫
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby github爬虫相关的知识,希望对你有一定的参考价值。
class Worker
def self.start
new.run
end
def run
loop { perform_task }
end
def perform_task
task = Task.next
retryable_debug do
task ? task.debug.perform : sleep_debug(1)
end
rescue Octokit::Error, Faraday::Error::ClientError
task.pause
end
end
require 'delegate'
class Task
extend Forwardable
def_delegators 'self.class', :collection, :indication
class << self
def queues
[OrgRepos, Members, Orgs, Projects, Users, Stars, Contributions]
end
def next
queues.inject(false) do |task, queue|
task or queue.fetch
end
end
def fetch
task = new_tasks.modify indication(:process)
new task if task
end
def new_tasks
collection.without indicator
end
def collection
mongodb.send collection_name
end
def collection_name(value=nil)
if value
@collection_name = value
else
@collection_name ||= name.split('::').last.downcase
end
end
def indicator(value=nil)
if value
@indicator = value
else
@indicator ||= :state
end
end
def indication(value, additional={})
{ '$set' => { indicator => value.to_s }.merge!(additional) }
end
end
def initialize(doc)
@doc = doc
end
def target
@doc['_id']
end
def finished(info={})
indication :ready, info
end
def wrap_array(object)
object.is_a?(Array) ? object : [object]
end
def scope
collection.find(_id: target)
end
def pause
scope.update indication(:pause)
end
def extract_repos(type=:repos)
repos = wrap_array Github.send type, target
repos.each do |repo|
mongodb.projects.upsert repo['full_name']
end
end
class Projects < Task
def perform
info = Github.repo target
scope.update finished info
if info['parent']
collection.upsert info['parent']['full_name']
end
end
end
class Users < Task
def perform
extract_repos
info = Github.user target
scope.update finished info
end
end
class Stars < Task
collection_name :users
indicator :star_state
def perform
extract_repos :starred
scope.update finished
end
end
class Contributions < Task
collection_name :projects
indicator :contribute_state
def perform
return scope.update indication :wait unless @doc['state'] == 'ready'
return scope.update indication :fork if @doc['fork']
contributors = wrap_array Github.contributors target
debug "count: #{contributors.size}"
contributors.each do |it|
next unless it['login']
next if mongodb.users.find(_id: it['login'], 'commits.to' => target).one
commits = { commits: { to: target, count: it['contributions'] }}
mongodb.users.upsert it['login'], '$push' => commits
end
scope.update finished
end
end
class Orgs < Task
def perform
info = Github.org target
members = extract_members
scope.update finished info.merge!(members: members)
end
private
def extract_members
members = wrap_array Github.org_members target
members.each do |it|
mongodb.users.upsert it['login']
end
end
end
class Members < Task
collection_name :users
indicator :member_state
def perform
orgs = wrap_array Github.orgs target
orgs.each do |it|
mongodb.orgs.upsert it['login']
end
scope.update finished
end
end
class OrgRepos < Task
collection_name :orgs
indicator :repo_state
def perform
extract_repos :org_repos
scope.update finished
end
end
end
以上是关于ruby github爬虫的主要内容,如果未能解决你的问题,请参考以下文章