ruby github爬虫

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby github爬虫相关的知识,希望对你有一定的参考价值。

class Worker
  def self.start
    new.run
  end

  def run
    loop { perform_task }
  end

  def perform_task
    task = Task.next
    retryable_debug do
      task ? task.debug.perform : sleep_debug(1)
    end
  rescue Octokit::Error, Faraday::Error::ClientError
    task.pause
  end
end
require 'delegate'

class Task
  extend Forwardable
  def_delegators 'self.class', :collection, :indication

  class << self
    def queues
       [OrgRepos, Members, Orgs, Projects, Users, Stars, Contributions]
    end

    def next
      queues.inject(false) do |task, queue|
        task or queue.fetch
      end
    end

    def fetch
      task = new_tasks.modify indication(:process)
      new task if task
    end

    def new_tasks
      collection.without indicator
    end

    def collection
      mongodb.send collection_name
    end

    def collection_name(value=nil)
      if value
        @collection_name = value
      else
        @collection_name ||= name.split('::').last.downcase
      end
    end

    def indicator(value=nil)
      if value
        @indicator = value
      else
        @indicator ||= :state
      end
    end

    def indication(value, additional={})
      { '$set' => { indicator => value.to_s }.merge!(additional) }
    end
  end

  def initialize(doc)
    @doc = doc
  end

  def target
    @doc['_id']
  end

  def finished(info={})
    indication :ready, info
  end

  def wrap_array(object)
    object.is_a?(Array) ? object : [object]
  end

  def scope
    collection.find(_id: target)
  end

  def pause
    scope.update indication(:pause)
  end

  def extract_repos(type=:repos)
    repos = wrap_array Github.send type, target
    repos.each do |repo|
      mongodb.projects.upsert repo['full_name']
    end
  end

  class Projects < Task
    def perform
      info = Github.repo target
      scope.update finished info

      if info['parent']
        collection.upsert info['parent']['full_name']
      end
    end
  end

  class Users < Task
    def perform
      extract_repos
      info = Github.user target
      scope.update finished info
    end
  end

  class Stars < Task
    collection_name :users
    indicator :star_state

    def perform
      extract_repos :starred
      scope.update finished
    end
  end

  class Contributions < Task
    collection_name :projects
    indicator :contribute_state

    def perform
      return scope.update indication :wait unless @doc['state'] == 'ready'
      return scope.update indication :fork if @doc['fork']

      contributors = wrap_array Github.contributors target
      debug "count: #{contributors.size}"
      contributors.each do |it|
        next unless it['login']
        next if mongodb.users.find(_id: it['login'], 'commits.to' => target).one
        commits = { commits: { to: target, count: it['contributions'] }}
        mongodb.users.upsert it['login'], '$push' => commits
      end
      scope.update finished
    end
  end

  class Orgs < Task
    def perform
      info = Github.org target
      members = extract_members
      scope.update finished info.merge!(members: members)
    end

    private

    def extract_members
      members = wrap_array Github.org_members target
      members.each do |it|
        mongodb.users.upsert it['login']
      end
    end
  end

  class Members < Task
    collection_name :users
    indicator :member_state

    def perform
      orgs = wrap_array Github.orgs target
      orgs.each do |it|
        mongodb.orgs.upsert it['login']
      end
      scope.update finished
    end
  end

  class OrgRepos < Task
    collection_name :orgs
    indicator :repo_state

    def perform
      extract_repos :org_repos
      scope.update finished
    end
  end
end

以上是关于ruby github爬虫的主要内容,如果未能解决你的问题,请参考以下文章

Ruby和python哪个更易懂、灵活?

Ruby用百度搜索爬虫

ruby 轻量级并行web图形爬虫

ruby 针对复杂网络的twitter爬虫

ruby 基于EM的爬虫

ruby 使用Mechanize的Hacky爬虫