ruby psy_crawler.rb

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby psy_crawler.rb相关的知识,希望对你有一定的参考价值。

require 'nokogiri'
require 'open-uri'
require 'json'


module PsyCrawler

  DOMAIN = "http://www.psydb.net"

  def self.get_artists

    ['0', *('a'..'z')].each do |i|
      page = Nokogiri::HTML(open("#{DOMAIN}/artists/#{i}/"))

      artists = []
      page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]//tr//td[3]').each do |td|
        td.to_s.match(/<a href="(\/artists\/[0a-z]\/.+\.php)">(.+)<\/a>/) do |m|
          artists << Hash[:name, m[2], :link, DOMAIN + m[1]]
        end
      end


      dir_name = "artists/#{i}"
      Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")

      file_name = "#{i}_#{artists.size}"
      File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
        puts "writing: #{file_name}"
        file.write(artists.to_json)
      end

    end

  end


  IGNORED_DATA = %w(name active tracks remixes updated)

  def self.get_data(index, artist)

    page = Nokogiri::HTML(open(artist[:link]))

    page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]/tr[2]/td/table//tr').each do |tr|

      unless tr.at_css("td.TBB").nil?

        key = tr.at_css("td.TBB").content.downcase unless tr.at_css("td.TBB").nil?
        value = tr.at_css("td.TB").content unless tr.at_css("td.TB").nil?

        artist[key.to_sym] = value unless IGNORED_DATA.include? key
      end

    end

    dir_name = "artists/#{index}"
    Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")

    file_name = artist[:name].gsub(/(\s)/, '_').downcase
    File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
      puts "writing: #{file_name}"
      file.write(artist.to_json)
    end
  end

end

PsyCrawler.get_artists

以上是关于ruby psy_crawler.rb的主要内容,如果未能解决你的问题,请参考以下文章

Ruby运算符

Ruby 25 岁了!Ruby 之父说 Ruby 3 有望 3 倍提速

如何学习ruby?Ruby学习技巧分享

ruby Ruby脚本,看看是否用openssl编译了ruby

什么是ruby?

ruby和ruby ee