ruby psy_crawler.rb
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby psy_crawler.rb相关的知识,希望对你有一定的参考价值。
require 'nokogiri'
require 'open-uri'
require 'json'
module PsyCrawler
DOMAIN = "http://www.psydb.net"
def self.get_artists
['0', *('a'..'z')].each do |i|
page = Nokogiri::HTML(open("#{DOMAIN}/artists/#{i}/"))
artists = []
page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]//tr//td[3]').each do |td|
td.to_s.match(/<a href="(\/artists\/[0a-z]\/.+\.php)">(.+)<\/a>/) do |m|
artists << Hash[:name, m[2], :link, DOMAIN + m[1]]
end
end
dir_name = "artists/#{i}"
Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")
file_name = "#{i}_#{artists.size}"
File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
puts "writing: #{file_name}"
file.write(artists.to_json)
end
end
end
IGNORED_DATA = %w(name active tracks remixes updated)
def self.get_data(index, artist)
page = Nokogiri::HTML(open(artist[:link]))
page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]/tr[2]/td/table//tr').each do |tr|
unless tr.at_css("td.TBB").nil?
key = tr.at_css("td.TBB").content.downcase unless tr.at_css("td.TBB").nil?
value = tr.at_css("td.TB").content unless tr.at_css("td.TB").nil?
artist[key.to_sym] = value unless IGNORED_DATA.include? key
end
end
dir_name = "artists/#{index}"
Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")
file_name = artist[:name].gsub(/(\s)/, '_').downcase
File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
puts "writing: #{file_name}"
file.write(artist.to_json)
end
end
end
PsyCrawler.get_artists
以上是关于ruby psy_crawler.rb的主要内容,如果未能解决你的问题,请参考以下文章
Ruby运算符
Ruby 25 岁了!Ruby 之父说 Ruby 3 有望 3 倍提速
如何学习ruby?Ruby学习技巧分享
ruby Ruby脚本,看看是否用openssl编译了ruby
什么是ruby?
ruby和ruby ee