ruby爬虫模板
Posted znsongshu
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby爬虫模板相关的知识,希望对你有一定的参考价值。
require ‘restclient‘
require ‘open-uri‘
require ‘open_uri_redirections‘
require ‘nokogiri‘
require ‘json‘
require ‘yaml‘
require ‘fileutils‘
require ‘base64‘
MAX_RETRY_TIMES = 5
ROOT_DIR = ‘/home/zn/work/small-tools-master/zlk/tu/‘
BASE_URL = ‘https://newceshiao.com/mnkc/tiku/?id=‘
COOKIE = :VerificationCodeNum => ‘1‘, :QZ_KSUser => ‘UserID=15357507&UserName=ppkao1520606811&UserToken=cw05IVsvRbyxuPoQeQIU4%252bZNshdiFE%252fN6LGCVScB%252bnQLBUYAu7SA7A%253d%253d‘
@cookie = ‘VerificationCodeNum=1; PPKAO=PPKAOSTID%3D987%26PPKAOCEID%3D%26PPKAOSJID%3D%26UserName%3D%26EDays%3D‘
@agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (Khtml, like Gecko) Ubuntu Chromium/63.0.3239.84 Chrome/63.0.3239.84 Safari/537.36"
@content_type = "application/x-www-form-urlencoded"
@download_error = Logger.new(‘download_error.log‘)
@no_doc = Logger.new(‘nodoc_error.log‘)
@parse_error = Logger.new(‘parse_error.log‘)
FileUtils.makedirs(ROOT_DIR) unless File.exists?ROOT_DIR
def download_image(image)
begin
name = Time.now.to_i.to_s + "%04d" % [rand(10000)]
suffix = image.sub(/.+\./, ‘‘)
img = name + "." + suffix
File.open("#ROOT_DIR/#img", "w") do |f|
f.write(open("#image").read)
end
rescue Exception => e
puts e.message
end
return img
end
def img_base64(image_src)
file = open(image_src).read
image = Base64.encode64(file)
end
def get_doc(search_link)
retry_times = 0
doc = nil
begin
#doc = Nokogiri::HTML(open(search_link,
# "Cookie" => @cookie,
# "User-Agent" => @agent,
# "Referer" => "https://study.chinaedu.com/megrez/synchronous/list.do?gradeCode=0201&specialtyCode=02",
# "Host" => "study.chinaedu.com",
# :allow_redirections => :all
# ))
#RestClient.post(url, access_token: access_token, image: image, content_type: @content_type) do |response|
# body = JSON.parse(response.body)
# return body["words_result"][0]["words"]
#end
RestClient.get(search_link, :cookies => COOKIE ) do |response|
doc = Nokogiri::HTML(response.follow_redirection)
end
rescue Exception => e
puts e.message
retry_times += 1
@download_error.error "download error: #search_link"
retry if retry_times < MAX_RETRY_TIMES
end
return doc
end
def process
result = []
pages = Array(18283..18583)
pages.each_with_index do |i, index|
link = BASE_URL + i.to_s
puts link
doc = get_doc(link)
if doc.nil?
@no_doc.error link
next
end
begin
ctg_one = doc.css(‘.ttop h3 a‘)[0].text
ctg_two = doc.css(‘img‘)[‘src‘]
rescue
@parse_error.error link
next
end
hash = Hash.new
hash[‘ctg_one‘] = ctg_one
hash[‘ctg_two‘] = ctg_two
result << hash
if (index+1)%10 == 0 || index == pages.size - 1
File.open("result.yaml",‘a+‘)|f| YAML.dump(result, f)
result = []
end
sleep rand(4..10)
end
end
process
以上是关于ruby爬虫模板的主要内容,如果未能解决你的问题,请参考以下文章