ruby HDtracks Info Scraper
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ruby HDtracks Info Scraper相关的知识,希望对你有一定的参考价值。
# Let's Scrape Some Info! (from hdtracks.com)
# User defines scope:
# -> scrape a specific label (by ID)
# -> crawl through a rage of label IDs
# -> scrape a list of plaintext album names
# -> scrape the "new" section
# -> scrape a custom hdtracks URL
# --------------------------------------------
# ------------
# requirements
# ------------
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'mechanize' # don't technically need mechanize for its methods; it's just faster at downloading an html page than nokogiri
require 'google_drive'
$sum_total_of_albums = 1
# -------
# methods
# -------
def what_date_is_it()
time = Time.new
correct_time_format = time.strftime("%m%d%y@%H:%M")
return correct_time_format
end
def scrape_type()
return "label" if $user_option == 1
return "new" if $user_option == 2
return "albumList" if $user_option == 3
return "labelList" if $user_option == 4
return "customURL" if $user_option == 5
return nil
end
def reformatter(smart_format_array, metadata)
return smart_format_array[0][0] if metadata == "FORMAT_A"
return smart_format_array[0][1] if metadata == "URL_A"
return smart_format_array[1][0] if metadata == "FORMAT_B" && $clean_format_array.length > 1
return smart_format_array[1][1] if metadata == "URL_B" && $clean_format_array.length > 1
return smart_format_array[2][0] if metadata == "FORMAT_C" && $clean_format_array.length > 2
return smart_format_array[2][1] if metadata == "URL_C" && $clean_format_array.length > 2
return nil
end
def dumb_scrape(page_source, url, metadata)
case metadata
when "TITLE" # scrape for title, artist, label, alt_text
tal_array = page_source.css("div[class='album-main-details']").text
return tal_array.split(/\s\s\s+/)
when "FORMAT" # scrape for formats a,b,c and urls a,b,c
dumb_format_array = page_source.css("div[class='product-available-format']").text
dumb_format_array = dumb_format_array.split(/\s/)
$clean_format_array = []
dumb_format_array.each do |x|
y = x.gsub(",", "")
$clean_format_array.push(y) if x.match(/^\d/)
end
format_hash = {}
i = 0
$clean_format_array.length.times do
temp_format_hash = { "#{(page_source.css("select[name='bitrateselect'] option")[i].text).gsub("\/", "kHz\/")}bit" => (page_source.css("select[name='bitrateselect'] option")[i]['value']) }
format_hash.merge!(temp_format_hash)
i += 1
end
format_hash.each { |k,v| format_hash[k] = url if v == "#" }
return format_hash.to_a
else
return nil
end
end
def smart_scrape(url, metadata)
unless url == nil
smart_agent = Mechanize.new
begin
page = smart_agent.get(url)
rescue Exception => e
page = e.page
end
page_source = Nokogiri::HTML(page.body)
case metadata
when "PRICE"
scraped_price = page_source.css("span[class='price']")[0].text
return scraped_price.to_s
when "LENGTH"
scraped_length = page_source.css("table[id='album-table'] td")[1].text
return scraped_length.to_s
when "ID"
scraped_id = page_source.css("input[type='hidden']")[0]["value"]
return scraped_id.to_s
when "UPC"
scraped_upc = (page_source.css("p[class='product-image'] img").attribute('src')).to_s.gsub("http://s3.amazonaws.com/hdtrack_img/", "").gsub("_185.jpg", "")
return scraped_upc
when "ALL"
all_info_array = []
all_info_array << page_source.css("span[class='price']")[0].text
all_info_array << page_source.css("table[id='album-table'] td")[1].text
all_info_array << page_source.css("input[type='hidden']")[0]["value"]
all_info_array << (page_source.css("p[class='product-image'] img").attribute('src')).to_s.gsub("http://s3.amazonaws.com/hdtrack_img/", "").gsub("_185.jpg", "")
all_info_string = "#{all_info_array[0]}\t#{all_info_array[1]}\t#{all_info_array[2]}\t#{all_info_array[3]}"
return all_info_string
else
return nil
end
end
end
# --------------------------------------------------------------
# populate user-defined list of titles in array: titles_for_url
# --------------------------------------------------------------
puts "1==label_scraper; 2==new_scraper; 3==album_names; 4==label_crawler; 5==custom_url"
$user_option = gets.chomp.to_i
if $user_option != (1..5)
until (1..5).include?($user_option) do
puts "Typo! Try again."
puts "1==label_scraper; 2==new_scraper; 3==album_names; 4==label_crawler; 5==custom_url"
$user_option = gets.chomp.to_i
end
end
titles_for_url = []
case $user_option
when 1
load './label_scraper.rb'
File.open('label_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
when 2
load './new_scraper.rb'
File.open('new_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
when 3
File.open('album_names.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
titles_for_url.map! do |title|
title = title.gsub(/[\-+\/+]/, ' ')
title = title.gsub(/[^0-9a-z\s]/i, ' ')
title = title.gsub(/\s+/, '-')
title = title.gsub('/\-+/', '-')
title = title.downcase.chop
end
titles_for_url.map! { |x| "http://www.hdtracks.com/#{x}" }
when 4
load './label_crawler.rb'
File.open('label_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
when 5
load './custom_url_scraper.rb'
File.open('custom_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
end
# --------------------------------
# scrape HDtracks.com for metadata
# --------------------------------
print "scraping album pages:"; titles_for_url.length.times { print "\s" }; print "|\n"; 21.times { print "\s" }
agent = Mechanize.new
File.open('results.txt', 'w+'){ |results|
titles_for_url.each do |url|
page = agent.get(url)
page_source = Nokogiri::HTML(page.body)
title_array = dumb_scrape(page_source, url, "TITLE")
complete_format_array = dumb_scrape(page_source, url, "FORMAT").reverse # reverse so formats are listed higher-res to lower-res
url_a = reformatter(complete_format_array, "URL_A")
url_b = reformatter(complete_format_array, "URL_B")
url_c = reformatter(complete_format_array, "URL_C")
if title_array[8] == "Release Date:"
results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[7]}\t#{title_array[9]}\t#{reformatter(complete_format_array, "FORMAT_A")}\t#{smart_scrape(url_a, "ALL")}\t#{url_a}"
results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[7]}\t#{title_array[9]}\t#{reformatter(complete_format_array, "FORMAT_B")}\t#{smart_scrape(url_b, "ALL")}\t#{url_b}" unless url_b == nil
results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[7]}\t#{title_array[9]}\t#{reformatter(complete_format_array, "FORMAT_C")}\t#{smart_scrape(url_c, "ALL")}\t#{url_c}" unless url_c == nil
else
results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[8]}\t#{title_array[10]}\t#{reformatter(complete_format_array, "FORMAT_A")}\t#{smart_scrape(url_a, "ALL")}\t#{url_a}"
results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[8]}\t#{title_array[10]}\t#{reformatter(complete_format_array, "FORMAT_B")}\t#{smart_scrape(url_b, "ALL")}\t#{url_b}" unless url_b == nil
results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[8]}\t#{title_array[10]}\t#{reformatter(complete_format_array, "FORMAT_C")}\t#{smart_scrape(url_c, "ALL")}\t#{url_c}" unless url_c == nil
end
$sum_total_of_albums += 1
print "`"
end
}
$sum_total_of_albums -= 1 # so this variable stays semantically correct
# ----------------------
# output results to user
# ----------------------
puts "\nTotal number of albums: #{$sum_total_of_albums}"
# ------------------------------------------------------
# upload and convert results.txt to google spreadsheets
# ------------------------------------------------------
puts "Uploading results to Google Sheets..."
session = GoogleDrive.login("hdtrackstest123@gmail.com", "hidef123")
file = session.upload_from_file("results.txt", "#{what_date_is_it()}_#{scrape_type()}", :content_type => "text/tab-separated-values")
folder = session.collection_by_title("HDtracks-Scrapes")
folder.add(file)
puts "Done!"
=begin
METADATA LEGEND
----------------
Title == #{title_array[2]}
Artist == #{title_array[4]}
Label == #{title_array[8]} (or 7 if no genre)
Format A == #{reformatter(complete_format_array, "FORMAT_A")}
Format B == #{reformatter(complete_format_array, "FORMAT_B")}
URL A == #{url_a}"
URL B == #{url_b}"
-
ALL A (below) == #{smart_scrape(url_a, "ALL")}
ALL B (below) == #{smart_scrape(url_b, "ALL")}
Price A == #{smart_scrape(url_a, "PRICE")}
Price B == #{smart_scrape(url_b, "PRICE")}
UPC A == #{smart_scrape(url_a, "UPC")}
UPC B == #{smart_scrape(url_b, "UPC")}
ProdID A == #{smart_scrape(url_a, "ID")}
ProdID B == #{smart_scrape(url_b, "ID")}
Album Length A == #{smart_scrape(url_a, "LENGTH")}
Album Length B == #{smart_scrape(url_b, "LENGTH")}
=end
以上是关于ruby HDtracks Info Scraper的主要内容,如果未能解决你的问题,请参考以下文章
使用 ruby mp3info 从外部站点读取 mp3 ID3(不加载整个文件)
ruby 最简单的OmniAuth开发人员策略实现(OmniAuth :: Strategies :: Developer)。 http://www.rubydoc.info/github/intri