ruby HDtracks Info Scraper

Posted 2021-05-15
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了ruby HDtracks Info Scraper相关的知识，希望对你有一定的参考价值。
# Let's Scrape Some Info! (from hdtracks.com)
# User defines scope:
# -> scrape a specific label (by ID)
# -> crawl through a rage of label IDs
# -> scrape a list of plaintext album names 
# -> scrape the "new" section
# -> scrape a custom hdtracks URL
# --------------------------------------------


# ------------
# requirements
# ------------
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'mechanize' # don't technically need mechanize for its methods; it's just faster at downloading an html page than nokogiri
require 'google_drive'

$sum_total_of_albums = 1

# -------
# methods
# -------

def what_date_is_it()
  time = Time.new 
  correct_time_format = time.strftime("%m%d%y@%H:%M")
  return correct_time_format
end

def scrape_type()
  return "label" if $user_option == 1
  return "new" if $user_option == 2
  return "albumList" if $user_option == 3
  return "labelList" if $user_option == 4
  return "customURL" if $user_option == 5
  return nil
end

def reformatter(smart_format_array, metadata)
  return smart_format_array[0][0] if metadata == "FORMAT_A"
  return smart_format_array[0][1] if metadata == "URL_A"
  return smart_format_array[1][0] if metadata == "FORMAT_B" && $clean_format_array.length > 1
  return smart_format_array[1][1] if metadata == "URL_B" && $clean_format_array.length > 1
  return smart_format_array[2][0] if metadata == "FORMAT_C" && $clean_format_array.length > 2
  return smart_format_array[2][1] if metadata == "URL_C" && $clean_format_array.length > 2
  return nil
end

def dumb_scrape(page_source, url, metadata)
  case metadata
    when "TITLE" # scrape for title, artist, label, alt_text
      tal_array = page_source.css("div[class='album-main-details']").text 
      return tal_array.split(/\s\s\s+/)
    when "FORMAT" # scrape for formats a,b,c and urls a,b,c
      dumb_format_array = page_source.css("div[class='product-available-format']").text 
      dumb_format_array = dumb_format_array.split(/\s/)
      $clean_format_array = []
      dumb_format_array.each do |x|
        y = x.gsub(",", "")
        $clean_format_array.push(y) if x.match(/^\d/)
      end
      format_hash = {}
      i = 0
      $clean_format_array.length.times do
        temp_format_hash = { "#{(page_source.css("select[name='bitrateselect'] option")[i].text).gsub("\/", "kHz\/")}bit" => (page_source.css("select[name='bitrateselect'] option")[i]['value']) }
        format_hash.merge!(temp_format_hash)
        i += 1
      end
      format_hash.each { |k,v| format_hash[k] = url if v == "#" }
      return format_hash.to_a
    else
      return nil
  end 
end

def smart_scrape(url, metadata)
  unless url == nil
  smart_agent = Mechanize.new
  begin
    page = smart_agent.get(url)
  rescue Exception => e
    page = e.page
  end
    page_source =  Nokogiri::HTML(page.body)
    case metadata
      when "PRICE"
        scraped_price = page_source.css("span[class='price']")[0].text
        return scraped_price.to_s
      when "LENGTH"
        scraped_length = page_source.css("table[id='album-table'] td")[1].text
        return scraped_length.to_s    
      when "ID"
        scraped_id = page_source.css("input[type='hidden']")[0]["value"]
        return scraped_id.to_s        
      when "UPC"
        scraped_upc = (page_source.css("p[class='product-image'] img").attribute('src')).to_s.gsub("http://s3.amazonaws.com/hdtrack_img/", "").gsub("_185.jpg", "")
        return scraped_upc
      when "ALL"
        all_info_array = []
        all_info_array << page_source.css("span[class='price']")[0].text
        all_info_array << page_source.css("table[id='album-table'] td")[1].text
        all_info_array << page_source.css("input[type='hidden']")[0]["value"]
        all_info_array << (page_source.css("p[class='product-image'] img").attribute('src')).to_s.gsub("http://s3.amazonaws.com/hdtrack_img/", "").gsub("_185.jpg", "")
        all_info_string = "#{all_info_array[0]}\t#{all_info_array[1]}\t#{all_info_array[2]}\t#{all_info_array[3]}"
        return all_info_string
      else
        return nil
    end
  end
end


# --------------------------------------------------------------
# populate user-defined list of titles in array: titles_for_url
# --------------------------------------------------------------

puts "1==label_scraper; 2==new_scraper; 3==album_names; 4==label_crawler; 5==custom_url"
$user_option = gets.chomp.to_i
if $user_option != (1..5)
  until (1..5).include?($user_option) do
    puts "Typo! Try again."
    puts "1==label_scraper; 2==new_scraper; 3==album_names; 4==label_crawler; 5==custom_url"
    $user_option = gets.chomp.to_i
  end
end

titles_for_url = []
case $user_option
  when 1
    load './label_scraper.rb'
    File.open('label_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
  when 2
    load './new_scraper.rb'
    File.open('new_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
  when 3
    File.open('album_names.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
    titles_for_url.map! do |title|
      title = title.gsub(/[\-+\/+]/, ' ')
      title = title.gsub(/[^0-9a-z\s]/i, ' ')
      title = title.gsub(/\s+/, '-')
      title = title.gsub('/\-+/', '-')
      title = title.downcase.chop
    end
    titles_for_url.map! { |x| "http://www.hdtracks.com/#{x}" }
  when 4
    load './label_crawler.rb'
    File.open('label_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
     when 5
    load './custom_url_scraper.rb'
    File.open('custom_urls.txt', 'r') { |x| x.each_line { |line| titles_for_url.push(line) } }
end

# --------------------------------
# scrape HDtracks.com for metadata
# --------------------------------
print "scraping album pages:"; titles_for_url.length.times { print "\s" }; print "|\n"; 21.times { print "\s" }
agent = Mechanize.new
File.open('results.txt', 'w+'){ |results|
  titles_for_url.each do |url|
    page = agent.get(url)
    page_source =  Nokogiri::HTML(page.body)
    title_array = dumb_scrape(page_source, url, "TITLE")
    complete_format_array = dumb_scrape(page_source, url, "FORMAT").reverse # reverse so formats are listed higher-res to lower-res
    url_a = reformatter(complete_format_array, "URL_A")
    url_b = reformatter(complete_format_array, "URL_B")
    url_c = reformatter(complete_format_array, "URL_C")
    if title_array[8] == "Release Date:"
      results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[7]}\t#{title_array[9]}\t#{reformatter(complete_format_array, "FORMAT_A")}\t#{smart_scrape(url_a, "ALL")}\t#{url_a}"
      results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[7]}\t#{title_array[9]}\t#{reformatter(complete_format_array, "FORMAT_B")}\t#{smart_scrape(url_b, "ALL")}\t#{url_b}" unless url_b == nil
      results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[7]}\t#{title_array[9]}\t#{reformatter(complete_format_array, "FORMAT_C")}\t#{smart_scrape(url_c, "ALL")}\t#{url_c}" unless url_c == nil
    else
      results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[8]}\t#{title_array[10]}\t#{reformatter(complete_format_array, "FORMAT_A")}\t#{smart_scrape(url_a, "ALL")}\t#{url_a}"
      results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[8]}\t#{title_array[10]}\t#{reformatter(complete_format_array, "FORMAT_B")}\t#{smart_scrape(url_b, "ALL")}\t#{url_b}" unless url_b == nil
      results.puts "#{title_array[2]}\t#{title_array[4]}\t#{title_array[8]}\t#{title_array[10]}\t#{reformatter(complete_format_array, "FORMAT_C")}\t#{smart_scrape(url_c, "ALL")}\t#{url_c}" unless url_c == nil
    end
    $sum_total_of_albums += 1
    print "`"
  end
}
$sum_total_of_albums -= 1 # so this variable stays semantically correct


# ----------------------
# output results to user
# ----------------------
puts "\nTotal number of albums: #{$sum_total_of_albums}"


# ------------------------------------------------------
# upload and convert results.txt to google spreadsheets 
# ------------------------------------------------------
puts "Uploading results to Google Sheets..."
session = GoogleDrive.login("hdtrackstest123@gmail.com", "hidef123")
file = session.upload_from_file("results.txt", "#{what_date_is_it()}_#{scrape_type()}", :content_type => "text/tab-separated-values")
folder = session.collection_by_title("HDtracks-Scrapes")
folder.add(file)
puts "Done!"





=begin
METADATA LEGEND
----------------
Title == #{title_array[2]}
Artist == #{title_array[4]}
Label == #{title_array[8]} (or 7 if no genre)
Format A == #{reformatter(complete_format_array, "FORMAT_A")}
Format B == #{reformatter(complete_format_array, "FORMAT_B")}
URL A == #{url_a}"
URL B == #{url_b}"
-
ALL A (below) == #{smart_scrape(url_a, "ALL")}
ALL B (below) == #{smart_scrape(url_b, "ALL")} 
Price A == #{smart_scrape(url_a, "PRICE")}
Price B == #{smart_scrape(url_b, "PRICE")}
UPC A == #{smart_scrape(url_a, "UPC")}
UPC B == #{smart_scrape(url_b, "UPC")}
ProdID A == #{smart_scrape(url_a, "ID")}
ProdID B == #{smart_scrape(url_b, "ID")}
Album Length A == #{smart_scrape(url_a, "LENGTH")}
Album Length B == #{smart_scrape(url_b, "LENGTH")}
=end
以上是关于ruby HDtracks Info Scraper的主要内容，如果未能解决你的问题，请参考以下文章