亚马逊美国Lightning_Deals爬虫
Posted qiushi9
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了亚马逊美国Lightning_Deals爬虫相关的知识,希望对你有一定的参考价值。
包含秒杀进度、距离结束时间、当前时间、商品标题、翻译后的标题、品牌、品牌是否有先关的备案注册信息、ASIN、Date first listed on Amazon、star、review、rank
删除了较多注释, 复制后能不用随缘。
import csv import json import time import redis import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from googletrans import Translator # import requests.packages.urllib3.util.ssl_ # requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = ‘ALL‘ def trademark(goods_brand): if goods_brand == None: brand_register = ‘未知‘ return brand_register if goods_brand == ‘null‘: brand_register = ‘未知‘ return brand_register r = redis.Redis(host=‘127.0.0.1‘, port=6379, db=0, decode_responses=True, password=XXXXX) redis_brand = r.get(goods_brand) if redis_brand == ‘Registered‘: return redis_brand if redis_brand == None or redis_brand == "unregistered": ff_option = Options() ff_option.add_argument(‘-headless‘) browser = webdriver.Chrome(‘C:Program Files (x86)GoogleChromeApplicationchromedriver.exe‘, options=ff_option) browser.get(‘http://tmsearch.uspto.gov‘) browser.find_element_by_xpath(‘/html/body/center/table[1]/tbody/tr[2]/td/font/font/a‘).click() # 向文本框中传入待查询的品牌 browser.find_element_by_name("p_s_PARA2").send_keys(goods_brand) # 点击查询按钮 browser.find_element_by_xpath("//input[@onclick=‘changeCurlyQuote();‘]").click() # 找到查询商标结果页面的title # 如果查询结果页的title为“TESS -- Error”则判定改商标未注册 # 已注册用True表示,未注册用False表示 register_html = browser.page_source if len(register_html) > 920: brand_register = ‘Registered‘ else: brand_register = ‘unregistered‘ r.set(goods_brand, brand_register) browser.quit() return brand_register def send_request(url, headers, proxies, session): flag = False while not flag: try: response = session.get(url, headers=headers, proxies=proxies, verify=False) return response except Exception as E: print(E) print(‘失败,正在重新尝试。‘) continue flag = True def rank(goods_soup): goods_rank_li = goods_soup.find(‘li‘, id=‘SalesRank‘) if goods_rank_li: goods_rank = goods_rank_li.text.strip().replace(" ", ‘‘).replace( ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }", "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘ #‘) return goods_rank else: goods_rank_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘) if goods_rank_table: goods_rank_table_tr = goods_rank_table.find_all(‘tr‘) for tr in goods_rank_table_tr: if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘BestSellersRank‘: goods_rank = tr.find(‘td‘).text.replace(" ", ‘‘).replace( ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }", "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘ #‘) return goods_rank def title(goods_soup): goods_title_span = goods_soup.find(‘span‘, id=‘productTitle‘) if goods_title_span: goods_title = goods_title_span.text.strip().replace("xa0", ‘‘).replace(",", ‘‘) return goods_title else: goods_title = ‘null‘ return goods_title def brand(goods_soup): goods_brand_a = goods_soup.find(‘a‘, id=‘bylineInfo‘) if goods_brand_a: goods_brand = goods_brand_a.text.strip().replace("xa0", ‘‘) return goods_brand else: try: goods_brand = goods_soup.find(‘a‘, id=‘brand‘).text.strip().replace("xa0", ‘‘) return goods_brand except AttributeError: goods_brand = ‘null‘ return goods_brand def star(goods_soup): goods_star_span = goods_soup.find(‘span‘, id=‘acrPopover‘) if goods_star_span: goods_star = goods_star_span.find(‘i‘).text.split(" out of 5 stars")[0] return goods_star else: goods_star = ‘null‘ return goods_star def review(goods_soup): goods_review_sapn = goods_soup.find(‘span‘, id=‘acrCustomerReviewText‘) if goods_review_sapn: goods_review = goods_review_sapn.text.split(" customer reviews")[0] return goods_review else: goods_review = ‘null‘ return goods_review def price(goods_soup): goods_price_span = goods_soup.find(‘span‘, id=‘priceblock_dealprice‘) if goods_price_span: goods_price = goods_price_span.text.replace("$", ‘‘) return goods_price else: goods_price_span = goods_soup.find(‘span‘, id=‘newBuyBoxPrice‘) if goods_price_span: goods_price = goods_price_span.text.replace("$", ‘‘) return goods_price else: goods_price_sale = goods_soup.find(‘span‘, id=‘priceblock_saleprice‘) if goods_price_sale: goods_price = goods_price_sale.text.replace("$", ‘‘) return goods_price else: try: goods_price = goods_soup.find(‘span‘, id=‘priceblock_ourprice‘).text.replace("$", ‘‘) return goods_price except AttributeError: goods_price = ‘null‘ return goods_price def date(goods_soup): goods_date_div = goods_soup.find(‘div‘, id=‘detailBullets_feature_div‘) if goods_date_div: goods_date_all_li = goods_date_div.find_all(‘li‘) for li in goods_date_all_li: li_title_span = li.find(‘span‘, class_=‘a-text-bold‘) if li_title_span: li_title = li_title_span.text.strip() if li_title == ‘Date first listed on Amazon:‘: goods_date = li.text.strip().replace("Date first listed on Amazon:", ‘‘).strip() return goods_date else: goods_date_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘) if goods_date_table: goods_date_table_tr = goods_date_table.find_all(‘tr‘) for tr in goods_date_table_tr: if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘DatefirstlistedonAmazon‘: goods_date = tr.find(‘td‘).text.replace(" ", ‘‘).strip() return goods_date def translator(title): translator = Translator(service_urls=[‘translate.google.cn‘]) after_title = translator.translate(‘%s‘ % title, src=‘de‘, dest="zh-CN") translation_title = str(after_title).split("text=")[1].replace(", pronunciation=None)", ‘‘) return translation_title def run(page): print("当前页码为:%s" % page) # url = ‘http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1®ions=‘ # proxy = requests.get(url).text.replace(" ", ‘‘) # proxies = { # "http": "http://" + proxy, # ‘https‘: ‘https://‘ + proxy # } base_url = ‘https://www.amazon.com/dp/‘ headers = { ‘accept-encoding‘: ‘gzip, deflate, br‘, # ‘accept-language‘: ‘zh-CN,zh;q=0.9‘, ‘upgrade-insecure-requests‘: ‘1‘, ‘user-agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", ‘accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, ‘cache-control‘: ‘max-age=0‘, ‘authority‘: ‘www.amazon.com‘, # ‘cookie‘: ‘session-id=140-0436092-5114916; session-id-time=2082787201l; ubid-main=134-7819260-0954509; x-wl-uid=1D2qwOkkELPrF8q/YqteGc9JYBV5ARtF3Mc2jW/s32idPGnjnZ7sbB5wHyMnR/u9Sw34fPkbO2xA=; session-token=1c+rOmuhW1M6euftwY4+w/swVPHn3AudZXzHSqu/xcF4uMyS946ZcjCM2If+kp/T4sOC1KoBEXBODEkbBGmd9AbT7XagZlu0xFW9tti1p/z0xCUCzs5/GOrqAi7knU6259ewXpjCRgotqeM8IgNTqV1AXCu/yCq/9abumda60iIUqCTSnaleSMeEU1l25LK0Y4FnXzmTNEgJJLvLtbgLF6Hnw7uyO2qTA2xB7uIB/ZZlC+TvrnOtd3cGa7jV6MHc; s_nr=1526623968134-New; s_vnum=1958623968135^%^26vn^%^3D1; s_dslv=1526623968135; lc-main=en_US; x-amz-captcha-1=1527334802791799; x-amz-captcha-2=kU9AAR92z09BFOUgsoQgXw==; skin=noskin; csm-hit=tb:SDXEVFJG2SJZZRQZDRAF+s-SDXEVFJG2SJZZRQZDRAF^|1527472665005^&adb:adblk_no‘, } ff_option = Options() ff_option.add_argument(‘-headless‘) browser = webdriver.Chrome(‘C:Program Files (x86)GoogleChromeApplicationchromedriver.exe‘, chrome_options=ff_option) browser.get(‘https://www.amazon.com/gp/goldbox/ref=gbps_ftr_s-4_d724_page_‘ + str( page) + ‘?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,page:‘ + str(page) + ‘,dealTypes:LIGHTNING_DEAL,dealsPerPage:48‘) time.sleep(10) pageSource = browser.page_source page_soup = BeautifulSoup(pageSource, ‘lxml‘) all_goods_div = page_soup.find(‘div‘, id=‘widgetContent‘).find_all(‘div‘, class_=‘a-section dealContainer‘) print(len(all_goods_div)) for goods_div in all_goods_div: session = requests.session() # 给每个商品生成一个空列表 goods_info_list = [] schedule_div = goods_div.find(‘div‘, ‘a-column a-span5 a-text-left unitLineHeight‘) # 进度条 if schedule_div: schedule = schedule_div.find(‘div‘, ‘a-row unitLineHeight‘).text.strip().replace(" Claimed", ‘‘).replace("xa0", ‘‘).replace("xae", ‘‘).replace("u2122", ‘‘) else: schedule = ‘null‘ # 距离结束 timer = goods_div.find(‘span‘, role=‘timer‘) if timer: end_time = timer.text.strip().replace("xa0", ‘‘).replace("xae", ‘‘).replace("u2122", ‘‘) else: end_time = ‘null‘ now_time = time.strftime(‘%H:%M:%S‘, time.localtime(time.time())) dealtitle = goods_div.find(‘a‘, id=‘dealImage‘) # 取到商品详情页url try: goods_asin = dealtitle[‘href‘].split("dp/")[1].split(‘/‘)[0] except IndexError: continue goods_url = base_url + goods_asin goods_html = requests.get(goods_url, headers=headers) goods_soup = BeautifulSoup(goods_html.text, ‘lxml‘) print("商品链接为:" + goods_url) goods_asin = goods_url.split(‘dp/‘)[1] goods_title = title(goods_soup) after_title = translator(goods_title) goods_brand = brand(goods_soup) goods_star = star(goods_soup) goods_review = review(goods_soup) goods_price = price(goods_soup) goods_rank = rank(goods_soup) goods_date = date(goods_soup) brand_register = trademark(goods_brand) if goods_date == None: goods_date = ‘null‘ print("schedule:" + schedule) print("goods_title:" + goods_title) print("after_title:" + after_title) print("goods_asin:" + goods_asin) print("goods_brand:" + goods_brand) print("brand_register:" + brand_register) print("goods_date:" + str(goods_date)) print("goods_star:" + goods_star) print("end_time:" + end_time) print("now_time:" + now_time) print("goods_review:" + goods_review) print("goods_rank:" + str(goods_rank)) print("goods_price:" + goods_price) goods_info_list.append(schedule) goods_info_list.append(end_time) goods_info_list.append(now_time) goods_info_list.append(goods_title) goods_info_list.append(after_title) goods_info_list.append(goods_brand) goods_info_list.append(brand_register) goods_info_list.append(goods_asin) goods_info_list.append(goods_date) goods_info_list.append(goods_star) goods_info_list.append(goods_review) goods_info_list.append(goods_rank) goods_info_list.append(goods_price) print(‘=========================================‘) csvFile = open(‘./Lightning_Deals_US/Lightning_Deals_US_%s.csv‘ % str(page), ‘a‘, newline=‘‘, encoding=‘gb18030‘) # 设置newline,否则两行之间会空一行 writer = csv.writer(csvFile) writer.writerow(goods_info_list) csvFile.close() if __name__ == ‘__main__‘: for i in range(1, 22): run(i)
以上是关于亚马逊美国Lightning_Deals爬虫的主要内容,如果未能解决你的问题,请参考以下文章