亚马逊美国Lightning_Deals爬虫

Posted qiushi9

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了亚马逊美国Lightning_Deals爬虫相关的知识,希望对你有一定的参考价值。

包含秒杀进度、距离结束时间、当前时间、商品标题、翻译后的标题、品牌、品牌是否有先关的备案注册信息、ASIN、Date first listed on Amazon、star、review、rank

删除了较多注释, 复制后能不用随缘

import csv
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from googletrans import Translator


# import requests.packages.urllib3.util.ssl_
# requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = ‘ALL‘

def trademark(goods_brand):
    if goods_brand == None:
        brand_register = ‘未知‘
        return brand_register
    if goods_brand == ‘null‘:
        brand_register = ‘未知‘
        return brand_register
    r = redis.Redis(host=‘127.0.0.1‘, port=6379, db=0, decode_responses=True, password=XXXXX)
    redis_brand = r.get(goods_brand)
    if redis_brand == ‘Registered‘:
        return redis_brand
    if redis_brand == None or redis_brand == "unregistered":
        ff_option = Options()
        ff_option.add_argument(‘-headless‘)
        browser = webdriver.Chrome(‘C:Program Files (x86)GoogleChromeApplicationchromedriver.exe‘, options=ff_option)
        browser.get(‘http://tmsearch.uspto.gov‘)
        browser.find_element_by_xpath(‘/html/body/center/table[1]/tbody/tr[2]/td/font/font/a‘).click()
        # 向文本框中传入待查询的品牌
        browser.find_element_by_name("p_s_PARA2").send_keys(goods_brand)
        # 点击查询按钮
        browser.find_element_by_xpath("//input[@onclick=‘changeCurlyQuote();‘]").click()
        # 找到查询商标结果页面的title
        # 如果查询结果页的title为“TESS -- Error”则判定改商标未注册
        # 已注册用True表示,未注册用False表示
        register_html = browser.page_source
        if len(register_html) > 920:
            brand_register = ‘Registered‘
        else:
            brand_register = ‘unregistered‘
        r.set(goods_brand, brand_register)
        browser.quit()
        return brand_register



def send_request(url, headers, proxies, session):
    flag = False
    while not flag:
        try:
            response = session.get(url, headers=headers, proxies=proxies, verify=False)
            return response
        except Exception as E:
            print(E)
            print(‘失败,正在重新尝试。‘)
            continue
        flag = True


def rank(goods_soup):
    goods_rank_li = goods_soup.find(‘li‘, id=‘SalesRank‘)
    if goods_rank_li:
        goods_rank = goods_rank_li.text.strip().replace("
", ‘‘).replace(
            ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
            "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘
#‘)
        return goods_rank
    else:
        goods_rank_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘)
        if goods_rank_table:
            goods_rank_table_tr = goods_rank_table.find_all(‘tr‘)
            for tr in goods_rank_table_tr:
                if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘BestSellersRank‘:
                    goods_rank = tr.find(‘td‘).text.replace("
", ‘‘).replace(
                        ".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
                        "").replace("Amazon Best Sellers Rank:", ‘‘).replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", ‘
#‘)
                    return goods_rank


def title(goods_soup):
    goods_title_span = goods_soup.find(‘span‘, id=‘productTitle‘)
    if goods_title_span:
        goods_title = goods_title_span.text.strip().replace("xa0", ‘‘).replace(",", ‘‘)
        return goods_title
    else:
        goods_title = ‘null‘
        return goods_title


def brand(goods_soup):
    goods_brand_a = goods_soup.find(‘a‘, id=‘bylineInfo‘)
    if goods_brand_a:
        goods_brand = goods_brand_a.text.strip().replace("xa0", ‘‘)
        return goods_brand
    else:
        try:
            goods_brand = goods_soup.find(‘a‘, id=‘brand‘).text.strip().replace("xa0", ‘‘)
            return goods_brand
        except AttributeError:
            goods_brand = ‘null‘
            return goods_brand


def star(goods_soup):
    goods_star_span = goods_soup.find(‘span‘, id=‘acrPopover‘)
    if goods_star_span:
        goods_star = goods_star_span.find(‘i‘).text.split(" out of 5 stars")[0]
        return goods_star
    else:
        goods_star = ‘null‘
        return goods_star


def review(goods_soup):
    goods_review_sapn = goods_soup.find(‘span‘, id=‘acrCustomerReviewText‘)
    if goods_review_sapn:
        goods_review = goods_review_sapn.text.split(" customer reviews")[0]
        return goods_review
    else:
        goods_review = ‘null‘
        return goods_review


def price(goods_soup):
    goods_price_span = goods_soup.find(‘span‘, id=‘priceblock_dealprice‘)
    if goods_price_span:
        goods_price = goods_price_span.text.replace("$", ‘‘)
        return goods_price
    else:
        goods_price_span = goods_soup.find(‘span‘, id=‘newBuyBoxPrice‘)
        if goods_price_span:
            goods_price = goods_price_span.text.replace("$", ‘‘)
            return goods_price
        else:
            goods_price_sale = goods_soup.find(‘span‘, id=‘priceblock_saleprice‘)
            if goods_price_sale:
                goods_price = goods_price_sale.text.replace("$", ‘‘)
                return goods_price
            else:
                try:
                    goods_price = goods_soup.find(‘span‘, id=‘priceblock_ourprice‘).text.replace("$", ‘‘)
                    return goods_price
                except AttributeError:
                    goods_price = ‘null‘
                    return goods_price


def date(goods_soup):
    goods_date_div = goods_soup.find(‘div‘, id=‘detailBullets_feature_div‘)
    if goods_date_div:
        goods_date_all_li = goods_date_div.find_all(‘li‘)
        for li in goods_date_all_li:
            li_title_span = li.find(‘span‘, class_=‘a-text-bold‘)
            if li_title_span:
                li_title = li_title_span.text.strip()

                if li_title == ‘Date first listed on Amazon:‘:
                    goods_date = li.text.strip().replace("Date first listed on Amazon:", ‘‘).strip()
                    return goods_date
    else:
        goods_date_table = goods_soup.find(‘table‘, id=‘productDetails_detailBullets_sections1‘)
        if goods_date_table:
            goods_date_table_tr = goods_date_table.find_all(‘tr‘)
            for tr in goods_date_table_tr:
                if tr.find(‘th‘).text.strip().replace(" ", ‘‘) == ‘DatefirstlistedonAmazon‘:
                    goods_date = tr.find(‘td‘).text.replace("
", ‘‘).strip()
                    return goods_date


def translator(title):
    translator = Translator(service_urls=[‘translate.google.cn‘])
    after_title = translator.translate(‘%s‘ % title, src=‘de‘, dest="zh-CN")
    translation_title = str(after_title).split("text=")[1].replace(", pronunciation=None)", ‘‘)
    return translation_title


def run(page):
    print("当前页码为:%s" % page)
    # url = ‘http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1&regions=‘
    # proxy = requests.get(url).text.replace("
", ‘‘)
    # proxies = {
    #     "http": "http://" + proxy,
    #     ‘https‘: ‘https://‘ + proxy
    # }
    base_url = ‘https://www.amazon.com/dp/‘
    headers = {
        ‘accept-encoding‘: ‘gzip, deflate, br‘,
        # ‘accept-language‘: ‘zh-CN,zh;q=0.9‘,
        ‘upgrade-insecure-requests‘: ‘1‘,
        ‘user-agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        ‘accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
        ‘cache-control‘: ‘max-age=0‘,
        ‘authority‘: ‘www.amazon.com‘,
        # ‘cookie‘: ‘session-id=140-0436092-5114916; session-id-time=2082787201l; ubid-main=134-7819260-0954509; x-wl-uid=1D2qwOkkELPrF8q/YqteGc9JYBV5ARtF3Mc2jW/s32idPGnjnZ7sbB5wHyMnR/u9Sw34fPkbO2xA=; session-token=1c+rOmuhW1M6euftwY4+w/swVPHn3AudZXzHSqu/xcF4uMyS946ZcjCM2If+kp/T4sOC1KoBEXBODEkbBGmd9AbT7XagZlu0xFW9tti1p/z0xCUCzs5/GOrqAi7knU6259ewXpjCRgotqeM8IgNTqV1AXCu/yCq/9abumda60iIUqCTSnaleSMeEU1l25LK0Y4FnXzmTNEgJJLvLtbgLF6Hnw7uyO2qTA2xB7uIB/ZZlC+TvrnOtd3cGa7jV6MHc; s_nr=1526623968134-New; s_vnum=1958623968135^%^26vn^%^3D1; s_dslv=1526623968135; lc-main=en_US; x-amz-captcha-1=1527334802791799; x-amz-captcha-2=kU9AAR92z09BFOUgsoQgXw==; skin=noskin; csm-hit=tb:SDXEVFJG2SJZZRQZDRAF+s-SDXEVFJG2SJZZRQZDRAF^|1527472665005^&adb:adblk_no‘,
    }

    ff_option = Options()
    ff_option.add_argument(‘-headless‘)
    browser = webdriver.Chrome(‘C:Program Files (x86)GoogleChromeApplicationchromedriver.exe‘, chrome_options=ff_option)
    browser.get(‘https://www.amazon.com/gp/goldbox/ref=gbps_ftr_s-4_d724_page_‘ + str(
        page) + ‘?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,page:‘ + str(page) + ‘,dealTypes:LIGHTNING_DEAL,dealsPerPage:48‘)
    time.sleep(10)
    pageSource = browser.page_source
    page_soup = BeautifulSoup(pageSource, ‘lxml‘)
    all_goods_div = page_soup.find(‘div‘, id=‘widgetContent‘).find_all(‘div‘, class_=‘a-section dealContainer‘)
    print(len(all_goods_div))
    for goods_div in all_goods_div:
        session = requests.session()
        # 给每个商品生成一个空列表
        goods_info_list = []
        schedule_div = goods_div.find(‘div‘, ‘a-column a-span5 a-text-left unitLineHeight‘)

        # 进度条
        if schedule_div:
            schedule = schedule_div.find(‘div‘, ‘a-row unitLineHeight‘).text.strip().replace(" Claimed", ‘‘).replace("xa0", ‘‘).replace("xae", ‘‘).replace("u2122", ‘‘)
        else:
            schedule = ‘null‘

        # 距离结束
        timer = goods_div.find(‘span‘, role=‘timer‘)
        if timer:
            end_time = timer.text.strip().replace("xa0", ‘‘).replace("xae", ‘‘).replace("u2122", ‘‘)
        else:
            end_time = ‘null‘
        now_time = time.strftime(‘%H:%M:%S‘, time.localtime(time.time()))

        dealtitle = goods_div.find(‘a‘, id=‘dealImage‘)
        # 取到商品详情页url
        try:
            goods_asin = dealtitle[‘href‘].split("dp/")[1].split(‘/‘)[0]
        except IndexError:
            continue
        goods_url = base_url + goods_asin
        goods_html = requests.get(goods_url, headers=headers)
        goods_soup = BeautifulSoup(goods_html.text, ‘lxml‘)
        print("商品链接为:" + goods_url)
        goods_asin = goods_url.split(‘dp/‘)[1]
        goods_title = title(goods_soup)
        after_title = translator(goods_title)
        goods_brand = brand(goods_soup)
        goods_star = star(goods_soup)
        goods_review = review(goods_soup)
        goods_price = price(goods_soup)
        goods_rank = rank(goods_soup)
        goods_date = date(goods_soup)
        brand_register = trademark(goods_brand)
        if goods_date == None:
            goods_date = ‘null‘
        print("schedule:" + schedule)
        print("goods_title:" + goods_title)
        print("after_title:" + after_title)
        print("goods_asin:" + goods_asin)
        print("goods_brand:" + goods_brand)
        print("brand_register:" + brand_register)
        print("goods_date:" + str(goods_date))
        print("goods_star:" + goods_star)
        print("end_time:" + end_time)
        print("now_time:" + now_time)
        print("goods_review:" + goods_review)
        print("goods_rank:" + str(goods_rank))
        print("goods_price:" + goods_price)
        goods_info_list.append(schedule)
        goods_info_list.append(end_time)
        goods_info_list.append(now_time)
        goods_info_list.append(goods_title)
        goods_info_list.append(after_title)
        goods_info_list.append(goods_brand)
        goods_info_list.append(brand_register)
        goods_info_list.append(goods_asin)
        goods_info_list.append(goods_date)
        goods_info_list.append(goods_star)
        goods_info_list.append(goods_review)
        goods_info_list.append(goods_rank)
        goods_info_list.append(goods_price)
        print(‘=========================================‘)
        csvFile = open(‘./Lightning_Deals_US/Lightning_Deals_US_%s.csv‘ % str(page), ‘a‘, newline=‘‘, encoding=‘gb18030‘)  # 设置newline,否则两行之间会空一行
        writer = csv.writer(csvFile)
        writer.writerow(goods_info_list)
        csvFile.close()


if __name__ == ‘__main__‘:
    for i in range(1, 22):
        run(i)

技术分享图片

以上是关于亚马逊美国Lightning_Deals爬虫的主要内容,如果未能解决你的问题,请参考以下文章

亚马逊与美国国家安全委员会携手开展业界首创合作项目,以解决美国常见的工伤问题

深挖亚马逊卖家都好奇的爬虫技术!

爬虫实战带你一步步破解亚马逊 淘宝 京东的反爬虫机制

亚马逊成为美国第一大可再生能源企业买家

美国亚马逊图片打不开

亚马逊美国的运营中心,藏着这些小秘密