python3爬虫-通过selenium获取TB商品

Posted zhuchunyu

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3爬虫-通过selenium获取TB商品相关的知识,希望对你有一定的参考价值。

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
from selenium.webdriver import ActionChains
import time, json,datetime
from lxml import etree

TB_LOGIN_URL = https://login.taobao.com/member/login.jhtml


class MyException(Exception):
    def __init__(self, status, msg):
        self.status = status
        self.msg = msg


class TBClass:
    def __init__(self):
        self.browser = None
        self.log_file = open("error.log", "a", encoding="utf-8")

    def __init_browser(self):
        options = Options()
        options.add_experimental_option(excludeSwitches, [enable-automation])
        # options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        self.browser = webdriver.Chrome(options=options)
        self.browser.implicitly_wait(3)
        self.browser.maximize_window()
        self.browser.get(TB_LOGIN_URL)
        self.wait = WebDriverWait(self.browser, 10)
        self.ac = ActionChains(self.browser)

    def __switch_to_pwd_mode(self):
        if not self.browser.find_element_by_id("J_Static2Quick").is_displayed():
            self.browser.find_element_by_id("J_Quick2Static").click()

    def __input_user(self, username):
        input_user_tag = self.browser.find_element_by_id("TPL_username_1")
        input_user_tag.clear()
        input_user_tag.send_keys(username)

    def __input_pwd(self, password):
        input_pwd_tag = self.browser.find_element_by_id("TPL_password_1")
        input_pwd_tag.clear()
        input_pwd_tag.send_keys(password)

    def __is_tag_exist_css(self, selector):
        try:
            self.browser.find_element_by_css_selector(selector)
            return True
        except NoSuchElementException:
            return False

    def __lock_exist(self):
        return self.__is_tag_exist_css(#nc_1_wrapper) and self.browser.find_element_by_id(
            nc_1_wrapper).is_displayed()

    def __slide_tag(self):
        bar_element = self.browser.find_element_by_id(nc_1_n1z)
        ActionChains(self.browser).drag_and_drop_by_offset(bar_element, 350, 0).perform()
        time.sleep(0.5)
        self.browser.get_screenshot_as_file(error.png)
        if self.__is_tag_exist_css(.errloading > span):
            error_message_element = self.browser.find_element_by_css_selector(.errloading > span)
            error_message = error_message_element.text
            self.browser.execute_script(noCaptcha.reset(1))
            raise MyException(10000, 滑动验证失败, message =  + error_message)

    def __submit(self):
        self.browser.find_element_by_id(J_SubmitStatic).click()
        time.sleep(0.5)
        if self.__is_tag_exist_css("#J_Message"):
            error_message_element = self.browser.find_element_by_css_selector(#J_Message > p)
            error_message = error_message_element.text
            raise MyException(10001, 登录出错, message =  + error_message)

    def login_tb(self, username, password):
        ‘‘‘
        登录tb账号
        :param username: 用户名
        :param password: 密码
        :return:
        ‘‘‘
        self.__init_browser()
        self.__switch_to_pwd_mode()
        time.sleep(0.5)
        self.__input_user(username)
        time.sleep(0.5)
        self.__input_pwd(password)
        if self.__lock_exist():
            self.__slide_tag()
        self.__submit()

    def goto_Tb_Home(self):
        ‘‘‘
        进入tb首页
        :return:
        ‘‘‘
        self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "site-nav-menu-hd")))
        self.browser.find_element_by_link_text("淘宝网首页").click()

    def __search_goods(self, goods):
        ‘‘‘
        在搜索框搜索商品
        :param goods: 商品名
        :return:
        ‘‘‘
        self.wait.until(EC.presence_of_element_located((By.ID, "q")))
        searchTag = self.browser.find_element_by_id("q")
        searchTag.clear()
        searchTag.send_keys(goods, Keys.ENTER)

    def __page_slide(self):
        ‘‘‘
        滑动页面功能,能更好获取到所有信息
        :return:
        ‘‘‘
        height = self.browser.execute_script("return document.body.clientHeight;")
        js = "window.scrollTo(0, {});".format(height)
        self.browser.execute_script(js)
        html = self.browser.page_source
        return html
        # while True:
        #     nowHeight = self.browser.execute_script("return document.body.clientHeight;")
        #     if height == nowHeight:
        #         html = self.browser.page_source
        #         return html
        #     js = "window.scrollTo({}, {});".format(height, nowHeight)
        #     self.browser.execute_script(js)
        #     height = nowHeight

    def __page_turing(self):
        ‘‘‘
        进行翻页操作
        :return:
        ‘‘‘
        try:
            try:
                next_tag = self.browser.find_elements_by_css_selector(".J_Ajax.num.icon-tag")[1]
            except IndexError:
                next_tag = self.browser.find_element_by_css_selector(".J_Ajax.num.icon-tag")
            next_tag.click()
        except NoSuchElementException:
            raise MyException(10001, "商品访问完毕")

    def __parse_html(self, html):
        html_selector = etree.HTML(html)
        # 商品名
        goods_name = html_selector.xpath(
            "//div[@class=‘pic‘]//img/@alt")

        # 购买人数
        buy_people_num = html_selector.xpath("//div[@class=‘deal-cnt‘]/text()")

        # 商品价格
        goods_price = html_selector.xpath(
            "//div[contains(@class,price) and contains(@class,g_price) and contains(@class,g_price-highlight)]/strong/text()")

        # 店铺名字
        shop_name = html_selector.xpath("//div[@class=‘shop‘]/a/span[2]/text() ")

        # 店铺地址
        shop_addr = html_selector.xpath("//div[@class=‘location‘]/text()")

        zip_data = zip(goods_name, buy_people_num, goods_price, shop_name, shop_addr)
        for data in zip_data:
            dic = {}
            dic["goods_name"] = data[0]
            dic["buy_people_num"] = data[1]
            dic["goods_price"] = data[2] + ""
            dic["shop_name"] = data[3]
            dic["shop_addr"] = data[4]
            # print("商品名>>:", data[0])
            # print("购买人数>>:", data[1])
            # print("商品价格>>:", data[2] + "元")
            # print("店铺名字>>:", data[3])
            # print("店铺地址>>:", data[4])
            # print("*" * 100)
            yield dic

    def __write_to_json(self, dic: dict):
        data_json = json.dumps(dic, ensure_ascii=False)
        self.file.write(data_json + "\n")

    def __is_tag_exist_xpath(self, xpath):
        try:
            self.browser.find_element_by_xpath(xpath)
            return True
        except NoSuchElementException:
            return False

    def login_verification(self):
        ‘‘‘判断是否有验证登陆,有的话返回True,没有的话返回False‘‘‘
        time.sleep(5)
        iframe = self.browser.find_elements_by_tag_name(iframe)[0]
        self.browser.switch_to.frame(iframe)
        if not self.__is_tag_exist_xpath("//a[@class=‘ui-form-other‘]"):
            if not self.__is_tag_exist_xpath("//*[id=‘J_GetCode‘]"):
                return False
            self.browser.find_element_by_id("J_GetCode").click()
            print("请打开手机获取验证码,并输入....")
            code_verify = input("验证码>>:").split()
            # 提交验证码
            self.browser.find_element_by_css_selector(".ui-input-checkcode-new").send_keys(code_verify, Keys.ENTER)
            return True

        self.browser.find_element_by_class_name("ui-form-other").click()
        # 等待元素被加载出来
        self.wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".ui-button-text.ui-button.ui-button-morange")))

        # 会获取到两个符合tag,取第二个就行,第二个就是通过短信验证码验证
        self.browser.find_elements_by_css_selector(".ui-button-text.ui-button.ui-button-morange")[1].click()

        # 等待id=J_GetCode被加载出来
        self.wait.until(EC.presence_of_element_located((By.ID, "J_GetCode")))

        self.browser.find_element_by_id("J_GetCode").click()
        print("请打开手机获取验证码,并输入....")
        code_verify = input("验证码>>:").split()
        # 提交验证码
        self.browser.find_element_by_css_selector(".ui-input-checkcode-new").send_keys(code_verify, Keys.ENTER)
        return True

    def search_goods(self, goods):
        self.__search_goods(goods)
        self.file = open("tb-{}.json".format(goods), "a", encoding="utf-8")
        n = 1
        while True:
            print("获取商品<{}>--第{}页数据".format(goods,n))
            html = self.__page_slide()
            for dic in self.__parse_html(html):
                self.__write_to_json(dic)
            try:
                self.__page_turing()  # 执行这个方法运行一段时间会抛出下面这个异常,直接进行下一个商品的爬取,这里就是一个小小的bug
            except MyException as e:
                if e.status == 10001:
                    error_msg = "{}----{}信息访问完毕".format(datetime.datetime.now(),goods)
                    self.log_file.write(error_msg + "\n")
                    self.file.close()
                    try:
                        goods = goods_list.pop(0)
                        self.search_goods(goods)
                    except IndexError:
                        error_msg = "{}----goods_list列表为空,爬取完毕".format(datetime.datetime.now())
                        self.log_file.write(error_msg + "\n")
                        return
            except Exception as error:
                error_msg = "{}----访问商品{}第{}页报错,错误信息为:{}".format(datetime.datetime.now(),goods, n, error.__str__())
                self.log_file.write(error_msg + "\n")
                self.file.close()
                try:
                    goods = goods_list.pop(0)
                    self.search_goods(goods)
                except IndexError:
                    error_msg = "goods_list列表为空,爬取完毕"
                    self.log_file.write(error_msg + "\n")
                    return

            time.sleep(20)
            n += 1

    def close_browser(self):
        ‘‘‘
        关闭浏览器
        :return:
        ‘‘‘
        self.browser.close()

    def __del__(self):
        ‘‘‘
        进行关闭一些资源的操作
        :return:
        ‘‘‘

        self.file.close()
        self.browser.close()
        self.log_file.close()


if __name__ == __main__:

    goods_list = ["水杯", "床上用品", "运动短裤"]
    username = ""     # 用户名
    password = ""     # 密码
    try:
        goods = goods_list.pop(0)
    except IndexError:
        raise MyException(10005, "goods_list不能为空")
    tb = TBClass()

    try:
        try:
            tb.login_tb(username, password)
        except MyException as e:
            print(e.msg)
            print("自动重启浏览器中....")
            tb.close_browser()
            tb.login_tb(username, password)

        if not tb.login_verification():
            tb.goto_Tb_Home()
        tb.search_goods(goods)
    finally:
        del tb

以上是关于python3爬虫-通过selenium获取TB商品的主要内容,如果未能解决你的问题,请参考以下文章

[Python3网络爬虫开发实战] 1.2.2-Selenium的安装

python爬虫之requests+selenium+BeautifulSoup

通过Python3+selenium自动测试网页

第一个爬虫练习 微信公众号。

小5聊Python3 使用selenium模块实现简单爬虫系列一

Python爬虫实战(3)selenium完成瀑布流数据爬取