[Python爬虫] 之十：Selenium +phantomjs抓取活动行中会议活动（多线程抓取）

Posted 2020-09-10

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了[Python爬虫] 之十：Selenium +phantomjs抓取活动行中会议活动（多线程抓取）相关的知识，希望对你有一定的参考价值。

　　延续上个抓取活动行中会议活动的问题，上次使用是单线程的抓取，效率较低，现在使用多线程的抓取。

　　数据的抓取分为两个过程：首先获取每个关键字搜索结果对应的url和页数，保存在列表里面，这个过程用一个线程来实现（类似生产者），同时根据获取的关键字的url和页数，抓取对应的数据，这个过程用多线程来抓取（类似消费者）

　　这样整个抓取过程共用了144.366188 秒，采用单线程来进行抓取要用大概184秒，这样大概节省了40秒

　　具体代码如下：

# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import IniFile
from selenium.webdriver.common.keys import Keys
import LogFile
from threading import Thread
from Queue import Queue

global url_pageCount_keyword_list
url_pageCount_keyword_list = []
# url_pageCount_keyword_queue = Queue()

#获取url线程类
class GetUrl_Thread(Thread):
    def __init__(self, IEDriverServer,keywordList,webSearchUrl,pageCountLable):
        ‘‘‘
        构造函数
        :param IEDriverServer: IE驱动的路径路径，比如：C:\Program Files\Internet Explorer\IEDriverServer.exe
        :param keywordList: 关键字列表
        :param webSearchUrl: 网站搜索页url
        :param pageCountLable: 页数标签
        ‘‘‘
        Thread.__init__(self)

        #关键字列表
        self.keywordList = keywordList
        self.pageCountLable = pageCountLable

        self.urldriver = webdriver.Ie(IEDriverServer)
        self.wait = ui.WebDriverWait(self.urldriver, 20)
        self.urldriver.maximize_window()
        self.urldriver.get(webSearchUrl)



    def run(self):
        # global url_pageCount_keyword_list
        # self.urldriver.implicitly_wait(3)
        time.sleep(3)
        for keyword in self.keywordList:
            if len(keyword) > 0:
                js = "var obj = document.getElementById(‘mainSearchTextbox‘);obj.value=‘" + keyword + "‘;"
                self.urldriver.execute_script(js)
                # 点击搜索链接
                ss_elements = self.urldriver.find_element_by_id("mainSearchTextbox")
                ss_elements.send_keys(Keys.RETURN)
                time.sleep(5)
                current_url = self.urldriver.current_url.replace(‘pi=1‘, ‘pi=‘)
                try:
                    elements = self.urldriver.find_elements_by_xpath(self.pageCountLable)
                    # 要爬虫的页数
                    strCount = elements[0].text.encode(‘utf8‘)
                    pageCount = int(strCount) / 10
                    if int(strCount) % 10 > 0:
                        pageCount = pageCount + 1
                    # my_queue.put(current_url + ‘_‘ + str(pageCount))
                    url_pageCount_keyword_list.append(current_url.encode(‘utf8‘) + ‘_‘ + str(pageCount) + ‘_‘ + keyword)
                except Exception, e:
                    print e.message

        self.urldriver.close()
        self.urldriver.quit()

#抓取数据线程类
class ScrapyData_Thread(Thread):
    def __init__(self, url_pageCount_keyword,htmlLable,OriginalUrlLabel):
        ‘‘‘
        构造函数
        :param url_pageCount_keyword: 关键字搜索结果url+页数+关键字
        :param htmlLable: 要搜索的标签
        :param OriginalUrlLabel: 每个记录对应的url标签
        ‘‘‘
        Thread.__init__(self)
        #对应关键字搜索结果的url
        self.current_url = url_pageCount_keyword.split(‘_‘)[0]
        #搜索结果的页数
        self.pageCount = int(url_pageCount_keyword.split(‘_‘)[1])
        #关键字
        self.keyword = url_pageCount_keyword.split(‘_‘)[2]

        self.htmlLable = htmlLable
        self.OriginalUrlLabel = OriginalUrlLabel
        self.currentDate = time.strftime(‘%Y-%m-%d‘)
        self.datePattern = re.compile(r‘\d{4}-\d{2}-\d{2}‘)

        self.driver = webdriver.PhantomJS()
        self.wait = ui.WebDriverWait(self.driver, 20)
        self.driver.maximize_window()

    def compareDate(self, dateLeft, dateRight):
        ‘‘‘
        比较俩个日期的大小
        :param dateLeft: 日期 格式2017-03-04
        :param dateRight:日期 格式2017-03-04
        :return: 1：左大于右，0：相等，-1：左小于右
        ‘‘‘
        dls = dateLeft.split(‘-‘)
        drs = dateRight.split(‘-‘)
        if len(dls) > len(drs):
            return 1
        if int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) == int(drs[2]):
            return 0

        if int(dls[0]) > int(drs[0]):
            return 1
        elif int(dls[0]) == int(drs[0]) and int(dls[1]) > int(drs[1]):
            return 1
        elif int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) > int(drs[2]):
            return 1
        return -1
    def run(self):
        try:
            print ‘‘
            print ‘关键字：%s ‘ % self.keyword

            pageCount = self.pageCount
            recordCount = 0
            if pageCount > 0:
                pageIndex = 0
                while pageCount > 0:
                    url = self.current_url + str(pageIndex)
                    self.driver.get(url)

                    # 延迟3秒
                    time.sleep(3)
                    # self.driver.implicitly_wait(3)
                    pageCount = pageCount - 1
                    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.htmlLable))
                    Elements = self.driver.find_elements_by_xpath(self.htmlLable)

                    # 查找微博对应的原始url
                    urlList = []
                    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.OriginalUrlLabel))
                    hrefElements = self.driver.find_elements_by_xpath(self.OriginalUrlLabel)
                    for hrefe in hrefElements:
                        urlList.append(hrefe.get_attribute(‘href‘).encode(‘utf8‘))

                    # self.driver.implicitly_wait(2)
                    index = 0
                    strMessage = ‘ ‘
                    strsplit = ‘\n------------------------------------------------------------------------------------\n‘
                    index = 0
                    # 每页中有用记录
                    usefulCount = 0
                    for element in Elements:
                        txt = element.text.encode(‘utf8‘)
                        txts = txt.split(‘\n‘)
                        strDate = re.findall(self.datePattern, txt)
                        # 日期大于今天并且搜索的关键字在标题中才认为是复合要求的数据
                        if len(strDate) > 0 and self.compareDate(strDate[0], self.currentDate) == 1 and \
                                        txts[0].find(self.keyword) > -1:
                            print ‘ ‘
                            print txt
                            print ‘活动链接：‘ + urlList[index]
                            print strsplit

                            strMessage = txt + "\n"
                            strMessage += ‘活动链接：‘ + urlList[index] + "\n"
                            strMessage += strsplit
                            strMessage = unicode(strMessage, ‘utf8‘)
                            # log.WriteLog(strMessage)
                            usefulCount = usefulCount + 1
                            recordCount = recordCount + 1
                        index = index + 1

                    pageIndex = pageIndex + 1
                    if usefulCount == 0:
                        break

            print "共浏览了: %d 页数据" % self.pageCount
            print "共抓取了: %d 个符合条件的活动记录" % recordCount
        except Exception, e:
            print e.message

        self.driver.close()
        self.driver.quit()

if __name__ == ‘__main__‘:
    configfile = os.path.join(os.getcwd(), ‘MeetingConfig.conf‘)
    cf = IniFile.ConfigFile(configfile)
    IEDriverServer = cf.GetValue("section", "IEDriverServer")
    os.environ["webdriver.ie.driver"] = IEDriverServer

    keyword = cf.GetValue("section", "keywords")
    keywordList = keyword.split(‘;‘)

    webSearchUrl = cf.GetValue("section", "webSearchUrl")
    pageCountLable = cf.GetValue("section", "pageCountLable")
    htmlLable = cf.GetValue("section", "htmlLable")
    OriginalUrlLabel = cf.GetValue("section", "OriginalUrlLabel")

    start = time.clock()
    turl = GetUrl_Thread(IEDriverServer,keywordList,webSearchUrl,pageCountLable)
    turl.start()

    while True:
        if len(url_pageCount_keyword_list) >0:
            url_pageCount_keyword = url_pageCount_keyword_list[0]
            url_pageCount_keyword_list.remove(url_pageCount_keyword)
            t = ScrapyData_Thread(url_pageCount_keyword, htmlLable, OriginalUrlLabel)
            t.setDaemon(True)
            t.start()
            t.join()
        else:
            if turl.isAlive():
                time.sleep(1)
            else:
                break

    end = time.clock()
    print "整个过程用时间: %f 秒" % (end - start)

以上是关于[Python爬虫] 之十：Selenium +phantomjs抓取活动行中会议活动（多线程抓取）的主要内容，如果未能解决你的问题，请参考以下文章