[Python爬虫] 之十:Selenium +phantomjs抓取活动行中会议活动(多线程抓取)
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了[Python爬虫] 之十:Selenium +phantomjs抓取活动行中会议活动(多线程抓取)相关的知识,希望对你有一定的参考价值。
延续上个抓取活动行中会议活动的问题,上次使用是单线程的抓取,效率较低,现在使用多线程的抓取。
数据的抓取分为两个过程:首先获取每个关键字搜索结果对应的url和页数,保存在列表里面,这个过程用一个线程来实现(类似生产者),同时根据获取的关键字的url和页数,抓取对应的数据,这个过程用多线程来抓取(类似消费者)
这样整个抓取过程共用了144.366188 秒,采用单线程来进行抓取要用大概184秒,这样大概节省了40秒
具体代码如下:
# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import IniFile
from selenium.webdriver.common.keys import Keys
import LogFile
from threading import Thread
from Queue import Queue
global url_pageCount_keyword_list
url_pageCount_keyword_list = []
# url_pageCount_keyword_queue = Queue()
#获取url线程类
class GetUrl_Thread(Thread):
def __init__(self, IEDriverServer,keywordList,webSearchUrl,pageCountLable):
‘‘‘
构造函数
:param IEDriverServer: IE驱动的路径路径,比如:C:\Program Files\Internet Explorer\IEDriverServer.exe
:param keywordList: 关键字列表
:param webSearchUrl: 网站搜索页url
:param pageCountLable: 页数标签
‘‘‘
Thread.__init__(self)
#关键字列表
self.keywordList = keywordList
self.pageCountLable = pageCountLable
self.urldriver = webdriver.Ie(IEDriverServer)
self.wait = ui.WebDriverWait(self.urldriver, 20)
self.urldriver.maximize_window()
self.urldriver.get(webSearchUrl)
def run(self):
# global url_pageCount_keyword_list
# self.urldriver.implicitly_wait(3)
time.sleep(3)
for keyword in self.keywordList:
if len(keyword) > 0:
js = "var obj = document.getElementById(‘mainSearchTextbox‘);obj.value=‘" + keyword + "‘;"
self.urldriver.execute_script(js)
# 点击搜索链接
ss_elements = self.urldriver.find_element_by_id("mainSearchTextbox")
ss_elements.send_keys(Keys.RETURN)
time.sleep(5)
current_url = self.urldriver.current_url.replace(‘pi=1‘, ‘pi=‘)
try:
elements = self.urldriver.find_elements_by_xpath(self.pageCountLable)
# 要爬虫的页数
strCount = elements[0].text.encode(‘utf8‘)
pageCount = int(strCount) / 10
if int(strCount) % 10 > 0:
pageCount = pageCount + 1
# my_queue.put(current_url + ‘_‘ + str(pageCount))
url_pageCount_keyword_list.append(current_url.encode(‘utf8‘) + ‘_‘ + str(pageCount) + ‘_‘ + keyword)
except Exception, e:
print e.message
self.urldriver.close()
self.urldriver.quit()
#抓取数据线程类
class ScrapyData_Thread(Thread):
def __init__(self, url_pageCount_keyword,htmlLable,OriginalUrlLabel):
‘‘‘
构造函数
:param url_pageCount_keyword: 关键字搜索结果url+页数+关键字
:param htmlLable: 要搜索的标签
:param OriginalUrlLabel: 每个记录对应的url标签
‘‘‘
Thread.__init__(self)
#对应关键字搜索结果的url
self.current_url = url_pageCount_keyword.split(‘_‘)[0]
#搜索结果的页数
self.pageCount = int(url_pageCount_keyword.split(‘_‘)[1])
#关键字
self.keyword = url_pageCount_keyword.split(‘_‘)[2]
self.htmlLable = htmlLable
self.OriginalUrlLabel = OriginalUrlLabel
self.currentDate = time.strftime(‘%Y-%m-%d‘)
self.datePattern = re.compile(r‘\d{4}-\d{2}-\d{2}‘)
self.driver = webdriver.PhantomJS()
self.wait = ui.WebDriverWait(self.driver, 20)
self.driver.maximize_window()
def compareDate(self, dateLeft, dateRight):
‘‘‘
比较俩个日期的大小
:param dateLeft: 日期 格式2017-03-04
:param dateRight:日期 格式2017-03-04
:return: 1:左大于右,0:相等,-1:左小于右
‘‘‘
dls = dateLeft.split(‘-‘)
drs = dateRight.split(‘-‘)
if len(dls) > len(drs):
return 1
if int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) == int(drs[2]):
return 0
if int(dls[0]) > int(drs[0]):
return 1
elif int(dls[0]) == int(drs[0]) and int(dls[1]) > int(drs[1]):
return 1
elif int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) > int(drs[2]):
return 1
return -1
def run(self):
try:
print ‘‘
print ‘关键字:%s ‘ % self.keyword
pageCount = self.pageCount
recordCount = 0
if pageCount > 0:
pageIndex = 0
while pageCount > 0:
url = self.current_url + str(pageIndex)
self.driver.get(url)
# 延迟3秒
time.sleep(3)
# self.driver.implicitly_wait(3)
pageCount = pageCount - 1
self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.htmlLable))
Elements = self.driver.find_elements_by_xpath(self.htmlLable)
# 查找微博对应的原始url
urlList = []
self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.OriginalUrlLabel))
hrefElements = self.driver.find_elements_by_xpath(self.OriginalUrlLabel)
for hrefe in hrefElements:
urlList.append(hrefe.get_attribute(‘href‘).encode(‘utf8‘))
# self.driver.implicitly_wait(2)
index = 0
strMessage = ‘ ‘
strsplit = ‘\n------------------------------------------------------------------------------------\n‘
index = 0
# 每页中有用记录
usefulCount = 0
for element in Elements:
txt = element.text.encode(‘utf8‘)
txts = txt.split(‘\n‘)
strDate = re.findall(self.datePattern, txt)
# 日期大于今天并且搜索的关键字在标题中才认为是复合要求的数据
if len(strDate) > 0 and self.compareDate(strDate[0], self.currentDate) == 1 and \
txts[0].find(self.keyword) > -1:
print ‘ ‘
print txt
print ‘活动链接:‘ + urlList[index]
print strsplit
strMessage = txt + "\n"
strMessage += ‘活动链接:‘ + urlList[index] + "\n"
strMessage += strsplit
strMessage = unicode(strMessage, ‘utf8‘)
# log.WriteLog(strMessage)
usefulCount = usefulCount + 1
recordCount = recordCount + 1
index = index + 1
pageIndex = pageIndex + 1
if usefulCount == 0:
break
print "共浏览了: %d 页数据" % self.pageCount
print "共抓取了: %d 个符合条件的活动记录" % recordCount
except Exception, e:
print e.message
self.driver.close()
self.driver.quit()
if __name__ == ‘__main__‘:
configfile = os.path.join(os.getcwd(), ‘MeetingConfig.conf‘)
cf = IniFile.ConfigFile(configfile)
IEDriverServer = cf.GetValue("section", "IEDriverServer")
os.environ["webdriver.ie.driver"] = IEDriverServer
keyword = cf.GetValue("section", "keywords")
keywordList = keyword.split(‘;‘)
webSearchUrl = cf.GetValue("section", "webSearchUrl")
pageCountLable = cf.GetValue("section", "pageCountLable")
htmlLable = cf.GetValue("section", "htmlLable")
OriginalUrlLabel = cf.GetValue("section", "OriginalUrlLabel")
start = time.clock()
turl = GetUrl_Thread(IEDriverServer,keywordList,webSearchUrl,pageCountLable)
turl.start()
while True:
if len(url_pageCount_keyword_list) >0:
url_pageCount_keyword = url_pageCount_keyword_list[0]
url_pageCount_keyword_list.remove(url_pageCount_keyword)
t = ScrapyData_Thread(url_pageCount_keyword, htmlLable, OriginalUrlLabel)
t.setDaemon(True)
t.start()
t.join()
else:
if turl.isAlive():
time.sleep(1)
else:
break
end = time.clock()
print "整个过程用时间: %f 秒" % (end - start)
以上是关于[Python爬虫] 之十:Selenium +phantomjs抓取活动行中会议活动(多线程抓取)的主要内容,如果未能解决你的问题,请参考以下文章