关于如何使用python进行多处理来爬取网站页面的问题
Posted
技术标签:
【中文标题】关于如何使用python进行多处理来爬取网站页面的问题【英文标题】:Question about how to make multiprocessing with python for crawling website pages 【发布时间】:2021-10-18 03:30:23 【问题描述】:我有一个关于如何使用 Python 制作用于多处理的爬取代码的问题。 下图是我想象的功能。 但是问题是被操作的进程不能接受 URL 列表。 请告诉我您认为的最佳解决方案。
[![在此处输入图片描述][1]][1]
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing import Pool
start_time = time.time()
driver = webdriver.Chrome(executable_path='chromedriver')
# Login
driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("ID")
driver.find_element_by_name("password").send_keys("PW")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
time.sleep(0.1)
all_urls = []
for i in range(1, 201):
all_urls.append('https://quasarzone.com/bbs/qf_cmr?page='.format(i))
result = []
def next_page(urls):
driver.get(urls)
res = driver.page_source
soup = BeautifulSoup(res, "html.parser", from_encoding='utf-8')
data_name = soup.select('td:nth-child(4) > div > div')
data_date = soup.select('td:nth-child(6) > span')
data_title = soup.select('td:nth-child(3) > p > a')
data_view = soup.select('td:nth-child(5) > span')
for name, date, title, view in zip(data_name, data_date, data_title, data_view):
result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])
# Problem point!!
if __name__ == '__main__':
with Pool(processes=4) as pool:
pool.map(next_page, all_urls)
pool.join()
f = open('crawling_review_quasarzone.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(f)
header = ['name', 'date', 'title', 'view']
csv_writer.writerow(header)
for i in result:
csv_writer.writerow(i)
f.close()
end_time = time.time()
spend_time = end_time - start_time
t = open('spending_time.txt', 'w')
t.write('total spending time: sec'.format(spend_time))
t.close()
driver.quit()
【问题讨论】:
这能回答你的问题吗? Python execute script using multiple browsers Selenium 【参考方案1】:我自己解决了。但是我认为这不是最好的情况。 也许我可以将多线程与多处理一起使用。 无论如何,我上传了我所做的代码。
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from concurrent.futures import ProcessPoolExecutor
board_name = 'cmr'
start_time = time.time()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("disable-gpu")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(executable_path='chromedriver', options=options)
driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("id")
driver.find_element_by_name("password").send_keys("pw")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
time.sleep(0.1)
def next_page(pages):
result = []
for i in pages:
driver.get('https://quasarzone.com/bbs/qf_?page='.format(board_name, i))
time.sleep(5)
res = driver.page_source
soup = BeautifulSoup(res, "html.parser")
data_name = soup.select('td:nth-child(4) > div > div')
data_date = soup.select('td:nth-child(6) > span')
data_title = soup.select('td:nth-child(3) > p > a')
data_view = soup.select('td:nth-child(5) > span')
for name, date, title, view in zip(data_name, data_date, data_title, data_view):
result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])
f = open('quasarzone_.csv'.format(board_name), 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(f)
header = ['name', 'date', 'title', 'view']
csv_writer.writerow(header)
for i in result:
csv_writer.writerow(i)
f.close()
def multiProcessing():
page_threshold = 100
number_process = 4
pool = ProcessPoolExecutor(max_workers=number_process)
process = []
for i in range(number_process+1):
p = range(page_threshold * i, page_threshold * (i+1))
process.append(p)
pool.map(next_page, process)
if __name__ == '__main__':
multiProcessing()
end_time = time.time()
spend_time = end_time - start_time
t = open('spending_time_.txt'.format(board_name), 'w')
t.write('total spending time of : :.2f sec'.format(board_name, spend_time))
t.close()
【讨论】:
这不是最好的情况。首先,这是更适合多线程的东西。尽管在这里,您只为每个池进程创建了一个可重用的 selenium 会话,这很好,但是在您处理完所有页面后没有退出这些会话的机制。我怀疑您最终会在后台使用一些永远不会终止的铬后台进程。请参阅我对您的问题的“重复”评论。 看来函数multiProcessing
正在一遍又一遍地重写相同的输出 csv 文件,用新数据覆盖以前的数据,并且它正在并行执行此操作。这两件事似乎都不对。我错过了什么吗?【参考方案2】:
以下是当所有页面都被终止时,我将如何使用“退出”驱动程序的线程池。您可以创建一个更大的线程池,其中每个线程处理较小范围的页面以获得更高的并发性。
我不明白的是,您的函数 next_page
似乎一遍又一遍地重写同一个 csv 文件,破坏了以前的内容,并且您正在与其他进程并行执行此操作,这势必会导致错误的结果.切换到使用线程,您要么需要编写单独的文件,要么需要通过使用threading.Lock
来强制序列化并以附加模式打开文件,其中只有主线程写出标题行。或者,让每个提交的任务将要写入的行返回到主线程进行写入。
我还对源代码进行了其他更改,以更符合PEP 8 Style Guide 并重命名一些变量和函数,以便它们更好地反映它们所代表的内容。我还纠正了我认为的一些逻辑错误,因此请仔细查看所有代码行,以确保我没有“过度纠正”某些内容。最后,我更正了一些小的英语使用错误。请注意,我无法运行代码,因为我没有用户 ID 和密码。
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import UnexpectedAlertPresentException
from concurrent.futures import ThreadPoolExecutor
import threading
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("disable-gpu")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
self.driver = webdriver.Chrome(executable_path='chromedriver', options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
thread_local = threading.local()
def create_driver():
the_driver = getattr(thread_local, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(thread_local, 'the_driver', the_driver)
# Special Initialization to login:
driver = the_driver.driver
driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("id")
driver.find_element_by_name("password").send_keys("pw")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
# The following should be replaced by driver.implicitly_wait(3)
# followed by a find for some element on the "sucessfully loged in page":
#time.sleep(0.1)
try:
driver.implicitly_wait(3)
driver.find_elements_by_class_name('banner-area')
except UnexpectedAlertPresentException:
s = 'Invalid login credentials.'
print(s)
raise Exception(s)
return the_driver.driver
board_name = 'cmr'
def next_pages(pages):
driver = create_driver()
result = []
for page in pages:
driver.get('https://quasarzone.com/bbs/qf_?page='.format(board_name, page))
# What does the following accomplish?
#time.sleep(5)
res = driver.page_source
soup = BeautifulSoup(res, "html.parser")
data_name = soup.select('td:nth-child(4) > div > div')
data_date = soup.select('td:nth-child(6) > span')
data_title = soup.select('td:nth-child(3) > p > a')
data_view = soup.select('td:nth-child(5) > span')
for name, date, title, view in zip(data_name, data_date, data_title, data_view):
result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])
# The following is questionable:
with open('quasarzone_.csv'.format(board_name), 'w', newline='', encoding='utf-8') as f:
csv_writer = csv.writer(f)
header = ['name', 'date', 'title', 'view']
csv_writer.writerow(header)
for row in result:
csv_writer.writerow(row)
def process_pages():
start_time = time.time()
page_threshold = 100
number_threads = 4
# or, for example, page_threshold = 50; number_threads = 8
pages_list = [range(page_threshold * i, page_threshold * (i+1)) for i in range(number_threads)]
with ThreadPoolExecutor(max_workers=number_threads) as pool:
pool.map(next_pages, pages_list)
# Using the previous "with" context handler results in an implicit call to pool.shutdown(True)
# at this point to wait for all the submitted tasks to complete. Alternatively,
# the return value from `pool.map` could be iterated to ensure completion of
# all submitted tasks, e.g. list(pool.map(...))
end_time = time.time()
elapsed_time = end_time - start_time
with open('elapsed_time_.txt'.format(board_name), 'w') as t:
t.write('Total elapsed time of : :.2f sec'.format(board_name, elapsed_time))
process_pages()
# Quit the selenium drivers:
del thread_local
import gc
gc.collect() # a little extra insurance
【讨论】:
以上是关于关于如何使用python进行多处理来爬取网站页面的问题的主要内容,如果未能解决你的问题,请参考以下文章