关于如何使用python进行多处理来爬取网站页面的问题

Posted 2023-02-16

技术标签:

【中文标题】关于如何使用python进行多处理来爬取网站页面的问题【英文标题】：Question about how to make multiprocessing with python for crawling website pages 【发布时间】：2021-10-18 03:30:23 【问题描述】：

我有一个关于如何使用 Python 制作用于多处理的爬取代码的问题。下图是我想象的功能。但是问题是被操作的进程不能接受 URL 列表。请告诉我您认为的最佳解决方案。

[![在此处输入图片描述][1]][1]

import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing import Pool

start_time = time.time()

driver = webdriver.Chrome(executable_path='chromedriver')

# Login
driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("ID")
driver.find_element_by_name("password").send_keys("PW")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
time.sleep(0.1)


all_urls = []
for i in range(1, 201):
    all_urls.append('https://quasarzone.com/bbs/qf_cmr?page='.format(i))


result = []


def next_page(urls):
    driver.get(urls)
    res = driver.page_source
    soup = BeautifulSoup(res, "html.parser", from_encoding='utf-8')
    data_name = soup.select('td:nth-child(4) > div > div')
    data_date = soup.select('td:nth-child(6) > span')
    data_title = soup.select('td:nth-child(3) > p > a')
    data_view = soup.select('td:nth-child(5) > span')

    for name, date, title, view in zip(data_name, data_date, data_title, data_view):
        result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])


# Problem point!!
if __name__ == '__main__':
    with Pool(processes=4) as pool:
        pool.map(next_page, all_urls)
        pool.join()


f = open('crawling_review_quasarzone.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(f)

header = ['name', 'date', 'title', 'view']
csv_writer.writerow(header)

for i in result:
    csv_writer.writerow(i)
f.close()

end_time = time.time()
spend_time = end_time - start_time
t = open('spending_time.txt', 'w')
t.write('total spending time:  sec'.format(spend_time))
t.close()

driver.quit()

【问题讨论】：

这能回答你的问题吗？ Python execute script using multiple browsers Selenium 【参考方案1】：

我自己解决了。但是我认为这不是最好的情况。也许我可以将多线程与多处理一起使用。无论如何，我上传了我所做的代码。

import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from concurrent.futures import ProcessPoolExecutor


board_name = 'cmr'

start_time = time.time()

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("disable-gpu")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(executable_path='chromedriver', options=options)


driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("id")
driver.find_element_by_name("password").send_keys("pw")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
time.sleep(0.1)


def next_page(pages):
    result = []
    for i in pages:
        driver.get('https://quasarzone.com/bbs/qf_?page='.format(board_name, i))
        time.sleep(5)

        res = driver.page_source
        soup = BeautifulSoup(res, "html.parser")
        data_name = soup.select('td:nth-child(4) > div > div')
        data_date = soup.select('td:nth-child(6) > span')
        data_title = soup.select('td:nth-child(3) > p > a')
        data_view = soup.select('td:nth-child(5) > span')

        for name, date, title, view in zip(data_name, data_date, data_title, data_view):
            result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])

    f = open('quasarzone_.csv'.format(board_name), 'w', newline='', encoding='utf-8')
    csv_writer = csv.writer(f)

    header = ['name', 'date', 'title', 'view']
    csv_writer.writerow(header)

    for i in result:
        csv_writer.writerow(i)
    f.close()


def multiProcessing():
    page_threshold = 100
    number_process = 4
    pool = ProcessPoolExecutor(max_workers=number_process)

    process = []
    for i in range(number_process+1):
        p = range(page_threshold * i, page_threshold * (i+1))
        process.append(p)
    pool.map(next_page, process)


if __name__ == '__main__':
    multiProcessing()


end_time = time.time()
spend_time = end_time - start_time

t = open('spending_time_.txt'.format(board_name), 'w')
t.write('total spending time of : :.2f sec'.format(board_name, spend_time))
t.close()

【讨论】：

这不是最好的情况。首先，这是更适合多线程的东西。尽管在这里，您只为每个池进程创建了一个可重用的 selenium 会话，这很好，但是在您处理完所有页面后没有退出这些会话的机制。我怀疑您最终会在后台使用一些永远不会终止的铬后台进程。请参阅我对您的问题的“重复”评论。看来函数multiProcessing 正在一遍又一遍地重写相同的输出 csv 文件，用新数据覆盖以前的数据，并且它正在并行执行此操作。这两件事似乎都不对。我错过了什么吗？【参考方案2】：

以下是当所有页面都被终止时，我将如何使用“退出”驱动程序的线程池。您可以创建一个更大的线程池，其中每个线程处理较小范围的页面以获得更高的并发性。

我不明白的是，您的函数 next_page 似乎一遍又一遍地重写同一个 csv 文件，破坏了以前的内容，并且您正在与其他进程并行执行此操作，这势必会导致错误的结果.切换到使用线程，您要么需要编写单独的文件，要么需要通过使用threading.Lock 来强制序列化并以附加模式打开文件，其中只有主线程写出标题行。或者，让每个提交的任务将要写入的行返回到主线程进行写入。

我还对源代码进行了其他更改，以更符合PEP 8 Style Guide 并重命名一些变量和函数，以便它们更好地反映它们所代表的内容。我还纠正了我认为的一些逻辑错误，因此请仔细查看所有代码行，以确保我没有“过度纠正”某些内容。最后，我更正了一些小的英语使用错误。请注意，我无法运行代码，因为我没有用户 ID 和密码。

import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import UnexpectedAlertPresentException

from concurrent.futures import ThreadPoolExecutor
import threading

class Driver:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument("disable-gpu")
        options.add_argument("disable-infobars")
        options.add_argument("--disable-extensions")
        self.driver = webdriver.Chrome(executable_path='chromedriver', options=options)

    def __del__(self):
        self.driver.quit() # clean up driver when we are cleaned up

thread_local = threading.local()

def create_driver():
    the_driver = getattr(thread_local, 'the_driver', None)
    if the_driver is None:
        the_driver = Driver()
        setattr(thread_local, 'the_driver', the_driver)
        # Special Initialization to login:
        driver = the_driver.driver
        driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
        driver.find_element_by_name("login_id").send_keys("id")
        driver.find_element_by_name("password").send_keys("pw")
        driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
        # The following should be replaced by driver.implicitly_wait(3)
        # followed by a find for some element on the "sucessfully loged in page":
        #time.sleep(0.1)
        try:
            driver.implicitly_wait(3)
            driver.find_elements_by_class_name('banner-area')
        except UnexpectedAlertPresentException:
            s = 'Invalid login credentials.'
            print(s)
            raise Exception(s)
    return the_driver.driver

board_name = 'cmr'

def next_pages(pages):
    driver = create_driver()
    result = []
    for page in pages:
        driver.get('https://quasarzone.com/bbs/qf_?page='.format(board_name, page))
        # What does the following accomplish?
        #time.sleep(5)

        res = driver.page_source
        soup = BeautifulSoup(res, "html.parser")
        data_name = soup.select('td:nth-child(4) > div > div')
        data_date = soup.select('td:nth-child(6) > span')
        data_title = soup.select('td:nth-child(3) > p > a')
        data_view = soup.select('td:nth-child(5) > span')

        for name, date, title, view in zip(data_name, data_date, data_title, data_view):
            result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])

    # The following is questionable:
    with open('quasarzone_.csv'.format(board_name), 'w', newline='', encoding='utf-8') as f:
        csv_writer = csv.writer(f)

        header = ['name', 'date', 'title', 'view']
        csv_writer.writerow(header)

        for row in result:
            csv_writer.writerow(row)

def process_pages():
    start_time = time.time()

    page_threshold = 100
    number_threads = 4
    # or, for example, page_threshold = 50; number_threads = 8
    pages_list = [range(page_threshold * i, page_threshold * (i+1)) for i in range(number_threads)]
    with ThreadPoolExecutor(max_workers=number_threads) as pool:
        pool.map(next_pages, pages_list)
    # Using the previous "with" context handler results in an implicit call to pool.shutdown(True)
    # at this point to wait for all the submitted tasks to complete. Alternatively,
    # the return value from `pool.map` could be iterated to ensure completion of
    # all submitted tasks, e.g. list(pool.map(...))

    end_time = time.time()
    elapsed_time = end_time - start_time

    with open('elapsed_time_.txt'.format(board_name), 'w') as t:
        t.write('Total elapsed time of : :.2f sec'.format(board_name, elapsed_time))

process_pages()

# Quit the selenium drivers:
del thread_local
import gc
gc.collect() # a little extra insurance

【讨论】：

以上是关于关于如何使用python进行多处理来爬取网站页面的问题的主要内容，如果未能解决你的问题，请参考以下文章