如何使Python多处理池工作以写入相同的日志文件

Question

我正在尝试在运行Python多处理池时将日志写入相同的日志文件。但我发现我的主要方法不起作用。任何人都可以检查我的代码？谢谢。

以下主要方法verify_html_file_content未在多处理池中执行。我正在尝试将日志写入同一个日志文件，日志文件是传递给main方法verify_html_file_content的参数：

import codecs
import datetime
import os

import multiprocessing

import requests
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

css_xpath = "//link"
js_xpath = "//script"
img_xpath = "//img"
i = 1


def is_404_error(html_url, attr_url, file_path, log_file):
    try:
        r = requests.head(attr_url, headers={'Connection': 'close'})
        if r.status_code == 404:
            log_file.write(file_path + " : " + html_url + " attr: " + attr_url + " return 404!
")
            log_file.flush()
        else:
            log_file.write(file_path + " : " + html_url + " attr: " + attr_url + " correct!
")
            log_file.flush()
    except requests.ConnectionError:
        print("ConnectionError: " + attr_url)


def verify_html_file_link(driver, html_url, file_path, log_file):
    driver.get(html_url)
    global i
    print("html_file No.: ", i)
    driver.implicitly_wait(15)
    xpath_list = [css_xpath, js_xpath, img_xpath]
    for xpath in xpath_list:
        try:
            des = driver.find_elements(By.XPATH, xpath)

            if des != None:
                for node in des:
                    if node.get_attribute("href") != None:
                        is_404_error(html_url, node.get_attribute("href"), file_path, log_file)
                    else:
                        if node.get_attribute("href") != None:
                            is_404_error(html_url, node.get_attribute("src"), file_path, log_file)

        except Exception:
            print("Something wrong with " + html_url)
            log_file.write("Something wrong with file: " + file_path + " : " + html_url)
        finally:
            i += 1


def verify_html_file_content(url, path, log_file):
    chromedriver = os.environ["chromedriver"]

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('window-size=1920x1080')
    driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
    try:
        verify_html_file_link(driver, url, path, log_file)

    except:
        raise

    finally:
        driver.close()
        driver.quit()


if __name__ == '__main__':

    start_time = datetime.datetime.now()

    total_html_files_map = {
        "1": "https://www.agoda.com/zh-cn/country/china.html?cid=-38",
        "2": "https://www.booking.com/searchresults.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Axu-O%3AwindowsS10-B%3Achrome-N%3AXX-S%3Abo-U%3Ac-H%3As;sid=e066c0dc5a7db02a31ae39a76587ff80;city=-246227&;ilp=1;d_dcp=1",
        "3": "https://secure.booking.com/myreferrals.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Axu-O%3AwindowsS10-B%3Achrome-N%3AXX-S%3Abo-U%3Ac-H%3As;sid=e066c0dc5a7db02a31ae39a76587ff80;rafftl=1;source=34;",
        "4": "https://www.booking.com/hotel/jp/oakwood-premier-tokyo.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Axu-O%3AwindowsS10-B%3Achrome-N%3AXX-S%3Abo-U%3Ac-H%3As;sid=e066c0dc5a7db02a31ae39a76587ff80;dest_id=-246227;dest_type=city;dist=0;hapos=5;hpos=5;room1=A%2CA;sb_price_type=total;srepoch=1513924549;srfid=5eb7da030c321979554842f24c64bf54e3e3fcbfX5;srpvid=90612e624185032f;type=total;ucfs=1&#hotelTmpl",
        "5": "https://www.booking.com/hotel/jp/apa-hotel-shinjuku-gyoenmae.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Axu-O%3AwindowsS10-B%3Achrome-N%3AXX-S%3Abo-U%3Ac-H%3As;sid=e066c0dc5a7db02a31ae39a76587ff80;dest_id=-246227;dest_type=city;dist=0;hapos=6;hpos=6;room1=A%2CA;sb_price_type=total;srepoch=1513924549;srfid=5eb7da030c321979554842f24c64bf54e3e3fcbfX6;srpvid=90612e624185032f;type=total;ucfs=1&#hotelTmpl",
        "6": "https://www.booking.com/hotel/jp/hotel-unizo-ginza-itchome.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Axu-O%3AwindowsS10-B%3Achrome-N%3AXX-S%3Abo-U%3Ac-H%3As;sid=e066c0dc5a7db02a31ae39a76587ff80;dest_id=-246227;dest_type=city;dist=0;hapos=7;hpos=7;room1=A%2CA;sb_price_type=total;srepoch=1513924549;srfid=5eb7da030c321979554842f24c64bf54e3e3fcbfX7;srpvid=90612e624185032f;type=total;ucfs=1&#hotelTmpl",
        "7": "https://www.booking.com/hotel/jp/apa-sugamo-ekimae.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Axu-O%3AwindowsS10-B%3Achrome-N%3AXX-S%3Abo-U%3Ac-H%3As;sid=e066c0dc5a7db02a31ae39a76587ff80;dest_id=-246227;dest_type=city;dist=0;hapos=8;hpos=8;room1=A%2CA;sb_price_type=total;srepoch=1513924549;srfid=5eb7da030c321979554842f24c64bf54e3e3fcbfX8;srpvid=90612e624185032f;type=total;ucfs=1&#hotelTmpl",
    }

    thread_pool = multiprocessing.Pool(5)
    log_file = codecs.open("html_files_verification.log", 'w', 'utf-8')
    try:

        for key in total_html_files_map.keys():
            thread_pool.apply_async(verify_html_file_content, args=(total_html_files_map[key], key, log_file,))
        print('Waiting for all subprocesses done...')
        thread_pool.close()
        thread_pool.join()
    except Exception as e:
        print(e)
    finally:
        log_file.close()

    end_time = datetime.datetime.now()
    print("Duration: ", time.strftime('%H:%M:%S', time.gmtime((end_time - start_time).total_seconds())))

Answer 1

另一答案

Answer 2

另一答案