多线程爬虫

Posted 2022-03-10 kend

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了多线程爬虫相关的知识，希望对你有一定的参考价值。

threading模块

import threading
import time

def coding():
    for i in range(3):
        print("正在写代码%s"%threading.current_thread())
        time.sleep(1)

def drawing():
    for i in range(3):
        print("正在画画%s"%threading.current_thread())
        time.sleep(1)

def main():
    # 创建一个子线程
    t1 = threading.Thread(target=coding, )
    t1.start()
    t2 = threading.Thread(target=drawing, )
    t2.start()


if __name__ == ‘__main__‘:
    main()

condition的生产者消费者模式

# threading.Condition 继承threading.Lock
import threading
import random
import time

gMoney = 1000
gCondition = threading.Condition()
gTimes = 0
gTotalTimes = 10

class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        global gTotalTimes
        while True:
            money = random.randint(100,1000)
            gCondition.acquire()
            if gTimes >= gTotalTimes:
                gCondition.release()
                break

            gMoney += money
            gTimes += 1
            print("%s生产了%d的钱,现在总共有%d" % (threading.current_thread(), money, gMoney))
            gCondition.notify_all() #通知wait等待的线程
            gCondition.release()
            time.sleep(1)


class Consumer(threading.Thread):
    def run(self):
        global gMoney
        while True:
            money = random.randint(100,1000)
            gCondition.acquire()

            while gMoney < money:
                if gTimes > gTotalTimes:
                    gCondition.release()
                    return
                print("%s,准备消费%d,剩余金额%d,不足!!!" % (threading.current_thread, money, gMoney))
                gCondition.wait()

            gMoney -= money
            print("消费者%s,消费了%d,剩余金额%d"%(threading.current_thread,money,gMoney))
            gCondition.release()
            time.sleep(1)

def main():
    for x in range(2):
        t = Producer(name="生产者%d"%x)
        t.start()

    for x in range(3):
        t = Consumer(name=‘消费者%d‘%x)
        t.start()


if __name__ == ‘__main__‘:
    main()

lock版的生产者消费者模式

import threading
import random
import time

gMoney = 1000
gLock = threading.Lock()
gTimes = 0
gTotalTimes = 10

class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        global gTotalTimes
        while True:
            money = random.randint(100,1000)
            gLock.acquire()

            # if gTimes >= gTotalTimes:
            #     gLock.release()
            #     break
            # gMoney += money
            # gTimes += 1
            # print("%s生产了%d的钱,现在总共有%d" % (threading.current_thread(), money, gMoney))
            # gLock.release()
            # time.sleep(1)

            if gTimes < gTotalTimes:
                gMoney += money
                gTimes += 1
                print("%s生产了%d的钱,现在总共有%d"%(threading.current_thread(),money,gMoney))
                gLock.release()
                time.sleep(1)
            else:
                print("已经生产了10次, 停止生产")
                gLock.release()
                break



class Consumer(threading.Thread):
    def run(self):
        global gMoney
        while True:
            money = random.randint(100,1000)
            gLock.acquire()
            if gMoney >= money:
                gMoney -= money
                print("消费者%s,消费了%d,还剩有%d"%(threading.current_thread(),money,gMoney))
            else:
                if gTimes >= gTotalTimes:
                    gLock.release()
                    break
                print("余额不足,当前金额是%d, 需要消费的金额是%d"%(gMoney,money))
            gLock.release()
            time.sleep(1)

def main():
    for x in range(2):
        t = Producer(name="生产者%d"%x)
        t.start()

    for x in range(3):
        t = Consumer(name=‘消费者%d‘%x)
        t.start()


if __name__ == ‘__main__‘:
    main()

queue的线程安全

from queue import Queue
import time
import threading


# q.put(2)
# q.put(1)
# q.put(3)
#
# print(q.qsize())
# print(q.full())
# print(q.empty())
# print(q.get())

def set_value(q):
    index = 0
    while True:
        q.put(index)
        index += 1
        time.sleep(3)

def get_value(q):
    while True:
        print(q.get())

def main():
    q = Queue(4)
    t1 = threading.Thread(target=set_value,args=[q])
    t2 = threading.Thread(target=get_value,args=[q])
    t1.start()
    t2.start()

if __name__ == ‘__main__‘:
    main()

threading类实现多线程

import threading
import time

class CodingThread(threading.Thread):
    def run(self):
        for i in range(3):
            print("正在写代码%s"%threading.current_thread())
            time.sleep(1)

class DrawingThread(threading.Thread):
    def run(self):
        for i in range(3):
            print("正在画画%s"%threading.current_thread())
            time.sleep(1)


def main():
    # 创建一个子线程
    t1 = CodingThread()
    t1.start()
    t2 = DrawingThread()
    t2.start()


if __name__ == ‘__main__‘:
    main()

selenium关闭页面和浏览器

from selenium import webdriver
import time

driver_path = r"G:\Crawler and Data\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get(‘https://www.baidu.com/‘)

# 通过id 的方式获取
inputTag = driver.find_element_by_id(‘kw‘)
inputTag.send_keys(‘python‘)
time.sleep(3)


driver.close() # 关闭页面
# driver.quit() # 关闭整个浏览器

selenium页面等待

from selenium import webdriver
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

driver_path = r"G:\Crawler and Data\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get(‘https://www.baidu.com/‘)

# 等待10秒后找这个对应的id标签, 因为是错误的所以等待10秒后报错
# 如果是正确的id 标签, 找到后直接继续执行, 不会等10 秒
WebDriverWait(driver,10).until(
    EC.presence_of_element_located((By.ID,‘shjdkah‘))
)

selenium打开多个页面和页面间的切换

from selenium import webdriver
import time

driver_path = r"G:\Crawler and Data\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get(‘https://www.baidu.com/‘)

# 打开豆瓣
driver.execute_script(‘window.open("https://www.douban.com/")‘)
# 但是当前的driver还是停留在baidu页面
print(driver.current_url)
print(driver.window_handles)  # 窗口句柄 看看现在打开的窗口有什么
driver.switch_to.window(driver.window_handles[1]) # 切换窗口
print(driver.current_url)
driver.close() # 关闭页面
driver.switch_to.window(driver.window_handles[0]) # 切换窗口

多线程共享全局变量

import threading

VALUE = 0
gLock = threading.Lock() # 创建锁

def add_value():
    global VALUE
    gLock.acquire()
    for x in range(1000000):
        VALUE += 1
    gLock.release()
    print("value,%d"%VALUE)

def main():
    for x in range(2):
        t = threading.Thread(target=add_value)
        t.start()

if __name__ == ‘__main__‘:
    main()

selenium设置代理ip

from selenium import webdriver


options = webdriver.ChromeOptions()
# 设置代理
options.add_argument("--proxy-server-http://1.197.203.158:9999")

driver_path = r"G:\Crawler and Data\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
driver.get(‘http://httpbin.org/ip‘)

以上是关于多线程爬虫的主要内容，如果未能解决你的问题，请参考以下文章

爬虫.多线程爬虫与多进程爬虫

Python多线程和多进程爬虫

Python爬虫编程思想（135）：多线程和多进程爬虫--Python与线程

爬虫学习之第四章爬虫进阶之多线程爬虫

Python 多线程爬虫