糗百爬虫
Posted dreamhai
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了糗百爬虫相关的知识,希望对你有一定的参考价值。
#coding:utf-8 import requests import json from lxml import etree import threading from queue import Queue class QiushiSpide(object): def __init__(self): self.url_tmp = "https://www.qiushibaike.com/8hr/page/{}/" self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/72.0.3626.96 Safari/537.36"} self.pre_url = "https://www.qiushibaike.com" self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue() def get_url_list(self): for i in range(1,14): self.url_queue.put(self.url_tmp.format(i)) print(self.url_queue.qsize()) # return [self.url_tmp.format(i) for i in range(1,14)] def parse_url(self): while True: url = self.url_queue.get() print(url) response = requests.get(url, self.header) self.html_queue.put(response.content.decode()) self.url_queue.task_done() print("url_queue 完成一个") # return response.content.decode() def get_content_list(self): while True: html_str = self.html_queue.get() html = etree.HTML(html_str) li_list = html.xpath("//li[contains(@class,‘item typs_‘)]") content_list=[] for li in li_list: item = {} img_list = li.xpath(".//a[contains(@class,‘recmd-left‘)]") for img in img_list: item["img_url"] = "https:" + img.xpath("./img/@src")[0] if len(img.xpath("./img/@src"))>0 else None div_list = li.xpath(".//div[@class=‘recmd-right‘]") for div in div_list: item["text"] = div.xpath("./a/text()")[0] if len(div.xpath("./a/text()"))>0 else None item["a_href"] = self.pre_url + div.xpath("./a/@href")[0] if len(div.xpath("./a/@href"))>0 else None item["smile_num"] = div.xpath(".//div[@class=‘recmd-num‘]/span[1]/text()")[0] if len(div.xpath(".//div[@class=‘recmd-num‘]/span[1]"))>0 else None item["comment_num"] = div.xpath(".//div[@class=‘recmd-num‘]/span[4]/text()")[0] if len(div.xpath(".//div[@class=‘recmd-num‘]/span[4]"))>0 else None content_list.append(item) self.content_queue.put(content_list) self.html_queue.task_done() print("html_queue 完成一个") # return content_list def save_content(self): while True: content = self.content_queue.get() with open("糗百多线程.txt",‘a‘,encoding=‘utf-8‘) as f: f.write(json.dumps(content,ensure_ascii=False,indent=2)) f.write(" ") self.content_queue.task_done() def run(self): # url_list = self.get_url_list() # for url in url_list: # print(url) # html_str = self.parse_url(url) # content = self.get_content_list(html_str) # self.save_content(content) t_list = [] self.get_url_list() for i in range(4): p = threading.Thread(target=self.parse_url) t_list.append(p) print("添加parse_url线程结束") for i in range(4): g = threading.Thread(target=self.get_content_list) t_list.append(g) print("添加get_content_list线程结束") s = threading.Thread(target=self.save_content) t_list.append(s) for t in t_list: t.setDaemon(True) #守护线程,该线程不重要,主线程结束,子线程结束 t.start() for q in [self.url_queue,self.html_queue,self.content_queue]: q.join() #让主线程等待阻塞,等待队列的任务执行完了后再 完成 print("主线程end") if __name__ == "__main__": q = QiushiSpide() q.run()
以上是关于糗百爬虫的主要内容,如果未能解决你的问题,请参考以下文章