requests+mongodb爬取今日头条，多进程

Posted 2021-01-09 zhong_sp

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了requests+mongodb爬取今日头条，多进程相关的知识，希望对你有一定的参考价值。

  1 import json
  2 import os
  3 from urllib.parse import urlencode
  4 import pymongo
  5 import requests
  6 from bs4 import BeautifulSoup
  7 from requests.exceptions import ConnectionError
  8 import re
  9 from multiprocessing import Pool
 10 from hashlib import md5
 11 from json.decoder import JSONDecodeError
 12 from config import *
 13 
 14 client = pymongo.MongoClient(MONGO_URL, connect=False)
 15 db = client[MONGO_DB]
 16 
 17 
 18 def get_page_index(offset, keyword):
 19     data = {
 20         ‘autoload‘: ‘true‘,
 21         ‘count‘: 20,
 22         ‘cur_tab‘: 3,
 23         ‘format‘: ‘json‘,
 24         ‘keyword‘: keyword,
 25         ‘offset‘: offset,
 26     }
 27     params = urlencode(data)
 28     base = ‘http://www.toutiao.com/search_content/‘
 29     url = base + ‘?‘ + params
 30     try:
 31         response = requests.get(url)
 32         if response.status_code == 200:
 33             return response.text
 34         return None
 35     except ConnectionError:
 36         print(‘Error occurred‘)
 37         return None
 38 
 39 
 40 def download_image(url):
 41     print(‘Downloading‘, url)
 42     try:
 43         response = requests.get(url)
 44         if response.status_code == 200:
 45             save_image(response.content)
 46         return None
 47     except ConnectionError:
 48         return None
 49 
 50 
 51 def save_image(content):
 52     file_path = ‘{0}/{1}.{2}‘.format(os.getcwd(), md5(content).hexdigest(), ‘jpg‘)
 53     print(file_path)
 54     if not os.path.exists(file_path):
 55         with open(file_path, ‘wb‘) as f:
 56             f.write(content)
 57             f.close()
 58 
 59 
 60 def parse_page_index(text):
 61     try:
 62         data = json.loads(text)
 63         if data and ‘data‘ in data.keys():
 64             for item in data.get(‘data‘):
 65                 yield item.get(‘article_url‘)
 66     except JSONDecodeError:
 67         pass
 68 
 69 
 70 def get_page_detail(url):
 71     try:
 72         response = requests.get(url)
 73         if response.status_code == 200:
 74             return response.text
 75         return None
 76     except ConnectionError:
 77         print(‘Error occurred‘)
 78         return None
 79 
 80 
 81 def parse_page_detail(html, url):
 82     soup = BeautifulSoup(html, ‘lxml‘)
 83     result = soup.select(‘title‘)
 84     title = result[0].get_text() if result else ‘‘
 85     images_pattern = re.compile(‘gallery: JSON.parse("(.*)")‘, re.S)
 86     result = re.search(images_pattern, html)
 87     if result:
 88         data = json.loads(result.group(1).replace(‘\‘, ‘‘))
 89         if data and ‘sub_images‘ in data.keys():
 90             sub_images = data.get(‘sub_images‘)
 91             images = [item.get(‘url‘) for item in sub_images]
 92             for image in images: download_image(image)
 93             return {
 94                 ‘title‘: title,
 95                 ‘url‘: url,
 96                 ‘images‘: images
 97             }
 98 
 99 
100 def save_to_mongo(result):
101     if db[MONGO_TABLE].insert(result):
102         print(‘Successfully Saved to Mongo‘, result)
103         return True
104     return False
105 
106 
107 def main(offset):
108     text = get_page_index(offset, KEYWORD)
109     urls = parse_page_index(text)
110     for url in urls:
111         html = get_page_detail(url)
112         result = parse_page_detail(html, url)
113         if result: save_to_mongo(result)
114 
115 
116 if __name__ == ‘__main__‘:
117     pool = Pool()
118     groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
119     pool.map(main, groups)
120     pool.close()
121     pool.join()

以上是关于requests+mongodb爬取今日头条，多进程的主要内容，如果未能解决你的问题，请参考以下文章

Python requests爬取今日头条，为啥获取不了网页内容