requests+mongodb爬取今日头条,多进程

Posted zhong_sp

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了requests+mongodb爬取今日头条,多进程相关的知识,希望对你有一定的参考价值。

  1 import json
  2 import os
  3 from urllib.parse import urlencode
  4 import pymongo
  5 import requests
  6 from bs4 import BeautifulSoup
  7 from requests.exceptions import ConnectionError
  8 import re
  9 from multiprocessing import Pool
 10 from hashlib import md5
 11 from json.decoder import JSONDecodeError
 12 from config import *
 13 
 14 client = pymongo.MongoClient(MONGO_URL, connect=False)
 15 db = client[MONGO_DB]
 16 
 17 
 18 def get_page_index(offset, keyword):
 19     data = {
 20         autoload: true,
 21         count: 20,
 22         cur_tab: 3,
 23         format: json,
 24         keyword: keyword,
 25         offset: offset,
 26     }
 27     params = urlencode(data)
 28     base = http://www.toutiao.com/search_content/
 29     url = base + ? + params
 30     try:
 31         response = requests.get(url)
 32         if response.status_code == 200:
 33             return response.text
 34         return None
 35     except ConnectionError:
 36         print(Error occurred)
 37         return None
 38 
 39 
 40 def download_image(url):
 41     print(Downloading, url)
 42     try:
 43         response = requests.get(url)
 44         if response.status_code == 200:
 45             save_image(response.content)
 46         return None
 47     except ConnectionError:
 48         return None
 49 
 50 
 51 def save_image(content):
 52     file_path = {0}/{1}.{2}.format(os.getcwd(), md5(content).hexdigest(), jpg)
 53     print(file_path)
 54     if not os.path.exists(file_path):
 55         with open(file_path, wb) as f:
 56             f.write(content)
 57             f.close()
 58 
 59 
 60 def parse_page_index(text):
 61     try:
 62         data = json.loads(text)
 63         if data and data in data.keys():
 64             for item in data.get(data):
 65                 yield item.get(article_url)
 66     except JSONDecodeError:
 67         pass
 68 
 69 
 70 def get_page_detail(url):
 71     try:
 72         response = requests.get(url)
 73         if response.status_code == 200:
 74             return response.text
 75         return None
 76     except ConnectionError:
 77         print(Error occurred)
 78         return None
 79 
 80 
 81 def parse_page_detail(html, url):
 82     soup = BeautifulSoup(html, lxml)
 83     result = soup.select(title)
 84     title = result[0].get_text() if result else ‘‘
 85     images_pattern = re.compile(gallery: JSON.parse("(.*)"), re.S)
 86     result = re.search(images_pattern, html)
 87     if result:
 88         data = json.loads(result.group(1).replace(\, ‘‘))
 89         if data and sub_images in data.keys():
 90             sub_images = data.get(sub_images)
 91             images = [item.get(url) for item in sub_images]
 92             for image in images: download_image(image)
 93             return {
 94                 title: title,
 95                 url: url,
 96                 images: images
 97             }
 98 
 99 
100 def save_to_mongo(result):
101     if db[MONGO_TABLE].insert(result):
102         print(Successfully Saved to Mongo, result)
103         return True
104     return False
105 
106 
107 def main(offset):
108     text = get_page_index(offset, KEYWORD)
109     urls = parse_page_index(text)
110     for url in urls:
111         html = get_page_detail(url)
112         result = parse_page_detail(html, url)
113         if result: save_to_mongo(result)
114 
115 
116 if __name__ == __main__:
117     pool = Pool()
118     groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
119     pool.map(main, groups)
120     pool.close()
121     pool.join()

 

以上是关于requests+mongodb爬取今日头条,多进程的主要内容,如果未能解决你的问题,请参考以下文章

Python requests爬取今日头条,为啥获取不了网页内容

用Ajax爬取今日头条图片

用Ajax爬取今日头条图片集

Python爬取今日头条段子

分析Ajax爬取今日头条街拍美图

用接口爬取今日头条图片