requests+mongodb爬取今日头条,多进程
Posted zhong_sp
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了requests+mongodb爬取今日头条,多进程相关的知识,希望对你有一定的参考价值。
1 import json 2 import os 3 from urllib.parse import urlencode 4 import pymongo 5 import requests 6 from bs4 import BeautifulSoup 7 from requests.exceptions import ConnectionError 8 import re 9 from multiprocessing import Pool 10 from hashlib import md5 11 from json.decoder import JSONDecodeError 12 from config import * 13 14 client = pymongo.MongoClient(MONGO_URL, connect=False) 15 db = client[MONGO_DB] 16 17 18 def get_page_index(offset, keyword): 19 data = { 20 ‘autoload‘: ‘true‘, 21 ‘count‘: 20, 22 ‘cur_tab‘: 3, 23 ‘format‘: ‘json‘, 24 ‘keyword‘: keyword, 25 ‘offset‘: offset, 26 } 27 params = urlencode(data) 28 base = ‘http://www.toutiao.com/search_content/‘ 29 url = base + ‘?‘ + params 30 try: 31 response = requests.get(url) 32 if response.status_code == 200: 33 return response.text 34 return None 35 except ConnectionError: 36 print(‘Error occurred‘) 37 return None 38 39 40 def download_image(url): 41 print(‘Downloading‘, url) 42 try: 43 response = requests.get(url) 44 if response.status_code == 200: 45 save_image(response.content) 46 return None 47 except ConnectionError: 48 return None 49 50 51 def save_image(content): 52 file_path = ‘{0}/{1}.{2}‘.format(os.getcwd(), md5(content).hexdigest(), ‘jpg‘) 53 print(file_path) 54 if not os.path.exists(file_path): 55 with open(file_path, ‘wb‘) as f: 56 f.write(content) 57 f.close() 58 59 60 def parse_page_index(text): 61 try: 62 data = json.loads(text) 63 if data and ‘data‘ in data.keys(): 64 for item in data.get(‘data‘): 65 yield item.get(‘article_url‘) 66 except JSONDecodeError: 67 pass 68 69 70 def get_page_detail(url): 71 try: 72 response = requests.get(url) 73 if response.status_code == 200: 74 return response.text 75 return None 76 except ConnectionError: 77 print(‘Error occurred‘) 78 return None 79 80 81 def parse_page_detail(html, url): 82 soup = BeautifulSoup(html, ‘lxml‘) 83 result = soup.select(‘title‘) 84 title = result[0].get_text() if result else ‘‘ 85 images_pattern = re.compile(‘gallery: JSON.parse("(.*)")‘, re.S) 86 result = re.search(images_pattern, html) 87 if result: 88 data = json.loads(result.group(1).replace(‘\‘, ‘‘)) 89 if data and ‘sub_images‘ in data.keys(): 90 sub_images = data.get(‘sub_images‘) 91 images = [item.get(‘url‘) for item in sub_images] 92 for image in images: download_image(image) 93 return { 94 ‘title‘: title, 95 ‘url‘: url, 96 ‘images‘: images 97 } 98 99 100 def save_to_mongo(result): 101 if db[MONGO_TABLE].insert(result): 102 print(‘Successfully Saved to Mongo‘, result) 103 return True 104 return False 105 106 107 def main(offset): 108 text = get_page_index(offset, KEYWORD) 109 urls = parse_page_index(text) 110 for url in urls: 111 html = get_page_detail(url) 112 result = parse_page_detail(html, url) 113 if result: save_to_mongo(result) 114 115 116 if __name__ == ‘__main__‘: 117 pool = Pool() 118 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) 119 pool.map(main, groups) 120 pool.close() 121 pool.join()
以上是关于requests+mongodb爬取今日头条,多进程的主要内容,如果未能解决你的问题,请参考以下文章