分析Ajax抓取今日头条街拍美图

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了分析Ajax抓取今日头条街拍美图相关的知识,希望对你有一定的参考价值。

spider.py

技术分享
  1 # -*- coding:utf-8 -*-
  2 from urllib import urlencode
  3 import requests
  4 from requests.exceptions import RequestException
  5 import json
  6 import re
  7 import os
  8 from hashlib import md5
  9 from bs4 import BeautifulSoup
 10 import pymongo
 11 from multiprocessing import Pool
 12 from json.decoder import JSONDecoder
 13 from config import *
 14 
 15 client = pymongo.MongoClient(MONGO_URL, connect=False)
 16 db = client[MONGO_DB]
 17 
 18 def get_page_index(offset,keyword):
 19     data = {
 20         offset: offset,
 21         format: json,
 22         keyword: keyword,
 23         autoload: true,
 24         count: 20,
 25         cur_tab: 3
 26     }
 27     url = http://www.toutiao.com/search_content/? + urlencode(data)
 28     try:
 29         response = requests.get(url)
 30         if response.status_code == 200:
 31             return response.text
 32         return None
 33     except RequestException:
 34         print u请求索引页失败, url
 35         return None
 36 
 37 def parse_page_index(html):
 38     data = json.loads(html)
 39     if data and data in data.keys():
 40         for item in data.get(data):
 41             yield item.get(article_url)
 42 
 43 def get_page_detail(url):
 44     try:
 45         response = requests.get(url)
 46         if response.status_code == 200:
 47             return response.text
 48         return None
 49     except RequestException:
 50         print u请求详情页失败, url
 51         return None
 52 
 53 def parse_page_detail(html, url):
 54     soup = BeautifulSoup(html, lxml)
 55     title = soup.select(title)[0].get_text()
 56     print(title)
 57     images_pattern = re.compile(gallery: (.*?),\n, re.S)
 58     result = re.search(images_pattern, html)
 59     if result:
 60         data = json.loads(result.group(1))
 61         if data and sub_images in data.keys():
 62             sub_images = data.get(sub_images)
 63             images = [item.get(url) for item in sub_images]
 64             for image in images: download_image(image)
 65             return {
 66                 title: title,
 67                 url: url,
 68                 images: images
 69             }
 70 
 71 def save_to_mongo(result):
 72     if db[MONGO_TABLE].insert(result):
 73         print u存储到MongoDB成功, result
 74         return True
 75     return False
 76 
 77 def download_image(url):
 78     print u正在下载, url
 79     try:
 80         response = requests.get(url)
 81         if response.status_code == 200:
 82             save_image(response.content)
 83         return None
 84     except RequestException:
 85         print u请求图片失败, url
 86         return None
 87 
 88 def save_image(content):
 89     file_path = {0}/{1}.{2}.format(os.getcwd(), md5(content).hexdigest(), jpg)
 90     if not os.path.exists(file_path):
 91         with open(file_path, wb) as f:
 92             f.write(content)
 93             f.close()
 94 
 95 def main(offset):
 96     html = get_page_index(offset, KEYWORD)
 97     for url in parse_page_index(html):
 98         html = get_page_detail(url)
 99         if html:
100             result = parse_page_detail(html, url)
101             if result: save_to_mongo(result)
102 
103 if __name__ == __main__:
104     groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]
105     pool = Pool()
106     pool.map(main, groups)
View Code

config.py

技术分享
1 # -*- coding:utf-8 -*-
2 MONGO_URL = localhost
3 MONGO_DB = toutiao
4 MONGO_TABLE = toutiao
5 
6 GROUP_START = 0
7 GROUP_END = 20
8 
9 KEYWORD = 街拍
View Code

 

以上是关于分析Ajax抓取今日头条街拍美图的主要内容,如果未能解决你的问题,请参考以下文章

ajax分析-今日头条街拍美图抓取

分析Ajax请求并抓取今日头条街拍美图

分析Ajax抓取今日头条街拍美图

爬虫:分析Ajax请求抓取今日头条街拍美图

芝麻HTTP:分析Ajax爬取今日头条街拍美图

Python3网络爬虫开发实战 分析Ajax爬取今日头条街拍美图