python知乎内容抓取（redis存储）

Posted 2020-12-27 水墨的心

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python知乎内容抓取（redis存储）相关的知识，希望对你有一定的参考价值。

　　因为平时喜欢上知乎，发现其话题是一个有向无环图（自己介绍说得），一级一级往上最后到根话题，所以我就想尝试从根话题一级一级往下将其全部内容爬取。最后实践过程中发现自己想多了..有以下三个问题：

　　1.数据量巨大，单台电脑能力肯定不够。我这里只抓取了话题结构和话题对应的单个页面（不翻页）的一些呈现信息，没有抓取具体问题回答的内容。

　　2.如果直接连续抓取，没多少次就会被知乎检测到异常，这里使用代理ip可以解决。

　　3.最麻烦的是要查看所有话题结构必须要登录（如下图示例显示的子话题是可以抓到的，但是点击查看全部话题结构就需要登录），一旦登录就不可能连续抓取，除非能搞到很多账号，而且还需要破解验证等问题。我这块目前只是从根话题抓取直接显示出来的子话题，所以抓取到的话题数量和实际差的很远。后面计划使用一个其他方法抓取（看这里）

　　存储由于key-value型很适合这里，所以我使用的是redis数据库，而且很快。一共有四张key-value表，zhTopicName保存话题名称（key）-子话题id（value，后同），zhTopicTree保存话题的结构有向无环图（邻接链表法，id-[子话题id...]），zhTopicMessage保存话题id-（关注人数，问题数），zhTopicQuestions保存话题id-[（问题名称，作者，评论数，点赞数，链接）...]

参考代码如下：

  1 # -*- coding: UTF-8 -*-
  2 
  3 from bs4 import BeautifulSoup
  4 import requests
  5 import random
  6 import redis
  7 import time
  8 import json
  9 
 10 # 子话题目录id获取接口url，当前初始根话题id为19776749
 11 CHILDREN_URL = \'https://www.zhihu.com/api/v3/topics/%u/children\'
 12 
 13 # 知乎精华url
 14 TOP_ANSWERS_URL = \'https://www.zhihu.com/topic/%u/top-answers\'
 15 
 16 # 请求头
 17 HEADERS = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \\
 18  16 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36\'}
 19 
 20 # 代理ip，这里示例放了一个，自己可以在网上找些补充下
 21 PROXIES_LIST = [
 22     {"http": "218.14.227.198:3124"},
 23 ]
 24 
 25 pool = redis.ConnectionPool(host=\'localhost\', port=6379, db=0)
 26 
 27 r = redis.Redis(connection_pool=pool)
 28 
 29 s = requests.session()
 30 
 31 
 32 def crawl_message(uid):
 33     try:
 34         req = s.get(url=TOP_ANSWERS_URL % int(uid), headers=HEADERS, proxies=random.choice(PROXIES_LIST)).text
 35     except Exception as e:
 36         raise e
 37     bf = BeautifulSoup(req, \'html.parser\')
 38     strong = bf.find_all(\'strong\', class_=\'NumberBoard-itemValue\')
 39     # 0为关注人数，1为问题数
 40     try:
 41         r.hset(\'zhTopicMessage\', uid, (strong[0].get(\'title\'), strong[1].get(\'title\')))
 42     except Exception as ex:
 43         print(uid, ex)
 44         return
 45     items = bf.find_all(\'div\', class_=\'ContentItem AnswerItem\')
 46     tqs = []
 47     for t in items:
 48         # 问题、作者、评论数、点赞数、链接
 49         tqs.append((eval(t.get(\'data-zop\'))[\'title\'], eval(t.get(\'data-zop\'))[\'authorName\'],
 50                     t.find(itemprop=\'commentCount\').get(\'content\'),
 51                     t.find(\'button\', class_="Button VoteButton VoteButton--up").contents[-1],
 52                     t.find(\'a\').get(\'href\')))
 53     r.hset(\'zhTopicQuestions\', uid, tqs)
 54     time.sleep(0.1)
 55 
 56 
 57 # 获取话题的那个有向无环图以及其名称与id的对应关系并入库
 58 def crawl_topic():
 59     while len(crawl_topic.zhTopic) > 0:
 60         try:
 61             data = \\
 62                 json.loads(
 63                     s.get(url=CHILDREN_URL % int(crawl_topic.zhTopic[0]), headers=HEADERS,
 64                           proxies=random.choice(PROXIES_LIST)).text)[
 65                     \'data\']
 66         except Exception as ex:
 67             raise ex
 68         if not data:  # 已经到叶子话题返回
 69             crawl_topic.zhTopic.pop(0)
 70             continue
 71         ids = []
 72         for m in data:
 73             nid = m[\'id\']
 74             ids.append(nid)
 75             crawl_topic.zhTopic.append(nid)
 76             r.hset(\'zhTopicName\', m[\'name\'], nid)
 77         r.hset(\'zhTopicTree\', nid, ids)
 78         crawl_topic.zhTopic.pop(0)
 79 
 80 
 81 def start_crawl():
 82     # 19776749为当前知乎根话题id
 83     # 这里构造一个id队列避免使用递归
 84     crawl_topic.zhTopic = [19776749]
 85     r.hset(\'zhTopicName\', \'「根话题」\', 19776749)
 86     crawl_topic()
 87     ids = r.hvals(\'zhTopicName\')
 88     for tid in ids:
 89         crawl_message(tid)
 90 
 91 
 92 def result_display():
 93     try:
 94         for tnm in r.hkeys(\'zhTopicName\'):
 95             tid = r.hget(\'zhTopicName\', tnm)
 96             print(\'话题:\', tnm.decode(), \'    关注人数:\', tid[1], \'    问题数:\', tid[0])
 97             for e in eval(r.hget(\'zhTopicQuestions\', tid).decode()):
 98                 print(\'    \', \'问题:\', e[0], \'作者:\', e[1], \'评论数\', e[2], \'点赞数:\', e[3], \'链接:\', e[4])
 99     except Exception as ex:
100         print(ex)
101 
102 
103 if __name__ == \'__main__\':
104     start_crawl()
105     result_display()

　　在我本机测试一共运行了5个小时左右获取了1万3千多个话题（如上问题3所述，远少于实际话题数，估计不到十分之一），每个话题不到5条精华问题（页面直接展示几条就抓几条，没有翻页）。下面展示下运行结果：

　　最后按网上流行那种话题关注人数多少对应字体大小画张图展示下（我也是现学现卖，仅供参考:)）

代码如下：

 1 import matplotlib.pyplot as plt
 2 from matplotlib import colors as mcolors
 3 import numpy as np
 4 import random
 5 import redis
 6 
 7 pool = redis.ConnectionPool(host=\'localhost\', port=6379, db=0)
 8 
 9 r = redis.Redis(connection_pool=pool)
10 
11 followers = []
12 for tnm in r.hkeys(\'zhTopicName\'):
13     tid = r.hget(\'zhTopicName\', tnm)
14     msg = r.hget(\'zhTopicMessage\', tid)
15     if msg:
16         followers.append((tnm.decode(), int(eval(msg)[0])))
17 # 按关注人数取前300的话题
18 hotTopics = sorted(followers, key=lambda follower: follower[1], reverse=True)[0:300]
19 
20 plt.figure(figsize=(10, 6.18))
21 plt.setp(plt.gca(), frame_on=False, xticks=(), yticks=())
22 plt.axis([0, 100, 0, 100])
23 plt.rcParams[\'font.sans-serif\'] = [\'SimHei\']
24 
25 randC = np.linspace(-20, 100, 300)
26 
27 colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
28 by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
29                 for name, color in colors.items())
30 sorted_names = [name for hsv, name in by_hsv]
31 random.shuffle(sorted_names)
32 
33 random.shuffle(randC)
34 
35 rankings = 0
36 for topic in hotTopics:
37     plt.text(randC[rankings], randC[299 - rankings], topic[0], fontsize=(300 - rankings) / 10 + 10,
38              rotation=random.randint(-30, 30), style=\'italic\', alpha=0.75,
39              color=sorted_names[rankings % len(sorted_names)])
40     rankings = rankings + 1
41 
42 plt.show()

运行结果：

以上是关于python知乎内容抓取（redis存储）的主要内容，如果未能解决你的问题，请参考以下文章