Scrapy爬取知乎用户信息

Posted wanglinjie

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Scrapy爬取知乎用户信息相关的知识,希望对你有一定的参考价值。

创建项目
scrapy startproject zhihuuser

scrapy genspider zhihu zhihu.com

 

items.py

from scrapy import Item, Field

class UserItem(Item):
    # define the fields for your item here like:
    id = Field()
    name = Field()
    avatar_url = Field()
    headline = Field()
    description = Field()
    url = Field()
    url_token = Field()
    gender = Field()
    cover_url = Field()
    type = Field()
    badge = Field()

    answer_count = Field()
    articles_count = Field()
    commercial_question_count = Field()
    favorite_count = Field()
    favorited_count = Field()
    follower_count = Field()
    following_columns_count = Field()
    following_count = Field()
    pins_count = Field()
    question_count = Field()
    thank_from_count = Field()
    thank_to_count = Field()
    thanked_count = Field()
    vote_from_count = Field()
    vote_to_count = Field()
    voteup_count = Field()
    following_favlists_count = Field()
    following_question_count = Field()
    following_topic_count = Field()
    marked_answers_count = Field()
    mutual_followees_count = Field()
    hosted_live_count = Field()
    participated_live_count = Field()

    locations = Field()
    educations = Field()
    employments = Field()

zhihu.py

import json

from scrapy import Spider, Request
from zhihuuser.items import UserItem


class ZhihuSpider(Spider):
    name = "zhihu"
    allowed_domains = ["www.zhihu.com"]
    user_url = https://www.zhihu.com/api/v4/members/{user}?include={include}
    follows_url = https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}
    followers_url = https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}
    start_user = excited-vczh
    user_query = locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics
    follows_query = data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics
    followers_query = data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics

    def start_requests(self):
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
        yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
                      self.parse_follows)
        yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
                      self.parse_followers)

    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        yield Request(
            self.follows_url.format(user=result.get(url_token), include=self.follows_query, limit=20, offset=0),
            self.parse_follows)

        yield Request(
            self.followers_url.format(user=result.get(url_token), include=self.followers_query, limit=20, offset=0),
            self.parse_followers)

    def parse_follows(self, response):
        results = json.loads(response.text)

        if data in results.keys():
            for result in results.get(data):
                yield Request(self.user_url.format(user=result.get(url_token), include=self.user_query),
                              self.parse_user)

        if paging in results.keys() and results.get(paging).get(is_end) == False:
            next_page = results.get(paging).get(next)
            yield Request(next_page,
                          self.parse_follows)

    def parse_followers(self, response):
        results = json.loads(response.text)

        if data in results.keys():
            for result in results.get(data):
                yield Request(self.user_url.format(user=result.get(url_token), include=self.user_query),
                              self.parse_user)

        if paging in results.keys() and results.get(paging).get(is_end) == False:
            next_page = results.get(paging).get(next)
            yield Request(next_page,
                          self.parse_followers)

 pipelines.py

import pymongo


class ZhihuPipeline(object):
    def process_item(self, item, spider):
        return item


class MongoPipeline(object):
    collection_name = users

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get(MONGO_URI),
            mongo_db=crawler.settings.get(MONGO_DATABASE)
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].update({url_token: item[url_token]}, dict(item), True)
        return item
settings.py
BOT_NAME = zhihuuser

SPIDER_MODULES = [zhihuuser.spiders]
NEWSPIDER_MODULE = zhihuuser.spiders

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
    User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/56.0.2924.87 Safari/537.36,
    authorization: oauth c3cef7c66a1843f8b3a9e6a1e3160e20,
}

ITEM_PIPELINES = {
    zhihuuser.pipelines.MongoPipeline: 300,
    # ‘scrapy_redis.pipelines.RedisPipeline‘: 301
}

MONGO_URI = localhost
MONGO_DATABASE = zhihu

 


以上是关于Scrapy爬取知乎用户信息的主要内容,如果未能解决你的问题,请参考以下文章

利用 Scrapy 爬取知乎用户信息

手把手教你用Scrapy爬取知乎大V粉丝列表

运维学python之爬虫高级篇scrapy爬取知乎关注用户存入mongodb

scrapy爬取知乎问答

使用scrapy爬取知乎图片

Python爬虫实战,Scrapy实战,爬取知乎表情包