爬豆瓣阅读遇到的问题
Posted xuezhihao
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬豆瓣阅读遇到的问题相关的知识,希望对你有一定的参考价值。
1.发送get和post请求才能获取response信息,并且把两个response信息分开,只返回post请求的response信息。
class DoubanSpider(scrapy.Spider): name = ‘douban‘ allowed_domains = [‘read.douban.com‘] page_num = 0 url1 = ‘https://read.douban.com/category/?kind/100&page=‘ start_urls = ( url1+str(page_num), ) def start_requests(self): url = ‘https://read.douban.com/j/kind/‘ headers = { "Content-Type": "application/json", "Referer": "https://read.douban.com/category/?kind=100&page=0&sort=hot", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/77.0.3865.90 Safari/537.36", } payload = {"sort": "hot", "page": 1, "kind": 100, "query": " query getFilterWorksList($works_ids: [ID!]) { worksList(worksIds: $works_ids) { title cover url isBundle url title author { name url } origAuthor { name url } translator { name url } abstract editorHighlight isOrigin kinds { name @skip(if: true) shortName @include(if: true) id } ... on WorksBase @include(if: true) { wordCount wordCountUnit } ... on WorksBase @include(if: true) { isEssay ... on EssayWorks { favorCount } isNew averageRating ratingCount url } ... on WorksBase @include(if: false) { isColumn isEssay onSaleTime ... on ColumnWorks { updateTime } } ... on WorksBase @include(if: true) { isColumn ... on ColumnWorks { isFinished } } ... on EssayWorks { essayActivityData { title uri tag { name color background icon2x icon3x iconSize { height } iconPosition { x y } } } } highlightTags { name } ... on WorksBase @include(if: false) { fixedPrice salesPrice isRebate } ... on EbookWorks { fixedPrice salesPrice isRebate } ... on WorksBase @include(if: true) { ... on EbookWorks { id isPurchased isInWishlist } } id isOrigin } } ", "variables": {}} yield scrapy.Request(url, headers=headers, body=json.dumps(payload))
2.分开之后处理response信息。
def parse(self, response): Item = DoubanspiderItem() books = response.xpath(‘//div[@class="info"]‘) print(response.text) res = json.loads(response.text)["list"] for i in res : print(i["title"]) Item["book"] = i["title"] Item["author"] = i["origAuthor"]["name"] Item["price"] = i["title"] Item["number"] = i["wordCount"] Item["grade"] = i["title"] Item["info"] = i["abstract"] yield Item
以上是关于爬豆瓣阅读遇到的问题的主要内容,如果未能解决你的问题,请参考以下文章