requests爬取豆瓣
Posted groundcontrol
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了requests爬取豆瓣相关的知识,希望对你有一定的参考价值。
豆瓣电视剧爬虫
# coding=utf-8 import requests import json class DoubanSpider(): def __init__(self): self.url_temp_list=[ {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/tv_american/items?start={}&count=18&loc_id=108288", "country":"US"}, {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/tv_domestic/items?start={}&count=18&loc_id=108288", "country":"CN"}, {"url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/tv_korean/items?start={}&count=18&loc_id=108288", "country":"KR"}, {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/tv_japanese/items?start={}&count=18&loc_id=108288", "country":"JP"}] self.headers = { "Referer": "https: // m.douban.com / tv / american", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/79.0.3945.130 Safari/537.36"} def parse_url(self,url):# 发送请求,获取响应 print(url) response = requests.get(url,headers=self.headers) return response.content.decode() def get_content_list(self,json_str): dict_ret = json.loads(json_str) content_list = dict_ret["subject_collection_items"] total = dict_ret["total"] return content_list,total def save_content_list(self,content_list,country):#保存 with open("douban.txt","a",encoding="utf-8") as f: for content in content_list: content["country"]=country f.write(json.dumps(content,ensure_ascii=False)) f.write(" ")#写入换行符进行换行 print("保存成功") def run(self):# 实现主要逻辑 for url_temp in self.url_temp_list: num=0 total = 100 # 假设有第一页 while num<total+18: # 1.start_utl url = url_temp["url_temp"].format(num) #2.发送请求,获取响应 json_str = self.parse_url(url) #3.提取数据 content_list,total = self.get_content_list(json_str) #4.保存 self.save_content_list(content_list,url_temp["country"]) # if len(content_list)<18: # break #5.构造下一页的url地址,进入循环 num +=18 if __name__ == "__main__" : douban_spider = DoubanSpider() douban_spider.run()
豆瓣书籍爬取
import requests import json class DoubanBook_Spider(): def __init__(self): self.url_temp_list = [ {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_fiction/items?start=0&count=18&loc_id=0", "book":"fiction"}, {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_nonfiction/items?start=0&count=18&loc_id=0", "book":"nofiction"}, {"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/book_classic/items?start=0&count=18&loc_id=0", "book":"classic"} ] self.headers={ "Referer": "https://m.douban.com/book/classic", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" } def parse_url(self,url):# 发送请求,获取响应 print(url) response = requests.get(url,headers=self.headers) return response.content.decode() def get_content_list(self,json_str):# 提取数据 dict_ret = json.loads(json_str) content_list = dict_ret["subject_collection_items"] total = dict_ret["total"] return content_list,total def save_content_list(self,content_list,book): with open("book_list.txt","a",encoding="utf-8")as f: for content in content_list: content["book"]= book f.write(json.dumps(content, ensure_ascii=False)) f.write(" ") # 写入换行符进行换行 print("保存成功") def run(self): for url_temp in self.url_temp_list: num = 0 total = 100 # 假设有第一页 while num < total + 18: # 1.start_utl url = url_temp["url_temp"].format(num) # 2.发送请求,获取响应 json_str = self.parse_url(url) # 3.提取是数据 content_list, total = self.get_content_list(json_str) # 4.保存 self.save_content_list(content_list, url_temp["book"]) # if len(content_list)<18: # break # 5.构造下一页的url地址,进入循环 num += 18 if __name__ == "__main__": douban_spider = DoubanBook_Spider() douban_spider.run()
以上是关于requests爬取豆瓣的主要内容,如果未能解决你的问题,请参考以下文章