项目练习 —— 贴吧爬虫
Posted sure-feng
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了项目练习 —— 贴吧爬虫相关的知识,希望对你有一定的参考价值。
1 # -*- coding:utf-8 -*- 2 # Author:Sure Feng 3 4 import requests 5 6 class TiebaSpider(object): 7 def __init__(self, tieba_name): 8 self.tieba_name = tieba_name 9 self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}" 10 self.headers = { 11 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/67.0.3396.99 Safari/537.36"} 12 13 14 def get_url_list(self): 15 """构造URL列表""" 16 # url_list = [] 17 # for i in range(10): 18 # url_list.append(self.url_temp.format(i * 50)) 19 # return url_list 20 return [self.url_temp.format(i * 50) for i in range(10)] 21 22 def parse_url(self, url): 23 """遍历列表,发送请求,获取响应内容""" 24 # print(url) 25 respond = requests.get(url, headers = self.headers) 26 return respond.content.decode() 27 28 29 def save_html(self, html_str, page_number): 30 """保存数据""" 31 file_name = "{} 第{}页" .format(self.tieba_name, page_number) 32 with open(file_name, "w", encoding="utf-8") as f: 33 f.write(html_str) 34 35 def run(self): 36 37 # 构造URL列表 38 url_list = self.get_url_list() 39 print(url_list) 40 # 遍历列表,发送请求,获取响应内容 41 for url in url_list: 42 html_str = self.parse_url(url) 43 # 保存数据 44 page_number = url_list.index(url) + 1 45 self.save_html(html_str, page_number) 46 47 48 if __name__ == "__main__": 49 tieba_sipder = TiebaSpider("棋魂") 50 tieba_sipder.run()
以上是关于项目练习 —— 贴吧爬虫的主要内容,如果未能解决你的问题,请参考以下文章