假期学习首都之窗百姓信件爬虫(完整版)2020.2.6 Python
Posted zlc364624
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了假期学习首都之窗百姓信件爬虫(完整版)2020.2.6 Python相关的知识,希望对你有一定的参考价值。
时间:2020.2.6
今天把昨天做到一半的首都之窗百姓信件爬取完成了。
源码如下:
1 import requests 2 import io 3 from bs4 import BeautifulSoup 4 #信1705-1 赵路仓 5 kv = {‘user-agent‘: ‘Mozilla/5.0‘} 6 id=‘AH20010700179‘ 7 8 def read(): 9 f=open(‘E://list.txt‘,‘r‘) 10 for line in f: 11 id=f.readline().rstrip(‘ ‘) 12 print(id) 13 url1 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=" + id # 咨询 14 url2 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=" + id # 建议 15 url3 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId=" + id # 投诉 16 parser(url1) 17 parser2(url2) 18 parser3(url3) 19 f.close() 20 21 def write(contents): 22 f=open(‘E://result.txt‘,‘a+‘) 23 f.write(contents) 24 print(contents,‘写入成功!‘) 25 f.close() 26 27 def parser(url): 28 try: 29 r = requests.get(url, headers=kv) 30 print(r.status_code) 31 demo = r.text 32 soup = BeautifulSoup(demo, "html.parser") 33 #print(soup.prettify()) 34 ‘‘‘print("标题:", soup.find("strong").get_text().lstrip().rstrip()) 35 print("来信人:",soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip()) 36 print("时间:",soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘)) 37 print("网友同问:", soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友同问:").lstrip().rstrip()) 38 print("问题:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()) 39 print("官方:", soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text()) 40 print("回答时间:",soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘)) 41 print("回答:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip())‘‘‘ 42 if soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().find( 43 ‘网友同问‘) != -1: 44 write("咨询"+"||") 45 write(soup.find("strong").get_text().lstrip().rstrip()+"||") 46 write(soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip()+"||") 47 write(soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘)+"||") 48 write(soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友同问:").lstrip().rstrip()+"||") 49 write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()+"||") 50 write(soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().lstrip().rstrip()+"||") 51 write(soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘)+"||") 52 write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace(" ","")) 53 write(soup.find_all("a", {"class": "dex_yes font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||") 54 write(soup.find_all("a", {"class": "dex_no font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||") 55 write(‘ ‘) 56 except: 57 print("咨询爬取失败!") 58 59 def parser2(url): 60 try: 61 r = requests.get(url, headers=kv) 62 print(r.status_code) 63 demo = r.text 64 soup = BeautifulSoup(demo, "html.parser") 65 #print(soup.prettify()) 66 ‘‘‘print("标题:", soup.find("strong").get_text().lstrip().rstrip()) 67 print("来信人:",soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip()) 68 print("时间:",soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘)) 69 print("网友同问:", soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友同问:").lstrip().rstrip()) 70 print("问题:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()) 71 print("官方:", soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text()) 72 print("回答时间:",soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘)) 73 print("回答:", soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip())‘‘‘ 74 if soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().find(‘网友支持‘)!=-1: 75 write("建议"+"||") 76 write(soup.find("strong").get_text().lstrip().rstrip()+"||") 77 write(soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip()+"||") 78 write(soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘)+"||") 79 write(soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友支持:").lstrip().rstrip()+"||") 80 write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()+"||") 81 write(soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().lstrip().rstrip()+"||") 82 write(soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘)+"||") 83 write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace(" ","")) 84 write(soup.find_all("a", {"class": "dex_yes font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||") 85 write(soup.find_all("a", {"class": "dex_no font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||") 86 write(‘ ‘) 87 except: 88 print("建议爬取失败!") 89 90 def parser3(url): 91 try: 92 r = requests.get(url, headers=kv) 93 print(r.status_code) 94 demo = r.text 95 soup = BeautifulSoup(demo, "html.parser") 96 #print(soup.prettify()) 97 if soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().find(‘网友评价‘)!=-1: 98 write("投诉"+"||") 99 write(soup.find("strong").get_text().lstrip().rstrip()+"||") 100 write(soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip(‘来信人:‘).lstrip().rstrip()+"||") 101 write(soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip(‘时间:‘)+"||") 102 write(soup.find_all("div", {"class": "col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip().rstrip().lstrip("网友评价数:").lstrip().rstrip()+"||") 103 write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip()+"||") 104 write(soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().lstrip().rstrip()+"||") 105 write(soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip(‘答复时间:‘)+"||") 106 write(soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace(" ","")+"||") 107 write(soup.find_all("a", {"class": "dex_yes font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||") 108 write(soup.find_all("a", {"class": "dex_no font12"})[0].get_text().lstrip().rstrip().replace(" ", "") + "||") 109 write(‘ ‘) 110 except: 111 print("投诉爬取失败!") 112 113 if __name__=="__main__": 114 read()
遇到的问题:
忽视了read()和readline()读取后面的 导致不停报错,在开始时没有看到投诉类型的信件和建议类型的信件,后来补上。
爬取的文本内容用||隔开,分别代表信件类型,标题,来信人,问题时间,网友评价,问题内容,回答方,回答时间,回答内容,赞,踩11个属性,存为E://result.txt文件内。
附上页面的ID(txt格式)百度网盘:https://pan.baidu.com/s/1GvF8Kllvv-vqBblgWnA-LQ
BeautifulSoup的安装和使用可以参考我的博客:https://www.cnblogs.com/zlc364624/p/12264070.html
以上是关于假期学习首都之窗百姓信件爬虫(完整版)2020.2.6 Python的主要内容,如果未能解决你的问题,请参考以下文章
[Python]爬取首都之窗百姓信件网址id python 2020.2.13
Python 爬取 北京市政府首都之窗信件列表-[后续补充]
Python 爬取 北京市政府首都之窗信件列表-[数据处理]