Python爬虫bs4解析实战
Posted 知我几分
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫bs4解析实战相关的知识,希望对你有一定的参考价值。
1.常用方法
from bs4 import BeautifulSoup html = """ <table class="tablelist" cellpadding="0" cellspacing="0"> <tr class="h"> <td class="l" width="374">职位名称</td> <td>职位类别</td> <td>人数</td> <td>地点</td> <td>发布时间</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=45021&keywords=python&tid=0&lid=0">22989-腾讯云计费PHP高级开发工程师</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=45005&keywords=python&tid=0&lid=0">25663-腾讯云高级后台开发(互联网业务)(北京)</a></td> <td>技术类</td> <td>1</td> <td>北京</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=45007&keywords=python&tid=0&lid=0">TEG06-云计算架构师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44980&keywords=python&tid=0&lid=0">PCG04-PCG研发部数据科学家(深圳/北京)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=44981&keywords=python&tid=0&lid=0">PCG04-PCG研发部业务运维工程师(深圳)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44971&keywords=python&tid=0&lid=0">23674-腾讯新闻大数据分析工程师(北京)</a></td> <td>技术类</td> <td>2</td> <td>北京</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=44964&keywords=python&tid=0&lid=0">TEG05-高级数据挖掘工程师(深圳)</a></td> <td>技术类</td> <td>2</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44968&keywords=python&tid=0&lid=0">PCG01-QQ后台推荐算法工程师</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="even"> <td class="l square"><a target="_blank" href="position_detail.php?id=44969&keywords=python&tid=0&lid=0">PCG01-QQ后台大数据开发工程师</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> <tr class="odd"> <td class="l square"><a target="_blank" href="position_detail.php?id=44952&keywords=python&tid=0&lid=0">22989-腾讯云AI产品高级咨询顾问(深圳北京)</a></td> <td>技术类</td> <td>1</td> <td>深圳</td> <td>2018-10-23</td> </tr> </table> """ soup = BeautifulSoup(html, "lxml") # 1.找到所有的tr标签 # trs = soup.find_all("tr") # 2.找到第二个tr标签,limit表示找到个数,在列表层面获取具体标签 # tr = soup.find_all("tr", limit=2)[1] # 3.找到所有class等于even的tr标签,class关键字冲突,加下划线 # trs = soup.find_all("tr", class_="even") # 4.attrs属性可添加多个,以key-value形式 # trs = soup.find_all("tr", attrs={"class": "even"}) # 5.将所有a标签有target属性的找到,可以添加多个关键字参数 # aList = soup.find_all("a", target="_blank") # 6.获取所有的a标签的href属性 # aList = soup.find_all("a") # for a in aList: # 1.通过下标操作的方式 # href = a["href"] # 2.通过attrs属性的方式 # href = a.attrs["href"] # 获取所有的职位信息,过滤掉第一个 trs = soup.find_all("tr")[1:] jobs = [] for tr in trs: job = {} # tds = tr.find_all("td") # title = tds[0].string # category = tds[1].string # nums = tds[2].string # city = tds[3].string # pubtime = tds[4].string # job["title"] = title # job["category"] = category # job["nums"] = nums # job["city"] = city # job["pubtime"] = pubtime # jobs.append(job) # 获取所有文本 infos = list(tr.stripped_strings) job["title"] = infos[0] job["category"] = infos[1] job["nums"] = infos[2] job["city"] = infos[3] job["pubtime"] = infos[4] jobs.append(job) print(jobs)
2.css选择器方法
# 1.获取所有tr标签 # trs = soup.select("tr") # 2.获取第二个tr标签 # tr = soup.select("tr")[1] # 3.获取所有class是even的tr标签 # trs = soup.select("tr.even") # trs = soup.select("tr[class=\'even\']") # 4.获取所有a标签的href属性 # aList = soup.select("a") # for a in aList: # print(a["href"]) # 5.将所有的职位信息提取出来 # trs = soup.select("tr") # for tr in trs: # infos = list(tr.stripped_strings) # print(infos)
from bs4 import BeautifulSoup html = """ <div> <!--我是div--> </div> """ # 本质上是一个tag类型,生成一个tag实例对象,调用tag的方法 soup = BeautifulSoup(html, "lxml") div = soup.find("div") print(type(div)) # <class \'bs4.element.Tag\'> # string打印标签下的直接子元素,隔行显示不能打印 print(div.string) # contents打印标签下的所有元素,返回一个列表 print(div.contents) # children打印标签下的所有元素,返回一个迭代器 print(div.children)
3.爬取中国天气网并图文显示
"""中国天气网爬取并视图显示最低气温城市""" import requests from bs4 import BeautifulSoup from pyecharts import Bar HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"} ALL_DATA = [] def detail_urls(url): rep = requests.get(url=url, headers=HEADERS) text = rep.content.decode(encoding="utf-8") # 港澳表格标签残缺需要补缺能力强的html5lib补齐表格标签 soup = BeautifulSoup(text, "html5lib") # 找到第一个属性为conMidtab的div标签 commidtab = soup.find("div", class_="conMidtab") # 找到这个div下的所有table tables = commidtab.find_all("table") # 循环每一个table for table in tables: # 排除介绍部分 trs = table.find_all("tr")[2:] # 省份和直辖市两种情况 for index, tr in enumerate(trs): tds = tr.find_all("td") city_td = tds[0] if index == 0: city_td = tds[1] # 获取所有文本并去掉空格 city = list(city_td.stripped_strings)[0] min_temp_td = tds[-2] min_temp = list(min_temp_td.stripped_strings)[0] max_temp_td = tds[-5] max_temp = list(max_temp_td.stripped_strings)[0] ALL_DATA.append({"city": city, "min_temp": int(min_temp), "max_temp": int(max_temp)}) def spider(): base_url = "http://www.weather.com.cn/textFC/{}.shtml" # 页数较少所以直接拿 address = ["hb", "db", "hd", "hz", "hn", "xb", "xn", "gat"] for i in range(len(address)): url = base_url.format(address[i]) # 将生成的传递给页面解析函数 get_detail_urls = detail_urls(url) ALL_DATA.sort(key=lambda data: data["min_temp"]) datas = ALL_DATA[0:10] cities = list(map(lambda x: x["city"], datas)) min_temp = list(map(lambda x: x["min_temp"], datas)) max_temp = list(map(lambda x: x["max_temp"], datas)) bar = Bar("中国最低气温排行榜") bar.add("最低气温", cities, min_temp, mark_line=["average"], mark_point=["max", "min"]) bar.add("最高气温", cities, max_temp, mark_line=["average"], mark_point=["max", "min"]) bar.render("temperature.html") if __name__ == \'__main__\': spider()
4.总结
"""由于网络的不确定性,要保持一个程序的正常运行就得在代码中处理好 各种可能会发生的异常,以确保程序正常运行""" from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup def getTitle(url): try: # 请求相关的错误,比如请求不到网页 html = urlopen(url) # 进行捕捉,并返回友好形式 except HTTPError as e: return None try: """print(html) from http.client import HTTPResponse 调用HTTPResponse的read()方法,返回bytes类型数据 print(type(html.read())) pycharmIDE 命令ctrl+b 进入BeautifulSoup源码,查看所需参数, 第一个为请求返回结果,第二个为解析返回数据的解析器,可选择lxml,html5lib等解析器""" htmlTag = BeautifulSoup(html.read(), "html.parser") # 标签选择器,选择h1标签 title = htmlTag.body.h1 except AttributeError as e: # 页面可能没有这个标签属性,进行捕捉,并返回友好形式 return None # 函数运行成功,返回执行结果 return title # 调用执行函数,获得返回结果 title = getTitle("http://jandan.net/") # 判断返回结果的类型,根据结果类型做处理 if title == None: print("Title could not be found") else: # 打印成功执行结果 print(title)
map()函数简介: https://www.cnblogs.com/superxuezhazha/p/5714970.html
以上是关于Python爬虫bs4解析实战的主要内容,如果未能解决你的问题,请参考以下文章
2017.08.11 Python网络爬虫实战之Beautiful Soup爬虫