1.1 BeautifulSoup介绍
1、BeautifulSoup作用
1、BeautifulSoup是一个模块,该模块用于接收一个html或XML字符串,然后将其进行格式化
2、之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单
2、安装
pip3 install beautifulsoup4
pip install lxml #lxml是一个比beautifulsoup4更强大的库(居然直接用pip就安装成功了)
3、lxml与html.parser比较
1. 两者都是把文本转成对象的方法,lxml是第三方库,但是性能好(生产用这个),html.parser 是python内置模块无需安装
2. soup = BeautifulSoup(response.text,features=‘lxml‘) #lxml是第三方库,但是性能好(生产用这个)
3. soup = BeautifulSoup(response.text,features=‘html.parser‘) # html.parser 是python内置模块无需安装
4、lxml结合BeautifulSoup举例
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") #1、找到第一个a标签 tag1 = soup.find(name=‘a‘) #2、找到所有的a标签 tag2 = soup.find_all(name=‘a‘) #3、找到id=link2的标签 tag3 = soup.select(‘#link2‘) print(tag1) # <a class="c1" id="i1" name="ha">i am a</a> print(tag2) # [<a class="c1" id="i1" name="ha">i am a</a>, <a class="c1" id="link2" name="ha">i am a</a>] print(tag3) # [<a class="c1" id="link2" name="ha">i am a</a>]
1.2 BeautifulSoup常用方法
1、name,标签名称(tag.name)
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find(‘a‘) # 找到第一个a标签 print(tag.name) # 获取标签名称(如果是a标签,name=a) tag.name = ‘span‘ # 将获取的a标签变成span标签 print(soup) # <html><head><title>The Dormouse‘s story</title></head> # <body> # <span class="c1" id="i1" name="ha">i am a</span> # <a class="c1" id="link2" name="ha">i am a</a> # </body> # </html>
2、attr,标签属性(tag.attrs)
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find(‘a‘) attrs = tag.attrs # 获取所有属性 print(attrs) # 格式:{‘name‘: ‘ha‘, ‘class‘: [‘c1‘], ‘id‘: ‘i1‘} tag.attrs = {‘ik‘:123} # 将属性替换成 ik="123" tag.attrs[‘id‘] = ‘iiiii‘ # 在原来的基础上添加一个 id="iiiii"属性 print(soup) # <a id="iiiii" ik="123">
3、children,所有子标签
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a</a> <a class="c1" id="link2" name="ha">i am a</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find(‘body‘) v = body.children #找到所有孩子标签 for tag in v: print(tag) # <a class="c1" id="i1" name="ha">i am a</a> # <a class="c1" id="link2" name="ha">i am a</a>
4、descendants,所有子子孙孙标签
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find(‘body‘) v = body.descendants #找到所有子子孙孙标签 for tag in v: print(tag) # <a class="c1" id="i1" name="ha">i am a1</a> # i am a1 # <a class="c1" id="link2" name="ha">i am a2</a> # i am a2
5、clear,将标签的所有子标签全部清空(保留标签名)
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") tag = soup.find(‘body‘) tag.clear() # 结果仅保留了body这个标签名,其他全部删除了 print(soup) # <html><head><title>The Dormouse‘s story</title></head> # <body></body> # </html>
6、decompose,递归的删除所有的标签
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find(‘body‘) body.decompose() # 结果将body标签都删除了,不保留body这个标签名 print(soup) # <html><head><title>The Dormouse‘s story</title></head> # </html>
7、extract,递归的删除所有的标签,并获取删除的标签
html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> <a class="c1" id="i1" name="ha">i am a1</a> <a class="c1" id="link2" name="ha">i am a2</a> </body> </html> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, features="lxml") body = soup.find(‘body‘) v = body.extract() print(v) # v就是删除的body标签的内容 # <body> # <a class="c1" id="i1" name="ha">i am a1</a> # <a class="c1" id="link2" name="ha">i am a2</a> # </body> print(soup) # soup是将body标签删除后的内容,还保留body这个空标签 # <html><head><title>The Dormouse‘s story</title></head> # </html>