beautiful模块

Posted sunch

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了beautiful模块相关的知识,希望对你有一定的参考价值。

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse‘s story</title></head>
<body>
asdf
    <div class="title">
        <b>The Dormouse‘s story总共</b>
        <h1>f</h1>
    </div>
<div class="story">Once upon a time there were three little sisters; and their names were
    <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""

soup = BeautifulSoup(html_doc, features=lxml)
# 前戏
# tag1 = soup.find(‘a‘)
# print(tag1)
# tag2 = soup.find_all(‘a‘)
# for tag in tag2:
#     print(tag.text)
# 找到id=link2的标签
# tag3 = soup.select(‘#link2‘)
# print(tag3)
#
# tag4 = soup.find(‘‘, id=‘link2‘)
# print(tag4)
#
# tag5 = soup.select(‘.title‘)
# print(tag5, type(tag5[0]))

# 1 name
# tag = soup.find(‘a‘)
# print(tag)
# print(tag.name)
#
# tag.name = ‘span‘
# print(soup)

# 2 attr
# tag = soup.find(‘a‘)
# attrs = tag.attrs
# print(attrs)
# print(‘xxxxxx‘, tag.get(‘class‘))
#
# tag.attrs = {‘ik‘: 123}
# tag.attrs[‘id‘] = ‘iiiii‘
# print(tag)

# 3 children    所有子标签
# body = soup.find(‘body‘)
# v = body.children
# child_list = []
# for i in v:
#     print(‘分割线‘.center(120, ‘#‘))
#     print(i)

# 4 children 所有子子孙孙
# body = soup.find(‘body‘)
# v = body.descendants
# for i in v:
#     print(‘分割线‘.center(120, ‘#‘))
#     print(i)

# 5 递归删除所有的标签
# body = soup.find(‘body‘)
# body.decompose()
# print(soup)

# 6 clear 将标签的所有子标签全部清空(保留标签名)
# body = soup.find(‘body‘)
# body.clear()
# print(soup)

# 7 extract,递归的删除所有的标签,并获取删除的标签
# body = soup.find(‘body‘)
# v = body.extract()
# print(soup)
# print(‘xxxxxxx‘, v)

# 8 decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
# body = soup.find(‘body‘)
# print(‘没转化之前‘, type(body), body)
# print("$$$$$$$$$$$$$$$$$")
# v = body.decode()
# v1 = body.decode_contents()
# print(v, type(v))
# print("$$$$$$$$$$$$$$$$$")
# print(v1, type(v1))

# 9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# body = soup.find(‘body‘)
# v = body.encode()
# v1 = body.encode_contents()
# print(v)
# print(‘#‘.center(120, ‘#‘))
# print(v1)

# 10. find,获取匹配的第一个标签
# tag = soup.find(‘a‘)
# tag = soup.find(‘a‘, attrs={‘class‘: ‘sister‘}, recursive=True, text=‘Lacie‘)   # recursive 递归
# tag = soup.find(‘a‘, id=‘link2‘)
# print(tag)

# 11. find_all,获取匹配的所有标签
# tags = soup.find_all(‘a‘)
# tags = soup.find_all(‘a‘, limit=1)
# tags = soup.find_all(‘a‘, attrs={‘class‘: ‘sister‘})
# tags = soup.find_all(‘a‘, attrs={‘class‘: ‘sister‘}, text=‘Lacie‘)
# print(tags)

# 列表
# v = soup.find_all(name=[‘a‘, ‘div‘])
#
# v1 = soup.find_all(name=‘a‘)
# v2 = soup.find_all(name=‘div‘)
# # v = soup.find_all(href=rep)
# print(v)
#
# print("&".center(120, ‘#‘))
# print(v1)
# print("&".center(120, ‘#‘))
# print(v2)

# v = soup.find_all(name=[‘a‘, ‘div‘])  # v1 = soup.find_all(name=‘a‘) + v2 = soup.find_all(name=‘div‘)
# v = soup.find_all(class_=[‘sister0‘, ‘sister‘])
# v = soup.find_all(text=‘Tillie‘)
# v = soup.find_all(id=[‘link1‘, ‘link2‘])
# v = soup.find_all(href=["http://example.com/lacie", "http://example.com/tillie"])
# print(v)

# 正则
import re

# rep = re.compile(‘p‘)
# rep = re.compile(‘^p‘)
# v = soup.find_all(name=rep)
# print(v)

# rep = re.compile(‘sister.*‘)
# v = soup.find_all(class_=rep)
# print(v)

# rep = re.compile(‘http://example.com.*‘)
#
# v = soup.find_all(href=rep)
# print(v)

# 方法筛选


# def func(tag):
#     return tag.has_attr(‘class‘) and tag.has_attr(‘id‘)
#
#
# v = soup.find_all(name=func)
# print(v)

# get 获取属性
# tag = soup.find(‘a‘)
# v = tag.get(‘id‘)
# print(v)

# 12. has_attr,检查标签是否具有该属性
# tag = soup.find(‘a‘)
# v = tag.has_attr(‘id‘)
# print(v)

# 13. get_text,获取标签内部文本内容
# tag = soup.find(‘a‘)
# v = tag.get_text(‘id‘)
# print(tag)
# print(v)

# 14. index,检查标签在某标签中的索引位置
# tag = soup.find(‘body‘)
# v = tag.index(tag.find(‘p‘))
# print(tag)
# print(v)

# tag = soup.find("body")
# for i, v in enumerate(tag):
#     print(i, v)

# 15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
# 判断是否是如下标签:‘br‘ , ‘hr‘, ‘input‘, ‘img‘, ‘meta‘,‘spacer‘, ‘link‘, ‘frame‘, ‘base‘
# tag = soup.find(‘br‘)
# v = tag.is_empty_element
# print(tag)
# print(v)

# 16. 当前的关联标签
# div = soup.find(‘div‘)
# print(div)
# print(div.next)
# print(div.next_element)
# print(div.next_elements)
# print(div.sibling)
# print(div.siblings)

# tag = soup.find(‘a‘)
# print(tag)
# print(tag.previous)
# print(tag.previous_element)
# print(tag.previous_elements)
# print(tag.previous_sibling)
# print(tag.previous_siblings)

# print(tag.parent)
# print(tag.parents)

# 17. 查找某标签的关联标签        #  参数同find_all
# tag = soup.find(‘a‘)
# print(tag.parent)
# print(tag.find_next())    # 下一个, 内嵌
# print(tag.find_all_next())
# print(tag.find_next_sibling())    # 兄弟
# print(tag.find_next_siblings())   # 所有兄弟
# print(tag.find_previous())  # 等同于找上一级
# print(tag.find_all_previous())

# tag1 = soup.find_all(‘a‘)[1]
# # print(tag1)
# # print(tag1.find_previous_sibling())  # 前一个兄弟
# # print(tag1.find_previous_siblings())  # 前面的兄弟们

# print(tag.find_parent())    # tag.parent
# print(tag.find_parents())    # tag.parents

# 18. select,select_one, CSS选择器
# print(soup.select(‘title‘))
# print(soup.select(‘p nth-of-type(3)‘))
# print(soup.select(‘body a‘))  # soup.find_al(‘a‘)
# soup.select("html head title")
# tag = soup.select("div,a")
# tag = soup.select("head > title")     # 注意空格
# tag = soup.select("div > a")    # 注意空格
# tag = soup.select("p > a:nth-of-type(2)")
# tag = soup.select("p > #link1")
# tag = soup.select("body > a")
# tag = soup.select("#link1 ~ .sister") # 同级往下所有
# tag = soup.select("#link1 + .sister") # 同级往下一个
# tag = soup.select(".sister")  # class
# tag = soup.select("[class~=sister]")  # 属性
# tag = soup.select("#link1")   # id
# tag = soup.select("a#link2")  # a标签里的id=link2
# tag = soup.select(‘a[href]‘)  # 属性
# tag = soup.select(‘a[href="http://example.com/lacie"]‘)     # 完全匹配
# tag = soup.select(‘a[href^="http://example.com/"]‘)  # 开头匹配
# tag = soup.select(‘a[href$="tillie"]‘)  # 结尾匹配
# tag = soup.select(‘a[href*=".com/"]‘)  # 随意包含
# print(tag)

from bs4.element import Tag


# def default_condition_generator(tag):
#     """找出含有href的标签"""
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr(‘href‘):
#             continue
#         yield child


# tags = soup.find(‘body‘).select(‘a‘, _candidate_generator=default_condition_generator)
# tags = soup.find(‘body‘).select(‘a‘, _candidate_generator=default_condition_generator, limit=1)
# print(type(tags), tags)


# 19. 标签的内容
# tag = soup.find(‘span‘)
# print(tag.string)   # 获取
# tag.string = ‘hello world‘  # 设置
# print(soup)

# tag = soup.find(‘body‘)
# print(tag.string)
# tag.string = ‘xxx‘
# print(soup)

# tag = soup.find(‘body‘)
# v = tag.stripped_strings  # 递归内部获取所有标签的文本
# for i in v:
#     print(i)

# tag = soup.find(‘body‘)
# print(tag.text)

# 20 append在当前标签【内部追加】一个标签
# tag = soup.find(‘body‘)
# tag.append(soup.find(‘a‘))  # <a class="sister0" id="link1">Els<span>f</span>ie</a></body>
# print(soup)

# from bs4.element import Tag
# obj = Tag(name=‘i‘,attrs={‘id‘: ‘it‘})
# obj.string = ‘我是一个新来的‘
# tag = soup.find(‘body‘)
# tag.append(obj)
# print(soup)

# 21.insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name=‘i‘, attrs={‘id‘: ‘it‘})
# obj.string = ‘我是一个新来的‘
# tag = soup.find(‘body‘)
# tag.insert(2, obj)  # 在索引为2的位置插入
# print(soup)

# 22. insert_after,insert_before 在当前标签后面或前面插入
# from bs4.element import Tag
# obj = Tag(name=‘i‘, attrs={‘id‘: ‘it‘})
# obj.string = ‘我是一个新来的‘
# tag = soup.find(‘body‘)
# tag.insert_before(obj)
# # tag.insert_after(obj)
# print(soup)

# 23. replace_with 在当前标签替换为指定标签
# from bs4.element import Tag
# obj = Tag(name=‘i‘, attrs={‘id‘: ‘it‘})
# obj.string = ‘我是一个新来的‘
# tag = soup.find(‘div‘)
# tag.replace_with(obj)
# print(soup)

# 24. 创建标签之间的关系
# tag = soup.find(‘div‘)
# a = soup.find(‘a‘)
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)

# 25. wrap,用指定标签把当前标签包裹起来
# from bs4.element import Tag
# obj1 = Tag(name=‘div‘, attrs={‘id‘: ‘it‘})
# obj1.string = ‘我是一个新来的‘
#
# tag = soup.find(‘a‘)
# v = tag.wrap(obj1)
# print(soup)

# tag = soup.find(‘a‘)
# v = tag.wrap(soup.find(‘p‘))
# print(soup)

# 26. unwrap,去掉当前标签,将保留其包裹的标签
tag = soup.find(a)
v = tag.unwrap()    # v为包裹的标签
print(v)
print(soup)

 

以上是关于beautiful模块的主要内容,如果未能解决你的问题,请参考以下文章

Beautiful Soup模块

爬虫-Beautiful模块

Beautiful Soup模块

爬虫-Beautiful Soup模块

如何在 Mac 上安装 Beautiful Soup 模块?

使用python beautiful soup或html模块的电子邮件刮刀