beautiful模块
Posted sunch
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了beautiful模块相关的知识,希望对你有一定的参考价值。
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> asdf <div class="title"> <b>The Dormouse‘s story总共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html_doc, features=‘lxml‘) # 前戏 # tag1 = soup.find(‘a‘) # print(tag1) # tag2 = soup.find_all(‘a‘) # for tag in tag2: # print(tag.text) # 找到id=link2的标签 # tag3 = soup.select(‘#link2‘) # print(tag3) # # tag4 = soup.find(‘‘, id=‘link2‘) # print(tag4) # # tag5 = soup.select(‘.title‘) # print(tag5, type(tag5[0])) # 1 name # tag = soup.find(‘a‘) # print(tag) # print(tag.name) # # tag.name = ‘span‘ # print(soup) # 2 attr # tag = soup.find(‘a‘) # attrs = tag.attrs # print(attrs) # print(‘xxxxxx‘, tag.get(‘class‘)) # # tag.attrs = {‘ik‘: 123} # tag.attrs[‘id‘] = ‘iiiii‘ # print(tag) # 3 children 所有子标签 # body = soup.find(‘body‘) # v = body.children # child_list = [] # for i in v: # print(‘分割线‘.center(120, ‘#‘)) # print(i) # 4 children 所有子子孙孙 # body = soup.find(‘body‘) # v = body.descendants # for i in v: # print(‘分割线‘.center(120, ‘#‘)) # print(i) # 5 递归删除所有的标签 # body = soup.find(‘body‘) # body.decompose() # print(soup) # 6 clear 将标签的所有子标签全部清空(保留标签名) # body = soup.find(‘body‘) # body.clear() # print(soup) # 7 extract,递归的删除所有的标签,并获取删除的标签 # body = soup.find(‘body‘) # v = body.extract() # print(soup) # print(‘xxxxxxx‘, v) # 8 decode,转换为字符串(含当前标签);decode_contents(不含当前标签) # body = soup.find(‘body‘) # print(‘没转化之前‘, type(body), body) # print("$$$$$$$$$$$$$$$$$") # v = body.decode() # v1 = body.decode_contents() # print(v, type(v)) # print("$$$$$$$$$$$$$$$$$") # print(v1, type(v1)) # 9. encode,转换为字节(含当前标签);encode_contents(不含当前标签) # body = soup.find(‘body‘) # v = body.encode() # v1 = body.encode_contents() # print(v) # print(‘#‘.center(120, ‘#‘)) # print(v1) # 10. find,获取匹配的第一个标签 # tag = soup.find(‘a‘) # tag = soup.find(‘a‘, attrs={‘class‘: ‘sister‘}, recursive=True, text=‘Lacie‘) # recursive 递归 # tag = soup.find(‘a‘, id=‘link2‘) # print(tag) # 11. find_all,获取匹配的所有标签 # tags = soup.find_all(‘a‘) # tags = soup.find_all(‘a‘, limit=1) # tags = soup.find_all(‘a‘, attrs={‘class‘: ‘sister‘}) # tags = soup.find_all(‘a‘, attrs={‘class‘: ‘sister‘}, text=‘Lacie‘) # print(tags) # 列表 # v = soup.find_all(name=[‘a‘, ‘div‘]) # # v1 = soup.find_all(name=‘a‘) # v2 = soup.find_all(name=‘div‘) # # v = soup.find_all(href=rep) # print(v) # # print("&".center(120, ‘#‘)) # print(v1) # print("&".center(120, ‘#‘)) # print(v2) # v = soup.find_all(name=[‘a‘, ‘div‘]) # v1 = soup.find_all(name=‘a‘) + v2 = soup.find_all(name=‘div‘) # v = soup.find_all(class_=[‘sister0‘, ‘sister‘]) # v = soup.find_all(text=‘Tillie‘) # v = soup.find_all(id=[‘link1‘, ‘link2‘]) # v = soup.find_all(href=["http://example.com/lacie", "http://example.com/tillie"]) # print(v) # 正则 import re # rep = re.compile(‘p‘) # rep = re.compile(‘^p‘) # v = soup.find_all(name=rep) # print(v) # rep = re.compile(‘sister.*‘) # v = soup.find_all(class_=rep) # print(v) # rep = re.compile(‘http://example.com.*‘) # # v = soup.find_all(href=rep) # print(v) # 方法筛选 # def func(tag): # return tag.has_attr(‘class‘) and tag.has_attr(‘id‘) # # # v = soup.find_all(name=func) # print(v) # get 获取属性 # tag = soup.find(‘a‘) # v = tag.get(‘id‘) # print(v) # 12. has_attr,检查标签是否具有该属性 # tag = soup.find(‘a‘) # v = tag.has_attr(‘id‘) # print(v) # 13. get_text,获取标签内部文本内容 # tag = soup.find(‘a‘) # v = tag.get_text(‘id‘) # print(tag) # print(v) # 14. index,检查标签在某标签中的索引位置 # tag = soup.find(‘body‘) # v = tag.index(tag.find(‘p‘)) # print(tag) # print(v) # tag = soup.find("body") # for i, v in enumerate(tag): # print(i, v) # 15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签, # 判断是否是如下标签:‘br‘ , ‘hr‘, ‘input‘, ‘img‘, ‘meta‘,‘spacer‘, ‘link‘, ‘frame‘, ‘base‘ # tag = soup.find(‘br‘) # v = tag.is_empty_element # print(tag) # print(v) # 16. 当前的关联标签 # div = soup.find(‘div‘) # print(div) # print(div.next) # print(div.next_element) # print(div.next_elements) # print(div.sibling) # print(div.siblings) # tag = soup.find(‘a‘) # print(tag) # print(tag.previous) # print(tag.previous_element) # print(tag.previous_elements) # print(tag.previous_sibling) # print(tag.previous_siblings) # print(tag.parent) # print(tag.parents) # 17. 查找某标签的关联标签 # 参数同find_all # tag = soup.find(‘a‘) # print(tag.parent) # print(tag.find_next()) # 下一个, 内嵌 # print(tag.find_all_next()) # print(tag.find_next_sibling()) # 兄弟 # print(tag.find_next_siblings()) # 所有兄弟 # print(tag.find_previous()) # 等同于找上一级 # print(tag.find_all_previous()) # tag1 = soup.find_all(‘a‘)[1] # # print(tag1) # # print(tag1.find_previous_sibling()) # 前一个兄弟 # # print(tag1.find_previous_siblings()) # 前面的兄弟们 # print(tag.find_parent()) # tag.parent # print(tag.find_parents()) # tag.parents # 18. select,select_one, CSS选择器 # print(soup.select(‘title‘)) # print(soup.select(‘p nth-of-type(3)‘)) # print(soup.select(‘body a‘)) # soup.find_al(‘a‘) # soup.select("html head title") # tag = soup.select("div,a") # tag = soup.select("head > title") # 注意空格 # tag = soup.select("div > a") # 注意空格 # tag = soup.select("p > a:nth-of-type(2)") # tag = soup.select("p > #link1") # tag = soup.select("body > a") # tag = soup.select("#link1 ~ .sister") # 同级往下所有 # tag = soup.select("#link1 + .sister") # 同级往下一个 # tag = soup.select(".sister") # class # tag = soup.select("[class~=sister]") # 属性 # tag = soup.select("#link1") # id # tag = soup.select("a#link2") # a标签里的id=link2 # tag = soup.select(‘a[href]‘) # 属性 # tag = soup.select(‘a[href="http://example.com/lacie"]‘) # 完全匹配 # tag = soup.select(‘a[href^="http://example.com/"]‘) # 开头匹配 # tag = soup.select(‘a[href$="tillie"]‘) # 结尾匹配 # tag = soup.select(‘a[href*=".com/"]‘) # 随意包含 # print(tag) from bs4.element import Tag # def default_condition_generator(tag): # """找出含有href的标签""" # for child in tag.descendants: # if not isinstance(child, Tag): # continue # if not child.has_attr(‘href‘): # continue # yield child # tags = soup.find(‘body‘).select(‘a‘, _candidate_generator=default_condition_generator) # tags = soup.find(‘body‘).select(‘a‘, _candidate_generator=default_condition_generator, limit=1) # print(type(tags), tags) # 19. 标签的内容 # tag = soup.find(‘span‘) # print(tag.string) # 获取 # tag.string = ‘hello world‘ # 设置 # print(soup) # tag = soup.find(‘body‘) # print(tag.string) # tag.string = ‘xxx‘ # print(soup) # tag = soup.find(‘body‘) # v = tag.stripped_strings # 递归内部获取所有标签的文本 # for i in v: # print(i) # tag = soup.find(‘body‘) # print(tag.text) # 20 append在当前标签【内部追加】一个标签 # tag = soup.find(‘body‘) # tag.append(soup.find(‘a‘)) # <a class="sister0" id="link1">Els<span>f</span>ie</a></body> # print(soup) # from bs4.element import Tag # obj = Tag(name=‘i‘,attrs={‘id‘: ‘it‘}) # obj.string = ‘我是一个新来的‘ # tag = soup.find(‘body‘) # tag.append(obj) # print(soup) # 21.insert在当前标签内部指定位置插入一个标签 # from bs4.element import Tag # obj = Tag(name=‘i‘, attrs={‘id‘: ‘it‘}) # obj.string = ‘我是一个新来的‘ # tag = soup.find(‘body‘) # tag.insert(2, obj) # 在索引为2的位置插入 # print(soup) # 22. insert_after,insert_before 在当前标签后面或前面插入 # from bs4.element import Tag # obj = Tag(name=‘i‘, attrs={‘id‘: ‘it‘}) # obj.string = ‘我是一个新来的‘ # tag = soup.find(‘body‘) # tag.insert_before(obj) # # tag.insert_after(obj) # print(soup) # 23. replace_with 在当前标签替换为指定标签 # from bs4.element import Tag # obj = Tag(name=‘i‘, attrs={‘id‘: ‘it‘}) # obj.string = ‘我是一个新来的‘ # tag = soup.find(‘div‘) # tag.replace_with(obj) # print(soup) # 24. 创建标签之间的关系 # tag = soup.find(‘div‘) # a = soup.find(‘a‘) # tag.setup(previous_sibling=a) # print(tag.previous_sibling) # 25. wrap,用指定标签把当前标签包裹起来 # from bs4.element import Tag # obj1 = Tag(name=‘div‘, attrs={‘id‘: ‘it‘}) # obj1.string = ‘我是一个新来的‘ # # tag = soup.find(‘a‘) # v = tag.wrap(obj1) # print(soup) # tag = soup.find(‘a‘) # v = tag.wrap(soup.find(‘p‘)) # print(soup) # 26. unwrap,去掉当前标签,将保留其包裹的标签 tag = soup.find(‘a‘) v = tag.unwrap() # v为包裹的标签 print(v) print(soup)
以上是关于beautiful模块的主要内容,如果未能解决你的问题,请参考以下文章