Python涓嬪埄鐢˙eautifulSoup瑙f瀽HTML
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python涓嬪埄鐢˙eautifulSoup瑙f瀽HTML相关的知识,希望对你有一定的参考价值。
鏍囩锛?a href='http://www.mamicode.com/so/1/exclude' title='exclude'>exclude limit open shel val Once wrap for
鎽樿Beautiful Soup 鏄竴涓彲浠ヤ粠 HTML 鎴?XML 鏍煎紡鏂囦欢涓彁鍙栨暟鎹殑 Python 搴擄紝浠栧彲浠ュ皢HTML 鎴?XML 鏁版嵁瑙f瀽涓篜ython 瀵硅薄锛屼互鏂逛究閫氳繃Python浠g爜杩涜澶勭悊銆?/p>
鏂囨。鐜
- 鏈枃妗d腑浠g爜鐨勬祴璇曠幆澧?/li>
Beautifu Soup 浣跨敤璇存槑
Beautiful Soup 鐨勫熀鏈姛鑳藉氨鏄HTML鐨勬爣绛捐繘琛屾煡鎵惧強缂栬緫銆?/p>
鍩烘湰姒傚康-瀵硅薄绫诲瀷
Beautiful Soup 灏嗗鏉?HTML 鏂囨。杞崲鎴愪竴涓鏉傜殑鏍戝舰缁撴瀯锛屾瘡涓妭鐐归兘琚浆鎹㈡垚涓€涓狿ython 瀵硅薄锛孊eautiful Soup灏嗚繖浜涘璞″畾涔変簡4 绉嶇被鍨? Tag銆丯avigableString銆丅eautifulSoup銆丆omment 銆?/p>
瀵硅薄绫诲瀷 | 鎻忚堪 |
---|---|
BeautifulSoup | 鏂囨。鐨勫叏閮ㄥ唴瀹?/td> |
Tag | HTML鐨勬爣绛?/td> |
NavigableString | 鏍囩鍖呭惈鐨勬枃瀛?/td> |
Comment | 鏄竴绉嶇壒娈婄殑NavigableString绫诲瀷锛屽綋鏍囩涓殑NavigableString 琚敞閲婃椂锛屽垯瀹氫箟涓鸿绫诲瀷 |
瀹夎鍙婂紩鐢?/h3>
# Beautiful Soup
pip install bs4
# 瑙f瀽鍣?pip install lxml
pip install html5lib
# 鍒濆鍖?from bs4 import BeautifulSoup
# 鏂规硶涓€锛岀洿鎺ユ墦寮€鏂囦欢
soup = BeautifulSoup(open("index.html"))
# 鏂规硶浜岋紝鎸囧畾鏁版嵁
resp = "<html>data</html>"
soup = BeautifulSoup(resp, 鈥榣xml鈥?
# soup 涓?BeautifulSoup 绫诲瀷瀵硅薄
print(type(soup))
鏍囩鎼滅储鍙婅繃婊?/h3>
鍩烘湰鏂规硶
# Beautiful Soup
pip install bs4
# 瑙f瀽鍣?pip install lxml
pip install html5lib
# 鍒濆鍖?from bs4 import BeautifulSoup
# 鏂规硶涓€锛岀洿鎺ユ墦寮€鏂囦欢
soup = BeautifulSoup(open("index.html"))
# 鏂规硶浜岋紝鎸囧畾鏁版嵁
resp = "<html>data</html>"
soup = BeautifulSoup(resp, 鈥榣xml鈥?
# soup 涓?BeautifulSoup 绫诲瀷瀵硅薄
print(type(soup))
鍩烘湰鏂规硶
鏍囩鎼滅储鏈塮ind_all() 鍜宖ind() 涓や釜鍩烘湰鐨勬悳绱㈡柟娉曪紝find_all() 鏂规硶浼氳繑鍥炴墍鏈夊尮閰嶅叧閿瓧鐨勬爣绛惧垪琛紝find()鏂规硶鍒欏彧杩斿洖涓€涓尮閰嶇粨鏋溿€?/p>
soup = BeautifulSoup(resp, 鈥榣xml鈥?
# 杩斿洖涓€涓爣绛惧悕涓?a"鐨凾ag
soup.find("a")
# 杩斿洖鎵€鏈塼ag 鍒楄〃
soup.find_all("a")
## find_all鏂规硶鍙绠€鍐?soup("a")
#鎵惧嚭鎵€鏈変互b寮€澶寸殑鏍囩
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
#鎵惧嚭鍒楄〃涓殑鎵€鏈夋爣绛?soup.find_all(["a", "p"])
# 鏌ユ壘鏍囩鍚嶄负p锛宑lass灞炴€т负"title"
soup.find_all("p", "title")
# 鏌ユ壘灞炴€d涓?link2"
soup.find_all(id="link2")
# 鏌ユ壘瀛樺湪灞炴€d鐨?soup.find_all(id=True)
#
soup.find_all(href=re.compile("elsie"), id=鈥榣ink1鈥?
#
soup.find_all(attrs={"data-foo": "value"})
#鏌ユ壘鏍囩鏂囧瓧鍖呭惈"sisters"
soup.find(string=re.compile("sisters"))
# 鑾峰彇鎸囧畾鏁伴噺鐨勭粨鏋?soup.find_all("a", limit=2)
# 鑷畾涔夊尮閰嶆柟娉?def has_class_but_no_id(tag):
return tag.has_attr(鈥榗lass鈥? and not tag.has_attr(鈥榠d鈥?
soup.find_all(has_class_but_no_id)
# 浠呭灞炴€т娇鐢ㄨ嚜瀹氫箟鍖归厤鏂规硶
def not_lacie(href):
return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)
# 璋冪敤tag鐨?find_all() 鏂规硶鏃?Beautiful Soup浼氭绱㈠綋鍓峵ag鐨勬墍鏈夊瓙瀛欒妭鐐?濡傛灉鍙兂鎼滅储tag鐨勭洿鎺ュ瓙鑺傜偣,鍙互浣跨敤鍙傛暟 recursive=False
soup.find_all("title", recursive=False)
鎵╁睍鏂规硶
find_parents() | 鎵€鏈夌埗杈堣妭鐐?/td> |
find_parent() | 绗竴涓埗杈堣妭鐐?/td> |
find_next_siblings() | 涔嬪悗鐨勬墍鏈夊厔寮熻妭鐐?/td> |
find_next_sibling() | 涔嬪悗鐨勭涓€涓厔寮熻妭鐐?/td> |
find_previous_siblings() | 涔嬪墠鐨勬墍鏈夊厔寮熻妭鐐?/td> |
find_previous_sibling() | 涔嬪墠鐨勭涓€涓厔寮熻妭鐐?/td> |
find_all_next() | 涔嬪悗鐨勬墍鏈夊厓绱?/td> |
find_next() | 涔嬪悗鐨勭涓€涓厓绱?/td> |
find_all_previous() | 涔嬪墠鐨勬墍鏈夊厓绱?/td> |
find_previous() | 涔嬪墠鐨勭涓€涓厓绱?/td> |
CSS閫夋嫨鍣?/h4>
Beautiful Soup鏀寔澶ч儴鍒嗙殑CSS閫夋嫨鍣?http://www.w3.org/TR/CSS2/selector.html, 鍦?Tag 鎴?BeautifulSoup 瀵硅薄鐨?.select() 鏂规硶涓紶鍏ュ瓧绗︿覆鍙傛暟, 鍗冲彲浣跨敤CSS閫夋嫨鍣ㄧ殑璇硶鎵惧埌tag銆?/p>
html_doc = """
<html>
<head>
<title>The Dormouse鈥榮 story</title>
</head>
<body>
<p class="title"><b>The Dormouse鈥榮 story</b></p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc)
# 鎵€鏈?a 鏍囩
soup.select("a")
# 閫愬眰鏌ユ壘
soup.select("body a")
soup.select("html head title")
# tag鏍囩涓嬬殑鐩存帴瀛愭爣绛?soup.select("head > title")
soup.select("p > #link1")
# 鎵€鏈夊尮閰嶆爣绛句箣鍚庣殑鍏勫紵鏍囩
soup.select("#link1 ~ .sister")
# 鍖归厤鏍囩涔嬪悗鐨勭涓€涓厔寮熸爣绛?soup.select("#link1 + .sister")
# 鏍规嵁calss绫诲悕
soup.select(".sister")
soup.select("[class~=sister]")
# 鏍规嵁ID鏌ユ壘
soup.select("#link1")
soup.select("a#link1")
# 鏍规嵁澶氫釜ID鏌ユ壘
soup.select("#link1,#link2")
# 鏍规嵁灞炴€ф煡鎵?soup.select(鈥榓[href]鈥?
# 鏍规嵁灞炴€у€兼煡鎵?soup.select(鈥榓[href^="http://example.com/"]鈥?
soup.select(鈥榓[href$="tillie"]鈥?
soup.select(鈥榓[href*=".com/el"]鈥?
# 鍙幏鍙栦竴涓尮閰嶇粨鏋?soup.select(".sister", limit=1)
# 鍙幏鍙栦竴涓尮閰嶇粨鏋?soup.select_one(".sister")
鏍囩瀵硅薄鏂规硶
鏍囩灞炴€?/h4>
soup = BeautifulSoup(鈥?lt;p class="body strikeout" id="1">Extremely bold</p><p class="body strikeout" id="2">Extremely bold2</p>鈥?
# 鑾峰彇鎵€鏈夌殑 p鏍囩瀵硅薄
tags = soup.find_all("p")
# 鑾峰彇绗竴涓猵鏍囩瀵硅薄
tag = soup.p
# 杈撳嚭鏍囩绫诲瀷
type(tag)
# 鏍囩鍚?tag.name
# 鏍囩灞炴€?tag.attrs
# 鏍囩灞炴€lass 鐨勫€?tag[鈥榗lass鈥榏
# 鏍囩鍖呭惈鐨勬枃瀛楀唴瀹癸紝瀵硅薄NavigableString 鐨勫唴瀹?tag.string
# 杩斿洖鏍囩鍐呮墍鏈夌殑鏂囧瓧鍐呭
for string in tag.strings:
print(repr(string))
# 杩斿洖鏍囩鍐呮墍鏈夌殑鏂囧瓧鍐呭, 骞跺幓鎺夌┖琛?for string in tag.stripped_strings:
print(repr(string))
# 鑾峰彇鍒皌ag涓寘鍚殑鎵€鏈夊強鍖呮嫭瀛愬瓩tag涓殑NavigableString鍐呭锛屽苟浠nicode瀛楃涓叉牸寮忚緭鍑?tag.get_text()
## 浠?|"鍒嗛殧
tag.get_text("|")
## 浠?|"鍒嗛殧锛屼笉杈撳嚭绌哄瓧绗?tag.get_text("|", strip=True)
鑾峰彇瀛愯妭鐐?/h4>
tag.contents # 杩斿洖绗竴灞傚瓙鑺傜偣鐨勫垪琛?tag.children # 杩斿洖绗竴灞傚瓙鑺傜偣鐨刲istiterator 瀵硅薄
for child in tag.children:
print(child)
tag.descendants # 閫掑綊杩斿洖鎵€鏈夊瓙鑺傜偣
for child in tag.descendants:
print(child)
鑾峰彇鐖惰妭鐐?/h4>
tag.parent # 杩斿洖绗竴灞傜埗鑺傜偣鏍囩
tag.parents # 閫掑綊寰楀埌鍏冪礌鐨勬墍鏈夌埗杈堣妭鐐?
for parent in tag.parents:
if parent is None:
print(parent)
else:
print(parent.name)
鑾峰彇鍏勫紵鑺傜偣
# 涓嬩竴涓厔寮熷厓绱?tag.next_sibling
# 褰撳墠鏍囩涔嬪悗鐨勬墍鏈夊厔寮熷厓绱?tag.next_siblings
for sibling in tag.next_siblings:
print(repr(sibling))
# 涓婁竴涓厔寮熷厓绱?tag.previous_sibling
# 褰撳墠鏍囩涔嬪墠鐨勬墍鏈夊厔寮熷厓绱?tag.previous_siblings
for sibling in tag.previous_siblings:
print(repr(sibling))
鍏冪礌鐨勯亶鍘?/h4>
soup = BeautifulSoup(鈥?lt;p class="body strikeout" id="1">Extremely bold</p><p class="body strikeout" id="2">Extremely bold2</p>鈥?
# 鑾峰彇鎵€鏈夌殑 p鏍囩瀵硅薄
tags = soup.find_all("p")
# 鑾峰彇绗竴涓猵鏍囩瀵硅薄
tag = soup.p
# 杈撳嚭鏍囩绫诲瀷
type(tag)
# 鏍囩鍚?tag.name
# 鏍囩灞炴€?tag.attrs
# 鏍囩灞炴€lass 鐨勫€?tag[鈥榗lass鈥榏
# 鏍囩鍖呭惈鐨勬枃瀛楀唴瀹癸紝瀵硅薄NavigableString 鐨勫唴瀹?tag.string
# 杩斿洖鏍囩鍐呮墍鏈夌殑鏂囧瓧鍐呭
for string in tag.strings:
print(repr(string))
# 杩斿洖鏍囩鍐呮墍鏈夌殑鏂囧瓧鍐呭, 骞跺幓鎺夌┖琛?for string in tag.stripped_strings:
print(repr(string))
# 鑾峰彇鍒皌ag涓寘鍚殑鎵€鏈夊強鍖呮嫭瀛愬瓩tag涓殑NavigableString鍐呭锛屽苟浠nicode瀛楃涓叉牸寮忚緭鍑?tag.get_text()
## 浠?|"鍒嗛殧
tag.get_text("|")
## 浠?|"鍒嗛殧锛屼笉杈撳嚭绌哄瓧绗?tag.get_text("|", strip=True)
tag.contents # 杩斿洖绗竴灞傚瓙鑺傜偣鐨勫垪琛?tag.children # 杩斿洖绗竴灞傚瓙鑺傜偣鐨刲istiterator 瀵硅薄
for child in tag.children:
print(child)
tag.descendants # 閫掑綊杩斿洖鎵€鏈夊瓙鑺傜偣
for child in tag.descendants:
print(child)
鑾峰彇鐖惰妭鐐?/h4>
tag.parent # 杩斿洖绗竴灞傜埗鑺傜偣鏍囩
tag.parents # 閫掑綊寰楀埌鍏冪礌鐨勬墍鏈夌埗杈堣妭鐐?
for parent in tag.parents:
if parent is None:
print(parent)
else:
print(parent.name)
鑾峰彇鍏勫紵鑺傜偣
# 涓嬩竴涓厔寮熷厓绱?tag.next_sibling
# 褰撳墠鏍囩涔嬪悗鐨勬墍鏈夊厔寮熷厓绱?tag.next_siblings
for sibling in tag.next_siblings:
print(repr(sibling))
# 涓婁竴涓厔寮熷厓绱?tag.previous_sibling
# 褰撳墠鏍囩涔嬪墠鐨勬墍鏈夊厔寮熷厓绱?tag.previous_siblings
for sibling in tag.previous_siblings:
print(repr(sibling))
鍏冪礌鐨勯亶鍘?/h4>
tag.parent # 杩斿洖绗竴灞傜埗鑺傜偣鏍囩
tag.parents # 閫掑綊寰楀埌鍏冪礌鐨勬墍鏈夌埗杈堣妭鐐?
for parent in tag.parents:
if parent is None:
print(parent)
else:
print(parent.name)
# 涓嬩竴涓厔寮熷厓绱?tag.next_sibling
# 褰撳墠鏍囩涔嬪悗鐨勬墍鏈夊厔寮熷厓绱?tag.next_siblings
for sibling in tag.next_siblings:
print(repr(sibling))
# 涓婁竴涓厔寮熷厓绱?tag.previous_sibling
# 褰撳墠鏍囩涔嬪墠鐨勬墍鏈夊厔寮熷厓绱?tag.previous_siblings
for sibling in tag.previous_siblings:
print(repr(sibling))
Beautiful Soup涓妸姣忎釜tag瀹氫箟涓轰竴涓€渆lement鈥濓紝姣忎釜鈥渆lement鈥濓紝琚嚜涓婅€屼笅鐨勫湪HTML涓帓鍒楋紝鍙互閫氳繃閬嶅巻鍛戒护閫愪釜鏄剧ず鏍囩
# 褰撳墠鏍囩鐨勪笅涓€涓厓绱?tag.next_element
# 褰撳墠鏍囩涔嬪悗鐨勬墍鏈夊厓绱?for element in tag.next_elements:
print(repr(element))
# 褰撳墠鏍囩鐨勫墠涓€涓厓绱?tag.previous_element
# 褰撳墠鏍囩涔嬪墠鐨勬墍鏈夊厓绱?for element in tag.previous_elements:
print(repr(element))
淇敼鏍囩灞炴€?/h4>
soup = BeautifulSoup(鈥?lt;b class="boldest">Extremely bold</b>鈥?
tag = soup.b
tag.name = "blockquote"
tag[鈥榗lass鈥榏 = 鈥榲erybold鈥?tag[鈥榠d鈥榏 = 1
tag.string = "New link text."
print(tag)
淇敼鏍囩鍐呭锛圢avigableString)
soup = BeautifulSoup(鈥?lt;b class="boldest">Extremely bold</b>鈥?
tag = soup.b
tag.string = "New link text."
娣诲姞鏍囩鍐呭锛圢avigableString)
soup = BeautifulSoup("<a>Foo</a>")
tag = soup.a
tag.append("Bar")
tag.contents
# 鎴栬€?
new_string = NavigableString("Bar")
tag.append(new_string)
print(tag)
娣诲姞娉ㄩ噴(Comment)
soup = BeautifulSoup(鈥?lt;b class="boldest">Extremely bold</b>鈥?
tag = soup.b
tag.name = "blockquote"
tag[鈥榗lass鈥榏 = 鈥榲erybold鈥?tag[鈥榠d鈥榏 = 1
tag.string = "New link text."
print(tag)
soup = BeautifulSoup(鈥?lt;b class="boldest">Extremely bold</b>鈥?
tag = soup.b
tag.string = "New link text."
soup = BeautifulSoup("<a>Foo</a>")
tag = soup.a
tag.append("Bar")
tag.contents
# 鎴栬€?
new_string = NavigableString("Bar")
tag.append(new_string)
print(tag)
娉ㄩ噴鏄竴涓壒娈婄殑NavigableString 瀵硅薄锛屾墍浠ュ悓鏍峰彲浠ラ€氳繃append() 鏂规硶杩涜娣诲姞銆?/p>
from bs4 import Comment
soup = BeautifulSoup("<a>Foo</a>")
new_comment = soup.new_string("Nice to see you.", Comment)
tag.append(new_comment)
print(tag)
娣诲姞鏍囩(Tag)
娣诲姞鏍囩鏂规硶鏈変袱绉嶏紝涓€绉嶆槸鍦ㄦ寚瀹氭爣绛剧殑鍐呴儴娣诲姞锛坅ppend鏂规硶锛夛紝鍙︿竴绉嶆槸鍦ㄦ寚瀹氫綅缃坊鍔?insert銆乮nsert_before銆乮nsert_after鏂规硶)
- append鏂规硶
soup = BeautifulSoup("<b></b>") tag = soup.b new_tag = soup.new_tag("a", href="http://www.example.com") new_tag.string = "Link text." tag.append(new_tag) print(tag)
* insert鏂规硶锛屾槸鎸囧湪褰撳墠鏍囩瀛愯妭鐐瑰垪琛ㄧ殑鎸囧畾浣嶇疆鎻掑叆瀵硅薄锛圱ag鎴朜avigableString锛?```python
html = 鈥?lt;b><a href="http://example.com/">I linked to <i>example.com</i></a></b>鈥?soup = BeautifulSoup(html)
tag = soup.a
tag.contents
tag.insert(1, "but did not endorse ")
tag.contents
- insert_before() 鍜?insert_after() 鏂规硶鍒欏湪褰撳墠鏍囩涔嬪墠鎴栦箣鍚庣殑鍏勫紵鑺傜偣娣诲姞鍏冪礌
html = 鈥?lt;b><a href="http://example.com/">I linked to <i>example.com</i></a></b>鈥?soup = BeautifulSoup(html) tag = soup.new_tag("i") tag.string = "Don鈥榯" soup.b.insert_before(tag) soup.b
* wrap() 鍜?unwrap()鍙互瀵规寚瀹氱殑tag鍏冪礌杩涜鍖呰鎴栬В鍖?骞惰繑鍥炲寘瑁呭悗鐨勭粨鏋溿€?
```python
# 娣诲姞鍖呰
soup = BeautifulSoup("<p>I wish I was bold.</p>")
soup.p.string.wrap(soup.new_tag("b"))
#杈撳嚭 <b>I wish I was bold.</b>
soup.p.wrap(soup.new_tag("div"))
#杈撳嚭 <div><p><b>I wish I was bold.</b></p></div>
# 鎷嗚В鍖呰
markup = 鈥?lt;a href="http://example.com/">I linked to <i>example.com</i></a>鈥?soup = BeautifulSoup(markup)
a_tag = soup.a
a_tag.i.unwrap()
a_tag
#杈撳嚭 <a href="http://example.com/">I linked to example.com</a>
鍒犻櫎鏍囩
html = 鈥?lt;b><a href="http://example.com/">I linked to <i>example.com</i></a></b>鈥?soup = BeautifulSoup(html)
# 娓呮褰撳墠鏍囩鐨勬墍鏈夊瓙鑺傜偣
soup.b.clear()
# 灏嗗綋鍓嶆爣绛惧強鎵€鏈夊瓙鑺傜偣浠巗oup 涓Щ闄?杩斿洖褰撳墠鏍囩銆?b_tag=soup.b.extract()
b_tag
soup
# 灏嗗綋鍓嶆爣绛惧強鎵€鏈夊瓙鑺傜偣浠巗oup 涓Щ闄わ紝鏃犺繑鍥炪€?soup.b.decompose()
# 灏嗗綋鍓嶆爣绛炬浛鎹负鎸囧畾鐨勫厓绱?tag=soup.i
new_tag = soup.new_tag("p")
new_tag.string = "Don鈥榯"
tag.replace_with(new_tag)
鍏朵粬鏂规硶
杈撳嚭
# 鏍煎紡鍖栬緭鍑?tag.prettify()
tag.prettify("latin-1")
- 浣跨敤Beautiful Soup瑙f瀽鍚?鏂囨。閮借杞崲鎴愪簡Unicode锛岀壒娈婂瓧绗︿篃琚浆鎹负Unicode锛屽鏋滃皢鏂囨。杞崲鎴愬瓧绗︿覆,Unicode缂栫爜浼氳缂栫爜鎴怳TF-8.杩欐牱灏辨棤娉曟纭樉绀篐TML鐗规畩瀛楃浜?/li>
- 浣跨敤Unicode鏃?Beautiful Soup杩樹細鏅鸿兘鐨勬妸鈥滃紩鍙封€濊浆鎹㈡垚HTML鎴朮ML涓殑鐗规畩瀛楃
鏂囨。缂栫爜
浣跨敤Beautiful Soup瑙f瀽鍚?鏂囨。閮借杞崲鎴愪簡Unicode锛屽叾浣跨敤浜嗏€滅紪鐮佽嚜鍔ㄦ娴嬧€濆瓙搴撴潵璇嗗埆褰撳墠鏂囨。缂栫爜骞惰浆鎹㈡垚Unicode缂栫爜銆?/p>
soup = BeautifulSoup(html)
soup.original_encoding
# 涔熷彲浠ユ墜鍔ㄦ寚瀹氭枃妗g殑缂栫爜
soup = BeautifulSoup(html, from_encoding="iso-8859-8")
soup.original_encoding
# 涓烘彁楂樷€滅紪鐮佽嚜鍔ㄦ娴嬧€濈殑妫€娴嬫晥鐜囷紝涔熷彲浠ラ鍏堟帓闄や竴浜涚紪鐮?soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"])
- 閫氳繃Beautiful Soup杈撳嚭鏂囨。鏃?涓嶇杈撳叆鏂囨。鏄粈涔堢紪鐮佹柟寮?榛樿杈撳嚭缂栫爜鍧囦负UTF-8缂栫爜
鏂囨。瑙f瀽鍣?/h4>
Beautiful Soup鐩墠鏀寔, 鈥渓xml鈥? 鈥渉tml5lib鈥? 鍜?鈥渉tml.parser鈥?/p>
soup=BeautifulSoup("<a><b /></a>")
soup
#杈撳嚭锛?<html><body><a><b></b></a></body></html>
soup=BeautifulSoup("<a></p>", "lxml")
soup
#杈撳嚭锛?<html><body><a></a></body></html>
soup=BeautifulSoup("<a></p>", "html5lib")
soup
#杈撳嚭锛?<html><head></head><body><a><p></p></a></body></html>
soup=BeautifulSoup("<a></p>", "html.parser")
soup
#杈撳嚭锛?<a></a>
鍙傝€冩枃妗?/h2>
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh
以上是关于Python涓嬪埄鐢˙eautifulSoup瑙f瀽HTML的主要内容,如果未能解决你的问题,请参考以下文章
鎵嬫捀Spring妗嗘灦锛岃璁′笌瀹炵幇璧勬簮鍔犺浇鍣紝浠嶴pring.xml瑙f瀽鍜屾敞鍐孊ean瀵硅薄