python?????????HTMLParser??????
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python?????????HTMLParser??????相关的知识,希望对你有一定的参考价值。
?????????ini ?????? parser ?????? handle close rar ?????? img
html.parser??????????????????????????????????????????????????????HTMLParser??????
???????????????????????????feed??????????????????HTML????????????????????????????????????goahead???????????????????????????????????????????????????parse_xxxx????????????start_tag, tag, attrs data comment???end_tag????????????????????????????????????????????????????????????????????????????????????????????????????????????HTMLParser?????????????????????????????????
????????????????????????????????????handle_starttag??????????????????(handle_endtag?????????????????????handle_data?????????????????????HTMLParser????????????????????????pass???????????????????????????HTMLParser??????????????????????????????????????????????????????python?????????https://docs.python.org/3/library/html.parser.html?highlight=htmlparser
????????????????????????
l feed(data)????????????????????????html?????????str?????????????????????????????????????????????data?????????????????????instance??????????????????????????????close()???
l handle_starttag(tag, attrs): ??????????????????Parse_starttag?????????tag???attrs?????????????????????????????????????????????????????????????????????????????????
??????????????????start tag???<a>????????????????????????tag??????a????????????)???attrs???start tag <>?????????????????????????????????name, value????????????????????????????????????????????????
???????????????<A HREF="http://www.baidu.com???>?????????????????????????????????handle_starttag??????a???,[(???href???,???http://www.baidu.com)]???.
l handle_endtag(tag)??????????????????????????????????????????????????????????????????</??????????????????
l handle_data(data)????????????????????????????????????????????????????????????????????????????????????????????????<script>...</script>??????????????????
l handle_comment(data) ??????????????????<!-- -->???????????????
l reset()?????????????????????????????????????????????????????????????????????
??????????????????from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
"""
recognize start tag, like <div>
:param tag:
:param attrs:
:return:
"""
print("Encountered a start tag:", tag)
def handle_endtag(self, tag):
"""
recognize end tag, like </div>
:param tag:
:return:
"""
print("Encountered an end tag :", tag)
def handle_data(self, data):
"""
recognize data, html content string
:param data:
:return:
"""
print("Encountered some data :", data)
def handle_startendtag(self, tag, attrs):
"""
recognize tag that without endtag, like <img />
:param tag:
:param attrs:
:return:
"""
print("Encountered startendtag :", tag)
def handle_comment(self,data):
"""
:param data:
:return:
"""
print("Encountered comment :", data)
parser = MyHTMLParser()
parser.feed(???<html><head><title>Test</title></head>???
???<body><h1>Parse me!</h1><img src = "" />???
???<!-- comment --></body></html>???)
???????????????python???????????????????????????????????????????????????html????????????????????????????????????????????????????????????????????????????????????????????????
Encountered a start tag: html Encountered a start tag: head Encountered a start tag: title Encountered some data : Test Encountered an end tag : title Encountered an end tag : head Encountered a start tag: body Encountered a start tag: h1 Encountered some data : Parse me! Encountered an end tag : h1 Encountered startendtag : img Encountered comment : comment Encountered an end tag : body Encountered an end tag : html
??????????????????
????????????????????????????????????????????????????????????????????????????????????????????????
?????????html??????
<html> <head> <title>Test</title> </head> <body> <h1>Parse me!</h1> <img src = "" /> <p class=???123???>A paragraph.</p> <p class = "p_font">A paragraph with class.</p> <!-- comment --> <div> <p>A paragraph in div.</p> </div> </body> </html>
1.????????????p??????????????????????????????????????????handle_data
def handle_data(self, data): if self.lasttag == ???p???: print("Encountered p data :", data)
2.??????css?????????class??????p_font???p?????????????????????????????????1???????????????????????????????????????????????????????????????
def __init__(self): HTMLParser.__init__(self) self.flag = False def handle_starttag(self, tag, attrs): for attr in attrs: if tag == ???p??? and attr[1]==???"p_font": self.flag = True def handle_data(self, data): if self.flag == True: print("Encountered p data :", data)
3.??????p?????????????????????
def handle_starttag(self, tag, attrs): if tag == ???p???: print("Encountered p attrs :", attrs)
4.??????p?????????class??????
def handle_starttag(self, tag, attrs): for attr in attrs: If tag == ???p??? and attr[0]= ???class???): print("Encountered p class :", attr[1])
5.??????div??????p???????????????
def __init__(self): HTMLParser.__init__(self) self.in_div = False def handle_starttag(self, tag, attrs): if tag == ???div???: self.in_div = True def handle_data(self, data): if self.in_div == True and self.lasttag == ???p???: print("Encountered p data :", data) self.in_div = False
6.??????????????????????????????????????????????????????????????????????????????????????????
???????????????????????????????????????HTMLParser????????????????????????handle_comment???????????????????????????????????????????????????
def __init__(self): HTMLParser.__init__(self) def handle_comment(self,data): print(("Encountered comment:",data)
????????????
?????????????????????????????????HTML???????????????
html = ?????????<h3 class="tb-main-title" data-title="Xiaomi/??????"> ???????????????/??????/????????????Xiaomi/?????? ??????note????????????4G?????? </h3> <p class="tb-subtitle"> ????????????????????????+????????????+????????????+?????????+??????????????????,?????????????????????????????? </p> <div id="J_TEditItem" class="tb-editor-menu"></div> </div> <h3 class="tb-main-title" data-title="MIUI/??????"> ???????????????/?????????MIUI/?????? ????????????2??????2??????????????????4G?????? </h3> <p class="tb-subtitle"> [????????????2?????????????????????,???????????????????????????????????????---????????????] <div id="J_TEditItem" class="tb-editor-menu"></div> </div>?????????
???????????????????????????????????????????????????????????????????????????????????????????????????
???????????????feed??????html???HTMLParser??????????????????????????????????????????????????????????????????????????????????????????????????????????????????handle_starttag??????????????????????????????????????????handle_data???print?????????????????????????????????????????????????????????flag??????????????????????????????
#????????????MyParser?????????HTMLParser class MyParser(HTMLParser): re=[]#???????????? flag=0#?????????????????????????????????????????????????????? def handle_starttag(self, tag, attrs): if tag==???p???:#???????????? for attr in attrs: if attr[0]==???class??? and attr[1]==???tb-subtitle???:#??????????????????????????? self.flag=1#?????????????????????????????????1 break else: pass def handle_data(self, data): if self.flag==1: self.re.append(data.strip())#??????????????????????????????????????????????????????????????? self.flag=0#????????????????????????????????? else: pass my=MyParser() my.feed(html) my.re
????????????????????????????????????????????????
[???????????????????????????+????????????+????????????+?????????+??????????????????,?????????????????????????????????,
???[????????????2?????????????????????,???????????????????????????????????????---????????????]???]
以上是关于python?????????HTMLParser??????的主要内容,如果未能解决你的问题,请参考以下文章
使用 Python 模块—— HTMLParser 解析 HTML 文档元素