python?????????HTMLParser??????

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python?????????HTMLParser??????相关的知识,希望对你有一定的参考价值。

?????????ini   ??????   parser   ??????   handle   close   rar   ??????   img   

html.parser??????????????????????????????????????????????????????HTMLParser??????

???????????????????????????feed??????????????????HTML????????????????????????????????????goahead???????????????????????????????????????????????????parse_xxxx????????????start_tag, tag, attrs data comment???end_tag????????????????????????????????????????????????????????????????????????????????????????????????????????????HTMLParser?????????????????????????????????

????????????

????????????????????????????????????handle_starttag??????????????????(handle_endtag?????????????????????handle_data?????????????????????HTMLParser????????????????????????pass???????????????????????????HTMLParser??????????????????????????????????????????????????????python?????????https://docs.python.org/3/library/html.parser.html?highlight=htmlparser

????????????????????????

l feed(data)????????????????????????html?????????str?????????????????????????????????????????????data?????????????????????instance??????????????????????????????close()???

l handle_starttag(tag, attrs): ??????????????????Parse_starttag?????????tag???attrs?????????????????????????????????????????????????????????????????????????????????

??????????????????start tag???<a>????????????????????????tag??????a????????????)???attrs???start tag <>?????????????????????????????????name, value????????????????????????????????????????????????

???????????????<A HREF="http://www.baidu.com???>?????????????????????????????????handle_starttag??????a???,[(???href???,???http://www.baidu.com)]???.

l handle_endtag(tag)??????????????????????????????????????????????????????????????????</??????????????????

l handle_data(data)????????????????????????????????????????????????????????????????????????????????????????????????<script>...</script>??????????????????

l handle_comment(data) ??????????????????<!-- -->???????????????

l reset()?????????????????????????????????????????????????????????????????????

??????????????????

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        """
        recognize start tag, like <div>
        :param tag:
        :param attrs:
        :return:
        """
        print("Encountered a start tag:", tag)

    def handle_endtag(self, tag):
        """
        recognize end tag, like </div>
        :param tag:
        :return:
        """
        print("Encountered an end tag :", tag)

    def handle_data(self, data):
        """
        recognize data, html content string
        :param data:
        :return:
        """
        print("Encountered some data  :", data)

    def handle_startendtag(self, tag, attrs):
        """
        recognize tag that without endtag, like <img />
        :param tag:
        :param attrs:
        :return:
        """
        print("Encountered startendtag :", tag)

    def handle_comment(self,data):
        """
        :param data:
        :return:
        """
        print("Encountered comment :", data)

parser = MyHTMLParser()
parser.feed(???<html><head><title>Test</title></head>???
            ???<body><h1>Parse me!</h1><img src = "" />???
            ???<!-- comment --></body></html>???)

???????????????python???????????????????????????????????????????????????html????????????????????????????????????????????????????????????????????????????????????????????????

Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data  : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data  : Parse me!
Encountered an end tag : h1
Encountered startendtag : img
Encountered comment :  comment
Encountered an end tag : body
Encountered an end tag : html

??????????????????

????????????????????????????????????????????????????????????????????????????????????????????????

?????????html??????

<html>
    <head>
        <title>Test</title>
    </head>
    <body>
        <h1>Parse me!</h1>
        <img src = "" />
        <p class=???123???>A paragraph.</p>
                <p class = "p_font">A paragraph with class.</p>
                <!-- comment -->
        <div>
            <p>A paragraph in div.</p>
        </div>
    </body>
</html>

1.????????????p??????????????????????????????????????????handle_data

def handle_data(self, data):
    if self.lasttag == ???p???:
        print("Encountered p data  :", data)

2.??????css?????????class??????p_font???p?????????????????????????????????1???????????????????????????????????????????????????????????????

def __init__(self):
    HTMLParser.__init__(self)
    self.flag = False

def handle_starttag(self, tag, attrs):
for attr in attrs:
if tag == ???p??? and attr[1]==???"p_font":
self.flag = True

def handle_data(self, data):
    if self.flag == True:
        print("Encountered p data  :", data)

3.??????p?????????????????????

def handle_starttag(self, tag, attrs):
    if tag == ???p???:
        print("Encountered p attrs  :", attrs)

4.??????p?????????class??????

def handle_starttag(self, tag, attrs):
for attr in attrs:
    	If tag == ???p??? and attr[0]= ???class???):
        	print("Encountered p class  :", attr[1])

5.??????div??????p???????????????

def __init__(self):
    HTMLParser.__init__(self)
    self.in_div = False

def handle_starttag(self, tag, attrs):
    if tag == ???div???:
        self.in_div = True

def handle_data(self, data):
    if self.in_div == True and self.lasttag == ???p???:
        print("Encountered p data  :", data)
self.in_div = False

6.??????????????????????????????????????????????????????????????????????????????????????????

???????????????????????????????????????HTMLParser????????????????????????handle_comment???????????????????????????????????????????????????

def __init__(self):
    HTMLParser.__init__(self)

def handle_comment(self,data):
    print(("Encountered  comment:",data)

????????????

?????????????????????????????????HTML???????????????

html = ?????????<h3 class="tb-main-title" data-title="Xiaomi/??????">
     ???????????????/??????/????????????Xiaomi/?????? ??????note????????????4G??????
   </h3>
   <p class="tb-subtitle">
 ????????????????????????+????????????+????????????+?????????+??????????????????,??????????????????????????????
 </p>
   <div id="J_TEditItem" class="tb-editor-menu"></div>
 </div>
<h3 class="tb-main-title" data-title="MIUI/??????">
     ???????????????/?????????MIUI/?????? ????????????2??????2??????????????????4G??????
   </h3>
   <p class="tb-subtitle">
 [????????????2?????????????????????,???????????????????????????????????????---????????????]
   <div id="J_TEditItem" class="tb-editor-menu"></div>
 </div>?????????

???????????????????????????????????????????????????????????????????????????????????????????????????

???????????????feed??????html???HTMLParser??????????????????????????????????????????????????????????????????????????????????????????????????????????????????handle_starttag??????????????????????????????????????????handle_data???print?????????????????????????????????????????????????????????flag??????????????????????????????

#????????????MyParser?????????HTMLParser
class MyParser(HTMLParser):
    re=[]#????????????
    flag=0#??????????????????????????????????????????????????????
    def handle_starttag(self, tag, attrs):
        if tag==???p???:#????????????
            for attr in attrs:
                if attr[0]==???class??? and attr[1]==???tb-subtitle???:#???????????????????????????
                    self.flag=1#?????????????????????????????????1
                    break
        else:
            pass

    def handle_data(self, data):
        if self.flag==1:
            self.re.append(data.strip())#???????????????????????????????????????????????????????????????
            self.flag=0#?????????????????????????????????
        else:
            pass


my=MyParser()
my.feed(html)
my.re

????????????????????????????????????????????????

[???????????????????????????+????????????+????????????+?????????+??????????????????,?????????????????????????????????,

???[????????????2?????????????????????,???????????????????????????????????????---????????????]???]

以上是关于python?????????HTMLParser??????的主要内容,如果未能解决你的问题,请参考以下文章

使用 Python 模块—— HTMLParser 解析 HTML 文档元素

python?????????????????????HTML??????HtmlParser

在 Python 3.2 中使用 HTMLParser

python网络爬虫之LXML与HTMLParser

python中HTMLParser简单理解

Pythons HTMLParser 可以编辑/更改 HTML 元素 innerText 还是只读取它