pyquery解析库

Posted wt7018

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pyquery解析库相关的知识,希望对你有一定的参考价值。

语法和jquey几乎一致

安装

conda install pyquery

一、初始化

标准用法

from pyquery import PyQuery as pq
import requests

#
r = requests.get(url=http://www.baidu.com)

html_doc = pq(r.text)
print(html_doc)
print(html_doc(#u1 a))

1、字符串初始化(最常用)

from pyquery import PyQuery as pq

html_doc = ‘‘‘<div>
    <ul id = ‘haha‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
print(doc)
print(type(doc))

2、url初始化

from pyquery import PyQuery as pq

#


html_doc = pq(url=http://www.baidu.com)
print(html_doc)
print(html_doc(#u1 a))

注意:一般通过requests模块或urllib获取网页的html->解析模块去解析

3、文件初始化

from pyquery import PyQuery as pq

#


doc = pq(filename=test.html)
print(doc)

二、基本CSS选择器

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
li_list = doc(div #con li)
print(li_list)

# id      #
# class  .
# tag    tagname

三、查找节点

1、子节点

find() 最常用的方法

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
div = doc(div)
li_list = div.find(li.active)
print(li_list)

children() 查找所有子节点,children(‘‘) 查找指定的子节点

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
div = doc(div)
# 查找所有子节点
selector = div.children()
print(selector)
# 查找含有item-0类的节点
li_item_0 = div.children(#con .item-0)
print(li_item_0)

2、父节点

parent() 父节点 parents() 祖节点 parents(‘‘) 含有某些选择器祖节点

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# s所有li节点
li_list = doc(#con li)
# li节点的父节点
ul = li_list.parent()
# print(ul)
# 祖辈节点(包含父节点)
divs = li_list.parents()
# print(divs)
# 含有id="conn" 的祖节点
div = li_list.parents(#con)
print(div)

3、兄弟节点

siblings() 所有兄弟姊妹节点,siblings(‘‘) 含有指定css选择器的兄弟节点

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 含有class="item-0 active"的节点
li = doc(#con li.item-0.active)
# 查找所有兄弟节点(除了自己本身)
# print(li.siblings())
# 查找含有指定css选择器的节点
print(li.siblings(.item-1.active))

四、遍历

1、单个节点

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 单个节点
li = doc(#con li.item-0.active)
print(li)

2、多个节点

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 多个节点,使用items()->生成器
li_lst = doc(#con li)
for li in li_lst.items():
    print(li, end=‘‘)

五、获取信息

1、属性

获取 设置

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 获取a标签的href属性
a = doc(li.item-0.active a)
print(a.attr(href))
# 设置属性
a.attr(href, oj8k)
print(a.attr(href))

2、文本

text() html()

获取 设置

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 获取text()
li = doc(li.item-0.active)
print(li.text())
# 获取html()
print(li.html())

# 设置text()
li.text(Hello World)
print(li.text())
# 设置html()
li.html(<a>打我</a>)
print(li.html())

注意:与JQuery的区别,pyquery(),  html() 获取的是内部的html,不包含其本身

六、操作DOM节点

1、add_class()和remove_class()c

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 获取text()
li = doc(li.item-0.active)
print(li.text())
# 获取html()
print(li.html())

# 设置text()
li.text(Hello World)
print(li.text())
# 设置html()
li.html(<a>打我</a>)
print(li.html())

2、remove()

作用:删除节点

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 获取li节点
li = doc(li.item-0.active)
print(li)
# 找到a节点,并删除a节点
a = li(a)
a.remove()
print(li)

七、伪类选择器

from pyquery import PyQuery as pq

#

html_doc = ‘‘‘<div>
    <ul id = ‘con‘>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
     </div>‘‘‘

doc = pq(html_doc)
# 获取li节点
li = doc(li.item-0.active)
print(li)
# 找到a节点,并删除a节点
a = li(a)
a.remove()
print(li)

 

 

以上是关于pyquery解析库的主要内容,如果未能解决你的问题,请参考以下文章

Pyquery解析库的安装和使用

Python3 BeautifulSoup和Pyquery解析库随笔

解析库pyquery使用

爬虫之解析库pyquery

pyquery解析库

解析库之beautifulsoup,pyquery