使用pyquery
Posted liyihua
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用pyquery相关的知识,希望对你有一定的参考价值。
-
简单举例
1 from pyquery import PyQuery as pq 2 3 html = ‘‘‘ 4 <div> 5 <ul> 6 <li class="item-O"><a href="linkl.html">first item</a></li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-inactive"><a href="link3.html">third item</a></li> 9 <li class="item-1"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = pq(html) 16 print(doc) 17 18 19 # 输出: 20 <div> 21 <ul> 22 <li class="item-O"><a href="linkl.html">first item</a></li> 23 <li class="item-1"><a href="link2.html">second item</a></li> 24 <li class="item-inactive"><a href="link3.html">third item</a></li> 25 <li class="item-1"><a href="link4.html">fourth item</a></li> 26 <li class="item-0"><a href="link5.html">fifth item</a> 27 </li></ul> 28 </div>
1 from pyquery import PyQuery as pq 2 import requests 3 4 # doc1 与 doc2 功能相同 5 doc1 = pq(url=‘https://www.cnblogs.com/liyihua/‘) 6 print(doc1(‘title‘)) 7 8 doc2 = pq(requests.get(‘https://www.cnblogs.com/liyihua/‘).text) 9 print(doc1(‘title‘)) 10 11 12 # 输出: 13 <title>李亦华 - 博客园</title> 14 15 <title>李亦华 - 博客园</title> 16
1 from pyquery import PyQuery as pq 2 3 doc = pq(filename=‘test.html‘) 4 print(doc(‘li‘)) 5 6 7 # 输出: 8 <li class="item-O"><a href="linkl.html">first item</a></li> 9 <li class="item-1"><a href="link2.html">second item</a></li> 10 <li class="item-inactive"><a href="link3.html">third item</a></li> 11 <li class="item-1"><a href="link4.html">fourth item</a></li> 12 <li class="item-0"><a href="link5.html">fifth item</a> 13 </li> 14 15 16 # 文件内容: 17 <div> 18 <ul> 19 <li class="item-O"><a href="linkl.html">first item</a></li> 20 <li class="item-1"><a href="link2.html">second item</a></li> 21 <li class="item-inactive"><a href="link3.html">third item</a></li> 22 <li class="item-1"><a href="link4.html">fourth item</a></li> 23 <li class="item-0"><a href="link5.html">fifth item</a> 24 </ul> 25 </div>
-
基本CSS选择器
1 from pyquery import PyQuery as pq 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = pq(html) 16 print(doc(‘#container .list li‘)) 17 18 print( 19 type( 20 doc(‘#container .list li‘) 21 ) 22 ) 23 24 25 # 输出: 26 <li class="item-0">first item</li> 27 <li class="item-1"><a href="link2.html">second item</a></li> 28 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 29 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 30 <li class="item-0"><a href="link5.html">fifth item</a></li> 31 32 <class ‘pyquery.pyquery.PyQuery‘>
-
查找节点
-
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 items = doc(‘.list‘) 17 18 print( 19 type(items), 20 items, 21 sep=‘\\n‘ 22 ) 23 24 print( 25 type(items.find(‘li‘)), 26 items.find(‘li‘), 27 sep=‘\\n‘ 28 ) 29 30 31 # 输出: 32 <class ‘pyquery.pyquery.PyQuery‘> 33 <ul class="list"> 34 <li class="item-0">first item</li> 35 <li class="item-1"><a href="link2.html">second item</a></li> 36 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 37 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 38 <li class="item-0"><a href="link5.html">fifth item</a></li> 39 </ul> 40 41 <class ‘pyquery.pyquery.PyQuery‘> 42 <li class="item-0">first item</li> 43 <li class="item-1"><a href="link2.html">second item</a></li> 44 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 45 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 46 <li class="item-0"><a href="link5.html">fifth item</a></li> 47
# find()方法查找的是所有子孙节点,如果只查找子节点,可以使用children()方法
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 items = doc(‘.list‘) 17 18 print(items, ‘\\n‘) 19 20 print( 21 type(items.parent()), 22 items.parent(), 23 sep=‘\\n‘ 24 ) 25 26 27 # 输出: 28 <ul class="list"> 29 <li class="item-0">first item</li> 30 <li class="item-1"><a href="link2.html">second item</a></li> 31 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 32 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 33 <li class="item-0"><a href="link5.html">fifth item</a></li> 34 </ul> 35 36 37 <class ‘pyquery.pyquery.PyQuery‘> 38 <div id="container"> 39 <ul class="list"> 40 <li class="item-0">first item</li> 41 <li class="item-1"><a href="link2.html">second item</a></li> 42 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 43 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 44 <li class="item-0"><a href="link5.html">fifth item</a></li> 45 </ul> 46 </div>
parents(selector=None)
parent(selector=None)
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 17 # 选择class为list的节点内部class为item-0和active的节点 18 items = doc(‘.list .item-0.active‘) 19 20 print( 21 type(items.siblings()), 22 items.siblings(), 23 sep=‘\\n‘ 24 ) 25 26 print("\\n", items.siblings(‘.active‘)) 27 28 29 # 输出: 30 <class ‘pyquery.pyquery.PyQuery‘> 31 <li class="item-1"><a href="link2.html">second item</a></li> 32 <li class="item-0">first item</li> 33 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 34 <li class="item-0"><a href="link5.html">fifth item</a></li> 35 36 37 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 38
-
-
遍历
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 lis = doc(‘li‘).items() # 调用items()方法,得到一个生成器 17 18 for li in lis: 19 print( 20 li, 21 type(li) 22 ) 23 24 25 # 输出: 26 <li class="item-0">first item</li> 27 <class ‘pyquery.pyquery.PyQuery‘> 28 <li class="item-1"><a href="link2.html">second item</a></li> 29 <class ‘pyquery.pyquery.PyQuery‘> 30 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 31 <class ‘pyquery.pyquery.PyQuery‘> 32 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 33 <class ‘pyquery.pyquery.PyQuery‘> 34 <li class="item-0"><a href="link5.html">fifth item</a></li> 35 <class ‘pyquery.pyquery.PyQuery‘>
-
获取信息
-
获取属性
attr()方法获取属性
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 a = doc(‘.item-0.active a‘) 17 print( 18 a, 19 type(a), 20 a.attr(‘href‘), # 也可以用a.attr.href,两者作用相同 21 sep=‘\\n‘ 22 ) 23 24 25 # 输出: 26 <a href="link3.html"><span class="bold">third item</span></a> 27 <class ‘pyquery.pyquery.PyQuery‘> 28 link3.html
# 当返回结果包含多个节点时,调用attr()方法,只会得到第一个节点的属性。如果想获取所有返回的节点的属性,就要用到遍历了
-
获取文本
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 li = doc(‘li‘) 17 18 print( 19 li.html(), # 获取节点的内部文本 20 li.text(), # 获取节点文本,返回结果是纯文字内容 21 type(li.text()), 22 sep=‘\\n‘ 23 ) 24 25 26 # 输出: 27 first item 28 first item second item third item fourth item fifth item 29 <class ‘str‘>
-
-
节点操作
-
add_class() 和 remove_class() ---- 添加class、移除class
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0">first item</li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 9 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a></li> 11 </ul> 12 </div> 13 ‘‘‘ 14 15 doc = PyQuery(html) 16 li = doc(‘.item-0.active‘) 17 18 print(li) 19 print(li.remove_class(‘active‘)) 20 print(li.add_class(‘active‘)) 21 22 23 # 输出: 24 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 25 26 <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li> 27 28 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 29
-
attr、text 和 html
# attr(*args, **kwargs) ---- Attributes manipulation
# text(value=no_default, **kwargs) ---- Get or set the text representation of sub nodes.
# html(value=no_default, **kwargs) ---- Get or set the html representation of sub nodes.1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div id="container"> 5 <ul class="list"> 6 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 7 </ul> 8 </div> 9 ‘‘‘ 10 11 doc = PyQuery(html) 12 13 li = doc(‘.item-0.active‘) 14 print(li) 15 16 li.attr(‘name‘, ‘link‘) # 添加属性name,属性值为link 17 print(li) 18 19 li.text(‘change item‘) # 将节点内部的内容改为‘change item‘ 20 print(li) 21 22 li.html(‘<span>change item</span>‘) # 将节点内部的内容改为‘<span>change item</span>‘ 23 print(li) 24 25 26 # 输出: 27 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 28 29 <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li> 30 31 <li class="item-0 active" name="link">change item</li> 32 33 <li class="item-0 active" name="link"><span>change item</span></li>
-
remove()----删除节点
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div class="LeeHua"> 5 LiYihua 6 <ul class="201802004731">liyihua</ul> 7 </div> 8 ‘‘‘ 9 10 doc = PyQuery(html) 11 Leehua = doc(‘.LeeHua‘) 12 print("移除节点ul前的输出:\\n"+Leehua.text()) 13 14 Leehua.find(‘ul‘).remove() 15 print("移除节点ul后的输出:\\n"+Leehua.text()) 16 17 18 # 输出: 19 移除节点ul前的输出: 20 LiYihua 21 liyihua 22 移除节点ul后的输出: 23 LiYihua
-
-
伪类选择器
- 示例:
1 from pyquery import PyQuery 2 3 html = ‘‘‘ 4 <div class="wrap"> 5 <div id="container"> 6 <ul class="list"> 7 <li class="item-0">first item</li> 8 <li class="item-1"><a href="link2.html">second item</a></li> 9 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 10 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 11 <li class="item-0"><a href="link5.html">fifth item</a></li> 12 </ul> 13 </div> 14 </div> 15 ‘‘‘ 16 17 doc = PyQuery(html) 18 19 # 选择属于父元素的第一个子元素的每个 <li> 元素。 20 li = doc(‘li:first-child‘) 21 print(li) 22 23 # 选择属于父元素的最后一个子元素的每个 <li> 元素。 24 li = doc(‘li:last-child‘) 25 print(li) 26 27 # 选择属于其父元素的第二个子元素的每个 <li> 元素 28 li = doc(‘li:nth-child(2)‘) 29 print(li) 30 31 # 选择属于其父元素的最后两个子元素的每个 <li> 元素 32 li = doc(‘li:gt(2)‘) 33 print(li) 34 35 # 选择属于父元素的第偶个子元素的每个 <li> 元素。 36 li = doc(‘li:nth-child(2n)‘) 37 print(li) 38 39 # 选择包含‘second‘的每个元素 40 li = doc(‘li:contains(second)‘) 41 print(li) 42 43 44 # 输出: 45 <li class="item-0">first item</li> 46 47 <li class="item-0"><a href="link5.html">fifth item</a></li> 48 49 <li class="item-1"><a href="link2.html">second item</a></li> 50 51 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 52 <li class="item-0"><a href="link5.html">fifth item</a></li> 53 54 <li class="item-1"><a href="link2.html">second item</a></li> 55 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 56 57 <li class="item-1"><a href="link2.html">second item</a></li> 58
CSS 选择器的用法:http://www.w3school.com.cn/cssref/css_selectors.asp
- 示例:
以上是关于使用pyquery的主要内容,如果未能解决你的问题,请参考以下文章
在使用加载数据流步骤的猪中,使用(使用 PigStorage)和不使用它有啥区别?