爬虫之pyquery库
Posted believepd
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫之pyquery库相关的知识,希望对你有一定的参考价值。
官方文档:https://pyquery.readthedocs.io/en/latest/
PyQuery是一个强大又灵活的网页解析库。如果你觉得正则写起来太麻烦、BeautifulSoup语法太难记,而你熟悉jQury的语法,那么PyQuery就是你的绝佳选择。
一、开始
字符串初始化:
from pyquery import PyQuery as pq d = pq("<html>哈哈哈</html>") # 现在d就相当于jQuery的$ print(d("html"))
URL初始化:
from pyquery import PyQuery as pq d = pq(url="https://www.baidu.com") print(d("head"))
文件初始化:
from pyquery import PyQuery as pq d = pq(filename=‘demo.html‘) # filename指定文件路径 print(d("head"))
二、基本CSS选择器
html = """ <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """ from pyquery import PyQuery as pq d = pq(html) print(d("#container .list li"))
三、查找元素
子元素
d("css选择器").find("li")
html = """ <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """ from pyquery import PyQuery as pq d = pq(html) items = d(".list") print(type(items)) # <class ‘pyquery.pyquery.PyQuery‘> li = items.find("li") print(type(li)) # <class ‘pyquery.pyquery.PyQuery‘> print(li) """ <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> """
父元素
d("css选择器").parent(<css选择器(可无)>)
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) items = d(".list") parents = items.parents() print(parents) """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) items = d(".list") parents = items.parents(".wrap") print(parents) """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """
兄弟元素
d("css选择器").siblings(<css选择器(可无)>)
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d(".list .item-0.active") print(li.siblings()) """ <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0">first item</li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> """
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d(".list .item-0.active") print(li.siblings(".active")) """ <li class="item-1 active"><a href="link4.html">fourth item</a></li> """
四、遍历
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d("li").items() print(type(li)) # <class ‘generator‘> for i in li: print(i) """ <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> """
五、获取信息
获取属性
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) a = d(".item-0.active a") print(a.attr("href")) print(a.attr.href)
获取文本
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) a = d(".item-0.active a") print(a.text()) """ third item """
获取html
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d(".item-0.active") print(li) print(li.html()) """ <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <a href="link3.html"><span class="bold">third item</span></a> """
六、DOM操作
addClass()、removeClass()
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d(".item-0.active") print(li) li.removeClass("active") print(li) li.addClass("active") print(li) """ <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> """
attr()、css()
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d(".item-0.active") print(li) li.attr("name", "link") print(li) li.css("font-size", "14px") print(li) """ <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li> """
remove()
html = """ <div class="wrap"> Hello, World. <p>This is a paragraph.</p> </div> """ from pyquery import PyQuery as pq d = pq(html) wrap = d(".wrap") print(wrap.text()) """ Hello, World. This is a paragraph. """ wrap.find("p").remove() print(wrap.text()) # Hello, World.
其他DOM方法
https://pyquery.readthedocs.io/en/latest/api.html
七、伪类选择器
html = """ <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> """ from pyquery import PyQuery as pq d = pq(html) li = d("li:first-child") print(li) # <li class="item-0">first item</li> li = d("li:last-child") print(li) # <li class="item-0"><a href="link5.html">fifth item</a></li> li = d("li:nth-child(2)") print(li) # <li class="item-1"><a href="link2.html">second item</a></li> li = d("li:gt(2)") # 从0开始计数,索引大于2 print(li) """ <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> """ li = d("li:nth-child(2n)") # 获取偶数顺序的元素(从1开始) print(li) """ <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> """ li = d("li:contains(second)") # 根据文本匹配,匹配文本包含second的标签 print(li) # <li class="item-1"><a href="link2.html">second item</a></li>
更多选择器:http://www.w3school.com.cn/cssref/css_selectors.asp
以上是关于爬虫之pyquery库的主要内容,如果未能解决你的问题,请参考以下文章