xml--xpath--from lxml import etree
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了xml--xpath--from lxml import etree相关的知识,希望对你有一定的参考价值。
html---显示数据 xml ---传输数据
lxml 库 --- etree --xpath ==== path helper 工具辅助解析 from lxml import etree # 版本问题 有时候 不能 import lxml 后 不能使用lxml.etree
xml=etree.HTML(html) #转换为HTML DOM
link_list=xml.xpath("//div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit "]/@href") # 匹配结果列表
======================================================================================================= # 1 取出每个帖子的连接 # class="threadlist_title pull_left j_th_tit member_thread_title_frs " 会员的class #class="threadlist_title pull_left j_th_tit " 普通的class #class="threadlist_detail clearfix" # -===== 再往上找 找到相同的为止 # //div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit "]/@href # // --所有 # //div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit"] # // 两层关系 # 2 取出每个帖子中 图片的连接 # //img[@class=‘BDE_Image‘]/@src #======================================================================================================= import urllib.request import random import os import time from concurrent.futures import ThreadPoolExecutor from lxml import etree def getUrl(url,user_agent): ‘‘‘ 根据url请求 获取相应的请求内容 :param url: 需要爬取的url ‘‘‘ print(url) request=urllib.request.Request(url) request.add_header(‘User-Agent‘,user_agent) response=urllib.request.urlopen(request) html=response.read().decode(‘utf-8‘) my_xml=etree.HTML(html) url_list=my_xml.xpath(‘//div[@class="threadlist_lz clearfix"]//a[@class="j_th_tit "]/@href‘) return url_list #https://tieba.baidu.com/f?kw=美女&pn=50 def loadPicUrl(obj): url_list = obj.result() with ThreadPoolExecutor() as e: for url in url_list: e.submit(getPic,"https://tieba.baidu.com"+url).add_done_callback(savePic) e.shutdown(wait=True) def getPic(url): html=urllib.request.urlopen(url).read() xml=etree.HTML(html) pic_url_list=xml.xpath("//img[@class=‘BDE_Image‘]/@src") return pic_url_list def savePic(obj): ‘‘‘ 将html 写入本地 :param html: 服务器的相应内容 :return: ‘‘‘ pic_url_list=obj.result() for pic_url in pic_url_list: path=‘%s\%s‘%(os.getcwd(),pic_url.split(‘/‘)[-1]) content=urllib.request.urlopen(pic_url).read() with open(path,‘wb‘) as f: f.write(content) def tiebaSpider(kw,beginPage,endPage): new_kw = urllib.request.quote(kw) executor=ThreadPoolExecutor() for page in range(beginPage, endPage + 1): new_url = url + ‘?kw=‘ + new_kw + ‘&pn=‘ + str(page*50) user_agent = random.choice(ua_list) future=executor.submit(getUrl,new_url,user_agent) future.add_done_callback(loadPicUrl) executor.shutdown(wait=True) # html=loadPage(new_url,user_agent) # savePage(html,kw,page) if __name__ == ‘__main__‘: url = ‘http://tieba.baidu.com/f‘ ua_list = [ ‘User-Agent:Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11‘, ‘User-Agent:Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50‘, ‘User-Agent:Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50‘, ‘User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0‘, ‘User-Agent:Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)‘, ‘User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)‘, ‘User-Agent:Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1‘, ‘User-Agent:Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11‘, ‘User-Agent:Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11‘ ] kw=input(‘请输入需要爬取的贴吧:‘) beginPage=int(input(‘请输入开始页码:‘)) endPage=int(input((‘请输入结束的页码:‘))) start_time=time.time() tiebaSpider(kw, beginPage, endPage) end_time=time.time() print(end_time-start_time) # 2.0721185207366943
以上是关于xml--xpath--from lxml import etree的主要内容,如果未能解决你的问题,请参考以下文章
ImportError:没有名为 lxml 的模块 - 即使安装了 LXML