爬虫学习（十四）——xpath项目实践

Posted 2021-02-09 kuangkuangduangduang

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了爬虫学习（十四）——xpath项目实践相关的知识，希望对你有一定的参考价值。

import os
import time
import urllib.request
import urllib.parse
from lxml import etree


# 构建面向对象的代码方式
class ZhanZhang(object):

    # 实例化参数，让参数能够全局调用
    def __init__(self,url,headers):
        self.headers=headers
        self.url=url
        self.opener=urllib.request.build_opener(urllib.request.HTTPHandler())

    # 构建请求对象
    def request(self):
        request=urllib.request.Request(self.url,headers=self.headers)
        opener = self.opener
        response=opener.open(request)
        return response
    # 解析网页标签，获取图片地址
    def paserhtml(self):
        response=self.request()
        html=response.read().decode("utf8")
        htmlInfo=etree.HTML(html)
        # xpath解析网页标签
        parserXpath =htmlInfo.xpath(‘//div[@id="container"]/div[@class]/div/a‘)
        # 获取标签的href属性
        imgUrl=[iHref.get("href") for iHref in parserXpath]
        # 获取图片的名字
        imgName=[iName.get("alt") for iName in parserXpath]
        num = len(imgUrl)
        imgData={"imgUrl":imgUrl,"imgName":imgName,"num":num}
        return imgData


    # 下载请求图片
    def download(self):
        imgData=self.paserHtml()
        if not os.path.exists("金毛"):
            os.mkdir("金毛")
        try:
            for index in range(imgData["num"]):
                print("正在下载第%d张图片"%index)
                lastUrl =imgData["imgUrl"][index]
                filename=imgData["imgName"][index]+".png"
                filepath=os.path.join("金毛",filename)
                newresponse_html= urllib.request.urlopen(lastUrl).read().decode("utf8")
                newTree=etree.HTML(newresponse_html)
                newUrl=newTree.xpath("//div[@class=‘img_text‘]/span/a")[0].get("href")
                urllib.request.urlretrieve(newUrl,filepath)
                time.sleep(1)
        except Exception as e:
            print("数据请求完成")

# 主函数，调用类方法
def main():
    url = "http://sc.chinaz.com/tupian/jinmaoquantupian.html"
    headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36‘}
    zhanzhang_photo=ZhanZhang(url,headers)
    zhanzhang_photo.download()

if __name__ == ‘__main__‘:
    main()

以上是关于爬虫学习（十四）——xpath项目实践的主要内容，如果未能解决你的问题，请参考以下文章