爬虫2

Posted sima-3

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫2相关的知识,希望对你有一定的参考价值。

一,校花网图片

from requests_html import HTMLSession
import os


class spider():
    def __init__(self):
        self.session = HTMLSession()
        self.headers = {
            User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36
        }

    def get_index_url(self):
        for i in range(1,4):
            if i == 1:
                yield http://www.xiaohuar.com/meinv/index.html
            else:
                yield http://www.xiaohuar.com/meinv/index_%s.html%i

    def get_img_name(self,index_url):
        r = self.session.get(url=index_url,headers=self.headers)
        elements_list = r.html.find(#images .items)
        for element in elements_list:
            img_url:str = element.find(img,first=True).attrs.get(src)
            if not img_url.startswith(http):
                img_url = http://www.xiaohuar.com + img_url
            img_name = element.find(.p_title>a,first=True).text.replace(\,‘‘).replace(/,‘‘) + .jpg
            yield img_url,img_name

    def save_img(self,img_url,img_name):
        r = self.session.get(url=img_url)
        img_path= os.path.join(校花图片,img_name)
        with open(img_path,wb) as f:
            f.write(r.content)
            print(%s下载完毕%img_name)






    def run(self):
        for index_url in self.get_index_url():
            for img_url,img_name in self.get_img_name(index_url):
                self.save_img(img_url,img_name)


if __name__ == __main__:
    xiaohua = spider()
    xiaohua.run()

二.豆瓣

from requests_html import HTMLSession

#测试
# session = HTMLSession()
# url=‘https://movie.douban.com/tag/#/?sort=S&range=0,10&tags=2018‘
#
# r = session.get(url=url)
# print(r.text)

#电影    %E7%94%B5%E5%BD%B1

# print(str(‘电影‘.encode(‘utf-8‘)).strip("‘b").replace(‘\x‘,‘%‘).upper())


class spider():
    def __init__(self):
        self.api = https://movie.douban.com/j/new_search_subjects?
        self.session = HTMLSession()

    def get_params(self):
        year_range = input(输入年份)    #年份是一个区间,逗号隔开
        sort = input(输入排序规则(S按评分))

        self.params = {
            year_range:year_range,
            sort:sort,
            start:0
        }

    def get_data(self):
        for i in range(10):
            self.params[start] = i*20
            r = self.session.get(url=self.api,params=self.params)
            print(r.json())

    def run(self):
        self.get_params()
        self.get_data()

if __name__ == __main__:
    douban = spider()
    douban.run()

三.校花网视频

from requests_html import HTMLSession
import os

class spider():

    def __init__(self):
        self.session = HTMLSession()


    def get_index_page(self):
        for i in range(7):
            url = http://www.xiaohuar.com/list-3-%s.html%i
            yield url

    def parse_index_page(self,index_page):
        r = self.session.get(url=index_page)
        elements_list = r.html.find(#images .items a[class="imglink"])
        for element in elements_list:
            yield element.attrs.get(href)

    def parse_detail_page(self,detail_page):
        r = self.session.get(url=detail_page)
        r.html.encoding = GBK
        result_obj = r.html.search(var vHLSurl    = "{}";)
        if result_obj:
            m3u8_url = result_obj[0]
            m3u8_name = r.html.find(title,first=True).text.replace(\,‘‘)
            yield m3u8_url,m3u8_name
        else:
            print("匹配失败,无资源")

    def save_m3u8(self,m3u8_url,m3u8_name):
        m3u8_dir = m3u8_name
        if not os.path.exists(m3u8_dir):
            os.mkdir(m3u8_dir)
        print(m3u8_url)
        r = self.session.get(url=m3u8_url)
        m3u8_path = os.path.join(m3u8_dir,playlist.m3u8)
        with open(m3u8_path,wt+,encoding=utf-8) as f :
            f.write(r.text)
            f.seek(0,0)
            for line in f:
                line = line.strip()
                if line.endswith(.ts):
                    ts_url = os.path.dirname(m3u8_url) + /%s%line
                    r = self.session.get(url=ts_url)
                    ts_path =  os.path.join(m3u8_dir,line)
                    with open(ts_path,wb) as f1:
                        f1.write(r.content)
                        print(%s下载完毕%line)

    def run(self):
        for url in self.get_index_page():
            for detail_page in  self.parse_index_page(url):
                for m3u8_url,m3u8_name in self.parse_detail_page(detail_page):
                    self.save_m3u8(m3u8_url,m3u8_name)


if __name__ == __main__:
    xioahua = spider()
    xioahua.run()

四.tmall

from requests_html import HTMLSession

class spider():

    def __init__(self):
        self.session = HTMLSession()
        self.api= http://list.tmall.com/search_product.htm?

    def get_params(self):
        pro = input("输入你要爬取的商品:")
        self.params = {
            q:pro,
            totalPage:1,
            jumpto:1
        }

    def get_totalPage(self):
        r = self.session.get(url=self.api,params=self.params)
        totalPage = r.html.find([name="totalPage"],first=True).attrs.get(value)
        self.params[totalPage] = int(totalPage)

    def get_pro_info(self):
        for i in range(1,self.params[totalPage]+1):
            self.params[jumpto] = i
            r = self.session.get(url=self.api, params=self.params)
            elements_pro_list = r.html.find(.product)
            for element_pro in elements_pro_list:
                title = element_pro.find(.productTitle a,first=True).text
                price = element_pro.find(.productPrice em,first=True).attrs.get(title)
                print(title)
                print(price)
                print(-*30)

    def run(self):
        self.get_params()
        self.get_totalPage()
        self.get_pro_info()

if __name__ == __main__:
    tmall = spider()
    tmall.run()

 

 

以上是关于爬虫2的主要内容,如果未能解决你的问题,请参考以下文章

NIH周三讲座视频爬虫

Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段

python爬虫学习笔记-M3U8流视频数据爬虫

VSCode自定义代码片段2——.vue文件的模板

爬虫遇到头疼的验证码?Python实战讲解弹窗处理和验证码识别

Python 利用爬虫爬取网页内容 (div节点的疑惑)