scrapy 爬取斗罗大陆漫画

Posted sxqfuture

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy 爬取斗罗大陆漫画相关的知识,希望对你有一定的参考价值。

# -*- coding: utf-8 -*-
import scrapy
import json
import os
import urllib
import time

from scrapy.http import Request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    WebDriverException,
    NoSuchElementException,
    StaleElementReferenceException
)


def gen_browser(driver_path):
    ‘‘‘实例化一个driver‘‘‘
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument(--no-sandbox)
    options.add_argument(--disable-gpu)
    options.add_argument(--ignore-certificate-errors)
    options.add_argument(disable-infobars)
    options.add_argument("--disable-plugins-discovery")
    user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    options.add_argument(user-agent="{0}".format(user_agent))
    # ############### 专业造假 ***************************

    def send(driver, cmd, params={}):
        ‘‘‘
        向调试工具发送指令
        from: https://stackoverflow.com/questions/47297877/to-set-mutationobserver-how-to-inject-javascript-before-page-loading-using-sele/47298910#47298910
        ‘‘‘
        resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
        url = driver.command_executor._url + resource
        body = json.dumps({cmd: cmd, params: params})
        response = driver.command_executor._request(POST, url, body)
        if response[status]:
            raise Exception(response.get(value))
        return response.get(value)

    def add_script(driver, script):
        ‘‘‘在页面加载前执行js‘‘‘
        send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
    # 给 webdriver.Chrome 添加一个名为 add_script 的方法
    webdriver.Chrome.add_script = add_script  # 这里(webdriver.Chrome)可能需要改,当调用不同的驱动时
    # *************** 专业造假 ###################
    browser = webdriver.Chrome(
        executable_path=driver_path,
        chrome_options=options
    )
    # ################## 辅助调试 *********************
    existed = {
        executor_url: browser.command_executor._url,  # 浏览器可被远程连接调用的地址
        session_id: browser.session_id  # 浏览器会话ID
    }
    print(existed)
    # ********************* 辅助调试 ##################
    # ############### 专业造假 ***************************
    browser.add_script("""
    Object.defineProperty(navigator, ‘webdriver‘, {
        get: () => false,
    });
    window.navigator.chrome = {
        runtime: {},
    };
    Object.defineProperty(navigator, ‘languages‘, {
        get: () => [‘zh-CN‘, ‘zh‘]
    });
    Object.defineProperty(navigator, ‘plugins‘, {
        get: () => [0, 1, 2],
    });
    """)
    # *************** 专业造假 ###################
    return browser


class XuexingSpider(scrapy.Spider):
    name = xuexing
    allowed_domains = [www.manhuatai.com]
    start_urls = [https://www.mh1234.com/wap/comic/9683/262424.html]

    def parse(self, response):
        driver_path = self.settings.get(DRIVER_PATH)
        # import ipdb; ipdb.set_trace()
        browser = gen_browser(driver_path)
        # 获取当前章节名
        next_url = response.url
        i = 0
        while bool(next_url):
            i += 1
            print(str(i).center(60, *))
            next_url = self.get_item(browser, next_url)

    def get_item(self, browser, url=None):
        if url is not None:
            browser.get(url)  # 打开页面
        van1 = browser.find_elements_by_xpath(//a[@class="BarTit"])
        van = van1[0].text.split(()[0].strip()
        if / in van:
            van = -.join(van.split(/))
        # import ipdb; ipdb.set_trace()
        if not os.path.exists(斗罗大陆):
            os.mkdir(斗罗大陆)
        if not os.path.exists(van):
            os.mkdir(r斗罗大陆/{0}.format(van))
        m = 0
        _url = browser.find_element_by_xpath(//*[@id="qTcms_pic"])
        img_url = _url.get_attribute(src)
        # 保存图片到指定路径  
        if img_url != None:
            m += 1
            #保存图片数据
            data = urllib.request.urlopen(img_url).read()
            f = open(斗罗大陆/{0}/{1}.jpg.format(van, m), wb)
            f.write(data)
            f.close()
        ye = int(browser.find_element_by_xpath(//*[@id="k_total"]).text)
        for yei in range(1, ye):
            time.sleep(0.5)
            browser.find_element_by_xpath(//*[@id="action"]/ul/li[3]/a).click()
            _url = browser.find_element_by_xpath(//*[@id="qTcms_pic"])
            img_url = _url.get_attribute(src)
            # 保存图片到指定路径  
            if img_url != None:
                m += 1
                #保存图片数据
                data = urllib.request.urlopen(img_url).read()
                f = open(斗罗大陆/{0}/{1}.png.format(van, m), wb)
                f.write(data)
                f.close()

        xia = browser.find_element_by_xpath(//*[@id="action"]/ul/li[4]/a).get_attribute(href)
        return xia

 

以上是关于scrapy 爬取斗罗大陆漫画的主要内容,如果未能解决你的问题,请参考以下文章

scrapy按顺序启动多个爬虫代码片段(python3)

Python3 爬虫U28_多线程爬取斗图啦的表情包

斗罗大陆H5手游黄金铁三角怎么过?

scrapy主动退出爬虫的代码片段(python3)

以开发之名|斗罗大陆:创造一个尽情探险的开放式游戏世界

以开发之名|斗罗大陆:创造一个尽情探险的开放式游戏世界