scrapy 爬取斗罗大陆漫画
Posted sxqfuture
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy 爬取斗罗大陆漫画相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*- import scrapy import json import os import urllib import time from scrapy.http import Request from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( TimeoutException, WebDriverException, NoSuchElementException, StaleElementReferenceException ) def gen_browser(driver_path): ‘‘‘实例化一个driver‘‘‘ options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument(‘--no-sandbox‘) options.add_argument(‘--disable-gpu‘) options.add_argument(‘--ignore-certificate-errors‘) options.add_argument(‘disable-infobars‘) options.add_argument("--disable-plugins-discovery") user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/67.0.3396.99 Safari/537.36" options.add_argument(‘user-agent="{0}"‘.format(user_agent)) # ############### 专业造假 *************************** def send(driver, cmd, params={}): ‘‘‘ 向调试工具发送指令 from: https://stackoverflow.com/questions/47297877/to-set-mutationobserver-how-to-inject-javascript-before-page-loading-using-sele/47298910#47298910 ‘‘‘ resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id url = driver.command_executor._url + resource body = json.dumps({‘cmd‘: cmd, ‘params‘: params}) response = driver.command_executor._request(‘POST‘, url, body) if response[‘status‘]: raise Exception(response.get(‘value‘)) return response.get(‘value‘) def add_script(driver, script): ‘‘‘在页面加载前执行js‘‘‘ send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script}) # 给 webdriver.Chrome 添加一个名为 add_script 的方法 webdriver.Chrome.add_script = add_script # 这里(webdriver.Chrome)可能需要改,当调用不同的驱动时 # *************** 专业造假 ################### browser = webdriver.Chrome( executable_path=driver_path, chrome_options=options ) # ################## 辅助调试 ********************* existed = { ‘executor_url‘: browser.command_executor._url, # 浏览器可被远程连接调用的地址 ‘session_id‘: browser.session_id # 浏览器会话ID } print(existed) # ********************* 辅助调试 ################## # ############### 专业造假 *************************** browser.add_script(""" Object.defineProperty(navigator, ‘webdriver‘, { get: () => false, }); window.navigator.chrome = { runtime: {}, }; Object.defineProperty(navigator, ‘languages‘, { get: () => [‘zh-CN‘, ‘zh‘] }); Object.defineProperty(navigator, ‘plugins‘, { get: () => [0, 1, 2], }); """) # *************** 专业造假 ################### return browser class XuexingSpider(scrapy.Spider): name = ‘xuexing‘ allowed_domains = [‘www.manhuatai.com‘] start_urls = [‘https://www.mh1234.com/wap/comic/9683/262424.html‘] def parse(self, response): driver_path = self.settings.get(‘DRIVER_PATH‘) # import ipdb; ipdb.set_trace() browser = gen_browser(driver_path) # 获取当前章节名 next_url = response.url i = 0 while bool(next_url): i += 1 print(str(i).center(60, ‘*‘)) next_url = self.get_item(browser, next_url) def get_item(self, browser, url=None): if url is not None: browser.get(url) # 打开页面 van1 = browser.find_elements_by_xpath(‘//a[@class="BarTit"]‘) van = van1[0].text.split(‘(‘)[0].strip() if ‘/‘ in van: van = ‘-‘.join(van.split(‘/‘)) # import ipdb; ipdb.set_trace() if not os.path.exists(‘斗罗大陆‘): os.mkdir(‘斗罗大陆‘) if not os.path.exists(van): os.mkdir(r‘斗罗大陆/{0}‘.format(van)) m = 0 _url = browser.find_element_by_xpath(‘//*[@id="qTcms_pic"]‘) img_url = _url.get_attribute(‘src‘) # 保存图片到指定路径 if img_url != None: m += 1 #保存图片数据 data = urllib.request.urlopen(img_url).read() f = open(‘斗罗大陆/{0}/{1}.jpg‘.format(van, m), ‘wb‘) f.write(data) f.close() ye = int(browser.find_element_by_xpath(‘//*[@id="k_total"]‘).text) for yei in range(1, ye): time.sleep(0.5) browser.find_element_by_xpath(‘//*[@id="action"]/ul/li[3]/a‘).click() _url = browser.find_element_by_xpath(‘//*[@id="qTcms_pic"]‘) img_url = _url.get_attribute(‘src‘) # 保存图片到指定路径 if img_url != None: m += 1 #保存图片数据 data = urllib.request.urlopen(img_url).read() f = open(‘斗罗大陆/{0}/{1}.png‘.format(van, m), ‘wb‘) f.write(data) f.close() xia = browser.find_element_by_xpath(‘//*[@id="action"]/ul/li[4]/a‘).get_attribute(‘href‘) return xia
以上是关于scrapy 爬取斗罗大陆漫画的主要内容,如果未能解决你的问题,请参考以下文章