python爬取哦漫画

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬取哦漫画相关的知识,希望对你有一定的参考价值。

  1 import requests
  2 from lxml import etree
  3 from bs4 import BeautifulSoup
  4 import os
  5 from selenium import webdriver
  6 
  7 
  8 
  9 #解析每个漫画分页并下载漫画
 10 def manhua(url):
 11 
 12 
 13     browser.get(url)
 14 
 15     #获取模拟访问的页面源码
 16     html=browser.page_source
 17 
 18 
 19     html = etree.HTML(html)
 20     img_url = html.xpath(//img[@id="mangaFile"]/@src)[0]
 21     alt = html.xpath(/html/body/div[2]/div[2]/h1/a/text())[0]
 22     title = html.xpath(/html/body/div[2]/div[2]/h2/text())[0]
 23     print(img_url,alt,title)
 24 
 25 
 26     # print(html)
 27 
 28 
 29     path=./漫画/+alt+/+title+/
 30     if not os.path.exists(path):
 31         os.makedirs(path)
 32     fname=img_url.split(/)[-1]
 33     # print(fname)
 34 
 35 
 36     print(os.path.join(path,fname))
 37 
 38     # request.urlretrieve(img_url,os.path.join(path,fname))
 39 
 40     #请求图片地址
 41     response = requests.get(img_url)
 42     #二进制解码
 43     data= response.content
 44     #保存文件
 45     with open(path+fname,wb) as f:
 46         f.write(data)
 47 #解析获取漫画分页链接
 48 def manhua_url(url):
 49     response = requests.get(url)
 50     response.encoding = response.apparent_encoding
 51     html = response.text
 52     html = etree.HTML(html)
 53     # print(html)
 54     #i为漫画页数
 55     i = html.xpath(/html/body/div[2]/div[2]/span/text())[1][1:-1]
 56     i=int(i)
 57     # print(i)
 58     #找到分页规律
 59     #拼接分页链接,选择用format函数
 60     url = url +/index.html?p={}
 61     # print(url)
 62     for n in range(1,i+1):
 63         fullurl = url.format(n)
 64         print(fullurl)
 65         # time.sleep(2)
 66         #fullurl为所有的分页漫画链接
 67         manhua(fullurl)
 68 
 69 #解析列表页
 70 def list(lb_url):
 71     response = requests.get(lb_url)
 72     response.encoding = response.apparent_encoding
 73     html = response.text
 74     html = BeautifulSoup(html,lxml)
 75     #匹配所有章节链接
 76     url_list = html.select(div.subBookList ul li)
 77     for url in url_list :
 78         url = url.select(a)[0].get(href).split(/)[-2]
 79 
 80         # print(url)
 81         fullurl = os.path.join(lb_url,url)
 82         print(fullurl)
 83         #章节链接
 84         manhua_url(fullurl)
 85 
 86     # print(url_list)
 87     # print(html)
 88 
 89 #解析首页
 90 def shouye():
 91     #首页链接
 92     base_url = http://www.omanhua.com/
 93     #发起请求
 94     response = requests.get(base_url)
 95     #解码
 96     response.encoding = response.apparent_encoding
 97     #获取返回的网页
 98     html = response.text
 99     # print(html)
100     #解析
101     html =BeautifulSoup(html,lxml)
102     #匹配最热漫画链接
103     url_list = html.select(ul#cartoon_image_show1 li)
104     for url in url_list:
105         # print(url)
106         url = url.select(a)[0].get(href)[1:]
107         # alt = url.select(‘a‘)
108         # print(alt)
109         #拼接链接
110         fullurl = os.path.join(base_url,url)
111         print(fullurl)
112 
113         list(fullurl)
114 if __name__ == __main__:
115     # 用自动测试模块selenium模拟浏览器访问,这里用谷歌 图片加载获取不到图片链接
116     #后面的路径是chorm驱动路径
117     browser = webdriver.Chrome(executable_path=rC:UserszhaozhiDesktopchromedriver.exe)
118     shouye()

刚开始自学爬虫不久,代码可能写的有点繁琐,希望和大家一起学习学习进步





以上是关于python爬取哦漫画的主要内容,如果未能解决你的问题,请参考以下文章

Python | 图片转pdf

python爬取漫画

python selenium爬取kuku漫画

利用python3爬虫爬取漫画岛-非人哉漫画

#私藏项目实操分享#Python爬虫实战,requests模块,Python实现爬取网站漫画

基于Python实现的ComicReaper漫画自动爬取脚本