爬取4k图片网图片
Posted rstz
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取4k图片网图片相关的知识,希望对你有一定的参考价值。
注意更改路径
1 import os 2 import requests 3 from lxml import etree 4 from urllib.request import urlopen, Request 5 import time 6 7 class BiAnImage(): 8 def __init__(self): 9 self.base_url = "http://pic.netbian.com" 10 self.header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/73.0.3683.86 Safari/537.36"} 11 def get_html(self, url): 12 response = requests.get(url, self.header) 13 if response.status_code == 200: 14 response.encoding = response.apparent_encoding 15 return response.text 16 return None 17 def get_url_1_list(self, html_1): 18 url_1_items = [] 19 title_1_items = [] 20 x_html = etree.HTML(html_1) 21 url_list = x_html.xpath(‘//div[@id="main"]/div[2]/a/@href‘) 22 title_list = x_html.xpath(‘//div[@id="main"]/div[2]/a/text()‘) 23 for url, title in zip(url_list, title_list): 24 url_1_items.append(self.base_url + url) 25 title_1_items.append(title) 26 return title_1_items, url_1_items 27 def get_url_2_list(self, html_2): 28 url_2_items = [] 29 title_2_items = [] 30 x_html = etree.HTML(html_2) 31 url_list = x_html.xpath(‘//ul[@class="clearfix"]/li/a/@href‘) 32 title_list = x_html.xpath(‘//ul[@class="clearfix"]/li/a/b/text()‘) 33 last_page = x_html.xpath(‘//a[text()="下一页"]/preceding-sibling::a[1]/text()‘) # 直接查找下一页 => 上一个元素 34 for url, title in zip(url_list, title_list): 35 url_2_items.append(self.base_url + url) 36 title_2_items.append(title) 37 return url_2_items, title_2_items, last_page 38 def get_image_url(self, image_html): 39 x_image_html = etree.HTML(image_html) 40 image_url = x_image_html.xpath(‘//a[@id="img"]/img/@src‘) 41 return self.base_url + image_url[0] 42 def save_image(self, save_path, image_name, image_url): 43 req = Request(url=image_url, headers=self.header) 44 45 content = urlopen(req).read() 46 img_name = image_name.replace(‘ ‘, ‘‘) + image_url[-4:] 47 with open(save_path + img_name, ‘wb‘) as f: 48 f.write(content) 49 print(img_name, "下载完成...") 50 def run(self): 51 # 获取所有分类标题, 链接 52 html = self.get_html(self.base_url) 53 title_1_items, url_1_items = self.get_url_1_list(html) 54 for title_1, url_1 in zip(title_1_items, url_1_items): 55 if title_1 == "4K动漫": 56 # if title_1 == "4K风景": TODO: 这里加一个判断就可以下载指定分类下的图片 57 html_2 = self.get_html(url_1) 58 url_2_items, title_2_items, last_page = self.get_url_2_list(html_2) 59 60 # 通过拿到分类页面中的last_page, 获取该分类下所有页面链接 61 for page in range(1, int(last_page[0])): 62 if page == 1: 63 more_url_1 = url_1 # more_url_1 是每个分类下每一页的链接 64 else: 65 more_url_1 = url_1 + "index_{}.html".format(page) 66 detail_html = self.get_html(more_url_1) 67 url_2_items, title_2_items, last_page = self.get_url_2_list(detail_html) 68 69 # 获取当前页面中所有图片链接 70 for url_2, title_2 in zip(url_2_items, title_2_items): 71 72 # print(title_1, url_1, last_page[0], more_url_1, title_2, url_2) 73 pictures = "C:/Users/25766/AppData/Local/Programs/Python/Python38/imgs/" 74 75 time.sleep(2) 76 # 在这里对下载的文件进行分类, 如果文件不存在, 就直接新建一个文件夹 77 if os.path.exists(pictures + title_1) is False: 78 os.makedirs(pictures + title_1) 79 save_path = pictures + title_1 + "/" 80 image_html = self.get_html(url_2) 81 img_url = self.get_image_url(image_html) 82 self.save_image(save_path, title_2, img_url) 83 #print(save_path) 84 85 # 跳出一个页面中所有图片链接 86 # 跳出一个分类的所有页面 87 # 跳出所有分类 88 89 bian = BiAnImage() 90 bian.run() 91
以上是关于爬取4k图片网图片的主要内容,如果未能解决你的问题,请参考以下文章