爬虫入门-4-2.爬取豆瓣读书图片
Posted min-r
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫入门-4-2.爬取豆瓣读书图片相关的知识,希望对你有一定的参考价值。
一.利用lxml解析
from lxml import etree import os import requests PROXY = { ‘HTTPS‘: ‘116.209.55.208:9999‘ } def spider(): url = ‘https://book.douban.com/latest?icn=index-latestbook-all‘ response = requests.get(url, proxies=PROXY) html = etree.HTML(response.content) # imgs是list类型 imgs = html.xpath("//div[@class=‘article‘]//ul/li/a/img/@src") save_pic(imgs) def save_pic(images): j = 0 if os.path.exists(‘pi‘): pass else: os.mkdir(‘pi‘) for img in images: j = j + 1 with open(‘pi/‘ + str(j) + ‘.jpg‘, ‘wb‘) as fd: picture = requests.get(img).content fd.write(picture) if __name__ == ‘__main__‘: spider()
二.利用beautifulSoup解析:
1 import os 2 3 import requests 4 from bs4 import BeautifulSoup 5 6 PROXY = { 7 ‘HTTPS‘: ‘116.209.55.208:9999‘ 8 } 9 10 11 def spider(): 12 url = ‘https://book.douban.com/latest?icn=index-latestbook-all‘ 13 response = requests.get(url, proxies=PROXY) 14 soup = BeautifulSoup(response.content, ‘lxml‘) 15 # class为cover的所有a标签 16 a_tags = soup.find_all("a", class_=‘cover‘) 17 for tag in a_tags: 18 # 查找a标签下的所有img标签 19 imgs = tag.find_all("img") 20 for img in imgs: 21 # 获取所有img标签的src属性,返回的是<class ‘str‘> 22 pics = img.get(‘src‘) 23 filename = pics[-13:] 24 print(filename) 25 with open(‘pic2s/‘ + filename, ‘wb‘) as fd: 26 picture = requests.get(pics).content 27 fd.write(picture) 28 29 30 if __name__ == ‘__main__‘: 31 spider()
1 import requests 2 from bs4 import BeautifulSoup 3 4 PROXY = { 5 ‘HTTPS‘: ‘116.209.55.208:9999‘ 6 } 7 8 9 def spider(): 10 url = ‘https://book.douban.com/latest?icn=index-latestbook-all‘ 11 response = requests.get(url, proxies=PROXY) 12 soup = BeautifulSoup(response.content, ‘lxml‘) 13 imgs=soup.select(".cover img") 14 for img in imgs: 15 pics = img.get(‘src‘) 16 filename = pics[-13:] 17 print(filename) 18 with open(‘pic3s/‘ + filename, ‘wb‘) as fd: 19 picture = requests.get(pics).content 20 fd.write(picture) 21 22 23 if __name__ == ‘__main__‘: 24 spider()
三.爬取豆瓣图书图片
1 import os 2 3 import requests 4 from lxml import etree 5 6 j = 0 7 for i in range(0, 6): 8 r = requests.get( 9 ‘https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C?‘ + ‘start=%d&type=T‘ % i * 20).content 10 books = etree.HTML(r) 11 imgs = books.xpath(‘//*[@id="subject_list"]/ul/li/div[1]/a/img/@src‘, stream=True) 12 if os.path.exists(‘downloads‘): 13 pass 14 else: 15 os.mkdir(‘downloads‘) 16 for img in imgs: 17 j = j + 1 18 with open(‘downloads/‘+str(j) + ‘.jpg‘, ‘wb‘) as fd: 19 picture = requests.get(img).content 20 fd.write(picture)
以上是关于爬虫入门-4-2.爬取豆瓣读书图片的主要内容,如果未能解决你的问题,请参考以下文章