python爬虫.3.下载网页图片
Posted Protogenoi
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬虫.3.下载网页图片相关的知识,希望对你有一定的参考价值。
目标,豆瓣读书,
下载页面书籍图片。
import urllib.request import re #使用正则表达式 def getJpg(date): jpgList = re.findall(r‘(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")‘,date) return jpgList def downLoad(jpgUrl,sTitle,n): try: urllib.request.urlretrieve(jpgUrl, ‘C:\\Users\\74172\\source\\repos\\Python\\spidertest1\\images\\book.douban\\%s.jpg‘ %sTitle) except Exception as e: print(e) finally: print(‘图片%s下载操作完成‘ % n) def getTitle(date): titleList = re.findall(r‘title=".">‘,date) return titleList if __name__ == ‘__main__‘: url = ‘https://book.douban.com/‘ res = urllib.request.urlopen(url) date = res.read().decode(‘utf-8‘) date_jpg = getJpg(date) imageTitle = getTitle(date) global n n = 1 for jpginfo in date_jpg: s = re.findall(r‘http.+?.jpg‘,str(jpginfo)) print(n,‘--- url -->‘,str(s)[2:-2]) sTitleInfo = re.findall(r‘alt=".+?."‘,str(jpginfo)) sTitleL = re.findall(r‘".+?."‘,str(sTitleInfo)) sTitle = str(sTitleL)[3:-3] downLoad(s[0],sTitle,n) n = n + 1
又做了点修改,并将书名写入txt文件中
import urllib.request import re #使用正则表达式 def getJpg(html): jpgList = re.findall(r‘(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")‘,html) jpgList = re.findall(r‘http.+?.jpg‘,str(jpgList)) return jpgList def downLoad(jpgUrl,sTitle,n): try: urllib.request.urlretrieve(jpgUrl, ‘C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg‘ %sTitle) finally: print(‘图片---%s----下载操作完成‘ % sTitle) def getTitle(html): titleList = re.findall(r‘(img src="http.+?.jpg")([\s\S]*?)(.+?.alt=".+?.")‘,html) titleList = re.findall(r‘alt=".+?."‘,str(titleList)) titleList = re.findall(r‘".+?."‘,str(titleList)) return titleList def writeTxt(imageTitle): try: #目录建立txt文件 f = open((url[8:-5]+‘.txt‘),"a",encoding="utf-8") #写入 f.write(imageTitle+‘\n‘) finally: if f: #关闭文件 f.close() if __name__ == ‘__main__‘: url = ‘https://book.douban.com/‘ res = urllib.request.urlopen(url) html = res.read().decode(‘utf-8‘) urlJpgs = getJpg(html) imageTitle = getTitle(html) n = 0 for urlJpg in urlJpgs: print(n,‘--- url -->‘,urlJpg) downLoad(urlJpg,imageTitle[n][1:-1],n) writeTxt(imageTitle[n][1:-1]) n = n + 1
以上是关于python爬虫.3.下载网页图片的主要内容,如果未能解决你的问题,请参考以下文章
python爬虫 将在线html网页中的图片链接替换成本地链接并将html文件下载到本地