****
# -*- coding:utf-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import re
import time
url = "https://www.zhihu.com/question/22918070"
html = request.urlopen(url).read().decode(‘utf-8‘)
soup = BeautifulSoup(html,‘html.parser‘)
#print(soup.prettify())
#用Beautiful Soup结合正则表达式来提取包含所有图片链接(img标签中,class=**,以.jpg结尾的链接)的语句
links = soup.find_all(‘img‘, "origin_image zh-lightbox-thumb",src=re.compile(r‘.jpg$‘))
print(links)
# 设置保存图片的路径,否则会保存到程序当前路径
path = r‘/home/kong/PycharmProjects/untitled2/image/‘ #路径前的r是保持字符串原始值的意思,就是说不对其中的符号进行转义
for link in links:
print(link.attrs[‘src‘])
#保存链接并命名,time.time()返回当前时间戳防止命名冲突
request.urlretrieve(link.attrs[‘src‘],path+‘\%s.jpg‘ % time.time()) #使用request.urlretrieve直接将所有远程链接数据下载到本地
*****
import csv
import requests
import re
import urllib
from collections import namedtuple
from lxml import etree
from bs4 import BeautifulSoup
def schedule(blocknum, blocksize, totalsize):
"""
blocknum: 123
blocksize: 456
totalsize: 789
"""
per = 100.0 * blocknum * blocksize / totalsize
if per > 100:
per = 100
print "current download schedule: %d" % per
user_agent = ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘
headers = {‘User-Agent‘: user_agent}
r = requests.get(‘http://www.win4000.com/wallpaper_detail_118605.html‘, headers=headers)
# r = requests.get(‘http://www.ivsky.com/tupian/ziranfengguang/‘, headers=headers)
html = etree.HTML(r.text)
# u = html.xpath(‘.//*[@class="imgitem"]‘)
img_urls = html.xpath(‘.//img/@src‘)
# img_urls = html.xpath(‘.//img/@src‘)
i = 0
for img_url in img_urls:
urllib.urlretrieve(img_url, ‘img‘+str(i) + ‘.jpg‘, schedule)
i += 1