爬虫案例
- 爬取汽车之家,指定页面的图片url
1.爬取汽车之家,指定页面的图片url
import requests
from bs4 import BeautifulSoup
# 获取页面数据
r1 = requests.get(
url='https://www.autohome.com.cn/news/201801/912472.html#pvareaid=102624',
headers={
'Host':'www.autohome.com.cn',
'Referer':"https://www.autohome.com.cn/",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
)
soup = BeautifulSoup(r1.text, "lxml")
# 定位标签
id_articlewrap = soup.find(name="div", id="articlewrap")
id_articleContent = soup.find(name="div", id="articleContent")
# 标题
h1 = (id_articlewrap.find(name="h1").text).strip()
# 获取id_articleContent下 p 标签,并且为 center 属性 []
pp = id_articleContent.find_all(name="p", attrs={"align": "center"})
for i in pp:
img = i.find(name="img")
# 判断是否有 img 标签
if img:
# 获取 src 地址
img_url = "https:" + img.get("src")
print(img_url)
# 获取 图片的 bytes 内容
img_response = requests.get(img_url).content
# 截取url图片名称
file_name = img_url.rsplit('/', maxsplit=1)[1]
with open(file_name, 'wb') as f:
# 写入文件中
f.write(img_response)