爬虫多个基础实例
Posted 离落想AC
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫多个基础实例相关的知识,希望对你有一定的参考价值。
输入名字爬取百度搜索的网页源码(初级)
# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 17:13
# @Author : 李如旭
# @File :111.py
# @Software: PyCharm
import requests
name = input("请输出要搜索人的名称:")
url = "https://www.baidu.com/s?wd=name"
head =
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
response = requests.get(url=url,headers=head)
#print(response)
html = response.text
print(html)
response.close()
获取百度翻译结果(初级)
# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:19
# @Author : 李如旭
# @File :baidufanyi.py
# @Software: PyCharm
import requests
url = "https://fanyi.baidu.com/sug"
word = input("请输入要翻译的英文单词:")
dat =
"kw": word
resp = requests.post(url,data=dat)
print(resp.json())
resp.close()
爬取豆瓣电影排行榜第一页(初级)
# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:33
# @Author : 李如旭
# @File :paihang.py
# @Software: PyCharm
import requests
url = "https://movie.douban.com/j/chart/top_list"
param =
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",
head =
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
resp = requests.get(url=url,params=param,headers=head)
print(resp.json())
resp.close()
爬取豆瓣电影TOP250 名字,年份,评分,评价人数
# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 21:10
# @Author : 李如旭
# @File :豆瓣排行榜.py
# @Software: PyCharm
import re
import requests
import csv
url ="https://movie.douban.com/top250"
head =
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
rsp = requests.get(url=url,headers=head)
html = rsp.text
#print(html)
#爬取名字
# # #法一:
# name = re.finditer(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',html,re.DOTALL)
# for i in name:
# print(i.group("name"))
#
# # #法二:
# obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',re.S)
# result = obj.finditer(html)
# for it in result :
# print(it.group("name"))
#
# #法三:
#
# name = re.findall(r'<li>.*?<div class="item">.*?<span class="title">(.*?)</span>',html,re.S)
# print(name)
#爬取名字、年份、评分、
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*?'
r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<number>.*?)</span>', re.S)
result = obj.finditer(html)
f = open("data.csv",mode="w")
csvwriter = csv.writer(f)
for it in result:
# print(it.group("name"))
# print(it.group("year").strip())
# print(it.group("score"))
# print(it.group("number"))
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
爬取百度图片:
# -*- coding: UTF-8 -*-
# @Time : 2021/6/2 9:59
# @Author : 李如旭
# @File :百度图片.py
# @Software: PyCharm
import requests
import re
import os
word = input("请输入搜索关键词(可以是人名,地名等): ")
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=30'
head =
'Access-Control-Allow-Credentials': 'true',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
resp = requests.get(url=url,headers=head)
resp.encoding = "utf-8"
html = resp.text
# print(html)
urls = re.findall('"thumbURL":"(.*?)"',html)
num = 0
file = input('请建立一个存储图片的文件夹,输入文件夹名称即可:')
os.mkdir(file)
for i in urls:
pic = requests.get(i, timeout=7)
string = file + r'\\\\' + word + '_' + str(num) + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
num = num + 1
以上是关于爬虫多个基础实例的主要内容,如果未能解决你的问题,请参考以下文章