爬虫多个基础实例

Posted 离落想AC

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫多个基础实例相关的知识,希望对你有一定的参考价值。

输入名字爬取百度搜索的网页源码(初级)

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 17:13
# @Author : 李如旭
# @File :111.py
# @Software: PyCharm

import requests

name = input("请输出要搜索人的名称:")

url = "https://www.baidu.com/s?wd=name"

head = 
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/90.0.4430.212 Safari/537.36"

response = requests.get(url=url,headers=head)

#print(response)
html = response.text
print(html)
response.close()

获取百度翻译结果(初级)

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:19
# @Author : 李如旭
# @File :baidufanyi.py
# @Software: PyCharm


import requests

url = "https://fanyi.baidu.com/sug"

word = input("请输入要翻译的英文单词:")

dat = 
    "kw": word

resp = requests.post(url,data=dat)

print(resp.json())

resp.close()

爬取豆瓣电影排行榜第一页(初级)

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:33
# @Author : 李如旭
# @File :paihang.py
# @Software: PyCharm

import requests

url = "https://movie.douban.com/j/chart/top_list"

param = 
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",


head = 
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"


resp = requests.get(url=url,params=param,headers=head)

print(resp.json())

resp.close()

爬取豆瓣电影TOP250 名字,年份,评分,评价人数

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 21:10
# @Author : 李如旭
# @File :豆瓣排行榜.py
# @Software: PyCharm

import re
import requests
import csv

url ="https://movie.douban.com/top250"

head = 
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"

rsp = requests.get(url=url,headers=head)
html = rsp.text

#print(html)

#爬取名字

# # #法一:
# name = re.finditer(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',html,re.DOTALL)
# for i in name:
#     print(i.group("name"))
#
# # #法二:
# obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',re.S)
# result = obj.finditer(html)
# for it in result :
#     print(it.group("name"))
#
# #法三:
#
# name = re.findall(r'<li>.*?<div class="item">.*?<span class="title">(.*?)</span>',html,re.S)
# print(name)



#爬取名字、年份、评分、
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
                 r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?'
                 r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
                 r'<span>(?P<number>.*?)</span>', re.S)
result = obj.finditer(html)


f = open("data.csv",mode="w")
csvwriter = csv.writer(f)

for it in result:
    # print(it.group("name"))
    # print(it.group("year").strip())
    # print(it.group("score"))
    # print(it.group("number"))
    dic = it.groupdict()
    dic['year'] = dic['year'].strip()
    csvwriter.writerow(dic.values())

f.close()


爬取百度图片:

# -*- coding: UTF-8 -*-
# @Time : 2021/6/2 9:59
# @Author : 李如旭
# @File :百度图片.py
# @Software: PyCharm
import requests
import re
import os
word = input("请输入搜索关键词(可以是人名,地名等): ")
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=30'
head = 
        'Access-Control-Allow-Credentials': 'true',
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    
resp = requests.get(url=url,headers=head)
resp.encoding = "utf-8"
html = resp.text
# print(html)

urls = re.findall('"thumbURL":"(.*?)"',html)
num = 0
file = input('请建立一个存储图片的文件夹,输入文件夹名称即可:')
os.mkdir(file)
for i in urls:
     pic = requests.get(i, timeout=7)
     string = file + r'\\\\' + word + '_' + str(num) + '.jpg'
     fp = open(string, 'wb')
     fp.write(pic.content)
     num = num + 1

以上是关于爬虫多个基础实例的主要内容,如果未能解决你的问题,请参考以下文章

爬虫多个基础实例

爬虫多个基础实例

爬虫基础以及一个简单的实例

爬虫3 requests基础

Python 开发简单爬虫 - 基础框架

网络爬虫技术——淘宝数据采集实例