requests库的基本使用 | 爬虫
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了requests库的基本使用 | 爬虫相关的知识,希望对你有一定的参考价值。
# proxies代理 # 1.用法 import requests # 构建一个url # url = ‘http://www.baidu.com‘ # 构建请求头 # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘ # } # 构建代理 # 网上查找免费代理 # proxies = { # "http": "http://61.135.217.7:80", # "https": "https://61.135.217.7:80", # } # 特殊代理 # 常规代理已经被大型网站封了; # proxies = { # "http": "http://账号:密码@IP:PORT", # "https": "http://账号:密码@IP:PORT", # } # 发送请求 # response = requests.get(url, headers=headers, proxies=proxies) # 如何验证代理是否成功? # 只要没报错;过滤掉速度慢的,时刻验证; # 2.cookie与session # cookie:一种持久保存在磁盘中;一种临时保存在缓存中; # 不安全,存放在本地的cookie会被他人分析进而实施欺骗行为; # session:存放位置:服务器内存.文件,数据库; # session设置有效期,当访问增多会占用服务器性能; # sessionid存放在cookie中,cookie要是被禁了,就涉及到url重构; # cookie数据限制,不要能超过4k; # ①带上cookie与session的好处: # 能够请求登录后的页面 # ②带上cookie与session的坏处: # 一套cookie和session会对应一个用户; # 请求次数过多过快,会被识别成爬虫; # 可以使用cookie池,账号池; # 不需要cookie的时候尽量不要使用cookie, # 但是为了登录,我们必须发送带有cookies的请求; # 如何使用requests处理cookies和session? # 1.处理cookies # # 请求头中添加cookie # # cookie参数:字典 # 需要访问的页面:http://www.renren.com/910033035 # cookie: # 方法一:将cookie放入请求头中 # """anonymid=ja66ma6ma1ay1i; depovince=GW; _r01_=1; jebe_key=4f2064ba-bdf0-4120-a73b-40054296547e%7C849ce3a2a3b19cb6be746727b6746f3b%7C1511060907946%7C1%7C1511060908130; JSESSIONID=abcXkOHShmoGs_4isqs-v; __utmt=1; ick=bf8207d8-aadc-4e53-bb04-1d11c600b917; __utma=151146938.2109560930.1511061038.1511061038.1511061038.1; __utmb=151146938.4.10.1511061038; __utmc=151146938; __utmz=151146938.1511061038.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebecookies=1c38aa9d-1d50-4e9e-bb25-887c2fb6bc4f|||||; ick_login=a7d3eed9-9d2f-420b-b773-80eac19fcbd4; _de=CA265D35DCCFFBBB070BF98752FC884D; p=e7d112ba7e8cf29163d032a0ed0523ab5; first_login_flag=1; ln_uact=18868271201; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=7b2cd7c3519060139fd32d514cbd82955; societyguester=7b2cd7c3519060139fd32d514cbd82955; id=910033035; xnsid=2dbb196a; ch_id=10016; ver=7.0; loginfrom=null; wp_fold=0""" # 构建url import re # url = ‘http://www.renren.com/910033035‘ # 构建请求头 # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘, # ‘Cookie‘: "anonymid=ja66ma6ma1ay1i; depovince=GW; _r01_=1; jebe_key=4f2064ba-bdf0-4120-a73b-40054296547e%7C849ce3a2a3b19cb6be746727b6746f3b%7C1511060907946%7C1%7C1511060908130; JSESSIONID=abcXkOHShmoGs_4isqs-v; __utmt=1; ick=bf8207d8-aadc-4e53-bb04-1d11c600b917; __utma=151146938.2109560930.1511061038.1511061038.1511061038.1; __utmb=151146938.4.10.1511061038; __utmc=151146938; __utmz=151146938.1511061038.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebecookies=1c38aa9d-1d50-4e9e-bb25-887c2fb6bc4f|||||; ick_login=a7d3eed9-9d2f-420b-b773-80eac19fcbd4; _de=CA265D35DCCFFBBB070BF98752FC884D; p=e7d112ba7e8cf29163d032a0ed0523ab5; first_login_flag=1; ln_uact=18868271201; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=7b2cd7c3519060139fd32d514cbd82955; societyguester=7b2cd7c3519060139fd32d514cbd82955; id=910033035; xnsid=2dbb196a; ch_id=10016; ver=7.0; loginfrom=null; wp_fold=0" # } # 发起请求获取响应 # response = requests.get(url, headers=headers) # 验证登录 # 1.根据响应url判断是否登录成功 # 2.保存成文件 # 3.正则判断 # print(re.findall(r‘新用户oF0z‘, response.content.decode())) # with open(‘renren.html‘, ‘w‘) as f: # f.write(response.content.decode()) # 方法二:cookie传参 # url = ‘http://www.renren.com/910033035‘ # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘, # } # # # 构建cookies字典 # temp = "anonymid=ja66ma6ma1ay1i; depovince=GW; _r01_=1; jebe_key=4f2064ba-bdf0-4120-a73b-40054296547e%7C849ce3a2a3b19cb6be746727b6746f3b%7C1511060907946%7C1%7C1511060908130; JSESSIONID=abcXkOHShmoGs_4isqs-v; __utmt=1; ick=bf8207d8-aadc-4e53-bb04-1d11c600b917; __utma=151146938.2109560930.1511061038.1511061038.1511061038.1; __utmb=151146938.4.10.1511061038; __utmc=151146938; __utmz=151146938.1511061038.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; jebecookies=1c38aa9d-1d50-4e9e-bb25-887c2fb6bc4f|||||; ick_login=a7d3eed9-9d2f-420b-b773-80eac19fcbd4; _de=CA265D35DCCFFBBB070BF98752FC884D; p=e7d112ba7e8cf29163d032a0ed0523ab5; first_login_flag=1; ln_uact=18868271201; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=7b2cd7c3519060139fd32d514cbd82955; societyguester=7b2cd7c3519060139fd32d514cbd82955; id=910033035; xnsid=2dbb196a; ch_id=10016; ver=7.0; loginfrom=null; wp_fold=0" # cookies = dict() # # 拆分cookies,以字典的方式存放 # for i in temp.split(‘; ‘): # key = i.split(‘=‘)[0] # value = i.split(‘=‘)[1] # cookies[key] = value # # print(cookies) # # # 发起请求 # response = requests.get(url, headers=headers, cookies=cookies) # # # 验证是否成功 # print(re.findall(r‘新用户oF0z‘, response.content.decode())) # 2.session # 处理session # requests提供了一个session类,来实现客户端和服务器端的回话保持; # 1.例化session对象 # 2.使用session对象发送get或者post请求 # session.get(url) # session.post(url, data=data) # # 构建url, 访问表单的url # url = ‘http://www.renren.com/PLogin.do‘ # # 构建请求头 # headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘, # } # # 构建登录数据 # post_data = { # ‘email‘: ‘18868271201‘, # ‘password‘: ‘laaimeng2011‘ # } # # 创建一个session对象 # session = requests.session() # # 发送请求 # response = session.post(url, headers=headers, data=post_data) # print(response.url) # # # 验证 # # 跳转其他页面,不需要再提交其他请求数据; # # session可以保持登录状态 # response1 = session.get(‘http://www.renren.com/910033035‘) # print(response1.url) # 3.tip小技巧 # response = requests.get(‘http://www.baidu.com‘) # # # cookie相关操作 # # 从请求头中获取cookies # cook = response.cookies # # print(cook) # # 打印的是一个cookieJar对象 # print(type(cook)) # # # 将这个对象转换成字典的形式 # dict_cook = requests.utils.dict_from_cookiejar(cook) # print(dict_cook) # print(type(dict_cook)) # # # 转换回去 # jar = requests.utils.cookiejar_from_dict(dict_cook) # print(jar) # print(type(jar)) # # 关闭ssl认证 # # 有些网站采用这样的认证证书,我们需要通过verify=False,来关闭 # response = requests.get(‘https://www.12306.cn/mormhwed/‘, verify=False) # # 会有警告,但是可以直接打印源码 # print(response.content.decode()) # # 超时处理 # url = ‘http://www.youtube.com‘ # # 三分钟的超时延迟 # # 所以我们设置一个短的timeout=3超时处理,可以验证代理情况 # # 如果爬虫使用多线程,超时延迟会影响效率 # response = requests.get(url, timeout=3)
以上是关于requests库的基本使用 | 爬虫的主要内容,如果未能解决你的问题,请参考以下文章