初探豆瓣爬虫
Posted 河南骏
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了初探豆瓣爬虫相关的知识,希望对你有一定的参考价值。
# coding:utf-8-bom #写爬虫防止被封的关键有以下几点: # 1、伪装请求报头(request header) # 2、减轻访问频率,速度 # 3、使用代理IP #下面从国内高匿代理IP 获得代理IP数据 import os import time import requests from bs4 import BeautifulSoup import csv import random import time #获取num页 国内高匿ip的网页中代理数据 def fetch_proxy(num): #修改当前工作文件夹 os.chdir(r'c:\\proxy_for_spider') api='http://www.xicidaili.com/nn/' header= 'User-Agent':'Mozilla/5.0 (Windows NT 6.1;Win64;x64;rv:59.0) Gecko/20100101 Firefox/59.0' fp=open('host.txt','a+',encoding=('utf-8')) for i in range(num+1): api=api.format(1) respones=requests.get(url=api,headers=header) soup=BeautifulSoup(respones.text,'lxml') container=soup.find_all(name='tr',attrs='class':'odd') for tag in container: try: con_soup=BeautifulSoup(str(tag),'lxml') td_list=con_soup.find_all('td') ip=str(td_list[1])[4:-5] port=str(td_list[2])[4:-5] IPport=ip+'\\t'+port+'\\n' fp.write(IPport) except Exception as e: print('No IP!') time.sleep(1) fp.close() #用百度网检测(大公司不怕短时间内高频率访问) 状态200表示可用 def test_proxy(): N=1 os.chdir(r'c:\\proxy_for_spider') url='https://www.baidu.com' fp=open('host.txt','r') ips=fp.readlines() proxys=list() for p in ips: ip=p.strip('\\n').split('\\t') proxy='http://'+ip[0]+':'+ip[1] proxies='proxy':proxy proxys.append(proxies) for pro in proxys: try: s=requests.get(url,proxies=pro) print('第个ip: 状态 '.format(N,pro,s.status_code)) except Exception as e: print(e) N+=1 #代理IP池生成函数 def proxypool(num): n=1 os.chdir(r'c:\\proxy_for_spider') fp=open('host.txt','r') proxys=list() ips=fp.readlines() while n<num: for p in ips: ip=p.strip('\\n').split('\\t') proxy='http://'+ip[0]+':'+ip[1] proxies='proxy':proxy proxys.append(proxies) n+=1 return proxys #下面开始爬豆瓣电影的电影数据,我们要获取 电影名、演员、评分 def fetch_movies(tag,pages,proxys): os.chdir(r'c:\\proxy_for_spider\\douban') url='https://movie.douban.com/tag/?start=&type=T' header= 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;Win64;x64;rv:59.0) Gecko/20100101 Firefox/59.0' #用csv文件保留数据 csvFile=open(r'_all.csv'.format(tag),'a+',newline='',encoding='utf-8') writer=csv.writer(csvFile) writer.writerow(('name','score','peoples','date','nation','actor')) for page in range(0,pages*(20+1),20): url = 'https://movie.douban.com/tag/?start=&type=T' url=url.format(tag,page) try: respones=requests.get(url,headers=header,proxies=random.choice(proxys)) if respones.status_code == 200: respones = requests.get(url, headers=header, proxies=random.choice(proxys)) soup=BeautifulSoup(respones.text,'lxml') movies=soup.find_all(name='div',attrs='class':'pl2')# pl2 :字母l不是1 #print(movies) for movie in movies: movie=BeautifulSoup(str(movie),'lxml') #print(movie) movname=movie.find(name='a') #影片名 #print(movname) movname=movname.contents[0].strip().strip('/').split('\\n')[0].strip('\\n') #movname.contents[0].repalce('\\t','').strip('\\n').strip('/').strip('\\n') movInfo=movie.find(name='p').contents[0].split('/') #上映时间 date=movInfo[0][0:10] #国家 nation=movInfo[0][11:-2] actor_list=[act.strip(' ').replace('...','') for act in movInfo[1:-1]] #演员 actors='\\t'.join(actor_list) #评分 score=movie.find('span','class':'rating_nums').string #评价人数 peopleNum=movie.find('span','class':'pl').string[1:-3] writer.writerow((movname,score,peopleNum,date,nation,actors)) except: continue print('共有页,已爬页'.format(pages,int((page/20)))) if __name__=="__main__": #fetch_proxy(10) #test_proxy() start=time.time() proxyPool=proxypool(50) #print(proxyPool) fetch_movies('烂片',100,proxyPool) end=time.time() lastT=int(end-start) print('耗时s'.format(lastT))
以上是关于初探豆瓣爬虫的主要内容,如果未能解决你的问题,请参考以下文章