python 使用selenium和requests爬取页面数据
Posted singleSpace
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 使用selenium和requests爬取页面数据相关的知识,希望对你有一定的参考价值。
目的:获取某网站某用户下市场大于1000秒的视频信息
1.本想通过接口获得结果,但是使用post发送信息到接口,提示服务端错误。
2.通过requests获取页面结果,使用html解析工具,发现麻烦而且得不到想要的结果
3.直接通过selenium获取控件的属性信息,如图片、视频地址,再对时间进行筛选。将信息保存到以id命名的文件夹下
# -*- coding:utf-8 -*-
from selenium import webdriver
import sys,os,requests,shutil
class GetUserVideo():
def __init__(self,driver,id):
self.id = str(id)
self.driver = driver
self.base_url = "http://www.xxxxx.com/user/%s?t=2"%(self.id)
def get_pagecounts(self):
#获取页面数
self.driver.get(self.base_url)
page_counts=int(self.driver.find_elements_by_xpath("//div[@class=\'page\']/a")[-2].text)+1
return page_counts
def get_video(self,driver,page,f):
video_times = driver.find_elements_by_xpath("//i[@class=\'continue_time\']")
video_urls = driver.find_elements_by_xpath("//div[@class=\'video\']/a[@class=\'url\']")
video_imgs = driver.find_elements_by_xpath("//a[@class=\'url\']/img")
length = len(video_times)
for i in range(length):
" 当前页面内筛选出时长大于1000秒的,并将图片、时长、地址保存到指定目录"
time_list = video_times[i].text.split(":")
time_count = int(time_list[0]) * 3600 + int(time_list[1]) * 60 + int(time_list[2])
if time_count > 1000:
video_time = video_times[i].text
video_url = video_urls[i].get_attribute(\'href\')
video_img = video_imgs[i].get_attribute("src")
img_name = str(page) + "_" + str(i)+"_"+os.path.basename(video_img)
f.write(img_name + "\\t")
f.write(video_time + "\\t")
f.write(video_url + "\\n")
img_url = requests.get(video_img)
with open(self.id + "/" + img_name, "wb") as b:
b.write(img_url.content)
def test(self):
"如果存在同名文件夹,就删除"
if os.path.exists(self.id):
shutil.rmtree(self.id)
os.mkdir(self.id)
driver = self.driver
page_counts=self.get_pagecounts()
f=open(self.id+"/video.txt","w")
for page in range(1,page_counts):
detail_url = "&page=%s" % page
driver.get(self.base_url+detail_url)
self.get_video(driver,page,f)
f.close()
driver.quit()
if __name__=="__main__":
path = sys.path[0].split("/")
index = path.index("SeleniumOfJenkins") + 1
ph_driver = "/driver/phantomjs-2.1.1-macosx/bin/phantomjs"
if index == len(path):
driver_path = sys.path[0] + ph_driver
else:
driver_path = "/".join(path[:index]) + ph_driver
driver = webdriver.PhantomJS(executable_path=driver_path)
driver.maximize_window()
driver.implicitly_wait(10)
test = GetUserVideo(driver,123456)
test.test()
以上是关于python 使用selenium和requests爬取页面数据的主要内容,如果未能解决你的问题,请参考以下文章
Cloudflare 如何区分 Selenium 和 Requests 流量?
python爬虫之requests+selenium+BeautifulSoup
python 爬虫 简单爬虫教程(requests + selenium )
requests.exceptions.MissingSchema:无效的 URL 'None':尝试通过 Selenium 和 Python 查找断开的链接时未提供架构