爬虫实战斗鱼直播(你想看的都有呀!)
Posted ZSYL
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实战斗鱼直播(你想看的都有呀!)相关的知识,希望对你有一定的参考价值。
前言
斗鱼直播—每个人的直播平台
闲着没事儿,看起斗鱼的游戏直播了,感觉挺有意思,就想着看看目前有多少人在直播,获取直播的相关信息存入csv中,想看什么一览无余。
1. 获取数据
斗鱼直播间 https://www.douyu.com/directory/all
def __init__(self):
self.url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.Chrome()
self.driver.get(self.url)
2. 解析数据
self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
获取所有的直播间信息的xpath,使用chrome driver自动化获取。
def parse_data(self):
room_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
print(len(room_list))
# 遍历房间列表,从每一个房间中获取数据
data_list = []
for room in room_list:
temp = {}
temp['title'] = room.find_element_by_xpath('./a[1]/div[2]/div[1]/h3').text
temp['category'] = room.find_element_by_xpath('./a[1]/div[2]/div[1]/span').text
temp['owner'] = room.find_element_by_xpath('./a[1]/div[2]/div[2]/h2/div').text
temp['num'] = room.find_element_by_xpath('./a[1]/div[2]/div[2]/span').text
temp['cover_link'] = room.find_element_by_xpath('./a[1]/div[1]/div[1]/img').get_attribute('src')
data_list.append(temp)
return data_list
3. 自动翻页
while True:
time.sleep(3)
# parse
data_list = self.parse_data()
# save
self.save_data(data_list)
# next
try:
# el_next = self.driver.find_elements_by_xpath('//*[@class=" dy-Pagination-next"]')
el_next = self.driver.find_element_by_xpath('//*[contains(text(), "下一页")]')
# 下拉滚动最下面,才能点击下一页按钮
self.driver.execute_script('scrollTo(0, 100000)')
el_next.click()
self.num += 120
if self.num > 200:
break
except:
break
4. 保存数据
import csv
# self.driver.implicitly_wait(10)
self.csv_file = open('douyuLive.csv', 'w', encoding='utf-8')
# 获取csv的writer对象
self.writer = csv.writer(self.csv_file)
# 初始化表头
self.writer.writerow(['title', 'category', 'owner', 'num', 'cover_link'])
def save_data(self, data_list):
for data in data_list:
self.writer.writerow(data.values())
5. 完整代码
from selenium import webdriver
import time
import csv
class Douyu(object):
def __init__(self):
self.url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.Chrome()
# self.driver.implicitly_wait(10)
self.csv_file = open('douyuLive.csv', 'w', encoding='utf-8')
# 获取csv的writer对象
self.writer = csv.writer(self.csv_file)
# 初始化表头
self.writer.writerow(['title', 'category', 'owner', 'num', 'cover_link'])
# 多少条数据
self.num = 0
def parse_data(self):
room_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
print(len(room_list))
# 遍历房间列表,从每一个房间中获取数据
data_list = []
for room in room_list:
temp = {}
temp['title'] = room.find_element_by_xpath('./a[1]/div[2]/div[1]/h3').text
temp['category'] = room.find_element_by_xpath('./a[1]/div[2]/div[1]/span').text
temp['owner'] = room.find_element_by_xpath('./a[1]/div[2]/div[2]/h2/div').text
temp['num'] = room.find_element_by_xpath('./a[1]/div[2]/div[2]/span').text
temp['cover_link'] = room.find_element_by_xpath('./a[1]/div[1]/div[1]/img').get_attribute('src')
data_list.append(temp)
return data_list
def save_data(self, data_list):
for data in data_list:
self.writer.writerow(data.values())
def run(self):
# url
# driver
# get
self.driver.get(self.url)
while True:
time.sleep(3)
# parse
data_list = self.parse_data()
# save
self.save_data(data_list)
# next
try:
# el_next = self.driver.find_elements_by_xpath('//*[@class=" dy-Pagination-next"]')
el_next = self.driver.find_element_by_xpath('//*[contains(text(), "下一页")]')
# 下拉滚动最下面
self.driver.execute_script('scrollTo(0, 100000)')
el_next.click()
self.num += 120
# 测试代码
# if self.num > 200:
# break
except:
break
if __name__ == '__main__':
# 实例化对象
douyu = Douyu()
# 开启爬虫主程序
douyu.run()
# 关闭文件流
douyu.csv_file.close()
print(f'共有{douyu.num}条直播数据,下载完毕!')
# 杀死进程
douyu.driver.quit()
6. 效果展示
加油!
感谢!
努力!
以上是关于爬虫实战斗鱼直播(你想看的都有呀!)的主要内容,如果未能解决你的问题,请参考以下文章