Python:学习爬取数据以分析(城市注册时间发表时间评分评论内容)
Posted 方舟aark
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python:学习爬取数据以分析(城市注册时间发表时间评分评论内容)相关的知识,希望对你有一定的参考价值。
最近因为泰迪实习需要,做一些这样的数据分析。分享一下相关代码。
修改其中的url(注意应用规则,需要对爬虫有一定了解)即可用。
import requests
import re
from task_getUserInf import getUserInf
import pandas as pd
import emoji
#用户数据获取
res = []
count = 1;
number = -1
for i in range(0,20):
print("========正在爬取第页用户数据,请稍等..========".format(i+1))
num = i * 20
url = 'https://movie.douban.com/subject/26100958/comments?start=&limit=20&status=P&sort=new_score'.format(num)
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
response = requests.get(url,headers=headers)
html_data = response.text
html_data = emoji.demojize(html_data)
# print(html_data)
getData = re.compile('<a href="(.*?)" class="">.*?</a>')
getData1 = re.compile('<span class="comment-time " title="(.*?)">')
getData2 = re.compile('<span class="short">((?:.|\\n)*?)</span>')
getData3 = re.compile('<span class=".*? rating" title="(.*?)"></span>')
rateData = re.findall(getData3, html_data)
Data = re.findall(getData,html_data)
timeData = re.findall(getData1, html_data)
commmentData = re.findall(getData2,html_data)
for i2 in range(0, len(rateData)):
if rateData[i2] == '很差':
rateData[i2] = 1
elif rateData[i2] == '较差':
rateData[i2] = 2
elif rateData[i2] == '还行':
rateData[i2] = 3
elif rateData[i2] == '推荐':
rateData[i2] = 4
elif rateData[i2] == '力荐':
rateData[i2] = 5
while(len(rateData) - len(Data) != 0):
rateData.append(3)
wait = []
waiters = ""
for waiter in range(0,len(commmentData)):
wait.append('-')
for k in range(0,len(commmentData)):
get = getUserInf(Data[k])
wait[k] = "o"
waiters = waiters.join(wait)
print("No.【\\】:[".format(count, k, len(commmentData)), waiters, "]")
waiters = ""
if len(get) == 2:
count = count + 1
number = number + 1
res.append((get[0],get[1],timeData[k],rateData[k],commmentData[k]))
# print("!!!已爬取第位用户数据,本页还剩位用户。!!!".format((k+1),(len(commmentData)-k-1)))
print(" 爬取数据:", res[number])
else:
print(" 第位用户已注销或信息不完整,略过。".format(k+1))
print("")
name = ['城市','注册时间','发言时间','评分','评论内容']
test = pd.DataFrame(columns=name,data=res)
print(test)
test.to_csv('../data/userData.csv',encoding='gbk')
print("完成!")
以下代码保存名为task_getUserInf,然后运行上面的代码即可。
import requests
import re
#用户数据获取
def getUserInf(url):
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
response = requests.get(url,headers=headers)
# response.encoding = response.apparent_encoding
html_data = response.text
# print(html_data)
# getData1 = re.compile('<a href="https://www.douban.com/location/.*?">(.*?)</a>')
getData1 = re.compile('<a href=".*?">(.*?)</a><br />')
getData2 = re.compile('<div class="pl">.*? <br/> (.*?)加入</div>')
Data = re.findall(getData1,html_data)
Data2 = re.findall(getData2,html_data)
if Data and Data2 and len(Data[0]) < 10:
# print("用户长居城市为:", Data[0])
# print("用户注册时间为:", Data2[0])
return Data[0],Data2[0]
else:
# print("无数据。")
return ['no','data','!']
以上是关于Python:学习爬取数据以分析(城市注册时间发表时间评分评论内容)的主要内容,如果未能解决你的问题,请参考以下文章
Python:爬取豆瓣短评用户数据(城市注册时间发表时间评分评论内容)
Python爬虫实战,openpyxl模块学习,爬取房价信息并简单的数据分析