在 Python 中从 Indeed 中获取员工评分
Posted
技术标签:
【中文标题】在 Python 中从 Indeed 中获取员工评分【英文标题】:Scrape the employee ratings from Indeed in Python 【发布时间】:2021-12-29 23:15:08 【问题描述】:我是网络抓取的新手,我需要从 Indeed 抓取员工评分和评论,但我的代码无法运行。你能告诉我的代码有什么问题吗?非常感谢您的帮助。
from bs4 import BeautifulSoup
import pandas as pd
import requests
df = pd.DataFrame('review_title': [],'review':[],'author':[],'rating':[])
for i in range(0, 140, 20):
url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start=i')
header = "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
results = soup.find("div", "id" : 'cmp-container')
elems = results.find_all(class_='cmp-Review-container')
for elem in elems:
title = elem.find(attrs = 'class':'cmp-Review-title')
review = elem.find('div', 'class': 'cmp-Review-text')
author = elem.find(attrs = 'class':'cmp-Review-author')
rating = elem.find(attrs = 'class':'cmp-ReviewRating-text')
df = df.append('review_title': title.text,
'review': review.text,
'author': author.text,
'rating': rating.text
, ignore_index=True)
它只返回标题。
接受Parikh的建议后,它可以返回员工评论,但不显示员工状态,以前的或当前的。如何改进我的代码以获得员工身份?
# Load the Modules
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import pandas as pd
# Use Big Tech as the samples to scrape the employee reviews on 12/20/2021
# Meta(Facebook),
lst=[]
for i in range(0, 460, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Meta-dd1502f2/reviews?start=i')
header = "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data = soup.find_all("div",attrs="data-tn-section":"reviews")
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs="itemprop":"author").get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
review=data.find("span",attrs="itemprop":"reviewBody").get_text(strip=True)
except AttributeError:
review=np.nan
try:
rating=data.find("div",attrs="itemprop":"reviewRating").find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,review,rating])
df_meta=pd.DataFrame(data=lst,columns=['title','author','review','rating'])
df_meta
输出如下所示,我也想要员工状态。非常感谢您的帮助。
再次感谢您的帮助和时间。我的最后一个问题是我试图总结利弊,但它只返回 NA。我应该如何修改它?
import numpy as np
lst=[]
for i in range(0, 240, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Airbnb/reviews?start=i')
header = "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data = soup.find_all("div",attrs="data-tn-section":"reviews")
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs="itemprop":"author").get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
status=data.find("span",attrs="itemprop":"author").get_text(strip=True).split("-")[0]
except AttributeError:
status=np.nan
try:
review=data.find("span",attrs="itemprop":"reviewBody").get_text(strip=True)
except AttributeError:
review=np.nan
try:
pros=data.find('div',class_='cmp-review-pro-text')[0].getText(strip=True)
except:
pros=np.nan
try:
cons=data.find('div',class_='cmp-review-con-text')[0].getText(strip=True)
except:
cons=np.nan
try:
rating=data.find("div",attrs="itemprop":"reviewRating").find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,status,pros,cons,review,rating])
【问题讨论】:
【参考方案1】:首先查看打印出main_data
并了解存在哪些标签数据的概览,根据获取的特定数据,我还添加了try
和except
块
import numpy as np
lst=[]
for i in range(0, 140, 20):
print(i)
url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start=i')
header = "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'lxml')
main_data=results.find_all("div",attrs="data-tn-section":"reviews" )
for data in main_data:
try:
title=data.find("h2").get_text(strip=True)
except AttributeError:
title=np.nan
try:
author=data.find("span",attrs="itemprop":"author").get_text(strip=True).split("-")[1]
except AttributeError:
author=np.nan
try:
status=data.find("span",attrs="itemprop":"author").get_text(strip=True).split("-")[0]
except AttributeError:
status=np.nan
try:
review=data.find("span",attrs="itemprop":"reviewBody").get_text(strip=True)
except AttributeError:
review=np.nan
try:
rating=data.find("div",attrs="itemprop":"reviewRating").find("button")['aria-label'].split(" ")[0]
except AttributeError:
rating=np.nan
lst.append([title,author,status,review,rating])
现在使用lst
作为DataFrame中的数据
import pandas as pd
df=pd.DataFrame(data=lst,columns=['title','author','status','review','rating'])
df
输出:
title author status review rating
0 good exerccise Provincia di Milano, Lombardia Senior Manager(Former Employee) working here can be challenging but helps buil... 3.0
【讨论】:
如果可能的话,您能否投票或接受作为答案,以便其他用户觉得有用 没问题! 您好,我还有一个关于代码的问题。如何抓取员工状态,以明确前任或现任员工。 我已经编辑了我的答案,请查看! 谢谢,这对我帮助很大。我有最后一个问题,关于如何分别刮取利弊。我已经尝试了以下代码,但它只返回 NA。以上是关于在 Python 中从 Indeed 中获取员工评分的主要内容,如果未能解决你的问题,请参考以下文章
数据可视化|用Python实现手机抓包,获取当当图书差评数据!
列出在超过 2 名员工的团队中从平均收入中获得平均值(准确率高达 30%)的员工(姓名、base_salary)