豆瓣电影信息查询
Posted pau1fang
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了豆瓣电影信息查询相关的知识,希望对你有一定的参考价值。
目标:输入电影名称,显示其基本信息,并生成一张评论词云图
电影信息来源为豆瓣网,搜索页面地址为https://search.douban.com/movie/subject_search?由于其页面是通过js渲染的,直接通过requests请求是拿不到电影查询结果的,所以我先用selenium获取本页面的查询结果并提取出前10条电影名和对应的详情页面url,然后再用requests请求需要查询的电影详情页面以获取电影信息。
在运用selenium的时候需要开启无头模式,具体操作如下:
chrome_options = webdriver.ChromeOptions() chrome_options.add_argument(‘--headless‘) chrome_options.add_argument(‘--disable-gpu‘) browser = webdriver.Chrome(chrome_options=self.chrome_options, executable_path=‘chromedriver.exe‘)# executable_path应准确指定chromedriver.exe的位置
最终效果如下图所示:
完整代码如下:
import requests from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from lxml.html import etree import bs4 import jieba import wordcloud import re from matplotlib import pyplot class Movie(): def __init__(self, name): self.url = f‘https://search.douban.com/movie/subject_search?search_text={name}‘ self.headers = ‘User-Agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ ‘Chrome/78.0.3904.108 Safari/537.36"‘ self.chrome_options = webdriver.ChromeOptions() self.chrome_options.add_argument(‘--headless‘) self.chrome_options.add_argument(‘--disable-gpu‘) self.chrome_options.add_argument(self.headers) self.browser = webdriver.Chrome(chrome_options=self.chrome_options, executable_path=‘chromedriver.exe‘) self.wait = WebDriverWait(self.browser, 10) def get_search(self):
# 获取搜索结果,以便进一步选择 self.browser.get(self.url) response = self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ‘.title > a‘))) if response: print(‘请选择:‘) movies = [] for i in range(10): name = response[i].text url = response[i].get_attribute(‘href‘) print(f‘{[i]}.{name}‘) movies.append([name, url]) self.browser.close() return movies else: print("没有搜到您要的信息,请重新输入") self.get_search() def get_movie_info(self, movie):
# 在影片详情页面提取影片基本信息 name = movie[0] url = movie[1] headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36‘} resp = requests.get(url, headers=headers) try: if resp.status_code == 200: soup = bs4.BeautifulSoup(resp.text, ‘html.parser‘) info = soup.find(name=‘div‘, attrs={‘id‘: ‘info‘}).text rating = soup.find(name=‘div‘, attrs={‘class‘: ‘rating_self‘}) rating_num = rating.strong.text rating_people = rating.a.text print(info) print(f‘评分: {rating_num}‘) print(rating_people) text = self.get_reviews(url, headers) self.word_cloud(name, text) else: return None except requests.exceptions: return None @staticmethod def get_reviews(url, headers):
#获取评论 text = ‘‘ for i in range(5): url = f‘{url}reviews?start=i‘ response = requests.get(url, headers=headers) html = etree.HTML(response.text) reviews = html.xpath(‘//*[@class="short-content"]/text()‘) reviews = ‘‘.join(‘‘.join(reviews).split()) reviews = ‘‘.join(reviews.split(‘()‘)) text += reviews return text @staticmethod def word_cloud(name, word):
# 生成词云 name = re.sub(r‘[\\/:*?"<>| 。,.?]+‘, ‘‘, name) ls = jieba.lcut(word) text = ‘ ‘.join(ls) w = wordcloud.WordCloud(font_path=‘simkai.ttf‘, width=800, height=600, background_color=‘white‘) w.generate(text) w.to_file(f‘{name}.png‘) pyplot.imshow(w) pyplot.axis(False) pyplot.show() def main(): movie_name = input("请输入电影名称,即可查询对应的影片信息:") m = Movie(movie_name) movies = m.get_search() num = input(‘请输入序号选择:‘) num = int(num) m.get_movie_info(movies[num]) if __name__ == ‘__main__‘: main()