python爬虫,爬取猫眼电影top100

Posted DRQ丶

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬虫,爬取猫眼电影top100相关的知识,希望对你有一定的参考价值。

import requests
from bs4 import BeautifulSoup

url_list = []
all_name = []
all_num  = []
all_actor = []
all_score = []

class Product_url():      # 这个地方传入的url是 https://maoyan.com/board/4?offset=

    global url_list
    def __init__(self, url):
        self.url = url
        for x in range(0, 10):
            one_url = self.url + str(x*10)       # 简单暴力的拼接字符串,储存下top100的是个url 
            url_list.append(one_url)



class Get_one_page():
    def __init__(self, url, headers):
        self.url = url
        self.headers = headers

    def get_response(self):
        response = requests.get(self.url, headers = self.headers)
        return response.text

# 这个类用来 进行抓取
class Spider():


    def __init__(self, html):      
        self.html = html

    global all_name     # 电影名字
    def get_name(self):                     
        soup = BeautifulSoup(self.html, lxml)
        for html_name in soup.select(.name):
            all_name.append(html_name.get_text())

    global all_num    # 所有评分
    def get_num(self):
        soup = BeautifulSoup(self.html, lxml)
        for html_num in soup.select(.board-index):
            all_num.append(html_num.get_text())

    global all_actor     # 演员
    def get_actor(self):
        soup = BeautifulSoup(self.html, lxml)
        for html_actor in soup.select(.star):
            all_actor.append(html_actor.get_text().strip())      #strip() 去除了


    global all_score
    def get_score(self):
        soup = BeautifulSoup(self.html, lxml)
        for html_score_integer in soup.select(.integer):    # 网页里评分是分为两部分的,整数和小数
            for html_score_fraction in soup.select(.fraction):
                all_score.append(html_score_integer.get_text() + html_score_fraction.get_text()) # 把整数和小数部分连接起来

if __name__ == __main__: filename = 猫眼电影top100.txt with open(filename, w) as file_object: file_object.write("猫眼电影top100") file_handle = open(猫眼电影top100.txt, a+) file_handle.write(" sadas") headers = { User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 +(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 } a_product = Product_url(https://maoyan.com/board/4?offset=) for n in range(0, 10): one_page = Get_one_page(url_list[n], headers) html = one_page.get_response() one_spider = Spider(html) one_spider.get_actor() one_spider.get_score() one_spider.get_num() one_spider.get_name() for n in range(0, 100): num = .join(all_num[n]) actor = .join(all_actor[n]) name = .join(all_name[n]) score = .join(all_score[n]) file_handle = open(猫眼电影top100.txt, a+) file_handle.write( + + num + + name + + actor + + score) file_handle.close()

 

以上是关于python爬虫,爬取猫眼电影top100的主要内容,如果未能解决你的问题,请参考以下文章

# [爬虫Demo] pyquery+csv爬取猫眼电影top100

Python爬虫实战之Requests+正则表达式爬取猫眼电影Top100

[爬虫]requests+正则表达式爬取猫眼电影TOP100

Python爬虫获取猫眼Top100电影信息(上)

爬虫实战01——爬取猫眼电影top100榜单

爬虫:爬取猫眼电影top100