爬虫小案例:豆瓣Top250电影

Posted keenleung

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫小案例:豆瓣Top250电影相关的知识,希望对你有一定的参考价值。

获取豆瓣Top250电影列表,然后给自己发邮件

直接上代码:

import requests,os,csv,time,smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
from email.header import Header
from email.header import  make_header
from email.mime.multipart import MIMEMultipart

from bs4 import BeautifulSoup

# 数据写入到文件
file_path = os.getcwd() + "/豆瓣Top250电影.csv"
if not os.path.isfile(file_path):
    # 编码utf-8-sig:支持python3,不支持python2
    with open(file_path, w, newline=‘‘, encoding=utf-8-sig) as f:
        writer = csv.writer(f)
        writer.writerow([排名, 电影名称, 上映年份, 地区, 类型, 评分, 推荐语, 链接])
# 电影列表
filmlist = []
for x in range(10):
    url = https://movie.douban.com/top250?start={}&filter=.format(x * 25)

    # 为躲避反爬机制,伪装成浏览器的请求头
    headers = {User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/78.0.3904.108 Safari/537.36 OPR/65.0.3467.78 (Edition Baidu)}
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        print(正获取第{}页电影数据....format(x+1))
        htmltext = res.text
        soup = BeautifulSoup(htmltext, html.parser)
        ol = soup.find(ol, class_=grid_view)
        for li in ol.find_all(li):
            # 排名
            num = li.find(div, class_=pic).find(em).text

            info = li.find(div, class_=info)

            # 标题
            title = []
            title_spans = info.find(div, class_=hd).find(a).find_all(span)
            for title_span in title_spans:
                title.append(title_span.text)
            title = ‘‘.join(title)

            # 链接
            link = info.find(div, class_=hd).find(a)[href]

            # 评分
            rating_num = info.find(span, class_=rating_num).text

            # 推荐语
            inq = info.find(span, class_=inq).text

            # 上映时间、地区、类型
            bd = info.find(div, class_=bd).find(p).contents[2]
            bd = bd.split(/)

            # 保存到文件上
            with open(file_path, a, newline=‘‘, encoding=utf-8-sig) as f:
                writer = csv.writer(f)
                writer.writerow([num, title, bd[0].strip(), bd[1].strip(), bd[2].strip(), rating_num, inq, link])

            filmlist.append("{0}.{1}:{2}/{3}/{4},评分:{5},推荐语:{6},链接:{7}".format(num,title,bd[0].strip(),bd[1].strip(),bd[2].strip(),rating_num,inq,link))

        time.sleep(0.75)
            # print(num)
            # print(title)
            # print(link)
            # print(rating_num)
            # print(inq)
            # print(bd)
            # print(‘{0}.{1} —— {2},推荐语:{3},链接:{4}‘.format(rating_num,title,inq,link))

        
    else:
        print(请求失败!)

# 发送邮件
my_sender = ‘xxx  # 发件人邮箱账号
my_pass = ‘xxx  # 发件人邮箱密码

try:
    print(time.strftime(%Y-%m-%d %H:%M:%S, time.localtime()) + ":准备发送邮件")

    # 创建一个带附件的实例
    # 使用多形式组合
    msg = MIMEMultipart()
    msg[From] = formataddr(["ljq", my_sender])  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
    msg[To] = formataddr(["lsjljq", lsjljq@163.com])  # 括号里的对应收件人邮箱昵称、收件人邮箱账号

    # 邮件标题
    subject = 豆瓣Top250电影
    msg[Subject] = Header(subject, utf-8)

    # 邮件正文内容
    contenttext = "
".join(filmlist)
    msg.attach(MIMEText(contenttext, plain, utf-8))

    # 构造附件1,传送当前目录下的 test.txt 文件
    att1 = MIMEText(open(豆瓣Top250电影.csv, rb).read(), base64, utf-8)
    # 文件名如果是中文,则需要转化一下
    att1["Content-Type"] = application/octet-stream;name="{0}".format(make_header([(豆瓣Top250电影, UTF-8)]).encode(UTF-8))
    # 这里的filename可以任意写,写什么名字,邮件中显示什么名字
    att1["Content-Disposition"] = attachment; filename="{0}.csv".format(make_header([(豆瓣Top250电影, UTF-8)]).encode(UTF-8))
    msg.attach(att1)

    print(time.strftime(%Y-%m-%d %H:%M:%S, time.localtime()) + ":正连接邮件服务器...")
    server = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)  # 发件人邮箱中的SMTP服务器,端口是25
    print(time.strftime(%Y-%m-%d %H:%M:%S, time.localtime()) + ":登录中...")
    server.login(my_sender, my_pass)  # 括号中对应的是发件人邮箱账号、邮箱密码
    print(time.strftime(%Y-%m-%d %H:%M:%S, time.localtime()) + ":正在发送邮件...")
    server.sendmail(my_sender, [lsjljq@163.com], msg.as_string())  # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
    server.quit()  # 关闭连接
    print(邮件发送成功!)
except Exception as err:
    print(邮件发送失败!:{0}.format(err))

 

以上是关于爬虫小案例:豆瓣Top250电影的主要内容,如果未能解决你的问题,请参考以下文章

python爬虫入门爬取豆瓣电影top250

运维学python之爬虫高级篇scrapy爬取豆瓣电影TOP250

Python爬虫-豆瓣电影 Top 250

python爬取豆瓣电影Top250(附完整源代码)

Python小爬虫——抓取豆瓣电影Top250数据

[python爬虫] BeautifulSoup和Selenium对比爬取豆瓣Top250电影信息