Python3.5爬取豆瓣电视剧数据并且同步到mysql中

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python3.5爬取豆瓣电视剧数据并且同步到mysql中相关的知识,希望对你有一定的参考价值。

 

#!/usr/local/bin/python
# -*- coding: utf-8 -*-
# Python:                  3.5
# Author:                  [email protected]
# Program:                 爬取豆瓣网站上所有电视剧的名称并写入数据库。
# Version:                 0.1
# History:                 2017.11.01

import requests,time, pymysql, re, datetime, json
from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody


host = ‘xxx
user = ‘xxx
passwd = ‘xxx
dbme = crawl
dbtarget = back_brace
table = tv_hotwords
tabledelta = tv_hotwords_delta
tablesync = slot_value
port = 3306
tolist = [[email protected]]


def get_tvs(urlbase, page):
    try:
        url = urlbase + str(page)
        print(url)
        try:
            result = requests.get(url).text
            jresult = json.loads(result)
            tvs = jresult.get(subjects)
        except:
            print(爬取 + urlbase + str(page) + 失败!)
        time.sleep(2)
        return tvs
    except:
        print(获取第%s页电影列表失败 % page)

def tv_insert(host, user, passwd, dbme, port, table, tvs_list):
    conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
    cur = conn.cursor()
    new_tvs = []
    punc = "!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。· "
    punctuation = punc
    for tv in tvs_list:
        try:
            tv[title] = re.sub(r"[%s]+" % punctuation, "", tv.get(title))
            cmd = insert into %s(tv_id, tv_name) values("%s", "%s") % (
                table, tv.get(id), tv.get(title))
            cur.execute(cmd)
            new_tvs.append(tv)
        except:
            print(" " * 20, tv.get(title), "already exists, skip……")
    cur.close()
    conn.commit()
    conn.close()
    return new_tvs


def tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, tvs_list, tablesync):
    conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
    cur = conn.cursor()
    cur.execute("delete from %s " % dbme+"."+tabledelta)

    for tv in tvs_list:
        try:
            cmd = insert into %s(tv_id, tv_name) values("%s", "%s") % (tabledelta, tv[id], tv[title])
            cmdsync = insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s") % (dbtarget+"."+tablesync, "xxxxxx", tv[title], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "")
            cur.execute(cmd)
            cur.execute(cmdsync)
        except pymysql.Error:
            print(" " * 20, tv[title], "already exists, skip……")
    cur.close()
    conn.commit()
    conn.close()


def tv_new_to_release(host, user, passwd, dbtarget, port):
    conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbtarget, port=port, charset="utf8")
    cur = conn.cursor()
    try:
        cmdbacktoskill = insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s") % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        cmdskilltoskillpro = insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s") % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        print(cmdbacktoskill)
        cur.execute(cmdbacktoskill)
        print(cmdskilltoskillpro)
        cur.execute(cmdskilltoskillpro)
    except pymysql.Error:
        print("write into back_brace.release_task error!!!")
    cur.close()
    conn.commit()
    conn.close()


def Email(to, subject, body):
    creds = Credentials(
        username=‘xxxxxx,
        password=‘xxxxxx)
    account = Account(
        primary_smtp_address=[email protected],
        credentials=creds,
        autodiscover=True,
        access_type=DELEGATE)
    m = Message(
        account=account,
        subject=subject,
        body=HTMLBody(body),
        to_recipients=[Mailbox(email_address=to)])
    m.send_and_save()


if __name__ == __main__:
    update_tvs = []
    pages = 25 # 豆瓣每项电视剧只有前500部
    # 美剧 英剧 韩剧 日剧 国产剧 港剧 日本动画 综艺
    urlbaselist = [https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E8%8B%B1%E5%89%A7&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E9%9F%A9%E5%89%A7&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E5%89%A7&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%B8%AF%E5%89%A7&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E6%9C%AC%E5%8A%A8%E7%94%BB&sort=recommend&page_limit=20&page_start=,
                       https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BB%BC%E8%89%BA&sort=recommend&page_limit=20&page_start=]

    for urlbase in urlbaselist:
        for i in range(pages):
            print("*"*30, i, "*"*30)
            tvs_list = get_tvs(urlbase, i * 20)
            new_tvs = tv_insert(host, user, passwd, dbme, port, table, tvs_list)

            for tv in new_tvs:
                print(tv[title],"Added")
                onetv = {}
                onetv["id"] = tv["id"]
                onetv["title"] = tv["title"]
                update_tvs.append(onetv)
            time.sleep(1)
        print(update_tvs)

        try:
            tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_tvs, tablesync)  # 将增加的电影写入movie_hotwords_delta表中
        except:
            print("tv update and sync Error!")

    try:
        tv_new_to_release(host, user, passwd, dbtarget, port)
    except:
        print("tv_new_to_release error!!!")

    subject = 本次新增电视剧名称
    body = "本次新增的电影名称为:<hr>"
    for movie in update_tvs:
        body += movie[title] + "<br>"
    for to in tolist:
        Email(to, subject, body)

 

还请各位大侠指点

以上是关于Python3.5爬取豆瓣电视剧数据并且同步到mysql中的主要内容,如果未能解决你的问题,请参考以下文章

requests爬取豆瓣

requests爬取豆瓣

豆瓣告诉你《扫黑风暴》如何短评爬取+词云

快速收集信息,Python爬虫教你一招爬取豆瓣Top250信息!

团队-爬取豆瓣电影-设计文档

《团队-爬取豆瓣电影TOP250-设计文档》