Python3.5爬取豆瓣电视剧数据并且同步到mysql中
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python3.5爬取豆瓣电视剧数据并且同步到mysql中相关的知识,希望对你有一定的参考价值。
#!/usr/local/bin/python # -*- coding: utf-8 -*- # Python: 3.5 # Author: [email protected] # Program: 爬取豆瓣网站上所有电视剧的名称并写入数据库。 # Version: 0.1 # History: 2017.11.01 import requests,time, pymysql, re, datetime, json from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody host = ‘xxx‘ user = ‘xxx‘ passwd = ‘xxx‘ dbme = ‘crawl‘ dbtarget = ‘back_brace‘ table = ‘tv_hotwords‘ tabledelta = ‘tv_hotwords_delta‘ tablesync = ‘slot_value‘ port = 3306 tolist = [‘[email protected]‘] def get_tvs(urlbase, page): try: url = urlbase + str(page) print(url) try: result = requests.get(url).text jresult = json.loads(result) tvs = jresult.get(‘subjects‘) except: print(‘爬取‘ + urlbase + str(page) + ‘失败!‘) time.sleep(2) return tvs except: print(‘获取第%s页电影列表失败‘ % page) def tv_insert(host, user, passwd, dbme, port, table, tvs_list): conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8") cur = conn.cursor() new_tvs = [] punc = "!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。· " punctuation = punc for tv in tvs_list: try: tv[‘title‘] = re.sub(r"[%s]+" % punctuation, "", tv.get(‘title‘)) cmd = ‘insert into %s(tv_id, tv_name) values("%s", "%s")‘ % ( table, tv.get(‘id‘), tv.get(‘title‘)) cur.execute(cmd) new_tvs.append(tv) except: print(" " * 20, tv.get(‘title‘), "already exists, skip……") cur.close() conn.commit() conn.close() return new_tvs def tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, tvs_list, tablesync): conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8") cur = conn.cursor() cur.execute("delete from %s " % dbme+"."+tabledelta) for tv in tvs_list: try: cmd = ‘insert into %s(tv_id, tv_name) values("%s", "%s")‘ % (tabledelta, tv[‘id‘], tv[‘title‘]) cmdsync = ‘insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")‘ % (dbtarget+"."+tablesync, "xxxxxx", tv[‘title‘], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "") cur.execute(cmd) cur.execute(cmdsync) except pymysql.Error: print(" " * 20, tv[‘title‘], "already exists, skip……") cur.close() conn.commit() conn.close() def tv_new_to_release(host, user, passwd, dbtarget, port): conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbtarget, port=port, charset="utf8") cur = conn.cursor() try: cmdbacktoskill = ‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) cmdskilltoskillpro = ‘insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")‘ % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) print(cmdbacktoskill) cur.execute(cmdbacktoskill) print(cmdskilltoskillpro) cur.execute(cmdskilltoskillpro) except pymysql.Error: print("write into back_brace.release_task error!!!") cur.close() conn.commit() conn.close() def Email(to, subject, body): creds = Credentials( username=‘xxxxxx‘, password=‘xxxxxx‘) account = Account( primary_smtp_address=‘[email protected]‘, credentials=creds, autodiscover=True, access_type=DELEGATE) m = Message( account=account, subject=subject, body=HTMLBody(body), to_recipients=[Mailbox(email_address=to)]) m.send_and_save() if __name__ == ‘__main__‘: update_tvs = [] pages = 25 # 豆瓣每项电视剧只有前500部 # 美剧 英剧 韩剧 日剧 国产剧 港剧 日本动画 综艺 urlbaselist = [‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E8%8B%B1%E5%89%A7&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E9%9F%A9%E5%89%A7&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E5%89%A7&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%B8%AF%E5%89%A7&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E6%9C%AC%E5%8A%A8%E7%94%BB&sort=recommend&page_limit=20&page_start=‘, ‘https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BB%BC%E8%89%BA&sort=recommend&page_limit=20&page_start=‘] for urlbase in urlbaselist: for i in range(pages): print("*"*30, i, "*"*30) tvs_list = get_tvs(urlbase, i * 20) new_tvs = tv_insert(host, user, passwd, dbme, port, table, tvs_list) for tv in new_tvs: print(tv[‘title‘],"Added") onetv = {} onetv["id"] = tv["id"] onetv["title"] = tv["title"] update_tvs.append(onetv) time.sleep(1) print(update_tvs) try: tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_tvs, tablesync) # 将增加的电影写入movie_hotwords_delta表中 except: print("tv update and sync Error!") try: tv_new_to_release(host, user, passwd, dbtarget, port) except: print("tv_new_to_release error!!!") subject = ‘本次新增电视剧名称‘ body = "本次新增的电影名称为:<hr>" for movie in update_tvs: body += movie[‘title‘] + "<br>" for to in tolist: Email(to, subject, body)
还请各位大侠指点
以上是关于Python3.5爬取豆瓣电视剧数据并且同步到mysql中的主要内容,如果未能解决你的问题,请参考以下文章