python 抓取电影天堂电影信息放入数据库

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 抓取电影天堂电影信息放入数据库相关的知识,希望对你有一定的参考价值。

# coding:utf-8
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import urllib2
import re
import json
import chardet
import pymysql
# url = "http://dytt8.net/"
# page = requests.get(url).content
# page_html = BeautifulSoup(page,‘lxml‘)

# name = page_html.select("td.inddline > a:nth-of-type(2)")
# for n in name:
#     if ‘dyzz‘ in n.encode(‘gbk‘):
#         print n.encode(‘gbk‘)
#         file = open("move.txt","a+")
#         file.write(n.encode(‘utf-8‘)+‘\n‘)
#         file.close()

def getmoveinfo( url ):
    page = requests.get(url).content
    page_html = BeautifulSoup(page,‘lxml‘)
    # title = page_html.select("div.title_all")
    # title = title[4].select("h1")
    # title = title[0].select("font")
    # return title[0].contents;
    title = page_html.find_all("font", attrs={"color": "#07519a"})
    title_content = title[0].contents
    if(re.findall(r"译  名(.*?)<br/>", str(page_html))):
        yiming = re.findall(r"译  名(.*?)<br/>", str(page_html))[0]
    else:
        yiming = ‘‘
    if(re.findall(r"类  别(.*?)<br/>", str(page_html))):
        leibie = re.findall(r"类  别(.*?)<br/>", str(page_html))[0]
    else:
        leibie = ‘‘
    if(re.findall(r"语  言(.*?)<br/>", str(page_html))):
        yuyan = re.findall(r"语  言(.*?)<br/>", str(page_html))[0]
    else:
        yuyan = ‘‘
    if(re.findall(r"字  幕(.*?)<br/>", str(page_html))):
        zimu = re.findall(r"字  幕(.*?)<br/>", str(page_html))[0]
    else:
        zimu = ‘‘
    if(re.findall(r"上映日期(.*?)<br/>", str(page_html))):
        date = re.findall(r"上映日期(.*?)<br/>", str(page_html))[0]
    else:
        date = ‘‘
    if(re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))):
        douban = re.findall(r"豆瓣评分(.*?)<br/>", str(page_html))[0]
    else:
        douban = ‘‘
    if(re.findall(r"片  长(.*?)<br/>", str(page_html))):
        pianchang = re.findall(r"片  长(.*?)<br/>", str(page_html))[0]
    else:
        pianchang = ‘‘
    if(re.findall(r"导  演(.*?)<br/>", str(page_html))):
        daoyan = re.findall(r"导  演(.*?)<br/>", str(page_html))[0]
    else:
        daoyan = ‘‘
    if(re.findall(r"主  演(.*?)<br/>", str(page_html))):
        zhuyan = re.findall(r"主  演(.*?)<br/>", str(page_html))[0]
    else:
        zhuyan = ‘‘
    if(re.findall(r"简  介(.*?)【下载地址】", str(page_html))):
        jianjie = re.findall(r"简  介(.*?)【下载地址】", str(page_html))[0]
    else:
        jianjie = ‘‘

    addres = page_html.find_all("td", attrs={"bgcolor": "#fdfddf"})
    if(addres):
        addres = addres[0].contents;
        addres = addres[0].get("href").encode(‘utf-8‘)
    else:
        addres = ‘‘
    res = {}
    res[‘title‘] =title_content[0].encode("utf-8")
    res[‘yiming‘] = yiming
    res[‘leibie‘] = leibie
    res[‘yuyan‘] = yuyan
    res[‘zimu‘] = zimu
    res[‘date‘] = date
    res[‘douban‘] = douban
    res[‘pianchang‘] = pianchang
    res[‘daoyan‘] = daoyan
    res[‘zhuyan‘] = zhuyan
    res[‘jianjie‘] = jianjie.replace("<br/>", "")
    res[‘addres‘] = addres
    return res

url = "http://dytt8.net/"
page = requests.get(url).content
page_html = BeautifulSoup(page,‘lxml‘)

name = page_html.select("td.inddline > a:nth-of-type(2)")

conn = pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,password=‘root‘,db=‘moves‘,charset=‘utf8‘)
cursor = conn.cursor()

for n in name:
    if ‘dyzz‘ in n.encode(‘gbk‘):
        info = getmoveinfo("http://dytt8.net"+n.get("href"))
        title = info[‘title‘]
        yiming = info[‘yiming‘]
        leibie = info[‘leibie‘]
        yuyan = info[‘yuyan‘]
        zimu = info[‘zimu‘]
        date = info[‘date‘]
        douban = info[‘douban‘]
        pianchang = info[‘pianchang‘]
        daoyan = info[‘daoyan‘]
        zhuyan = info[‘zhuyan‘]
        jianjie = info[‘jianjie‘]
        addres = info[‘addres‘]
        # print title.decode(‘utf-8‘).encode(‘gbk‘)
        cursor.execute("INSERT INTO move_info(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres)VALUES(‘{0}‘,‘{1}‘,‘{2}‘,‘{3}‘,‘{4}‘,‘{5}‘,‘{6}‘,‘{7}‘,‘{8}‘,‘{9}‘,‘{10}‘,‘{11}‘);".format(title,yiming,leibie,yuyan,zimu,date,douban,pianchang,daoyan,zhuyan,jianjie,addres))
        conn.commit()

cursor.close()
conn.close()
print ‘ok‘           

以上是关于python 抓取电影天堂电影信息放入数据库的主要内容,如果未能解决你的问题,请参考以下文章

Python抓取电影天堂, 零基础都可以学? 源码&视频教程, 大赞!

Python多线程爬虫爬取电影天堂资源

[Java] 用java实现的电影天堂,飘花电影网的电影的下载地址抓取

实例练习----电影天堂抓取下载链接

python爬虫(十七) 电影天堂爬虫1

1-1 用Python抓取豆瓣及IMDB上的电影信息