IMDB TOP 250爬虫

Posted 2020-10-07 L1B0@10.0.0.55

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了IMDB TOP 250爬虫相关的知识，希望对你有一定的参考价值。

这个小学期Python大作业搞了个获取IMDB TOP 250电影全部信息的爬虫。第二次写爬虫，比在暑假集训时写的熟练多了。欢迎大家评论。

  1 \'\'\'
  2 ************************************************
  3 *Time：2017.9.11       
  4 *Target：All movies\' information of IMDB TOP_250
  5 *Resources：http://www.imdb.cn/IMDB250/
  6 ************************************************
  7 \'\'\'
  8 
  9 import re
 10 import requests
 11 import numpy as np
 12 import matplotlib.pyplot as plt
 13 from bs4 import BeautifulSoup
 14 
 15 num = 1 #电影计数
 16 All_txt = [] #全部电影的信息
 17 headers={\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0\'}#浏览器代理
 18 def  getHTMLText(url):
 19     try:
 20         #print(url)
 21         r = requests.get( url,headers = headers )
 22         #print(r)
 23         r.encoding = \'utf-8\'
 24         return r.text
 25     except:
 26         return "错误"
 27 
 28 #从每一部电影的页面中获取全部信息
 29 def get_all_information(url,page):
 30     global num,All_txt
 31     txt = getHTMLText(url)
 32     if txt != "错误":
 33         print(\'page\'+str(page)+\' NO.\'+str(num)+\' Get it!\')
 34     if num == 247:
 35         print(\'Finished!!!\')
 36     soup = BeautifulSoup(txt,"html.parser")
 37     Cname,Ename,Score,title,Actor,Starring,Infor = \'\',\'\',\'\',\'\',\'\',\'\',\'\'
 38 
 39     #TOP250-film_Chinese_name&Score
 40     infor_1 = soup.find_all(\'div\',class_ = \'hdd\')
 41     rel = \'<h3>\'+\'[\\s\\S]*?\'+\'</h3>\'
 42     pattern = re.compile(rel)
 43     Cname = \'\'.join(pattern.findall(str(infor_1[0])))
 44     Cname = Cname.replace(\'<h3>\',\'\').replace(\'</h3>\',\'\')
 45     #print(Cname)
 46     #find_the_year & save
 47     rel = \'（\'+\'[\\s\\S]*?\'+\'）\'
 48     pattern = re.compile(rel)
 49     time_ = \'\'.join(pattern.findall(Cname))
 50     #print(time_)
 51     with open(\'time.txt\',\'a\',encoding=\'utf-8\') as t:
 52         t.write( time_.replace(\'（\',\'\').replace(\'）\',\'\') + \'\\n\' )
 53     #find_Score
 54     rel = \'<i>\'+\'[\\s\\S]*?\'+\'</i>\'
 55     pattern = re.compile(rel)
 56     Score = \'\'.join(pattern.findall(str(infor_1[0])))
 57     Score = Score.replace(\'<i>\',\'\').replace(\'</i>\',\'\')
 58     #print(Cname,Score)
 59 
 60     #TOP250-film_many_infor
 61     now = soup.find_all(\'div\',class_ = \'bdd clear\')
 62     #print(now[0])
 63     a = BeautifulSoup(str(now[0]), "html.parser")
 64     many_infor = a.find_all(\'li\')
 65 
 66     #TOP250-film_Ename
 67     Ename = str(many_infor[0]).replace(\'<li>\',\'\').replace(\'<i>\',\'\').replace(\'</i>\',\'\').replace(\'</li>\',\'\').replace(\'<a>\',\'\').replace(\'</a>\',\'\')
 68     #TOP250-film_Actor
 69     Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all(\'a\')
 70     Actor = Actor_temp[0].get_text().replace(\'导演：\',\'\')
 71     #TOP250-film_Starring
 72     Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all(\'a\')
 73     for i in Starring_temp:
 74         Starring += i.get_text().replace(\' \',\'\') + \' \'
 75     #print(Starring)
 76 
 77     #Top-film_Infor
 78     for j in range(4,7):
 79         Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser")
 80         for i in Infor_temp.children:
 81             Infor += i.get_text().replace(\' \',\'\') + \' \'
 82         Infor += \'\\n\'
 83     #print(Infor)
 84 
 85     #TOP250-film_Synopsis
 86     content =  soup.find_all(\'div\',class_ = \'fk-4 clear\')
 87     #print(content)
 88     soup_con = BeautifulSoup(str(content[0]), "html.parser")
 89     title = soup_con.find_all(\'div\',class_ = \'hdd\')
 90     title = str(title[0]).replace(\'<div class="hdd">\',\'\').replace(\'</div>\',\'\\n\')
 91     #print(title)
 92     content_1 = soup_con.find_all(\'div\',class_ = \'bdd clear\')
 93     content_1 = str(content_1[0]).replace(\'<div class="bdd clear" style="font-size:15px">\',\'\').replace(\'</div>\',\'\')
 94     content_1 = content_1.replace(\'<!-- <p><a href="#">更多剧情 >></a></p>  -->\',\'\').replace(\'<br/>\',\'\\n\')
 95 
 96     #Save_all_information
 97     All_txt.append(\'第\'+str(num)+\'部\'+\'\\n\')
 98     All_txt.append( Cname+\'\\n\' )
 99     All_txt.append( \'【英文名】\'+Ename+\'\\n\' )
100     All_txt.append( \'【评分】\'+Score+\'\\n\' )
101     All_txt.append( \'【导演】\'+Actor+\'\\n\' )
102     All_txt.append( \'【主演】\'+Starring+\'\\n\' )
103     All_txt.append( Infor+\'\\n\' )
104     All_txt.append( title+\'\\n\'+content_1+\'\\n\' )
105     All_txt.append(\'\\n\')
106     num += 1
107 
108 #在每一页中得到当前页的全部电影的url
109 def getin_one(url,page):
110     txt = getHTMLText(url)
111     soup = BeautifulSoup(txt, "html.parser")
112     #print(soup)
113     temp = soup.find_all(\'div\',class_="ss-3 clear")
114     rel = \'<a href="\' + \'[\\s\\S]*?\' + \'">\'
115     pattern = re.compile(rel)
116     All_url = pattern.findall( str(temp[0]) )
117     for i in range(len(All_url)):
118         temp_url = \'http://www.imdb.cn\'+All_url[i].replace(\'<a href="\',\'\').replace(\'">\',\'\')
119         get_all_information(temp_url,page)
120     #print(All_url)
121 
122 #将所有电影的年份统计并生成条形图
123 def Analyze_some_infor():
124     plt.rc(\'font\', family=\'SimHei\', size=13)#字体及大小
125     #Analyze_time
126     file = open(\'time.txt\')
127     a,b,c,d,e,f = 0,0,0,0,0,0
128     for line in file:
129         line = eval(line)
130         if line == 0:
131             f += 1
132         elif line < 1940 and line >= 1920:
133             a += 1 
134         elif line < 1960 and line >= 1940:
135             b += 1
136         elif line < 1980 and line >= 1960:
137             c += 1
138         elif line < 2000 and line >= 1980:
139             d += 1
140         else:
141             e += 1
142     times = [a,b,c,d,e,f]
143     range_time = [\'1920-1940\',\'1940-1960\',\'1960-1980\',\'1980-2000\',\'2000-现在\',\'无信息\']
144     idx = np.arange(len(range_time))
145     width = 0.5
146     plt.bar(idx,times,width,color=\'green\')
147     plt.xticks(idx+width/2, range_time, rotation=40)
148     plt.xlabel(\'电影年代\')
149     plt.ylabel(\'数目\')
150     plt.savefig(\'time_pic.jpg\')
151     plt.show()
152 
153 def main():
154     global All_txt
155     getin_one(\'http://www.imdb.cn/IMDB250/\',1)
156     for i in range(2,10):
157         getin_one( \'http://www.imdb.cn/imdb250/\'+str(i) , i )
158     #将已有内容清空
159     with open(\'All_infor.txt\',\'w\',encoding=\'utf-8\') as x:
160         pass
161     with open(\'All_infor.txt\',\'a\',encoding=\'utf-8\') as x:
162         for i in All_txt:
163             x.write(i)
164     Analyze_some_infor()
165 
166 main()

作者： LB919
出处：http://www.cnblogs.com/L1B0/
该文章为LB919投入了时间和精力的原创；
如有转载，荣幸之至！请随手标明出处；

以上是关于IMDB TOP 250爬虫的主要内容，如果未能解决你的问题，请参考以下文章

用爬虫分析IMDB TOP250电影数据

IMDB Top 250 与豆瓣电影250的比较？

API 或任何其他方式访问 IMDB TOP 250 列表？

python爬虫入门爬取豆瓣电影top250

Forward团队-爬虫豆瓣top250项目-模块开发过程