IMDB TOP 250爬虫
Posted L1B0@10.0.0.55
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了IMDB TOP 250爬虫相关的知识,希望对你有一定的参考价值。
这个小学期Python大作业搞了个获取IMDB TOP 250电影全部信息的爬虫。第二次写爬虫,比在暑假集训时写的熟练多了。欢迎大家评论。
1 \'\'\' 2 ************************************************ 3 *Time:2017.9.11 4 *Target:All movies\' information of IMDB TOP_250 5 *Resources:http://www.imdb.cn/IMDB250/ 6 ************************************************ 7 \'\'\' 8 9 import re 10 import requests 11 import numpy as np 12 import matplotlib.pyplot as plt 13 from bs4 import BeautifulSoup 14 15 num = 1 #电影计数 16 All_txt = [] #全部电影的信息 17 headers={\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0\'}#浏览器代理 18 def getHTMLText(url): 19 try: 20 #print(url) 21 r = requests.get( url,headers = headers ) 22 #print(r) 23 r.encoding = \'utf-8\' 24 return r.text 25 except: 26 return "错误" 27 28 #从每一部电影的页面中获取全部信息 29 def get_all_information(url,page): 30 global num,All_txt 31 txt = getHTMLText(url) 32 if txt != "错误": 33 print(\'page\'+str(page)+\' NO.\'+str(num)+\' Get it!\') 34 if num == 247: 35 print(\'Finished!!!\') 36 soup = BeautifulSoup(txt,"html.parser") 37 Cname,Ename,Score,title,Actor,Starring,Infor = \'\',\'\',\'\',\'\',\'\',\'\',\'\' 38 39 #TOP250-film_Chinese_name&Score 40 infor_1 = soup.find_all(\'div\',class_ = \'hdd\') 41 rel = \'<h3>\'+\'[\\s\\S]*?\'+\'</h3>\' 42 pattern = re.compile(rel) 43 Cname = \'\'.join(pattern.findall(str(infor_1[0]))) 44 Cname = Cname.replace(\'<h3>\',\'\').replace(\'</h3>\',\'\') 45 #print(Cname) 46 #find_the_year & save 47 rel = \'(\'+\'[\\s\\S]*?\'+\')\' 48 pattern = re.compile(rel) 49 time_ = \'\'.join(pattern.findall(Cname)) 50 #print(time_) 51 with open(\'time.txt\',\'a\',encoding=\'utf-8\') as t: 52 t.write( time_.replace(\'(\',\'\').replace(\')\',\'\') + \'\\n\' ) 53 #find_Score 54 rel = \'<i>\'+\'[\\s\\S]*?\'+\'</i>\' 55 pattern = re.compile(rel) 56 Score = \'\'.join(pattern.findall(str(infor_1[0]))) 57 Score = Score.replace(\'<i>\',\'\').replace(\'</i>\',\'\') 58 #print(Cname,Score) 59 60 #TOP250-film_many_infor 61 now = soup.find_all(\'div\',class_ = \'bdd clear\') 62 #print(now[0]) 63 a = BeautifulSoup(str(now[0]), "html.parser") 64 many_infor = a.find_all(\'li\') 65 66 #TOP250-film_Ename 67 Ename = str(many_infor[0]).replace(\'<li>\',\'\').replace(\'<i>\',\'\').replace(\'</i>\',\'\').replace(\'</li>\',\'\').replace(\'<a>\',\'\').replace(\'</a>\',\'\') 68 #TOP250-film_Actor 69 Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all(\'a\') 70 Actor = Actor_temp[0].get_text().replace(\'导演:\',\'\') 71 #TOP250-film_Starring 72 Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all(\'a\') 73 for i in Starring_temp: 74 Starring += i.get_text().replace(\' \',\'\') + \' \' 75 #print(Starring) 76 77 #Top-film_Infor 78 for j in range(4,7): 79 Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser") 80 for i in Infor_temp.children: 81 Infor += i.get_text().replace(\' \',\'\') + \' \' 82 Infor += \'\\n\' 83 #print(Infor) 84 85 #TOP250-film_Synopsis 86 content = soup.find_all(\'div\',class_ = \'fk-4 clear\') 87 #print(content) 88 soup_con = BeautifulSoup(str(content[0]), "html.parser") 89 title = soup_con.find_all(\'div\',class_ = \'hdd\') 90 title = str(title[0]).replace(\'<div class="hdd">\',\'\').replace(\'</div>\',\'\\n\') 91 #print(title) 92 content_1 = soup_con.find_all(\'div\',class_ = \'bdd clear\') 93 content_1 = str(content_1[0]).replace(\'<div class="bdd clear" style="font-size:15px">\',\'\').replace(\'</div>\',\'\') 94 content_1 = content_1.replace(\'<!-- <p><a href="#">更多剧情 >></a></p> -->\',\'\').replace(\'<br/>\',\'\\n\') 95 96 #Save_all_information 97 All_txt.append(\'第\'+str(num)+\'部\'+\'\\n\') 98 All_txt.append( Cname+\'\\n\' ) 99 All_txt.append( \'【英文名】\'+Ename+\'\\n\' ) 100 All_txt.append( \'【评分】\'+Score+\'\\n\' ) 101 All_txt.append( \'【导演】\'+Actor+\'\\n\' ) 102 All_txt.append( \'【主演】\'+Starring+\'\\n\' ) 103 All_txt.append( Infor+\'\\n\' ) 104 All_txt.append( title+\'\\n\'+content_1+\'\\n\' ) 105 All_txt.append(\'\\n\') 106 num += 1 107 108 #在每一页中得到当前页的全部电影的url 109 def getin_one(url,page): 110 txt = getHTMLText(url) 111 soup = BeautifulSoup(txt, "html.parser") 112 #print(soup) 113 temp = soup.find_all(\'div\',class_="ss-3 clear") 114 rel = \'<a href="\' + \'[\\s\\S]*?\' + \'">\' 115 pattern = re.compile(rel) 116 All_url = pattern.findall( str(temp[0]) ) 117 for i in range(len(All_url)): 118 temp_url = \'http://www.imdb.cn\'+All_url[i].replace(\'<a href="\',\'\').replace(\'">\',\'\') 119 get_all_information(temp_url,page) 120 #print(All_url) 121 122 #将所有电影的年份统计并生成条形图 123 def Analyze_some_infor(): 124 plt.rc(\'font\', family=\'SimHei\', size=13)#字体及大小 125 #Analyze_time 126 file = open(\'time.txt\') 127 a,b,c,d,e,f = 0,0,0,0,0,0 128 for line in file: 129 line = eval(line) 130 if line == 0: 131 f += 1 132 elif line < 1940 and line >= 1920: 133 a += 1 134 elif line < 1960 and line >= 1940: 135 b += 1 136 elif line < 1980 and line >= 1960: 137 c += 1 138 elif line < 2000 and line >= 1980: 139 d += 1 140 else: 141 e += 1 142 times = [a,b,c,d,e,f] 143 range_time = [\'1920-1940\',\'1940-1960\',\'1960-1980\',\'1980-2000\',\'2000-现在\',\'无信息\'] 144 idx = np.arange(len(range_time)) 145 width = 0.5 146 plt.bar(idx,times,width,color=\'green\') 147 plt.xticks(idx+width/2, range_time, rotation=40) 148 plt.xlabel(\'电影年代\') 149 plt.ylabel(\'数目\') 150 plt.savefig(\'time_pic.jpg\') 151 plt.show() 152 153 def main(): 154 global All_txt 155 getin_one(\'http://www.imdb.cn/IMDB250/\',1) 156 for i in range(2,10): 157 getin_one( \'http://www.imdb.cn/imdb250/\'+str(i) , i ) 158 #将已有内容清空 159 with open(\'All_infor.txt\',\'w\',encoding=\'utf-8\') as x: 160 pass 161 with open(\'All_infor.txt\',\'a\',encoding=\'utf-8\') as x: 162 for i in All_txt: 163 x.write(i) 164 Analyze_some_infor() 165 166 main()
作者: LB919
出处:http://www.cnblogs.com/L1B0/
该文章为LB919投入了时间和精力的原创;
如有转载,荣幸之至!请随手标明出处;
以上是关于IMDB TOP 250爬虫的主要内容,如果未能解决你的问题,请参考以下文章