爬虫初识(爬取dytt电影列表及下载地址)
Posted zhoushibin-1
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫初识(爬取dytt电影列表及下载地址)相关的知识,希望对你有一定的参考价值。
import re from urllib.request import urlopen def getPage(url): response=urlopen(url) return response.read().decode(‘gbk‘,errors=‘ignore‘) def parsePage(s): com=re.compile(r‘<td height="26">.*?<b>.*?<a href="(?P<url_name>.*?)" class="ulink">.*?‘,re.S) ret=com.finditer(s) for i in ret : return "http://www.dytt8.net"+i.group("url_name") def parsePage1(s): com=re.compile(r‘<div id="Zoom">.*?译.*?名(?P<name>.*?)<br />◎片.*?名(?P<pianname>.*?)<br />.*?◎导.*?演(?P<daoyan>.*?)<br />‘+ ‘◎主.*?演(?P<zhuyan>.*?)<br /><br />◎简.*?介.*?<td.*?><a href="(?P<xiazaidizhi>.*?)">‘,re.S) ret1=com.finditer(s) # print(‘****************************************************************‘) for i in ret1 : yield {"yiming":(re.sub("[u3000]", "",i.group(‘name‘))), "pianming":re.sub("[u3000]", "",i.group("pianname")), "daoyan":re.sub("[u3000]", "",i.group("daoyan")), "zhuyan":re.sub("[u3000]", "",i.group("zhuyan")), "xiazaidizhi":re.sub("[u3000]", "",i.group("xiazaidizhi"))} def main(num): url="http://www.dytt8.net/html/gndy/dyzz/list_23_%s.html" % num response_html=getPage(url) xiangqing=parsePage(response_html) response1_html = getPage(xiangqing) ret=parsePage1(response1_html) f = open("move_list", "a", encoding="utf8") for obj in ret: print(obj) data = str(obj) f.write(data + " ") for i in range(1,181): main(i)
以上是关于爬虫初识(爬取dytt电影列表及下载地址)的主要内容,如果未能解决你的问题,请参考以下文章