python requests 正则爬虫
Posted 石
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python requests 正则爬虫相关的知识,希望对你有一定的参考价值。
代码:
import requests from multiprocessing import Pool from requests.exceptions import RequestException import re import json def get_one_page(url): try: if url==‘‘: return None response=requests.get(url) if response.status_code==200: return response.text return None except RequestException: return None def parse_one_page(html): if html==‘‘: return None pattern=re.compile(‘<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a‘ +‘.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>‘ +‘.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>‘,re.S) items=re.findall(pattern,html) for item in items: yield { ‘index‘:item[0], ‘image‘:item[1], ‘title‘:item[2], ‘actor‘:item[3].strip()[3:], ‘time‘:item[4].strip()[5:], ‘score‘:item[5]+item[6] } def write_to_file(content): with open(‘result.txt‘,‘a‘,encoding=‘utf-8‘) as f:#a往后追加 f.write(json.dumps(content,ensure_ascii=False)+‘ ‘) f.close() def main(offset): url=‘http://maoyan.com/board/4?offset=‘+str(offset) html=get_one_page(url) # 得到html for item in parse_one_page(html): print(item) write_to_file(item) if __name__==‘__main__‘: # for i in range(10): # main(i*10) pool=Pool() #多线程 pool.map(main,[i*10 for i in range(10)])