爬虫爬取新闻
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫爬取新闻相关的知识,希望对你有一定的参考价值。
第一版爬取新闻的用下来最大的感受是:
- 爬取多少条不能自己设置
- 速度太慢了
这次把这两点优化下:
通过读取用户输入的数字来设置每个频道爬取新闻数量,采用多线程优化爬取速度
1 # -*- coding:utf-8 -*- 2 3 import os,time 4 import sys 5 import urllib 6 from urllib import request 7 import re,threading 8 from lxml import etree 9 10 11 def StringListSave(save_path, filename, slist): 12 if not os.path.exists(save_path): 13 os.makedirs(save_path) 14 path = save_path+"/"+filename+".txt" 15 with open(path, "w+", encoding=‘GB18030‘) as fp: 16 for s in slist: 17 fp.write("%s\t\t%s\n" % (s[0], s[1])) 18 19 def CellPage(save_path, filename, slist,num=50): 20 ‘‘‘单个新闻内容的存储‘‘‘ 21 folder = save_path+‘/‘+filename 22 print (folder) 23 if not os.path.exists(folder): 24 os.mkdir(folder) 25 i = 0 26 for item, url in slist: 27 #每个频道保存多少条 28 if i >= num:break 29 #过滤不符合windows的文件名 30 newitem = re.sub(r"[\/\\\:\*\?\"\<\>\|]","",item) 31 print (item) 32 with open(folder+‘/‘+newitem+‘.html‘, "w+", encoding=‘GB18030‘) as fp: 33 PageContent = request.urlopen(url).read().decode("GB18030") 34 fp.write("%s\n" % PageContent) 35 i += 1 36 37 def Page_Info(myPage): 38 ‘‘‘Regex‘‘‘ 39 mypage_Info = re.findall(r‘<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>‘, myPage, re.S) 40 return mypage_Info 41 42 def New_Page_Info(new_page): 43 ‘‘‘Regex(slowly) or Xpath(fast)‘‘‘ 44 dom = etree.HTML(new_page) 45 new_items = dom.xpath(‘//tr/td/a/text()‘) 46 new_urls = dom.xpath(‘//tr/td/a/@href‘) 47 assert(len(new_items) == len(new_urls)) 48 return zip(new_items, new_urls) 49 50 def Save(tuple1,i,save_path,num): 51 #print (tuple1) 52 item = tuple1[0] 53 url = tuple1[1] 54 print ("downloading ", url) 55 new_page = request.urlopen(url).read().decode("GB18030") 56 newPageResults = New_Page_Info(new_page) 57 filename = str(i)+"_"+item 58 StringListSave(save_path, filename, newPageResults) 59 newPageResults = New_Page_Info(new_page) 60 if num: #非空字符串 61 CellPage(save_path, filename, newPageResults,num) 62 else: 63 CellPage(save_path, filename, newPageResults) 64 65 66 67 68 def Spider(url): 69 i = 0 70 num = input("请输入每个频道需要爬取的新闻数量(直接回车默认50):") #读取用户每个频道需要爬取的数量 71 #调试设置 72 #num = ‘2‘ 73 if num.strip(): 74 num = int(num) 75 print ("downloading ", url) 76 myPage = request.urlopen(url).read().decode("GB18030") 77 myPageResults = Page_Info(myPage) 78 ntime = time.strftime("%Y%m%d",time.localtime(time.time())) 79 save_path = "news-" + ntime 80 filename = str(i)+"_"+"Ranking" 81 StringListSave(save_path, filename, myPageResults) 82 i += 1 83 84 85 list = myPageResults 86 threads = [] 87 files = range(len(list)) 88 #创建线程 89 for i in files: 90 t = threading.Thread(target=Save,args=(list[i],i,save_path,num)) 91 threads.append(t) 92 #启动线程 93 for i in files: 94 threads[i].start() 95 for i in files: 96 threads[i].join() 97 98 99 100 if __name__ == ‘__main__‘: 101 print ("start") 102 start_url = "http://news.163.com/rank/" 103 Spider(start_url) 104 print ("end")
以上是关于爬虫爬取新闻的主要内容,如果未能解决你的问题,请参考以下文章