Python爬虫爬取博客园并保存

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫爬取博客园并保存相关的知识,希望对你有一定的参考价值。

Python爬虫爬取博客园并保存       

爬取博客园指定用户的文章修饰后全部保存到本地

首先定义爬取的模块文件:

  1. crawlers_main.py 执行入口
  2. url_manager.py url管理器
  3. download_manager.py 下载模块
  4. parser_manager.py html解析器(解析html需要利用的内容)
  5. output_manager.py 输出html网页全部内容文件(包括css,png,js等)

技术分享

 

crawlers_main.py 执行入口

技术分享
 1 # coding:utf8
 2 from com.crawlers import download_manager
 3 from com.crawlers import output_manager
 4 from com.crawlers import parser_manager
 5 from com.crawlers import url_manager
 6 
 7 
 8 class SpiderMain(object):
 9     def __init__(self):
10         self.urls = url_manager.UrlManager()
11         self.downloader = download_manager.DownloadManager()
12         self.parser = parser_manager.ParserManager()
13         self.output = output_manager.OutputManager()
14 
15     def craw(self, root_url):
16         html_root = self.downloader.download(root_url)
17         new_urls = self.parser.parseUrls(root_url,html_root)
18         self.urls.add_new_urls(new_urls)
19         count = 1
20         while self.urls.has_new_url():
21             try:
22                 new_url = self.urls.get_new_url()
23                 print(craw %d : %s % (count, new_url))
24                 html_cont = self.downloader.download(new_url)
25                 new_data = self.parser.parse(new_url, html_cont)
26                 self.output.collect_data(new_data)
27                 if count == 1000:
28                     break
29                 count += 1
30             except:
31                 print(craw failed)
32 
33         self.output.output_html()
34 
35 
36 if __name__ == "__main__":
37     root_url = "http://www.cnblogs.com/zhuyuliang/"
38     obj_spider = SpiderMain()
39     obj_spider.craw(root_url)
crawlers_main.py

url_manager.py url管理器

技术分享
 1 # coding:utf8
 2 class UrlManager(object):
 3 
 4     def __init__(self):
 5         self.new_urls = set()
 6         self.old_urls = set()
 7 
 8     def add_new_url(self, url):
 9         if url is None:
10             return
11         if url not in self.new_urls and url not in self.old_urls:
12             self.new_urls.add(url)
13 
14     def add_new_urls(self, urls):
15         if urls is None or len(urls) == 0:
16             return
17         for url in urls:
18             self.add_new_url(url)
19 
20     def has_new_url(self):
21         return len(self.new_urls) != 0
22 
23     def get_new_url(self):
24         new_url = self.new_urls.pop()
25         self.old_urls.add(new_url)
26         return new_url
url_manager.py

download_manager.py 下载模块

技术分享
 1 # coding:utf8
 2 import urllib2
 3 
 4 
 5 class DownloadManager(object):
 6 
 7     def download(self, url):
 8         if url is None:
 9             return None
10         response = urllib2.urlopen(url)
11         if response.code != 200:
12             return None
13         return response.read()
download_main.py

parser_manager.py html解析器(解析html需要利用的内容)

技术分享
 1 # coding:utf8
 2 import re
 3 from HTMLParser import HTMLParser
 4 
 5 from bs4 import BeautifulSoup
 6 import urlparse
 7 
 8 import sys
 9 reload(sys)
10 sys.setdefaultencoding(utf-8)
11 
12 class ParserManager(HTMLParser):
13 
14     def __init__(self):
15         HTMLParser.__init__(self)
16         self.links = []
17 
18     def handle_starttag(self, tag, attrs):
19         # print "Encountered the beginning of a %s tag" % tag
20         if tag == img or tag == "script":
21             for (variable, value) in attrs:
22                 if variable == "src" or variable == "href":
23                     self.links.append(value)
24         if tag == "link":
25             dic = dict(attrs)
26             if dic[rel] == "stylesheet":
27                 self.links.append(dic[href])
28 
29     def parse(self, page_url, html_cont):
30             if page_url is None or html_cont is None:
31                 return
32             soup = BeautifulSoup(html_cont,html.parser,from_encoding=utf-8)
33             new_data = self._get_new_data(page_url,soup)
34             return new_data
35 
36     def _get_new_urls(self, page_url, soup):
37         new_urls = set()
38         #href="http://www.cnblogs.com/zhuyuliang/p/5218635.html"
39         links = soup.find_all(a,href=re.compile(rhttp://www.cnblogs.com/zhuyuliang/p/...))
40         for link in links:
41             new_url = link[href]
42             new_full_url = urlparse.urljoin(page_url,new_url)
43             new_urls.add(new_full_url)
44         return new_urls
45 
46     def _get_new_data(self, page_url, soup):
47         res_data = {}
48         res_data[url] = page_url
49 
50         #<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/zhuyuliang/p/5218635.html">Android开发代码规范</a>
51         title_node = soup.find(a,class_=postTitle2)
52         res_data[title] = title_node.get_text()
53 
54         #div id=‘topics‘
55         summary_node = soup.find(div,class_="post")
56         res_data[summary] = summary_node
57 
58         new_tag = soup.new_tag("body")
59         new_tag.string = summary_node.encode(utf-8)
60         soup.body.replace_with(new_tag)
61         res_data[template] = soup
62 
63         return res_data
64 
65     def parseUrls(self,root_url,html_cont):
66         soup = BeautifulSoup(html_cont, html.parser, from_encoding=utf-8)
67         new_urls = self._get_new_urls(root_url, soup)
68         return new_urls
parser_manager.py

output_manager.py 输出html网页全部内容文件(包括css,png,js等)

技术分享
 1 # -*- coding:utf-8 -*-
 2 # !/bin/sh
 3 import os
 4 import urllib
 5 from com.crawlers.parser_manager import ParserManager
 6 
 7 
 8 class OutputManager(object):
 9 
10     def __init__(self):
11         self.datas = []
12 
13     def collect_data(self, data):
14         if data is None:
15             return
16         self.datas.append(data)
17 
18 
19     def output_html(self):
20         for data in self.datas:
21             fout = open(output/%s.txt%data[title].encode(utf-8), w)
22             fout.write("%s" % data[summary].encode(utf-8))
23             fout.close()
24             url = data[url].encode(utf-8)
25             pagename = data[title].encode(utf-8)
26             # html_code = urllib.urlopen(url).read()
27             hp = ParserManager()
28             html_code = data[template].encode(utf-8)
29             html_code = hp.unescape(html_code)
30             hp.feed(html_code)
31             hp.close()
32             durl = url.rsplit(/,1)[0]
33             self.download(pagename,html_code,durl,hp.links)
34 
35 
36     def download(self,pagename,html_code,durl,links):
37         if not os.path.exists(output/+pagename+_files):
38             os.mkdir(output/+pagename+_files)
39             upurl = durl.rsplit(/,1)[0]
40             for link in links:
41                 fname = link.split(/)[-1]
42                 fname = fname.split(?)[0]
43                 localpath = %s%s % (output/+pagename+_files/,fname)
44                 replacelocalpath = %s%s % (pagename + _files/, fname)
45                 # if link[0:3] == ‘../‘:
46                 #         downlink = "http:" + link
47                 # else:
48                 #         downlink = link
49                 try:
50                     urllib.urlretrieve("http://www.cnblogs.com" + link,localpath)
51                 except Exception,error:
52                     print download error:, error
53                 else:
54                     print download +fname
55                     html_code = html_code.replace(link,replacelocalpath)
56                 open(output/+pagename+.html,w).write(html_code)
57         return True
output_manager.py

 

最后输出:

技术分享

 

>结束

以上是关于Python爬虫爬取博客园并保存的主要内容,如果未能解决你的问题,请参考以下文章

如何用Python爬虫获取那些价值博文

scrapy按顺序启动多个爬虫代码片段(python3)

Python爬虫爬取一篇韩寒新浪博客

scrapy主动退出爬虫的代码片段(python3)

python爬虫爬取csdn博客专家所有博客内容

爬虫实战使用python爬取博客园的某一篇文章