Python 简单爬虫
Posted 小喽啰A
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python 简单爬虫相关的知识,希望对你有一定的参考价值。
Python 简单爬虫
实现一个基本的爬虫需要以下内容:
- URL管理器,用来管理已经爬取的URL和未爬取的URL
- 网页内容下载器
- 内容解析器,解析出网页中需要的有价值的信息
- 内容收集器
URL管理器实现
URL管理器提供一个获取和添加URL的方法,一方面为下载器提供要下载的连接;另一方面,保存解析器解析到的URL,补充URL管理器中的URL内容
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
地址管理器
'''
__author__ = 'wlong.yi@gmail.com'
class UrlManager(object):
"""docstring for UrlManager"""
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_url(url)
def get_new_url(self):
if len(self.new_urls) == 0:
return None
url = self.new_urls.pop()
self.old_urls.add(url)
return url
def has_new_url(self):
return len(self.new_urls) > 0
网页下载器
根据指定的URL获取网页内容,这里使用urllib2实现
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
下载网页的内容
'''
__author__ = 'wlong.yi@gmail.com'
import urllib2
class htmlDownloader(object):
"""docstring for HtmlDownloader"""
def __init__(self):
pass
def download(self, url):
if url is None:
return None
response = urllib2.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
内容解析器
根据指定规则解析出网页中的内容,获取有价值的数据和url;这里是解析的百度百科的标题和简介
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
网页解析器
'''
__author__ = 'wlong.yi@gmail.com'
from bs4 import BeautifulSoup
import re
class HtmlParser(object):
"""docstring for HtmlParser"""
def __init__(self):
pass
def _get_urls(self, page_url, soup):
new_urls = set()
# <a target="_blank" href="/view/20965.htm">自由软件</a>
url_nodes = soup.find_all('a', href=re.compile(r'/view/\\d+\\.htm'))
for node in url_nodes:
url = node['href']
full_url = "http://baike.baidu.com"+url
new_urls.add(full_url)
return new_urls
def _get_data(self, page_url, soup):
data =
data['page_url'] = page_url
title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
data['title'] = title_node.get_text()
summary_node = soup.find('div', class_='lemma-summary')
data['summary'] = summary_node.get_text()
return data
def parse(self, page_url, html_content):
if html_content is None or len(html_content) == 0:
return None
soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
urls = self._get_urls(page_url, soup)
data = self._get_data(page_url, soup)
return urls, data
内容收集器
管理解析器解析到的有价值数据
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
内容收集,输出
'''
__author__ = 'wlong.yi@gmail.com'
class HtmlOutputer(object):
"""docstring for HtmlOutputer"""
def __init__(self):
self.datas = []
def collect_data(self, data):
if self.datas is not None:
self.datas.append(data)
def print_data(self):
fout = open('result', 'w+')
for data in self.datas:
fout.write("Title: %s\\nPageUrl: %s\\nSummary: %s\\n\\n" % (data['title'].encode("utf-8"), data['page_url'].encode("utf-8"), data['summary'].encode("utf-8")))
fout.close()
测试运行代码
以百度百科Python的词条为根节点进行爬取相关的内容
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
爬虫工具
'''
__author__ = 'wlong.yi@gmail.com'
from html_downloader import HtmlDownloader
from url_manager import UrlManager
from html_parser import HtmlParser
from html_outputer import HtmlOutputer
um = UrlManager()
hd = HtmlDownloader()
hp = HtmlParser()
ho = HtmlOutputer()
um.add_url("http://baike.baidu.com/view/21087.htm")
count = 1;
while um.has_new_url():
try:
page_url = um.get_new_url()
print "[%d] reading... [%s]" % (count, page_url)
res = hd.download(page_url)
urls, data = hp.parse(page_url, res)
ho.collect_data(data)
um.add_urls(urls)
count = count+1
if count > 10:
break
except:
print "[%d] reading... [%s], failed." % (count, page_url)
continue
ho.print_data()
以上是关于Python 简单爬虫的主要内容,如果未能解决你的问题,请参考以下文章