完整爬虫步骤(进阶)
Posted 542684416-qq
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了完整爬虫步骤(进阶)相关的知识,希望对你有一定的参考价值。
import random import requests from fake_useragent import UserAgent from retrying import retry import hashlib #信息摘要 md5 import queue #队列 import re #正则表达式 from urllib import robotparser #解析网站robots.txt文件 from urllib.parse import urlparse,urljoin,urldefrag #解析url from threading import Thread #多线程 from datetime import datetime import time from day03 import mongo_cache MAX_DEP = 2 #定义爬虫爬取深度 def get_robots(url): ‘‘‘ 解析robots.txt文件 :param url: :return: ‘‘‘ rp = robotparser.RobotFileParser() rp.set_url(urljoin(url,‘robots.txt‘)) rp.read() return rp def save_url(html_content,url_str): ‘‘‘ 存储下载内容 :param html_content: :param url_str: :return: ‘‘‘ md5 = hashlib.md5() md5.update(html_content) # file_path = ‘./download/‘ + md5.hexdigest() + ‘.html‘ file_path = ‘./download/‘ + gen_html_name(url_str) + ‘.html‘ with open(file_path,‘wb‘) as f: f.write(html_content) def gen_html_name(url_str): path = urlparse(url_str).path path_array = path.split(‘/‘) return path_array[len(path_array)-1] def extractor_url_lists(html_content): ‘‘‘ 抽取网页中的其他链接 :param html_content: :return: ‘‘‘ url_regex = re.compile(‘<a[^>]+href=["‘](.*?)["‘]‘,re.IGNORECASE) return url_regex.findall(html_content) class CrawlerCommon(Thread): ‘‘‘ 实现一个通用爬虫,涵盖基本的爬虫功能及涉及一些反爬虫技术 ‘‘‘ def __init__(self,init_url): super(CrawlerCommon,self).__init__() __ua = UserAgent() #随机User-Agent self.seed_url = init_url #初始爬取的种子网址 self.crawler_queue = queue.Queue() #使用不同的队列会造成BFS和DFS的效果 self.crawler_queue.put(init_url) #将种子网址放入队列 self.visited = {init_url : 0} #初始化爬取深度为0 self.rp = get_robots(init_url) #初始化robots解析器 self.headers = {‘User-Agent‘:__ua.random} #生成一个随机user-agent self.link_regex = ‘(index|view)‘ #抽取网址的过滤条件 self.throttle = Throttle(2.0) #下载限流器间隔2秒 self.mcache = mongo_cache.MongoCache() #初始化Mongo_cache @retry(stop_max_attempt_number=3) def retry_download(self,url_str,data,method,proxies): ‘‘‘ 使用装饰器的重试下载类 :param url_str: :param data: :param method: :param proxies: :return: ‘‘‘ if method == ‘POST‘: result = requests.post(url_str,data=data,headers = self.headers,proxies=proxies) else: result = requests.get(url_str,headers=self.headers,timeout=3,proxies=proxies) assert result.status_code == 200 #此处为断言,判断状态码是否为200 return result.content def download(self,url_str,data=None,method=‘GET‘,proxies={}): ‘‘‘ 真正的下载类 ‘‘‘ print(‘download url is :::::‘,url_str) try: #此处随机添加代理代码 ip = random.choice([‘27.155.84.233:8081‘,‘61.135.217.7:80‘,‘183.47.40.35:8088‘,‘123.244.148.5:60230‘]) proxies = {"http.html": ip} print(proxies) result = self.retry_download(url_str,data,method,proxies) except Exception as e: print(e.message) result = None return result def nomalize(self,url_str): ‘‘‘ 补全下载链接 :param url_str: :return: ‘‘‘ real_url,_ = urldefrag(url_str) return urljoin(self.seed_url,real_url) def save_result(self,html_content,url_str): ‘‘‘ 将结果存入数据库,存入前检查内容是否存在 :param html_content: 下载的二进制内容 :param url_str: 下载网页的url :return: 无 ‘‘‘ if url_str not in self.mcache: self.mcache[url_str] = html_content else: data_from_mongo = self.mcache[url_str] #初始化md5算法 md5_func_mongo = hashlib.md5() md5_func_download = hashlib.md5() #生成数据库记录的md5摘要 md5_func_mongo.update(data_from_mongo) mongo_md5_str = md5_func_mongo.hexdigest() #生成下载数据的md5摘要 md5_func_download.update(html_content) download_md5_str = md5_func_download.hexdigest() #对比下载结果是否和数据库一样 if download_md5_str != mongo_md5_str: self.mcache[url_str] = html_content def run(self): ‘‘‘ 进行网络爬去主要方法 :return: ‘‘‘ while not self.crawler_queue.empty(): url_str = self.crawler_queue.get() #检测robots.txt文件规则 if self.rp.can_fetch(self.headers[‘User-Agent‘],url_str): self.throttle.wait(url_str) depth = self.visited[url_str] if depth < MAX_DEP: #下载链接 html_content = self.download(url_str) #存储链接 if html_content is not None : self.mcache[url_str] = html_content save_url(html_content,url_str) #筛选出页面所有链接 url_list = extractor_url_lists(html_content.decode(‘utf-8‘)) #筛选需要爬取得网站 filter_urls = [link for link in url_list if re.search(‘/(css3)‘,link)] for url in filter_urls: #补全链接 real_url = self.nomalize(url) #判断连接是否访问过 if real_url not in self.visited: self.visited[real_url] = depth +1 self.crawler_queue.put(real_url) else: print(‘robots.txt 禁止下载:‘,url_str) class Throttle(object): ‘‘‘ 下载限流器 ‘‘‘ def __init__(self,delay): self.domains = {} self.delay = delay def wait(self,url_str): domain = urlparse(url_str).netloc #取出网址域名部分(netloc) last_down = self.domains.get(domain) #取出域名的上次下载时间 if self.delay > 0 and last_down is not None: #将当前时间和上次下载时间相减,得出两次下载时间间隔,然后用休眠时间(delay)减去这个间隔。 #如果大于0就休眠,负责直接下载后续的链接 sleep_sec = self.delay - (datetime.now() - last_down).seconds if sleep_sec > 0: time.sleep(sleep_sec) self.domains[domain] = datetime.now() #当前时间为值,以域名为key存到domains字典中 if __name__=="__main__": crawler = CrawlerCommon(‘http://www.runoob.com/css3/css3-tutorial.html‘) crawler.run()
以上是关于完整爬虫步骤(进阶)的主要内容,如果未能解决你的问题,请参考以下文章