python网络爬虫(14)博客园用户信息爬取
Posted bai2018
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python网络爬虫(14)博客园用户信息爬取相关的知识,希望对你有一定的参考价值。
说明
这里只放代码,方案技术没有更变
代码说明
需要cookies绕过登录,使用selenium在Firefox下模拟。需要安装geck...?插件,另外,数据存储在sqlite,需要安装。
Spider.py
import htmlDownloader import HtmlParser import DataOutput import UrlManager import re from selenium import webdriver class Spider(object): def __init__(self): self.downloader=HtmlDownloader.HtmlDownloader() self.parser=HtmlParser.HtmlParser() self.output=DataOutput.DataOutput() self.urlManager=UrlManager.UrlManager() self.driver=webdriver.Firefox() def crawl(self,root_url): content=self.downloader.download_root(root_url,self.driver) urls=self.parser.parser_url(content) self.urlManager.add_urls(urls) i=0 while self.urlManager.new_urls_size()>0 and self.urlManager.old_urls_size()<2000: url=self.urlManager.get_new_url() i=i+1 print(str(i)+‘:‘+str(url)) pattern=re.compile(‘/.*?/‘) user_name=re.findall(pattern,url) url=‘https://home.cnblogs.com‘+user_name[1] content=self.downloader.download(self.driver,url) new_urls=self.parser.parser_url(content) self.urlManager.add_urls(new_urls) try: content=self.parser.parser_data(self.driver) self.output.store_data(content) except: i=i-1 print(‘error url may not exits:‘+self.driver.current_url) self.output.output_end() self.urlManager.save_status() #self.driver.close() print(‘ed‘) if __name__==‘__main__‘: spider=Spider() spider.crawl(‘https://www.cnblogs.com/‘)
UrlManager.py
import pickle import hashlib import re class UrlManager(): def __init__(self): self.old_urls=self.load_process(‘new_urls‘) self.new_urls=self.load_process(‘old_urls‘) def load_process(self,file_name): print(‘loading .‘) try: with open(file_name,‘rb‘) as f: tmp=pickle.load(f) return tmp except: print(‘file may not exist.will create it‘) new_set=set() self.save_process(file_name,new_set) return new_set def save_process(self,file_name,data): with open(file_name,‘wb‘) as f: pickle.dump(data,f) def save_status(self): self.save_process(‘new_urls‘,self.new_urls) self.save_process(‘old_urls‘,self.old_urls) def add_urls(self,urls): for url in urls: m=hashlib.md5() m.update(url.encode(‘utf8‘)) url_md5=m.hexdigest()[8:-8] if url not in self.new_urls and url_md5 not in self.old_urls: self.new_urls.add(url) def get_new_url(self): new_url=self.new_urls.pop() m=hashlib.md5() m.update(new_url.encode(‘utf8‘)) url_md5=m.hexdigest()[8:-8] self.old_urls.add(url_md5) return new_url def new_urls_size(self): return len(self.new_urls) def old_urls_size(self): return len(self.old_urls)
HtmlParser.py
import re import json class HtmlParser(object): def parser_url(self,content): pattern=re.compile(u‘https://www.cnblogs.com/\\w*/‘) all_urls=re.findall(pattern,content) all_urls=list(set(all_urls)) return all_urls def parser_data(self,driver): dict= user_id=driver.find_element_by_class_name(‘display_name‘).text all_message=driver.find_element_by_class_name(‘user_profile‘).text all_message=all_message.split(‘\\n‘) all_message.insert(0,‘用户ID:‘+user_id+‘\\n‘) switch=‘用户ID‘:‘user_id‘, ‘姓名‘:‘name‘, ‘性别‘:‘sex‘, ‘出生日期‘:‘birth_day‘, ‘家乡‘:‘hometown‘, ‘现居住地‘:‘live_place‘, ‘单位‘:‘work_for‘, ‘工作状况‘:‘job_status‘, ‘感兴趣的技术‘:‘interest_technology‘, ‘最近目标‘:‘recent_goal‘, ‘座右铭‘:‘mark_words‘, ‘自我介绍‘:‘introduce‘, ‘园龄‘:‘blog_age‘, ‘博客‘:‘blog_address‘, ‘婚姻‘:‘marriage‘, ‘职位‘:‘position‘, ‘QQ‘:‘qq‘, ‘Email‘:‘email‘ key=‘‘ value=‘‘ for each in all_message: try: each=each.replace(‘\\n‘,‘‘) key=switch[each.split(‘:‘)[0]] value=each.split(‘:‘)[1] dict[key]=value except: print(‘split error:‘+each+‘auto fixed..‘) value=value+each dict[key]=value print(dict) return dict
HtmlDownloader.py
import json class HtmlDownloader(object): def download_root(self,url,driver): driver.get(url) with open(‘cookies.json‘, ‘r‘, encoding=‘utf-8‘) as f: listCookies = json.loads(f.read()) for cookie in listCookies: driver.add_cookie( ‘domain‘: cookie[‘domain‘], # 此处xxx.com前,需要带点 ‘name‘: cookie[‘name‘], ‘value‘: cookie[‘value‘] ) driver.refresh() return driver.page_source def download(self,driver,url): driver.get(url) return driver.page_source
DataOutput.py
import sqlite3 class DataOutput(object): def __init__(self): self.cx=sqlite3.connect("cnblog.db") self.table_name=‘cnblog‘ self.create_table() def create_table(self): values=‘‘‘ id integer primary key autoincrement, user_id varchar(50) not null, name varchar(50), sex varchar(6), birth_day varchar(30), hometown varchar(50), live_place varchar(50), marriage varchar(20), position varchar(30), work_for varchar(50), job_status varchar(20), interest_technology varchar(200), recent_goal varchar(500), mark_words varchar(500), introduce varchar(500), blog_age varchar(30), blog_address varchar(100), qq varchar(15), email varchar(30) ‘‘‘ self.cx.execute(‘create table if not exists %s(%s)‘ %(self.table_name,values)) def store_data(self,data): flag=0 user_id=‘‘ for key,value in data.items(): if flag==0: cmd="insert into %s (%s) values (‘%s‘)" %(self.table_name,key,value) user_id=value flag=1 else: cmd=‘update %s set %s="%s" where user_id="%s"‘ %(self.table_name,key,value,user_id) self.cx.execute(cmd) self.cx.commit() def output_end(self): self.cx.close()
以上是关于python网络爬虫(14)博客园用户信息爬取的主要内容,如果未能解决你的问题,请参考以下文章