python 简书用户爬虫
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 简书用户爬虫相关的知识,希望对你有一定的参考价值。
1 # python 2 # -*- coding: utf-8 -*- 3 """ 4 __title__ = ‘‘ 5 __author__ = ‘wlc‘ 6 __mtime__ = ‘2017/10/15‘ 7 """ 8 import re 9 import time 10 import math 11 import csv 12 import requests 13 from bs4 import BeautifulSoup 14 from collections import deque 15 16 #建立一个csv文件保存信息 17 path = ‘dataCollection/userInfo.csv‘ 18 csvFile = open(path, ‘a+‘, newline=‘‘, encoding=‘utf-8‘) 19 writer = csv.writer(csvFile) 20 writer.writerow((‘id‘,‘name‘,‘following‘,‘follower‘,‘article‘,‘word‘,‘like‘)) 21 22 #全局变量用来存储userid 和关注的人数 23 idContainer = set() 24 #用来放置用户的链接使用双向队列 25 linkDeque = deque() 26 27 class jianshu(object): 28 def __init__(self): 29 #定制url模板 30 self.url = ‘http://www.jianshu.com/users/{userId}/following?page={page}‘ 31 #用户id与name的匹配规则 32 self.idPattern = re.compile(‘<a class="name" href="/u/(.*?)">(.*?)</a>‘) 33 #用户的关注 粉丝 文章 文集 的匹配规则 34 self.metalPattern = re.compile(‘<span>关注 (\d+)</span><span>粉丝 (\d+)</span><span>文章 (\d+)</span>‘) 35 self.meta = re.compile(‘写了 (\d+) 字,获得了 (\d+) 个喜欢‘) 36 #伪装成浏览器 37 self.header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} 38 39 def createRequest(self, userId, page): 40 url = self.url.format(userId = userId, page = page) 41 requ = requests.get(url, headers = self.header).text 42 return requ 43 44 def pageResponse(self, requ): 45 bsOBJ = BeautifulSoup(requ, ‘lxml‘) 46 userContainer = bsOBJ.find_all(‘ul‘,{‘class‘:‘user-list‘})[0] 47 userContent = userContainer.contents 48 userContent = [str(user) for user in userContent if user != ‘\n‘] 49 #关注用户列表 50 return userContent 51 52 def parserUserInfo(self, user): 53 id, name = re.findall(self.idPattern, user)[0] 54 followingNum, followerNum, articleNum = re.findall(self.metalPattern, user)[0] 55 wordNum, likeNum = re.findall(self.meta, user)[0]#此处如果出现index out of range 则进行try except 就可以有的时候获取值为空 56 content = (id, name, followingNum, followerNum, articleNum, wordNum, likeNum) 57 writer.writerow(content) 58 return content 59 60 def getUserList(self, userId, following): 61 idContainer.add((userId, following)) 62 num = int(following) / 10 63 page = math.ceil(num) 64 for pg in range(1, page + 1, 1): 65 requ = self.createRequest(userId, pg) 66 userList = self.pageResponse(requ) 67 for user in userList: 68 content = self.parserUserInfo(user) 69 linkDeque.append((content[0], content[2])) 70 time.sleep(1) 71 for deq in linkDeque: 72 if deq not in idContainer: 73 self.getUserList(deq[0],deq[1]) 74 print("what") 75 jianshu = jianshu().getUserList(‘652fbdd1e7b3‘,162)
以上是关于python 简书用户爬虫的主要内容,如果未能解决你的问题,请参考以下文章