python 使用pycurl的cURL Fitbit社区组,使用BeautifulSoup刮取页面,发送给Redis(写入获取数据以创建Fitbit仪表板

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 使用pycurl的cURL Fitbit社区组,使用BeautifulSoup刮取页面,发送给Redis(写入获取数据以创建Fitbit仪表板相关的知识,希望对你有一定的参考价值。

#!/usr/bin/python
from bs4 import BeautifulSoup
import pycurl
import re
import os
from urllib import urlencode
from io import BytesIO
from StringIO import StringIO
import sys
import redis
import time


class getFitbitData:

	cookieDir	= './fbcookie.txt'	#where we're storing our cookies

	#Config Redis server we're connecting to
	pool = redis.ConnectionPool( host='0.0.0.0', port=6379,password='ifneeded',db=12 )
	redisServer = redis.Redis( connection_pool=pool )
	pipe = redisServer.pipeline()

	# 2D array group name used in Redis key, and groups fitbit URL
	groups = [["XXXX","https://www.fitbit.com/group/XXXXXX"],["XXXX","https://www.fitbit.com/group/XXXXXX"]]

	date = time.strftime("%Y-%m-%d")

	def __init__( self ):
		#Where we store our Unity Crap
		self.password =	'yourpass'
		self.user = 'youruser'

	def getHTML( self, groupURL, page ):
		print "authenticate"
		buffer = BytesIO()
		c = pycurl.Curl()
		c.setopt(c.SSL_VERIFYPEER, False)
		c.setopt(c.FOLLOWLOCATION, True)
		c.setopt(c.TIMEOUT, 60)
		c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0')
		c.setopt(c.URL, 'https://www.fitbit.com/login')
		c.setopt(c.WRITEFUNCTION, buffer.write)
		c.perform()
		html = str(buffer.getvalue())


		#Get hidden values for post
		if "_sourcePage" in html:
			rex = re.compile( "input type=\"hidden\" name=\"_sourcePage\" value=\"(.*?)\"")
			sourcepage = rex.search( html ).groups()[0]

		if "__fp" in html:
			rex = re.compile( "input type=\"hidden\" name=\"__fp\" value=\"(.*)\"")
			fp = rex.search( html ).groups()[0]

		datastuff = {'login':'Log In','disableThirdPartyLogin':'false','email':self.user,'password':self.password,'rememberMe':'true'}

		#post datastuff
		c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0')
		c.setopt(c.URL, 'https://www.fitbit.com/login' )
		c.setopt(c.COOKIEJAR, self.cookieDir)
		c.setopt(c.COOKIEFILE, self.cookieDir )
		c.setopt(c.WRITEFUNCTION, buffer.write)
		c.setopt(c.FOLLOWLOCATION, True)
		c.setopt(c.POST, True)
		c.setopt(c.POSTFIELDS, urlencode( datastuff ))
		c.perform()

		buffer.flush()
		buffer = BytesIO()
		c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0')
		c.setopt(c.URL, groupURL+'/leaders?timeWindow=CURRENT_MONTH&page='+page)
		c.setopt(c.COOKIEJAR, self.cookieDir)
		c.setopt(c.COOKIEFILE, self.cookieDir )
		c.setopt(c.WRITEFUNCTION, buffer.write)
		c.perform()
		html = str(buffer.getvalue())
		return html
		# c.close()

	def parseHTML( self, html, group ):
		soup = BeautifulSoup(html, "html.parser")
		count = 0

		# find all a hrefs with class formlink
		for leftCell in soup.find_all("div", {"class": "leaderboardCell left"}):
			for mylink in leftCell.find_all("div", {"class": "info"}):
				for link in mylink.find_all("a"):
					name = (link.get_text())
				for link in mylink.find_all("li", {"class": "stat ellipsis"}):
					t = (link.get_text())
					t = "".join(t.split())
					t = t[:-5]
					steps = int(t.replace(',', ''))
				for link in mylink.find_all("li", {"class": "average ellipsis"}):
					a = (link.get_text()[:-5])
					avg = int(a.replace(',', ''))
				print name
				print steps

				self.redisServer.zadd("all:steps",name,steps)
				self.redisServer.zadd(group+":steps",name,steps)
				count += 1

		return count

fit =  getFitbitData()

#Run through groups if more then 25 listings on a page go to the next page
for group in fit.groups:
	html = fit.getHTML( group[1], "0" )
	listCount = fit.parseHTML( html, group[0] )
	page = 1
	while listCount == 25:
		print "run again"
		html = fit.getHTML( group[1], str(page) )
		listCount = fit.parseHTML( html, group[0] )
		page +=1

	# Calculate Average - Pull all data from redis
	# get total and start counter for every step count greater then 0
	s = 0
	d = 0
	z = fit.redisServer.zrange(group[0]+':steps',0,-1,withscores=True)
	for x in z:
		if x[1] > 0:
			s = s +x[1]
			d += 1

	# math for average
	avg = s/d
	fit.redisServer.delete(group[0]+":avg:"+fit.date)
	fit.redisServer.sadd(group[0]+":avg:"+fit.date,avg)

print "main done"

以上是关于python 使用pycurl的cURL Fitbit社区组,使用BeautifulSoup刮取页面,发送给Redis(写入获取数据以创建Fitbit仪表板的主要内容,如果未能解决你的问题,请参考以下文章

转python版本的curl工具pycurl学习

curl库pycurl实例及参数详解

Python学习-pycurl模块

PHP的curl模块和python的pycurl模块的区别

请问一下python里的pycurl初始化,pycurl.Curl()必须能够联网才能初始化正确吗? 我本想在局域网内部测试。

python使用pycurl获取web连接信息