爬虫实例:模拟登陆新浪
Posted 上后谈爱情
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实例:模拟登陆新浪相关的知识,希望对你有一定的参考价值。
1.在模拟登陆的过程中第一步需要得到登陆前信息,用户名和密码通过js预先加密,所以必须要先将js预先加密的servertime和nonce和pubkey得到,下面json模块和re得到预先加密的信息
1 #---coding:utf-8--- 2 import urllib2 3 import re 4 import json 5 def get_servertime(): 6 url="http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939" 7 8 data=urllib2.urlopen(url).read() 9 10 p=re.compile(\'\\((.*)\\)\') 11 12 try: 13 json_data=p.search(data).group(1) 14 print json_data 15 data=json.loads(json_data) 16 ##采用json。loads进行解码,得到dict类型数据,从之中得到需要的数据 17 servertime = str(data[\'servertime\']) 18 nonce = data[\'nonce\'] 19 print servertime,"\\n",nonce 20 except: 21 print "Get servertime data" 22 23 if __name__==\'__main__\': 24 get_servertime() 25
结果上显示:
1 {"retcode":0,"servertime":1472783606,"pcid":"gz-32dce7bbd55e33948992c2978d847ff601de","nonce":"26ISKM","pubkey":"-----BEGIN PUBLIC KEY-----\\nMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDrKjhWhmGIf6GAvdtcq9XyHHv9\\nWcCQyy0kWoesJTBiiCcpKT5VBjUFCOf5qju3f0MzIxSQ+RX21jxV\\/i8IpJs1P0RK\\n05k8rMAtt4Sru45CqbG7\\/\\/s4vhjXjoeg5Bubj3OpKO4MzuH2c5iEuXd+T+noihu+\\nSVknrEp5mzGB1kQkQwIDAQAB\\n-----END PUBLIC KEY-----","rsakv":"1330428213","is_openlock":0,"exectime":10} 2 1472783606 3 26ISKM 4 -----BEGIN PUBLIC KEY----- 5 MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDrKjhWhmGIf6GAvdtcq9XyHHv9 6 WcCQyy0kWoesJTBiiCcpKT5VBjUFCOf5qju3f0MzIxSQ+RX21jxV/i8IpJs1P0RK 7 05k8rMAtt4Sru45CqbG7//s4vhjXjoeg5Bubj3OpKO4MzuH2c5iEuXd+T+noihu+ 8 SVknrEp5mzGB1kQkQwIDAQAB 9 -----END PUBLIC KEY----- 10 1330428213
在这里pwd密码采用RAS进行三次加密,用户名采用的是base64的加密机制
1 #---coding:utf-8--- 2 import urllib2 3 import re 4 import json 5 import hashlib 6 import urllib 7 def get_servertime(): 8 url="http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939" 9 10 data=urllib2.urlopen(url).read() 11 12 p=re.compile(\'\\((.*)\\)\') 13 14 try: 15 json_data=p.search(data).group(1) 16 17 data=json.loads(json_data) 18 ##采用json。loads进行解码,得到dict类型数据,从之中得到需要的数据 19 servertime = str(data[\'servertime\']) 20 nonce = data[\'nonce\'] 21 pubkey=data[\'pubkey\'] 22 rsakv=data[\'rsakv\'] 23 print servertime,"\\n",nonce,"\\n",pubkey,"\\n",rsakv 24 # return servertime,nonce,rsakv 25 except: 26 print "Get servertime data" 27 return None 28 #对密码进行加密,采用RSA机制进行三次加密 29 def get_pwd(pwd,servertime,nonce): 30 pwd1=hashlib.sha1(pwd).hexdigest() 31 pwd2=hashlib.sha1(pwd1).hexdigest() 32 pwd3_=pwd2+servertime+nonce 33 pwd3 = hashlib.sha1(pwd3_).hexdigest() 34 return pwd3 35 36 def get_user(username): 37 ##采用base64加密 38 username_=urllib.quote(username) 39 username=base64.encodestring(username_)[:-1] 40 return username 41 42 43 if __name__==\'__main__\': 44 get_servertime() 45
完成的代码,对于模拟登陆中最重要的对其加密机制,之后就是对其进行模拟登陆一般步骤,post_data和对头文件进行包装:
在这里使用的是urllib、cookie库
1 import urllib.request 2 import http.cookiejar 3 import base64 4 import json 5 import urllib.parse 6 import rsa 7 import binascii 8 import os 9 import re 10 import time 11 import datetime 12 import random 13 14 15 username=\'\' 16 password=\'\' 17 18 19 cookiejar=http.cookiejar.LWPCookieJar(username) 20 cookie=urllib.request.HTTPCookieProcessor(cookiejar) 21 httphandle=urllib.request.HTTPHandler() 22 opener=urllib.request.build_opener(cookie,httphandle) 23 urllib.request.install_opener(opener) 24 25 26 publickey=\'EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB\\ 27 784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443\' 28 pubkey=int(publickey,16) 29 30 postdata={ 31 \'entry\':\'weibo\', 32 \'gateway\':\'1\', 33 \'from\':\'\', 34 \'savestate\':\'7\', 35 \'useticket\':\'1\', 36 \'pagerefer\':\'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F\', 37 \'vsnf\':\'1\', 38 \'su\':\'\', 39 \'service\':\'miniblog\', 40 \'servertime\':\'\', 41 \'nonce\':\'\', 42 \'pwencode\':\'rsa2\', 43 \'rsakv\':\'1330428213\', 44 \'sp\':\'\', 45 \'sr\':\'1920*1080\', 46 \'encoding\':\'UTF-8\', 47 \'prelt\':\'269\', 48 \'url\':\'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack\', 49 \'returntype\':\'META\', 50 \'showpin\':\'0\' 51 } 52 def gettime(): 53 return time.mktime(datetime.datetime.now().timetuple()) 54 55 def openurl(url,chart=\'utf-8\',data=None): 56 result=opener.open(url,data) 57 result=result.read() 58 if(chart!=\'null\'): 59 return result.decode(chart) 60 else: 61 return result 62 63 """ 64 login_about get begin 65 """ 66 def b64(sth): 67 return base64.b64encode(sth.encode()).decode(\'utf-8\') 68 69 def get_su(): 70 string=urllib.parse.quote(username) 71 return b64(string) 72 73 def get_sp(st,nc): 74 key=rsa.PublicKey(pubkey,65537) 75 message=str(st)+\'\\t\'+str(nc)+\'\\n\'+password 76 sp=rsa.encrypt(message.encode(),key) 77 sp=binascii.b2a_hex(sp) 78 return sp.decode(\'utf-8\') 79 def get_servertime():#and nonce 80 81 url=\'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&su=%s&checkpin=1&rsakt=mod\' %(get_su()) 82 page=opener.open(url) 83 data=json.loads(page.read().decode(\'utf-8\')) 84 85 result=[] 86 result.append(str(data[\'servertime\'])) 87 result.append(str(data[\'nonce\'])) 88 result.append(str(data[\'pcid\'])) 89 return result 90 """ 91 login_about get end 92 match begin 93 """ 94 def match(pattern,string): 95 a=re.compile(pattern) 96 result=re.findall(a,string) 97 return result 98 def match_fanscount(string): 99 a=r\'fans" >([0-9]+)\' 100 result=match(a,string) 101 if(result!=[]): 102 return result[0] 103 else: 104 return 20000 105 def match_login_url(string): 106 a=r\'[a-zA-z]+://[^\\s]*=0\' 107 result=match(a,string) 108 return result[0] 109 def match_uid(string): 110 a=r\'usercard="id=([0-9]+)" href="\' 111 result=match(a,string) 112 return result 113 def match_name(string): 114 a=r"CONFIG\\[\'onick\'\\]=\'(.+)\'" 115 result=match(a,string) 116 return result[0] 117 def replace_(st): 118 a=\'\\\\\' 119 120 return st.replace(a,\'\') 121 """ 122 match end 123 do login begin 124 """ 125 def login(postdata): 126 result=get_servertime() 127 servertime=result[0] 128 nonce=result[1] 129 130 postdata[\'su\']=get_su() 131 postdata[\'sp\']=get_sp(servertime,nonce) 132 postdata[\'servertime\']=servertime 133 postdata[\'nonce\']=nonce 134 135 postdata=urllib.parse.urlencode(postdata) 136 url=\'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)\' 137 138 headers={ 139 \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/40.0.2214.94 Safari/537.36 OPR/27.0.1689.66 (Edition Baidu)\', 140 } 141 req=urllib.request.Request(url,postdata.encode(),headers) 142 text=urllib.request.urlopen(req) 143 text=text.read() 144 145 text=text.decode(\'gbk\') 146 result=match_login_url(text) 147 opener.open(result) 148 cookiejar.save() 149 def auto_login(): 150 cookiejar.load() 151 html=openurl(\'http://weibo.com/\',\'gbk\') 152 url=match_login_url(html) 153 opener.open(url) 154 """ 155 opener.open(\'http://passport.weibo.cn/sso/crossdomain?action=login&savestate=1&retcode=0\') 156 result=openurl(\'http://weibo.com/2598335181/follow?rightmod=1&wvr=6\') 157 print(result)""" 158 159 """ 160 """ 161 162 163 if(os.path.exists(username)==True): 164 print(\'检测到cookie,自动登录\') 165 auto_login() 166 else: 167 login(postdata)
在接下来采用requests下session模块,在session会话条件下,不需要再每一次提交的过程中都要保存cookies,在session.post()方法的时候就已经将cookies自动提交上去了,使用Seesion一直与服务器保持会话:
但是自己编写的程序在最后对网页上的内容进行重定向的时候出现了一些问题:
1 #!/usr/bin/env python 2 #---coding:utf-8---- 3 import requests 4 import json 5 import urllib 6 import re 7 import base64 8 import rsa 9 import binascii 10 from matplotlib._image import Image 11 12 username="*********" 13 pwd=\'***************\' 14 def get_su(username): 15 username_html=urllib.quote(username).encode(\'utf-8\') 16 su=base64.b64encode(username_html).decode(\'utf-8\') 17 # print su 18 return su 19 20 def get_sth(su): 21 # 改字典内的数据经过精简,只有在这些数据存在下才不影响获得所需的准确数据 22 payload = {\'entry\': \'weibo\', \'rsakt\': \'mod\', \'su\': su, \'checkpin\': \'1\' } 23 res = requests.get(\'http://login.sina.com.cn/sso/prelogin.php\', 24 params=payload).text 25 res = eval(res)##将字符串以一种理解的文档进行表达,得到pubkey可以运用任何形式 26 # print(res) 27 return res 28 29 def get_sp(pwd,nonce,servertime,pubkey): 30 ##创建公钥的长度:pubkey的公钥在新浪中得到是固定的 31 key=rsa.PublicKey(int(pubkey,16),65537) 32 message=str(servertime)+\'\\t\'+str(nonce)+\'\\n\'+pwd 33 34 ##对集进行加密: 35 passwd=rsa.encrypt(message.encode(\'utf-8\'),key) 36 ##将加密信息转换成16进制 37 sp=binascii.b2a_hex(passwd) 38 return sp 39 def get_pin(pcid): 40 payload={\'s\':\'0\',\'p\':pcid} 41 pin_url = "http://login.sina.com.cn/cgi/pin.php" 42 43 Res=requests.Session().get(pin_url,params=payload) 44 45 ##把图片进行显示 46 with open(\'cha.jpg\',\'w+\') as f: 47 f.write(Res.content) 48 f.close() 49 try: 50 51 im=Image.open(\'cha.jpg\') 52 im.show() 53 im.close() 54 except: 55 print(\'在当前目录下没有找到图片\') 56 57 if __name__==\'__main__\': 58 su=get_su(username)##直接对用户名进行加密,采用requets模块得到密码加密信息 59 res=get_sth(su) 60 print res 61 nonce=res[\'nonce\'] 62 rsakv=res[\'rsakv\'] 63 servertime=res[\'servertime\'] 64 pcid=res[\'pcid\'] 65 pubkey=res[\'pubkey\'] 66 showin=False##判断代码是否用验证码 67 ##获取其中密码,通过RSA加密 68 sp=get_sp(pwd,nonce,servertime,pubkey) 69 print \'加密用户名:\',su,\'加密的密码:\',sp 70 71 ####开始向客户端进行post_data: 72 73 headers = { 74 \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) \' 75 \'AppleWebKit/537.36 (KHTML, like Gecko) \' 76 \'Chrome/47.0.2526.80 Safari/537.36\' 77 } 78 payload = { 79 \'entry\': \'weibo\', 80 \'gateway\': \'1以上是关于爬虫实例:模拟登陆新浪的主要内容,如果未能解决你的问题,请参考以下文章