爬虫实例:模拟登陆新浪

Posted 上后谈爱情

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实例:模拟登陆新浪相关的知识,希望对你有一定的参考价值。

1.在模拟登陆的过程中第一步需要得到登陆前信息,用户名和密码通过js预先加密,所以必须要先将js预先加密的servertime和nonce和pubkey得到,下面json模块和re得到预先加密的信息

 1 #---coding:utf-8---
 2 import urllib2
 3 import re
 4 import json
 5 def get_servertime():
 6     url="http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939"
 7     
 8     data=urllib2.urlopen(url).read()
 9     
10     p=re.compile(\'\\((.*)\\)\')
11     
12     try:
13         json_data=p.search(data).group(1)
14         print json_data
15         data=json.loads(json_data)
16         ##采用json。loads进行解码,得到dict类型数据,从之中得到需要的数据
17         servertime = str(data[\'servertime\'])
18         nonce = data[\'nonce\']
19         print servertime,"\\n",nonce
20     except:
21        print "Get servertime data"
22        
23 if __name__==\'__main__\':
24     get_servertime()    
25         
View Code

结果上显示:

 1 {"retcode":0,"servertime":1472783606,"pcid":"gz-32dce7bbd55e33948992c2978d847ff601de","nonce":"26ISKM","pubkey":"-----BEGIN PUBLIC KEY-----\\nMIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDrKjhWhmGIf6GAvdtcq9XyHHv9\\nWcCQyy0kWoesJTBiiCcpKT5VBjUFCOf5qju3f0MzIxSQ+RX21jxV\\/i8IpJs1P0RK\\n05k8rMAtt4Sru45CqbG7\\/\\/s4vhjXjoeg5Bubj3OpKO4MzuH2c5iEuXd+T+noihu+\\nSVknrEp5mzGB1kQkQwIDAQAB\\n-----END PUBLIC KEY-----","rsakv":"1330428213","is_openlock":0,"exectime":10}
 2 1472783606 
 3 26ISKM 
 4 -----BEGIN PUBLIC KEY-----
 5 MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDrKjhWhmGIf6GAvdtcq9XyHHv9
 6 WcCQyy0kWoesJTBiiCcpKT5VBjUFCOf5qju3f0MzIxSQ+RX21jxV/i8IpJs1P0RK
 7 05k8rMAtt4Sru45CqbG7//s4vhjXjoeg5Bubj3OpKO4MzuH2c5iEuXd+T+noihu+
 8 SVknrEp5mzGB1kQkQwIDAQAB
 9 -----END PUBLIC KEY----- 
10 1330428213
View Code
在这里pwd密码采用RAS进行三次加密,用户名采用的是base64的加密机制
 1 #---coding:utf-8---
 2 import urllib2
 3 import re
 4 import json
 5 import hashlib
 6 import urllib
 7 def get_servertime():
 8     url="http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939"
 9     
10     data=urllib2.urlopen(url).read()
11     
12     p=re.compile(\'\\((.*)\\)\')
13     
14     try:
15         json_data=p.search(data).group(1)
16         
17         data=json.loads(json_data)
18         ##采用json。loads进行解码,得到dict类型数据,从之中得到需要的数据
19         servertime = str(data[\'servertime\'])
20         nonce = data[\'nonce\']
21         pubkey=data[\'pubkey\']
22         rsakv=data[\'rsakv\']
23         print servertime,"\\n",nonce,"\\n",pubkey,"\\n",rsakv
24        # return servertime,nonce,rsakv
25     except:
26        print "Get servertime data"
27        return None
28 #对密码进行加密,采用RSA机制进行三次加密
29 def get_pwd(pwd,servertime,nonce):
30     pwd1=hashlib.sha1(pwd).hexdigest()
31     pwd2=hashlib.sha1(pwd1).hexdigest()
32     pwd3_=pwd2+servertime+nonce
33     pwd3 = hashlib.sha1(pwd3_).hexdigest()
34     return pwd3
35 
36 def get_user(username):
37     ##采用base64加密
38     username_=urllib.quote(username)
39     username=base64.encodestring(username_)[:-1]
40     return username
41     
42           
43 if __name__==\'__main__\':
44     get_servertime()    
45         

 

 完成的代码,对于模拟登陆中最重要的对其加密机制,之后就是对其进行模拟登陆一般步骤,post_data和对头文件进行包装:

在这里使用的是urllib、cookie库

  1 import urllib.request
  2 import http.cookiejar
  3 import base64
  4 import json
  5 import urllib.parse
  6 import rsa
  7 import binascii
  8 import os
  9 import re
 10 import time
 11 import datetime
 12 import random
 13 
 14 
 15 username=\'\'
 16 password=\'\'
 17 
 18 
 19 cookiejar=http.cookiejar.LWPCookieJar(username)
 20 cookie=urllib.request.HTTPCookieProcessor(cookiejar)
 21 httphandle=urllib.request.HTTPHandler()
 22 opener=urllib.request.build_opener(cookie,httphandle)
 23 urllib.request.install_opener(opener)
 24 
 25 
 26 publickey=\'EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB\\
 27 784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443\'
 28 pubkey=int(publickey,16)
 29 
 30 postdata={
 31     \'entry\':\'weibo\',
 32     \'gateway\':\'1\',
 33     \'from\':\'\',
 34     \'savestate\':\'7\',
 35     \'useticket\':\'1\',
 36     \'pagerefer\':\'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D%252F\',
 37     \'vsnf\':\'1\',
 38     \'su\':\'\',
 39     \'service\':\'miniblog\',
 40     \'servertime\':\'\',
 41     \'nonce\':\'\',
 42     \'pwencode\':\'rsa2\',
 43     \'rsakv\':\'1330428213\',
 44     \'sp\':\'\',
 45     \'sr\':\'1920*1080\',
 46     \'encoding\':\'UTF-8\',
 47     \'prelt\':\'269\',
 48     \'url\':\'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack\',
 49     \'returntype\':\'META\',
 50     \'showpin\':\'0\'
 51     }
 52 def gettime():
 53     return time.mktime(datetime.datetime.now().timetuple())
 54     
 55 def openurl(url,chart=\'utf-8\',data=None):
 56     result=opener.open(url,data)
 57     result=result.read()
 58     if(chart!=\'null\'):
 59         return result.decode(chart)
 60     else:
 61         return result
 62 
 63 """
 64 login_about get begin
 65 """
 66 def b64(sth):
 67     return base64.b64encode(sth.encode()).decode(\'utf-8\')
 68 
 69 def get_su():
 70     string=urllib.parse.quote(username)
 71     return b64(string)
 72 
 73 def get_sp(st,nc):
 74     key=rsa.PublicKey(pubkey,65537)
 75     message=str(st)+\'\\t\'+str(nc)+\'\\n\'+password
 76     sp=rsa.encrypt(message.encode(),key)
 77     sp=binascii.b2a_hex(sp)
 78     return sp.decode(\'utf-8\')
 79 def get_servertime():#and nonce
 80 
 81     url=\'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&su=%s&checkpin=1&rsakt=mod\' %(get_su())
 82     page=opener.open(url)
 83     data=json.loads(page.read().decode(\'utf-8\'))
 84     
 85     result=[]
 86     result.append(str(data[\'servertime\']))
 87     result.append(str(data[\'nonce\']))
 88     result.append(str(data[\'pcid\']))
 89     return result
 90 """
 91 login_about get end
 92 match begin
 93 """
 94 def match(pattern,string):
 95     a=re.compile(pattern)
 96     result=re.findall(a,string)
 97     return result
 98 def match_fanscount(string):
 99     a=r\'fans" >([0-9]+)\'
100     result=match(a,string)
101     if(result!=[]):
102         return result[0]
103     else:
104         return 20000
105 def match_login_url(string):
106     a=r\'[a-zA-z]+://[^\\s]*=0\'
107     result=match(a,string)
108     return result[0]
109 def match_uid(string):
110     a=r\'usercard="id=([0-9]+)" href="\'
111     result=match(a,string)
112     return result
113 def match_name(string):
114     a=r"CONFIG\\[\'onick\'\\]=\'(.+)\'"
115     result=match(a,string)
116     return result[0]
117 def replace_(st):
118     a=\'\\\\\'
119     
120     return st.replace(a,\'\')
121 """
122 match end
123 do login begin
124 """
125 def login(postdata):
126     result=get_servertime()
127     servertime=result[0]
128     nonce=result[1]
129 
130     postdata[\'su\']=get_su()
131     postdata[\'sp\']=get_sp(servertime,nonce)
132     postdata[\'servertime\']=servertime
133     postdata[\'nonce\']=nonce
134     
135     postdata=urllib.parse.urlencode(postdata)
136     url=\'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)\'
137     
138     headers={
139     \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/40.0.2214.94 Safari/537.36 OPR/27.0.1689.66 (Edition Baidu)\',
140          }
141     req=urllib.request.Request(url,postdata.encode(),headers)
142     text=urllib.request.urlopen(req)
143     text=text.read()
144     
145     text=text.decode(\'gbk\')
146     result=match_login_url(text)
147     opener.open(result)
148     cookiejar.save()
149 def auto_login():
150     cookiejar.load()
151     html=openurl(\'http://weibo.com/\',\'gbk\')
152     url=match_login_url(html)
153     opener.open(url)
154     """
155     opener.open(\'http://passport.weibo.cn/sso/crossdomain?action=login&savestate=1&retcode=0\')
156     result=openurl(\'http://weibo.com/2598335181/follow?rightmod=1&wvr=6\')
157     print(result)"""
158 
159 """
160 """
161 
162 
163 if(os.path.exists(username)==True):
164     print(\'检测到cookie,自动登录\')
165     auto_login()
166 else:
167     login(postdata)

 在接下来采用requests下session模块,在session会话条件下,不需要再每一次提交的过程中都要保存cookies,在session.post()方法的时候就已经将cookies自动提交上去了,使用Seesion一直与服务器保持会话:

但是自己编写的程序在最后对网页上的内容进行重定向的时候出现了一些问题:

  1 #!/usr/bin/env python
  2 #---coding:utf-8----
  3 import requests
  4 import json
  5 import urllib
  6 import re
  7 import base64
  8 import rsa
  9 import binascii
 10 from matplotlib._image import Image
 11 
 12 username="*********"
 13 pwd=\'***************\'
 14 def get_su(username):
 15     username_html=urllib.quote(username).encode(\'utf-8\')
 16     su=base64.b64encode(username_html).decode(\'utf-8\')
 17    # print su
 18     return su
 19 
 20 def get_sth(su):
 21     # 改字典内的数据经过精简,只有在这些数据存在下才不影响获得所需的准确数据
 22     payload = {\'entry\': \'weibo\', \'rsakt\': \'mod\', \'su\': su, \'checkpin\': \'1\' }
 23     res = requests.get(\'http://login.sina.com.cn/sso/prelogin.php\',
 24                        params=payload).text
 25     res = eval(res)##将字符串以一种理解的文档进行表达,得到pubkey可以运用任何形式
 26    # print(res)
 27     return res
 28 
 29 def get_sp(pwd,nonce,servertime,pubkey):
 30     ##创建公钥的长度:pubkey的公钥在新浪中得到是固定的
 31     key=rsa.PublicKey(int(pubkey,16),65537)
 32     message=str(servertime)+\'\\t\'+str(nonce)+\'\\n\'+pwd
 33     
 34     ##对集进行加密:
 35     passwd=rsa.encrypt(message.encode(\'utf-8\'),key)
 36     ##将加密信息转换成16进制
 37     sp=binascii.b2a_hex(passwd)
 38     return sp
 39 def get_pin(pcid):
 40     payload={\'s\':\'0\',\'p\':pcid}
 41     pin_url = "http://login.sina.com.cn/cgi/pin.php"
 42     
 43     Res=requests.Session().get(pin_url,params=payload)
 44     
 45     ##把图片进行显示
 46     with open(\'cha.jpg\',\'w+\') as f:
 47         f.write(Res.content)
 48         f.close()
 49     try:
 50         
 51         im=Image.open(\'cha.jpg\')
 52         im.show()
 53         im.close()
 54     except:
 55         print(\'在当前目录下没有找到图片\')
 56     
 57 if __name__==\'__main__\':
 58     su=get_su(username)##直接对用户名进行加密,采用requets模块得到密码加密信息
 59     res=get_sth(su)
 60     print res
 61     nonce=res[\'nonce\']
 62     rsakv=res[\'rsakv\']
 63     servertime=res[\'servertime\']
 64     pcid=res[\'pcid\']
 65     pubkey=res[\'pubkey\']
 66     showin=False##判断代码是否用验证码
 67     ##获取其中密码,通过RSA加密
 68     sp=get_sp(pwd,nonce,servertime,pubkey)
 69     print \'加密用户名:\',su,\'加密的密码:\',sp
 70     
 71     ####开始向客户端进行post_data:
 72     
 73     headers = {
 74             \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) \'
 75                           \'AppleWebKit/537.36 (KHTML, like Gecko) \'
 76                           \'Chrome/47.0.2526.80 Safari/537.36\'
 77         }
 78     payload = {
 79         \'entry\': \'weibo\',
 80         \'gateway\': \'1以上是关于爬虫实例:模拟登陆新浪的主要内容,如果未能解决你的问题,请参考以下文章

Java 模拟新浪登录 2016

定向爬虫 - Python模拟新浪微博登录

定向爬虫 - Python模拟新浪微博登录

java+selenium模拟登陆新浪微博demo

Python 爬虫实例—— 爬取 新浪军事新闻

爬虫新浪微博爬虫——环境部署