新浪微博模拟登陆+数据抓取(java实现)

Posted EliteQing

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了新浪微博模拟登陆+数据抓取(java实现)相关的知识,希望对你有一定的参考价值。

 模拟登陆部分实现:

 
package token.exe;

import java.math.BigInteger;
import java.util.Random;

import org.apache.commons.codec.binary.Base64;

public class WeiboEncoder {

    private static BigInteger n = null;
    private static BigInteger e = null;
    
    /**
     * 使用Base64加密用户名(su的获取)
     * @param account
     * @return
     */
    @SuppressWarnings("deprecation")
    public static String encodeAccount(String account){        
        return new String(Base64.encodeBase64(account.getBytes()));
    }
    
    /**
     * 使用RSAEncrypt对用户密码进行加密(sp的获取)
     * @param pwd
     * @param nStr
     * @param eStr
     * @return
     */
    public static String RSAEncrypt(String pwd, String nStr, String eStr){
        n = new BigInteger(nStr,16);
        e = new BigInteger(eStr,16);
        
        BigInteger r = RSADoPublic(pkcs1pad2(pwd,(n.bitLength()+7)>>3));
        String sp = r.toString(16);
        if((sp.length()&1) != 0 ) 
            sp = "0" + sp;
        return sp;
    }
    
    private static BigInteger RSADoPublic(BigInteger x){
         return x.modPow(e, n);
    }
    
    private static BigInteger pkcs1pad2(String s, int n){
        if(n < s.length() + 11) { // TODO: fix for utf-8
            System.err.println("Message too long for RSA");
            return null;
          }
        byte[] ba = new byte[n];
        int i = s.length()-1;
        while(i >= 0 && n > 0) {
            int c = s.codePointAt(i--);
            if(c < 128) { // encode using utf-8
              ba[--n] = new Byte(String.valueOf(c));
            }
            else if((c > 127) && (c < 2048)) {
              ba[--n] = new Byte(String.valueOf((c & 63) | 128));
              ba[--n] = new Byte(String.valueOf((c >> 6) | 192));
            }
            else {
              ba[--n] = new Byte(String.valueOf((c & 63) | 128));
              ba[--n] = new Byte(String.valueOf(((c >> 6) & 63) | 128));
              ba[--n] = new Byte(String.valueOf((c >> 12) | 224));
            }
          }
        ba[--n] = new Byte("0");
        
        byte[] temp = new byte[1];
        Random rdm = new Random(47L);
        
        while(n > 2) { // random non-zero pad
            temp[0] = new Byte("0");
            while(temp[0] == 0) 
                rdm.nextBytes(temp);
            ba[--n] = temp[0];
        }
        ba[--n] = 2;
        ba[--n] = 0;
        
        return new BigInteger(ba);
    }
     
    
}

 参数实体:

 

package token.def;

import java.io.Serializable;

public class LoginParams implements Serializable {
    
    private static final long serialVersionUID = -5775728968372860382L;
    private String pcid;
    private String servertime;
    private String nonce;
    private String rsakv;
    private String imgUrl;
    private String sp;
    private String code;
    private boolean isLogin = true;
    
    public String getPcid() {
        return pcid;
    }
    
    public void setPcid(String pcid) {
        this.pcid = pcid;
    }
    
    public String getServertime() {
        return servertime;
    }
    
    public void setServertime(String servertime) {
        this.servertime = servertime;
    }
    
    public String getNonce() {
        return nonce;
    }
    public void setNonce(String nonce) {
        this.nonce = nonce;
    }
    
    public String getRsakv() {
        return rsakv;
    }
    
    public void setRsakv(String rsakv) {
        this.rsakv = rsakv;
    }
    
    public String getImgUrl() {
        return imgUrl;
    }

    public void setImgUrl(String imgUrl) {
        this.imgUrl = imgUrl;
    }
    
    public String getSp() {
        return sp;
    }

    public void setSp(String sp) {
        this.sp = sp;
    }

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public boolean isLogin() {
        return isLogin;
    }

    public void setLogin(boolean isLogin) {
        this.isLogin = isLogin;
    }

    @Override
    public String toString() {
        return "LoginParams [pcid=" + pcid + ", servertime=" + servertime
                + ", nonce=" + nonce + ", rsakv=" + rsakv + ", imgUrl="
                + imgUrl + ", sp=" + sp + ", code=" + code + ", isLogin="
                + isLogin + "]";
    }
    
}

登陆部分实现: package token.exe; import java.io.FileOutputStream; import java.io.IOException; import java.net.URLEncoder; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Properties; import java.util.Scanner; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.HttpVersion; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.httpclient.params.HttpClientParams; import org.apache.commons.httpclient.params.HttpConnectionManagerParams; import org.apache.commons.httpclient.protocol.Protocol; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import token.SinaWeiboOAuth; import token.def.LoginParams; import weibo4j.model.MySSLSocketFactory; public class WeiboLoginer { private HttpClient httpClient; //httpClient实例初始化 public WeiboLoginer() { //httpclient连接配置 MultiThreadedHttpConnectionManager httpManager = new MultiThreadedHttpConnectionManager(); HttpConnectionManagerParams connectParams = httpManager.getParams(); connectParams.setConnectionTimeout(3000); connectParams.setDefaultMaxConnectionsPerHost(100); connectParams.setSoTimeout(3000); //httpclient参数配置 HttpClientParams httpParams = new HttpClientParams(); httpParams.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); httpParams.setVersion(HttpVersion.HTTP_1_1); //设置默认Header List<Header> headers = new ArrayList<Header>(); headers.add(new Header("Content-Type", "application/x-www-form-urlencoded")); headers.add(new Header("Host", "login.sina.com.cn")); headers.add(new Header("User-Agent","Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0")); headers.add(new Header("API-RemoteIP", "192.168.0.1"));//伪造新浪验证IP headers.add(new Header("X-Forwarded-For","192.168.0.1"));//伪造真实IP headers.add(new Header("CLIENT-IP", "192.168.0.1"));//伪造客户端IP //初始化httpclient httpClient = new HttpClient(httpParams, httpManager); httpClient.getHostConfiguration().getParams().setParameter("http.default-headers", headers); //设置ssl协议 Protocol protocol = new Protocol("https",new MySSLSocketFactory(), 443); Protocol.registerProtocol("https", protocol); //设置代理 // httpClient.getHostConfiguration().setProxy("", 0); // httpClient.getParams().setAuthenticationPreemptive(false); } /** * 登陆并获取code值,如果出现验证码则返回还有验证码的参数信息 * @return */ public LoginParams doLogin(String username, String password) { Properties properties = initProperties(); String base64UserCount = WeiboEncoder.encodeAccount(username); HashMap<String, String> pubkeyMap = null; String sp = null; String imgUrl = null; LoginParams loginParams = new LoginParams(); try { pubkeyMap = pubKeyMap(base64UserCount); sp = WeiboEncoder.RSAEncrypt(password, pubkeyMap.get("pubkey"),"10001"); imgUrl = getPin(pubkeyMap); if (imgUrl != null) { loginParams.setPcid(pubkeyMap.get("pcid")); loginParams.setNonce(pubkeyMap.get("nonce")); loginParams.setServertime(pubkeyMap.get("servertime")); loginParams.setRsakv(pubkeyMap.get("rsakv")); loginParams.setImgUrl(imgUrl); loginParams.setSp(sp); return loginParams; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } HashMap<String, String> ticketMap = null; try { ticketMap = getTicket(base64UserCount, sp, pubkeyMap); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } //确认在最终登陆后是否再需要验证码(账号为新浪的注册邮箱) String vcUrl = isHasPinAgain(pubkeyMap, ticketMap); if (vcUrl != null) { loginParams.setPcid(pubkeyMap.get("pcid")); loginParams.setNonce(pubkeyMap.get("nonce")); loginParams.setServertime(pubkeyMap.get("servertime")); loginParams.setRsakv(pubkeyMap.get("rsakv")); loginParams.setImgUrl(imgUrl); loginParams.setSp(sp); return loginParams; } try { String code = authorize(ticketMap.get("ticket"), properties.getProperty("authorizeURL"), properties.getProperty("redirect_URI"), properties.getProperty("client_ID"), username, ticketMap.get("uid")); loginParams.setCode(code); } catch (KeyManagementException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (NoSuchAlgorithmException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return loginParams; } /** * 有验证码时登陆 * @param sp * @param pin * @param pcid * @param servertime * @param nonce * @param rsakv * @return */ public LoginParams doLoginByPin(String username, String sp, String pin, String pcid, String servertime,String nonce,String rsakv ) { Properties properties = initProperties(); String base64UserCount = WeiboEncoder.encodeAccount(username); HashMap<String, String> ticketMap = null; LoginParams params = new LoginParams(); try { ticketMap = getTicket(base64UserCount, sp, pin, pcid, servertime, nonce, rsakv); if (ticketMap.containsKey("reason")) { //意为"输入的验证码不正确" String reply = "\\\\u8f93\\\\u5165\\\\u7684\\\\u9a8c\\\\u8bc1\\\\u7801\\\\u4e0d\\\\u6b63\\\\u786e"; String reasonStr = ticketMap.get("reason"); if (reasonStr.equals(reply)) { params.setLogin(false); return params; } } String code = authorize(ticketMap.get("ticket"), properties.getProperty("authorizeURL"), properties.getProperty("redirect_URI"), properties.getProperty("client_ID"), username, ticketMap.get("uid")); params.setCode(code); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return params; } /** * 模拟新浪授权 * @param ticket ticket参数 * @param redirectURI 回调地址 * @param clientId appKey * @param username 用户名 * @return token * @throws IOException * @throws KeyManagementException * @throws NoSuchAlgorithmException */ private String authorize(String ticket, String authorizeURL, String redirectURI, String clientId, String username, String uid) throws IOException, KeyManagementException, NoSuchAlgorithmException { String code = null; String url = authorizeURL + "?client_id=" + clientId + "&redirect_uri=" + redirectURI + "&response_type=code&forcelogin=true"; String regCallback = authorizeURL + "?client_id=" + clientId + "&redirect_uri=" + redirectURI + "&response_type=code&display=default&from=&with_cookie="; PostMethod post = new PostMethod(authorizeURL); //模拟申请token的链接,如果不添加,那么回调地址返回则为空 post.setRequestHeader("Referer",url); // 模拟登录时所要提交的参数信息 NameValuePair[] formpPairs=new NameValuePair[]{ new NameValuePair("action", "login"), new NameValuePair("userId",username), new NameValuePair("ticket", ticket), new NameValuePair("response_type", "code"), new NameValuePair("redirect_uri", redirectURI), new NameValuePair("client_id", clientId), new NameValuePair("regCallback", URLEncoder.encode(regCallback, "UTF-8")) }; post.setRequestBody(formpPairs); int status = httpClient.executeMethod(post); if (status == HttpStatus.SC_OK) { byte[] htmlDatas = post.getResponseBody(); code = authorizeAgain(htmlDatas, ticket, authorizeURL, redirectURI, clientId, username, uid); }else if (status == 302) { Header locationHeader = post.getResponseHeader("location"); String location = locationHeader.getValue(); code = location.substring(location.indexOf("=")+1); } return code; } /** * 二次提交授权申请 * @param htmlDatas 第一次授权申请返回的页面数据 * @return * @throws IOException * @throws HttpException */ private String authorizeAgain(byte[] htmlDatas, String ticket, String authorizeURL, String redirectURI,String clientId, String username, String uid) throws HttpException, IOException { String verifyToken = null; String html = new String(htmlDatas, "utf-8"); Document doc = Jsoup.parse(html); Element verifyTokeneElement = doc.select("input[name=verifyToken]").first(); verifyToken = verifyTokeneElement.attr("value"); String code = null; String url = authorizeURL + "?client_id=" + clientId + "&redirect_uri=" + redirectURI + "&response_type=code&forcelogin=true"; String regCallback = authorizeURL + "?client_id=" + clientId + "&redirect_uri=" + redirectURI + "&response_type=code&display=default&from=&with_cookie="; PostMethod post = new PostMethod(authorizeURL); //模拟申请token的链接,如果不添加,那么回调地址返回则为空 post.setRequestHeader("Referer",authorizeURL); // 模拟登录时所要提交的参数信息 NameValuePair[] formpPairs=new NameValuePair[]{ new NameValuePair("action", "authorize"), new NameValuePair("uid",uid), new NameValuePair("url", url), new NameValuePair("response_type", "code"), new NameValuePair("redirect_uri", redirectURI), new NameValuePair("client_id", clientId), new NameValuePair("verifyToken", verifyToken), new NameValuePair("regCallback", URLEncoder.encode(regCallback, "UTF-8")) }; post.setRequestBody(formpPairs); int status = httpClient.executeMethod(post); if (status == 302) { Header locationHeader = post.getResponseHeader("location"); String location = locationHeader.getValue(); if (location == null) { throw new NullPointerException("redirect_uri is null"); } code = location.substring(location.indexOf("=")+1); } return code; } /** * 模拟用户预登录 * @param unameBase64 * @return * @throws IOException */ private HashMap<String, String> pubKeyMap(String unameBase64) throws IOException { String url = "https://login.sina.com.cn/sso/prelogin.php?" + "entry=openapi&" + "callback=sinaSSOController.preloginCallBack&" + "su=" + unameBase64 + "&" + "rsakt=mod&" + "checkpin=1&" + "client=ssologin.js(v1.4.5)" + "&_=" + new Date().getTime(); return getParaFromResult(get(url)); } /** * 预登陆是否需要验证码 * @param pubkeyMap * @return */ private String getPin(HashMap<String, String> pubkeyMap) { String imgUrl = null; int isShowpin = 0; if (pubkeyMap != null) { String showpin = pubkeyMap.get("showpin"); if (showpin != null) { isShowpin = Integer.parseInt(showpin); if (isShowpin == 1) { String url = "https://login.sina.com.cn/cgi/pin.php?" + "r=" + Math.floor(Math.random() * 100000000) + "&s=0" + "&p=" + pubkeyMap.get("pcid"); imgUrl = url; } } } return imgUrl; } /** * 确认登陆后是否需要再验证 * @return */ private String isHasPinAgain(HashMap<String, String> pubkeyMap, HashMap<String, String> ticketMap) { String imgUrl = null; int isHasPin = 0; if ((pubkeyMap != null) && (ticketMap != null)) { //意为"为了您的帐号安全,请输入验证码" String str = "\\\\u4e3a\\\\u4e86\\\\u60a8\\\\u7684\\\\u5e10\\\\u53f7\\\\u5b89" + "\\\\u5168\\\\uff0c\\\\u8bf7\\\\u8f93\\\\u5165\\\\u9a8c\\\\u8bc1\\\\u7801"; if (ticketMap.containsKey("reason")) { String reasonStr = ticketMap.get("reason"); if (reasonStr.equals(str)) { isHasPin = 1; String url = "https://login.sina.com.cn/cgi/pin.php?" + "r=" + Math.floor(Math.random() * 100000000) + "&s=0" + "&p=" + pubkeyMap.get("pcid"); imgUrl = url; } } } return imgUrl; } /** * 获取验证码 */ public String getVCode(String pcid) { String imgUrl = null; if (pcid != null) { String url = "https://login.sina.com.cn/cgi/pin.php?" + "r=" + Math.floor(Math.random() * 100000000) + "&s=0" + "&p=" + pcid; imgUrl = url; } return imgUrl; } /** * 保存验证码 * @param url 验证码链接 */ public void saveVCodeImg(String url) { GetMethod getImages = new GetMethod(url); try { int status = httpClient.executeMethod(getImages); if (status == HttpStatus.SC_OK) { FileOutputStream outputStream = new FileOutputStream("vc.jpg"); outputStream.write(getImages.getResponseBody()); outputStream.close(); } } catch (HttpException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 无验证码时模拟用户登录,并获取ticket * @param usernameBase64 使用Base64加密的用户名 * @param sp 使用SHA1加密后的用户密码 * @return * @throws Exception */ private HashMap<String, String> getTicket(String usernameBase64, String sp, HashMap<String, String> pubkeyMap) throws Exception { String url = null; if (pubkeyMap != null) { url = "https://login.sina.com.cn/sso/login.php?" + "entry=openapi&" + "gateway=1&" + "from=&" + "savestate=0&" + "useticket=1&" + "pagerefer=&" + "ct=1800&" + "s=1&" + "vsnf=1&" + "vsnval=&" + "door=&" + "su="+ usernameBase64 + "&" + "service=miniblog&" + "servertime="+ pubkeyMap.get("servertime") + "&" + "nonce="+ pubkeyMap.get("nonce") + "&" + "pwencode=rsa&" + "rsakv="+ pubkeyMap.get("rsakv") + "&" + "sp="+ sp + "&" + "encoding=UTF-8&" + "callback=sinaSSOController.loginCallBack&" + "cdult=2&" + "domain=weibo.com&" + "prelt=37&" + "returntype=TEXT&" + "client=ssologin.js(v1.4.5)&" + "_=" + new Date().getTime(); } return getParaFromResult(get(url)); } /** * 有验证码时模拟用户登录,并获取ticket * @param usernameBase64 * @param sp * @param pin * @param pcid * @param servertime * @param nonce * @param rsakv * @return * @throws Exception */ public HashMap<String, String> getTicket(String usernameBase64, String sp, String pin, String pcid, String servertime,String nonce,String rsakv) throws Exception { String url = "https://login.sina.com.cn/sso/login.php?" + "entry=openapi&" + "gateway=1&" + "from=&" + "savestate=0&" + "useticket=1&" + "pagerefer=&" + "pcid=" + pcid + "&" + "ct=1800&" + "s=1&" + "vsnf=1&" + "vsnval=&" + "door=" + pin + "&" + "su="+ usernameBase64 + "&" + "service=miniblog&" + "servertime="+ servertime + "&" + "nonce="+ nonce + "&" + "pwencode=rsa&" + "rsakv="+ rsakv + "&" + "sp="+ sp + "&" + "encoding=UTF-8&" + "callback=sinaSSOController.loginCallBack&" + "cdult=2&" + "domain=weibo.com&" + "prelt=37&" + "returntype=TEXT&" + "client=ssologin.js(v1.4.5)&" + "_=" + new Date().getTime(); return getParaFromResult(get(url)); } /** * 分析结果,取出所需参数 * @param result 页面内容 * @return */ private HashMap<String, String> getParaFromResult(String result) { HashMap<String, String> hm = new HashMap<String, String>(); result = result.substring(result.indexOf("{") + 1, result.indexOf("}")); String[] r = result.split(","); String[] temp; for (int i = 0; i < r.length; i++) { temp = r[i].split(":"); for (int j = 0; j < 2; j++) { if (temp[j].contains("\\"")) temp[j] = temp[j].substring(1, temp[j].length() - 1); } hm.put(temp[0], temp[1]); } return hm; } /** * 执行给定的URL,并输出目标URL返回的页面结果 * @param url * @return * @throws IOException */ private String get(String url) throws IOException { String surl = null; GetMethod getMethod = new GetMethod(url); int status = httpClient.executeMethod(getMethod); if (status == HttpStatus.SC_OK) { surl = new String(getMethod.getResponseBody(), "UTF-8"); } getMethod.releaseConnection(); return surl; } /** * 配置信息初始化 * @return */ private Properties initProperties() { Properties prop = new Properties(); try { prop.load(Thread.currentThread().getContextClassLoader(). getResourceAsStream("config.properties")); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return prop; } /** * @param args */ public static void main(String[] args) { WeiboLoginer loginer = new WeiboLoginer(); LoginParams loginParams = loginer.doLogin("",""); //有验证码时 if (loginParams.getCode() == null) { String pcid = loginParams.getPcid(); String nonce = loginParams.getNonce(); String rsakv = loginParams.getRsakv(); String servertime = loginParams.getServertime(); String sp = loginParams.getSp(); System.err.println(loginParams.getImgUrl()); //再次获取验证码 System.err.println(loginer.getVCode(pcid)); Scanner input = new Scanner(System.in); String pin = input.nextLine(); LoginParams loginResult = loginer.doLoginByPin("",sp, pin, pcid, servertime, nonce, rsakv); if (!loginResult.isLogin()) { System.err.println("验证码错误!重新录入"); //获取验证码并保存(测试) String imgUrl = loginer.getVCode(pcid); loginer.saveVCodeImg(imgUrl); Scanner input1= new Scanner(System.in); String pin1 = input1.nextLine(); String code = loginer.doLoginByPin("",sp, pin1, pcid, servertime, nonce, rsakv).getCode(); System.out.println(SinaWeiboOAuth.getToken(code)); } }else { //无验证码时 String code = loginParams.getCode(); System.out.println(SinaWeiboOAuth.getToken(code)); } } }

参考地址 http://www.cnblogs.com/zhengbing/p/3459249.html

以上是关于新浪微博模拟登陆+数据抓取(java实现)的主要内容,如果未能解决你的问题,请参考以下文章

Java模拟新浪微博登陆抓取数据

Java模拟新浪微博登陆抓取数据

腾讯微博模拟登陆+数据抓取(java实现)

java+selenium模拟登陆新浪微博demo

定向爬虫 - Python模拟新浪微博登录

定向爬虫 - Python模拟新浪微博登录