httpclient:Ip 代理

Posted Michael2397

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了httpclient:Ip 代理相关的知识,希望对你有一定的参考价值。

参考:http://blog.csdn.net/sdfiiiiii/article/details/70432060  http://blog.csdn.net/qy20115549/article/details/54945974

第一篇博客可以获取http://www.xicidaili.com/网站上所有的代理ip,并测试可不可以用(貌似不是很准),可用的代理ip放到一个list中

第二篇博客是直接将代理ip设置进代码内,可以用作测试ip可不可用

第一篇博客

<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.28</version>
</dependency>
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.2</version>
</dependency>
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 获取代理IP,需要
 * com.alibaba.fastjson.JSONObject以及Jsoup
 */
public class ProxyCralwerUnusedVPN {

    ThreadLocal<Integer> localWantedNumber = new ThreadLocal<Integer>();
    ThreadLocal<List<ProxyInfo>> localProxyInfos = new ThreadLocal<List<ProxyInfo>>();

    public static void main(String[] args) {
        ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN();
        /**
         * 想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢)
         */
        proxyCrawler.startCrawler(1);
    }

    /**
     * 暴露给外部模块调用的入口
     * @param wantedNumber 调用方期望获取到的代理IP个数
     */
    public String startCrawler(int wantedNumber) {
        localWantedNumber.set(wantedNumber);

        kuaidailiCom("http://www.xicidaili.com/nn/", 15);
        kuaidailiCom("http://www.xicidaili.com/nt/", 15);
        kuaidailiCom("http://www.xicidaili.com/wt/", 15);
        kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15);
        kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15);
        kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15);

        /**
         * 构造返回数据
         */
        ProxyResponse response = new ProxyResponse();
        response.setSuccess("true");
        Map<String, Object> dataInfoMap = new HashMap<String, Object>();
        dataInfoMap.put("numFound", localProxyInfos.get().size());
        dataInfoMap.put("pageNum", 1);
        dataInfoMap.put("proxy", localProxyInfos.get());
        response.setData(dataInfoMap);
        String responseString = JSONObject.toJSON(response).toString();
        System.out.println(responseString);
        return responseString;
    }

    private void kuaidailiCom(String baseUrl, int totalPage) {
        String ipReg = "\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3} \\\\d{1,6}";
        Pattern ipPtn = Pattern.compile(ipReg);

        for (int i = 1; i < totalPage; i++) {
            if (getCurrentProxyNumber() >= localWantedNumber.get()) {
                return;
            }
            try {
                Document doc = Jsoup.connect(baseUrl + i + "/")
                        .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                        .header("Accept-Encoding", "gzip, deflate, sdch")
                        .header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
                        .header("Cache-Control", "max-age=0")
                        .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
                        .header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
                        .header("Host", "www.kuaidaili.com")
                        .header("Referer", "http://www.kuaidaili.com/free/outha/")
                        .timeout(30 * 1000)
                        .get();
                Matcher m = ipPtn.matcher(doc.text());

                while (m.find()) {
                    if (getCurrentProxyNumber() >= localWantedNumber.get()) {
                        break;
                    }
                    String[] strs = m.group().split(" ");
                    if (checkProxy(strs[0], Integer.parseInt(strs[1]))) {
                        System.out.println("获取到可用代理IP\\t" + strs[0] + "\\t" + strs[1]);
                        addProxy(strs[0], strs[1], "http");
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    private static boolean checkProxy(String ip, Integer port) {
        try {
            //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
            Jsoup.connect("http://1212.ip138.com/ic.asp")
                    .timeout(2 * 1000)
                    .proxy(ip, port)
                    .get();
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    private int getCurrentProxyNumber() {
        List<ProxyInfo> proxyInfos = localProxyInfos.get();
        if (proxyInfos == null) {
            proxyInfos = new ArrayList<ProxyInfo>();
            localProxyInfos.set(proxyInfos);
            return 0;
        }
        else {
            return proxyInfos.size();
        }
    }
    private void addProxy(String ip, String port, String protocol){
        List<ProxyInfo> proxyInfos = localProxyInfos.get();
        if (proxyInfos == null) {
            proxyInfos = new ArrayList<ProxyInfo>();
            proxyInfos.add(new ProxyInfo(ip, port, protocol));
        }
        else {
            proxyInfos.add(new ProxyInfo(ip, port, protocol));
        }
    }
}



class ProxyInfo {
    private String userName = "";
    private String ip;
    private String password = "";
    private String type;
    private String port;
    private int is_internet = 1;
    public ProxyInfo(String ip, String port, String type) {
        this.ip = ip;
        this.type = type;
        this.port = port;
    }
    public String getUserName() {
        return userName;
    }
    public void setUserName(String userName) {
        this.userName = userName;
    }
    public String getIp() {
        return ip;
    }
    public void setIp(String ip) {
        this.ip = ip;
    }
    public String getPassword() {
        return password;
    }
    public void setPassword(String password) {
        this.password = password;
    }
    public String getType() {
        return type;
    }
    public void setType(String type) {
        this.type = type;
    }
    public String getPort() {
        return port;
    }
    public void setPort(String port) {
        this.port = port;
    }
    public int getIs_internet() {
        return is_internet;
    }
    public void setIs_internet(int is_internet) {
        this.is_internet = is_internet;
    }
}

class ProxyResponse {
    private String success;
    private Map<String, Object> data;
    public String getSuccess() {
        return success;
    }
    public void setSuccess(String success) {
        this.success = success;
    }
    public Map<String, Object> getData() {
        return data;
    }
    public void setData(Map<String, Object> data) {
        this.data = data;
    }
}

 

 第二篇博客

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
import java.net.URLConnection;

public class GetHtml {
    public static void main(String[] args) throws UnsupportedEncodingException {
        //输入代理ip,端口,及所要爬取的url
        gethtml("121.61.101.222",808,"http://club.autohome.com.cn/bbs/forum-c-2533-1.html?orderby=dateline&qaType=-1");

    }
    public static String gethtml(String ip,int port,String url) throws UnsupportedEncodingException{
        URL url1 = null;
        try {
            url1 = new URL(url);
        } catch (MalformedURLException e1) {
            e1.printStackTrace();
        }
        InetSocketAddress addr = null;
        //代理服务器的ip及端口
        addr = new InetSocketAddress(ip, port);
        Proxy proxy = new Proxy(Proxy.Type.HTTP, addr); // http proxy
        InputStream in = null;
        try {
            URLConnection conn = url1.openConnection(proxy);
            conn.setConnectTimeout(3000);
            in = conn.getInputStream();
        } catch (Exception e) {
            System.out.println("ip " + " is not aviable");//异常IP
        }

        String s = convertStreamToString(in);
        System.out.println(s);
        return s;

    }
    public static String convertStreamToString(InputStream is) throws UnsupportedEncodingException {
        if (is == null)
            return "";
        BufferedReader reader = new BufferedReader(new InputStreamReader(is,"gb2312"));
        StringBuilder sb = new StringBuilder();
        String line = null;
        try {
            while ((line = reader.readLine()) != null) {
                sb.append(line + "/n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();

    }
}

 

以上是关于httpclient:Ip 代理的主要内容,如果未能解决你的问题,请参考以下文章

scrapy按顺序启动多个爬虫代码片段(python3)

spring aop中this和target区别

JAVA之AOP

Extjs使用商店代理api; CRUD

ios block和delegate的区别

Yarn: 一个新的JavaScript模块管理器