Java爬虫(httpclient&jsoup)

Posted 2020-12-17 Wenqihe

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Java爬虫(httpclient&jsoup)相关的知识，希望对你有一定的参考价值。

简介

网络爬虫是一种按照一定的规则自动的抓取网页上面的信息的一种程序或脚本。
使用httpclient和jsoup可以爬虫网页信息。

httpclient

get请求

    public static void main(String[] args) throws IOException {
    //创建HttpClient对象
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //创建HttpGet请求
    HttpGet httpGet = new HttpGet("http://www.jd.com/");
    CloseableHttpResponse response = null;
    try {
  //使用HttpClient发起请求
  response = httpClient.execute(httpGet);
  //判断响应状态码是否为200
  if (response.getStatusLine().getStatusCode() == 200) {
      //如果为200表示请求成功，获取返回数据
      String content = EntityUtils.toString(response.getEntity(), "UTF-8");
      //打印数据长度
      System.out.println(content);
  }
    } catch (Exception e) {
  e.printStackTrace();
    } finally {
  //释放连接
  if (response == null) {
      try {
          response.close();
      } catch (IOException e) {
          e.printStackTrace();
      }
      httpClient.close();
        }
    }
}

get请求(携带params)

    public static void main(String[] args) throws IOException {
    //创建HttpClient对象
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //创建HttpGet请求
    String uri = "http://www.jd.com/search?keys=Java";
    HttpGet httpGet = new HttpGet(uri);
    CloseableHttpResponse response = null;
    try {
  //使用HttpClient发起请求
  response = httpClient.execute(httpGet);
  //判断响应状态码是否为200
  if (response.getStatusLine().getStatusCode() == 200) {
      //如果为200表示请求成功，获取返回数据
      String content = EntityUtils.toString(response.getEntity(), "UTF-8");
      //打印数据长度
      System.out.println(content);
  }
    } catch (Exception e) {
  e.printStackTrace();
    } finally {
  //释放连接
  if (response == null) {
      try {
          response.close();
      } catch (IOException e) {
          e.printStackTrace();
      }
      httpClient.close();
        }
    }
}

post请求

    public static void main(String[] args) throws IOException {
    //创建HttpClient对象
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //创建HttpPost请求
    HttpPost httpPost = new HttpPost("http://www.jd.com/");
    CloseableHttpResponse response = null;
    try {
  //使用HttpClient发起请求
  response = httpClient.execute(httpPost);
  //判断响应状态码是否为200
  if (response.getStatusLine().getStatusCode() == 200) {
      //如果为200表示请求成功，获取返回数据
      String content = EntityUtils.toString(response.getEntity(), "UTF-8");
      //打印数据长度
      System.out.println(content);
  }
    } catch (Exception e) {
  e.printStackTrace();
    } finally {
  //释放连接
  if (response == null) {
      try {
          response.close();
      } catch (IOException e) {
          e.printStackTrace();
      }
      httpClient.close();
        }
    }
}

post请求(携带params)

    public static void main(String[] args) throws IOException {
    //创建HttpClient对象
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //创建HttpPost请求
    HttpPost httpPost = new HttpPost("http://www.jd.com/");
    //声明存放参数的List集合
    List<NameValuePair> params = new ArrayList<NameValuePair>();
    params.add(new BasicNameValuePair("keys", "java"));
    //创建表单数据Entity
    UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "UTF-8");
    //设置表单Entity到httpPost请求对象中
    httpPost.setEntity(formEntity);
    CloseableHttpResponse response = null;
    try {
  //使用HttpClient发起请求
  response = httpClient.execute(httpPost);
  //判断响应状态码是否为200
  if (response.getStatusLine().getStatusCode() == 200) {
      //如果为200表示请求成功，获取返回数据
      String content = EntityUtils.toString(response.getEntity(), "UTF-8");
      //打印数据长度
      System.out.println(content);
  }
    } catch (Exception e) {
  e.printStackTrace();
    } finally {
  //释放连接
  if (response == null) {
      try {
          response.close();
      } catch (IOException e) {
          e.printStackTrace();
      }
      httpClient.close();
        }
    }
}

httpclient连接池

    private static void doGet(PoolingHttpClientConnectionManager cm) {
    //创建httpclient对象
    CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    //创建httpget请求
    HttpGet httpGet = new HttpGet("http://www.jd.com/");
    CloseableHttpResponse response = null;
    try {
  response = httpClient.execute(httpGet);
  // 判断状态码是否是200
  if (response.getStatusLine().getStatusCode() == 200) {
      // 解析数据
      String content = EntityUtils.toString(response.getEntity(), "UTF-8");
      System.out.println(content.length());
  }
    } catch (Exception e) {
  e.printStackTrace();
    } finally {
  //释放连接
  if (response == null) {
      try {
          response.close();
      } catch (IOException e) {
          e.printStackTrace();
      }
      //不能关闭HttpClient
      //httpClient.close();
        }
    }
}

设置请求参数

    public static void main(String[] args) throws IOException {
    //创建HttpClient对象
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //创建HttpGet请求
    HttpGet httpGet = new HttpGet("http://www.jd.com/");
    //设置请求参数
    RequestConfig requestConfig = RequestConfig.custom()
      .setConnectTimeout(1000)//设置创建连接的最长时间
      .setConnectionRequestTimeout(500)//设置获取连接的最长时间
      .setSocketTimeout(10 * 1000)//设置数据传输的最长时间
      .build();
    httpGet.setConfig(requestConfig);
    CloseableHttpResponse response = null;
    try {
  //使用HttpClient发起请求
  response = httpClient.execute(httpGet);
  //判断响应状态码是否为200
  if (response.getStatusLine().getStatusCode() == 200) {
      //如果为200表示请求成功，获取返回数据
      String content = EntityUtils.toString(response.getEntity(), "UTF-8");
      //打印数据长度
      System.out.println(content);
  }
    } catch (Exception e) {
  e.printStackTrace();
    } finally {
  //释放连接
  if (response == null) {
      try {
          response.close();
      } catch (IOException e) {
          e.printStackTrace();
      }
      httpClient.close();
  }
    }
}

jsoup(一般只用jsoup来作解析工具使用)

jsoup主要功能:

从一个url，文本或字符串中解析html
使用dom或css选择器来查找，取出数据
可操作html元素，属性，文本

解析url（jsoup可以直接输入url，发起请求并获取数据，封装为document对象）

      public void testJsoupUrl() throws Exception {
      //    解析url地址
      Document document = Jsoup.parse(new URL("http://www.jd.com/"), 1000);

      //获取title的内容
      Element title = document.getElementsByTag("title").first();
      System.out.println(title.text());
  }

使用dom方式遍历文档

元素获取
1.根据id查询元素getElementById
2.根据标签获取元素getElementsByTag
3.根据class获取元素getElementsByClass
4.根据属性获取元素getElementsByAttribute

//1.    根据id查询元素getElementById
Element element = document.getElementById("city_bj");
//2.   根据标签获取元素getElementsByTag
element = document.getElementsByTag("title").first();
//3.   根据class获取元素getElementsByClass
element = document.getElementsByClass("s_name").last();
//4.   根据属性获取元素getElementsByAttribute
element = document.getElementsByAttribute("abc").first();
element = document.getElementsByAttributeValue("class", "city_con").first();

元素中获取数据
1.从元素中获取id
2.从元素中获取className
3.从元素中获取属性的值attr
4.从元素中获取所有属性attributes
5.从元素中获取文本内容text

//获取元素
Element element = document.getElementById("test");
//1.   从元素中获取id
String str = element.id();
//2.   从元素中获取className
str = element.className();
//3.   从元素中获取属性的值attr
str = element.attr("id");
//4.   从元素中获取所有属性attributes
str = element.attributes().toString();
//5.   从元素中获取文本内容text
str = element.text();

使用selector选择器语法查找元素

  tagname: 通过标签查找元素，比如：span
  //tagname: 通过标签查找元素，比如：span
  Elements span = document.select("span");
  for (Element element : span) {
      System.out.println(element.text());
  }

  #id: 通过ID查找元素，比如：# city_bj)
  //#id: 通过ID查找元素，比如：#city_bjj
  String str = document.select("#city_bj").text();

  .class: 通过class名称查找元素，比如：.class_a
  //.class: 通过class名称查找元素，比如：.class_a
  str = document.select(".class_a").text();

  [attribute]: 利用属性查找元素，比如：[abc]
  //[attribute]: 利用属性查找元素，比如：[abc]
  str = document.select("[abc]").text();

  [attr=value]: 利用属性值来查找元素，比如：[class=s_name]
  //[attr=value]: 利用属性值来查找元素，比如：[class=s_name]
  str = document.select("[class=s_name]").text();

selector选择器组合使用

  el#id: 元素+ID，比如： h3#city_bj
  //el#id: 元素+ID，比如： h3#city_bj
  String str = document.select("h3#city_bj").text();

  el.class: 元素+class，比如： li.class_a
  //el.class: 元素+class，比如： li.class_a
  str = document.select("li.class_a").text();

  el[attr]: 元素+属性名，比如： span[abc]
  //el[attr]: 元素+属性名，比如： span[abc]
  str = document.select("span[abc]").text();

  任意组合: 比如：span[abc].s_name
  //任意组合，比如：span[abc].s_name
  str = document.select("span[abc].s_name").text();

  ancestor child: 查找某个元素下子元素，比如：.city_con li 查找"city_con"下的所有li
  //ancestor child: 查找某个元素下子元素，比如：.city_con li 查找"city_con"下的所有li
  str = document.select(".city_con li").text();

  parent > child: 查找某个父元素下的直接子元素，比如：

  //parent > child: 查找某个父元素下的直接子元素，
  //比如：.city_con > ul > li 查找city_con第一级（直接子元素）的ul，再找所有ul下的第一级li
  str = document.select(".city_con > ul > li").text();

  .city_con > ul > li 查找city_con第一级（直接子元素）的ul，再找所有ul下的第一级li
  parent > *: 查找某个父元素下所有直接子元素
  //parent > * 查找某个父元素下所有直接子元素.city_con > *
  str = document.select(".city_con > *").text();

以上是关于Java爬虫(httpclient&jsoup)的主要内容，如果未能解决你的问题，请参考以下文章