httpClient get方式抓取数据

Posted 2020-09-23 啄木鸟伍迪

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了httpClient get方式抓取数据相关的知识，希望对你有一定的参考价值。

/*
   * 爬取网页信息
   */
   private static String pickData(String url) {
       CloseableHttpClient httpclient = HttpClients.createDefault();
       try {
           HttpGet httpget = new HttpGet(url);
           CloseableHttpResponse response = httpclient.execute(httpget);
           try {
               // 获取响应实体
               HttpEntity entity = response.getEntity();
               // 打印响应状态
               if (entity != null) {
                   InputStream in = entity.getContent();
                   // byte[] b=new byte[in.available()];
                   // in.read(b);
                   BufferedReader br = new BufferedReader(new InputStreamReader(in, "gbk"));
                   String temp = "";
                   String s = "";
                   while ((temp = br.readLine()) != null) {
                       s = s + temp;
                   }
                   return s;
               } else {
                   String content = "热门综艺节目抓取失败,请检查";
                   ErrorLog el = new ErrorLog();
                   Remind remind = new Remind();
                   remind.remind(el.getVerietyLog(), content);
                   return null;
               }
           } finally {
               response.close();
           }
       } catch (ClientProtocolException e) {
           e.printStackTrace();
       } catch (ParseException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           // 关闭连接,释放资源
           try {
               httpclient.close();
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
       return null;
   }

   /*
   * 使用jsoup解析网页信息
   */
   private static Variety analyzehtmlByString(String html) {
       Variety v = new Variety();
       String[] arr = new String[3];
       Document document = Jsoup.parse(html);
       // document.select("meta").attr("charset", "utf-8");
       // System.out.println(document);
       Elements array = document.getElementsByClass("keyword");
       System.out.println(array.size());
       String content = "热门综艺节目抓取失败,请检查";
       ErrorLog el = new ErrorLog();
       if (array.size() == 0) {
           Remind remind = new Remind();
           remind.remind(el.getVerietyLog(), content);
           return null;
       }else{
           if (array.size() >= 3) {
               for (int i = 0; i < 3; i++) {
                   String name = array.get(i).child(0).text();
                   arr[i] = name;
               }
           } else {
               for (int i = 0; i < array.size(); i++) {
                   String name = array.get(i).child(0).text();
                   arr[i] = name;
               }
           }
           v.setHot1(arr[0]);
           v.setHot2(arr[1]);
           v.setHot3(arr[2]);
           return v;
       }

   }

以上是关于httpClient get方式抓取数据的主要内容，如果未能解决你的问题，请参考以下文章