Java爬虫

Posted 2020-10-10 一抹微笑~的博客

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Java爬虫相关的知识，希望对你有一定的参考价值。

作为一位Java爬虫的初学者，分享一下自己的心得。
所用到的jar包

org.codehaus.jettison.jar

jsoup-1.7.3.jar

个人认为爬虫的实现机制：
获取Docume对象—>获取节点—>输出或者持久化

获取页面的图片地址

获取Docume对象—>获取Img元素—>输出地址

 1 package com.cn.basic;
 2 
 3 import java.io.IOException;
 4 import org.jsoup.Jsoup;
 5 import org.jsoup.nodes.Document;
 6 import org.jsoup.nodes.Element;
 7 import org.jsoup.select.Elements;
 8 
 9 public class ImageDemo1 {
10 
11     public static void Get_Url(String htmlUrl, String path) {
12 
13         try {
14             Document doc = Jsoup.connect(htmlUrl).get();
15 
16             Element body = doc.body();
17             Elements elements = body.select("img");
18 
19             String src = "";
20             for (Element element : elements) {
21 
22                 src = element.attr("src");
23 
24                 System.out.println(path + src);
25 
26             }
27 
28             System.out.println("elements-size: " + elements.size());
29 
30         } catch (IOException e) {
31             e.printStackTrace();
32         }
33 
34     }
35 
36     public static void main(String[] args) {
37 
38         String url = "http://pic.netbian.com/4kkatongdongman/index_2.html";
39         String path = "http://pic.netbian.com";
40         Get_Url(url, path);
41 
42     }
43 
44 }

View Code

将图片写入本地

获取Docume对象—>获取Img元素—>将图片保存本地

  1 package com.cn.basic;
  2 
  3 import java.io.ByteArrayOutputStream;
  4 import java.io.File;
  5 import java.io.FileOutputStream;
  6 import java.io.IOException;
  7 import java.io.InputStream;
  8 import java.net.HttpURLConnection;
  9 import java.net.URL;
 10 import java.util.Date;
 11 
 12 import org.jsoup.Jsoup;
 13 import org.jsoup.nodes.Document;
 14 import org.jsoup.nodes.Element;
 15 import org.jsoup.select.Elements;
 16 
 17 public class ImageDemo2 {
 18 
 19     public static void saveImage(String htmlUrl, String path) {
 20 
 21         try {
 22             Document doc = Jsoup.connect(htmlUrl).get();
 23             Element body = doc.body();
 24             Elements elements = body.select("img");
 25             
 26             String outputFilePath="E:/pythonTest/javaPython/imgs/";
 27             String src = "";
 28 
 29             HttpURLConnection conn = null;
 30             InputStream inStream = null;
 31             byte[] data = null;
 32             String filePath = null;
 33             FileOutputStream outStream = null;
 34             
 35             Long startTime=new Date().getTime();
 36             
 37             for (Element element : elements) {
 38 
 39                 src = element.attr("src");
 40 
 41                 System.out.println(path + src);
 42                 // new一个URL对象
 43 
 44                 if (!src.contains(".jpg")) {
 45                     continue;
 46                 }
 47 
 48                 URL url = new URL(path + src);
 49                 // 打开链接
 50                 conn = (HttpURLConnection) url.openConnection();
 51                 // 设置请求方式为"GET"
 52                 conn.setRequestMethod("GET");
 53                 // 超时响应时间为5秒
 54                 conn.setConnectTimeout(5 * 1000);
 55                 // 通过输入流获取图片数据
 56                 inStream = conn.getInputStream();
 57                 // 得到图片的二进制数据，以二进制封装得到数据，具有通用性
 58                 data = readInputStream(inStream);
 59                 // new一个文件对象用来保存图片，默认保存当前工程根目录
 60                 filePath = outputFilePath + System.currentTimeMillis() + ".jpg";
 61                 // 创建输出流
 62                 outStream = new FileOutputStream(new File(filePath));
 63                 // 写入数据
 64                 outStream.write(data);
 65                 // 关闭输出流
 66                 outStream.close();
 67 
 68             }
 69             System.out.println(elements.size());
 70             System.out.println("读写速度："+(new Date().getTime()-startTime)+"毫秒");
 71             
 72 
 73         } catch (IOException e) {
 74             e.printStackTrace();
 75         } catch (Exception e) {
 76             e.printStackTrace();
 77         }
 78 
 79     }
 80 
 81     public static byte[] readInputStream(InputStream inStream) throws Exception {
 82         ByteArrayOutputStream outStream = new ByteArrayOutputStream();
 83         // 创建一个Buffer字符串
 84         byte[] buffer = new byte[1024];
 85         // 每次读取的字符串长度，如果为-1，代表全部读取完毕
 86         int len = 0;
 87         // 使用一个输入流从buffer里把数据读取出来
 88         while ((len = inStream.read(buffer)) != -1) {
 89             // 用输出流往buffer里写入数据，中间参数代表从哪个位置开始读，len代表读取的长度
 90             outStream.write(buffer, 0, len);
 91         }
 92         // 关闭输入流
 93         inStream.close();
 94         // 把outStream里的数据写入内存
 95         return outStream.toByteArray();
 96     }
 97 
 98     public static void main(String[] args) {
 99         String url = "http://pic.netbian.com/4kkatongdongman/index_2.html";
100         String path = "http://pic.netbian.com";
101         saveImage(url, path);
102     }
103 
104 }

View Code

以上是关于Java爬虫的主要内容，如果未能解决你的问题，请参考以下文章