java爬网页图片到本地
Posted 你个佬六
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java爬网页图片到本地相关的知识,希望对你有一定的参考价值。
一、前言
如何用java实现爬网页的照片呢?
二、看代码
package com.expt.ares.web;
import com.alibaba.fastjson2.JSON;
import com.expt.ares.vo.GetImgVO;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RestController
@RequestMapping("/img")
@Slf4j
public class ImgController
// 获取img标签正则
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
// 获取src路径的正则
// private static final String IMGSRC_REG = "[a-zA-z]+://[^\\\\s]*";
private static final String IMGSRC_REG = "/uploadfile[^\\\\\\\\s]*.jpg";
/**
* 单条抓取
* @param getImgVO
* @throws Exception
*
* eg:
*
*
* "url":"https://www.xiurenb.vip/XiaoYu/11486#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/1/"
*
*/
@PostMapping("/getImg")
public void getImg(@RequestBody GetImgVO getImgVO) throws Exception
String url = getImgVO.getUrl();
String path = getImgVO.getDownloadPath();
String mUrl = new String();
int count = 1;
for (int i = 0; i < 30; i++)
mUrl = url;
if (i == 0)
mUrl = mUrl.replaceAll("#x#","");
else
mUrl = mUrl.replaceAll("#x#","_"+i);
log.info(mUrl);
String html = getHtml(mUrl);
log.info(html);
List<String> imageUrl = getImageUrl(html);
log.info(JSON.toJSONString(imageUrl));
List<String> imageSrc = getImageSrc(imageUrl);
log.info(JSON.toJSONString(imageSrc));
count = download(imageSrc,path,count);
mUrl = new String();
/**
* 批量抓取
* @param getImgVOList
* @throws Exception
*
* eg:
*
* [
*
* "url":"https://www.xiurenb.vip/XiaoYu/11526#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/7/"
* ,
*
* "url":"https://www.xiurenb.vip/XiuRen/11808#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/4/"
* ,
*
* "url":"https://www.xiurenb.vip/XiaoYu/11775#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/6/"
* ,
* ]
*
*
*/
@PostMapping("/getImgs")
public void getImgs(@RequestBody List<GetImgVO> getImgVOList) throws Exception
for (GetImgVO vo : getImgVOList)
getImg(vo);
//获取HTML内容
private String getHtml(String url) throws Exception
URL url1 = new URL(url);//使用java.net.URL
URLConnection connection = url1.openConnection();//打开链接
InputStream in = connection.getInputStream();//获取输入流
InputStreamReader isr = new InputStreamReader(in);//流的包装
BufferedReader br = new BufferedReader(isr);
String line;
StringBuffer sb = new StringBuffer();
while ((line = br.readLine()) != null) //整行读取
sb.append(line, 0, line.length());//添加到StringBuffer中
sb.append('\\n');//添加换行符
//关闭各种流,先声明的后关闭
br.close();
isr.close();
in.close();
return sb.toString();
//获取ImageUrl地址
private List<String> getImageUrl(String html)
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html);
List<String> listimgurl = new ArrayList<String>();
while (matcher.find())
listimgurl.add(matcher.group());
return listimgurl;
//获取ImageSrc地址
private List<String> getImageSrc(List<String> listimageurl)
List<String> listImageSrc = new ArrayList<String>();
for (String image : listimageurl)
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find())
listImageSrc.add("https://p.xiurenb.top/" + matcher.group().substring(0, matcher.group().length()));
return listImageSrc;
//下载图片
private int download(List<String> listImgSrc,String path,int count)
try
//开始时间
Date begindate = new Date();
for (String url : listImgSrc)
//开始时间
Date begindate2 = new Date();
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
InputStream in = uri.openStream();
File file = new File(path);
if (!file.exists() && !file.isDirectory())
file.mkdirs();
FileOutputStream fo = new FileOutputStream(new File(path + count + ".jpg"));//文件输出流
byte[] buf = new byte[1024];
int length = 0;
log.info("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1)
fo.write(buf, 0, length);
//关闭流
in.close();
fo.close();
log.info(imageName +"____"+ count + "_____"+ "下载完成");
count = count + 1;
//结束时间
Date overdate2 = new Date();
double time = overdate2.getTime() - begindate2.getTime();
log.info("耗时:" + time / 1000 + "s");
Date overdate = new Date();
double time = overdate.getTime() - begindate.getTime();
log.info("总耗时:" + time / 1000 + "s");
catch (Exception e)
log.error("下载失败",e);
return count;
三、看结果
以上是关于java爬网页图片到本地的主要内容,如果未能解决你的问题,请参考以下文章