java爬网页图片到本地

Posted 你个佬六

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java爬网页图片到本地相关的知识,希望对你有一定的参考价值。

一、前言

如何用java实现爬网页的照片呢?

二、看代码

package com.expt.ares.web;

import com.alibaba.fastjson2.JSON;
import com.expt.ares.vo.GetImgVO;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@RestController
@RequestMapping("/img")
@Slf4j
public class ImgController 

    // 获取img标签正则
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    // 获取src路径的正则
//    private static final String IMGSRC_REG = "[a-zA-z]+://[^\\\\s]*";
    private static final String IMGSRC_REG = "/uploadfile[^\\\\\\\\s]*.jpg";


    /**
     * 单条抓取
     * @param getImgVO
     * @throws Exception
     *
     * eg:
     *
     * 
     *     "url":"https://www.xiurenb.vip/XiaoYu/11486#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/1/"
     * 
     */
    @PostMapping("/getImg")
    public void getImg(@RequestBody GetImgVO getImgVO) throws Exception 
        String url = getImgVO.getUrl();
        String path = getImgVO.getDownloadPath();
        String mUrl = new String();
        int count = 1;
        for (int i = 0; i < 30; i++) 
            mUrl = url;
            if (i == 0)
                mUrl = mUrl.replaceAll("#x#","");
            else 
                mUrl = mUrl.replaceAll("#x#","_"+i);
            
            log.info(mUrl);
            String html = getHtml(mUrl);
            log.info(html);
            List<String> imageUrl = getImageUrl(html);
            log.info(JSON.toJSONString(imageUrl));

            List<String> imageSrc = getImageSrc(imageUrl);
            log.info(JSON.toJSONString(imageSrc));

            count = download(imageSrc,path,count);
            mUrl = new String();
        
    


    /**
     * 批量抓取
     * @param getImgVOList
     * @throws Exception
     *
     * eg:
     *
     * [
     *     
     *     "url":"https://www.xiurenb.vip/XiaoYu/11526#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/7/"
     *     ,
     *     
     *     "url":"https://www.xiurenb.vip/XiuRen/11808#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/4/"
     *     ,
     *     
     *     "url":"https://www.xiurenb.vip/XiaoYu/11775#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/6/"
     *     ,
     * ]
     *
     *
     */
    @PostMapping("/getImgs")
    public void getImgs(@RequestBody List<GetImgVO> getImgVOList) throws Exception 
        for (GetImgVO vo : getImgVOList)
            getImg(vo);
        
    


    //获取HTML内容
    private String getHtml(String url) throws Exception 
        URL url1 = new URL(url);//使用java.net.URL
        URLConnection connection = url1.openConnection();//打开链接
        InputStream in = connection.getInputStream();//获取输入流
        InputStreamReader isr = new InputStreamReader(in);//流的包装
        BufferedReader br = new BufferedReader(isr);

        String line;
        StringBuffer sb = new StringBuffer();
        while ((line = br.readLine()) != null) //整行读取
            sb.append(line, 0, line.length());//添加到StringBuffer中
            sb.append('\\n');//添加换行符
        
        //关闭各种流,先声明的后关闭
        br.close();
        isr.close();
        in.close();
        return sb.toString();
    

    //获取ImageUrl地址
    private List<String> getImageUrl(String html) 
        Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html);
        List<String> listimgurl = new ArrayList<String>();
        while (matcher.find()) 
            listimgurl.add(matcher.group());
        
        return listimgurl;
    

    //获取ImageSrc地址
    private List<String> getImageSrc(List<String> listimageurl) 
        List<String> listImageSrc = new ArrayList<String>();
        for (String image : listimageurl) 
            Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
            while (matcher.find()) 
                listImageSrc.add("https://p.xiurenb.top/" + matcher.group().substring(0, matcher.group().length()));
            
        
        return listImageSrc;
    

    //下载图片
    private int download(List<String> listImgSrc,String path,int count) 
        try 
            //开始时间
            Date begindate = new Date();
            for (String url : listImgSrc) 
                //开始时间
                Date begindate2 = new Date();
                String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
                URL uri = new URL(url);
                InputStream in = uri.openStream();
                File file = new File(path);
                if (!file.exists() && !file.isDirectory())
                    file.mkdirs();
                
                FileOutputStream fo = new FileOutputStream(new File(path + count + ".jpg"));//文件输出流
                byte[] buf = new byte[1024];
                int length = 0;
                log.info("开始下载:" + url);
                while ((length = in.read(buf, 0, buf.length)) != -1) 
                    fo.write(buf, 0, length);
                
                //关闭流
                in.close();
                fo.close();
                log.info(imageName +"____"+ count + "_____"+ "下载完成");
                count = count + 1;
                //结束时间
                Date overdate2 = new Date();
                double time = overdate2.getTime() - begindate2.getTime();
                log.info("耗时:" + time / 1000 + "s");
            
            Date overdate = new Date();
            double time = overdate.getTime() - begindate.getTime();
            log.info("总耗时:" + time / 1000 + "s");

         catch (Exception e) 
            log.error("下载失败",e);
        
        return count;
    

三、看结果


以上是关于java爬网页图片到本地的主要内容,如果未能解决你的问题,请参考以下文章

java读取网页图片路径并下载到本地

Python爬虫爬取网页上的所有图片

python爬虫 将在线html网页中的图片链接替换成本地链接并将html文件下载到本地

java相关。爬虫问题,关于新浪微博。谢谢!

百度鲜花图像爬取

用 Java 爬美女图片,这个厉害了。。