java poi读取pdf word excel文档,读取pdf文字图片

Posted 宇宙磅礴而冷漠

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java poi读取pdf word excel文档,读取pdf文字图片相关的知识,希望对你有一定的参考价值。

依赖

<dependency>
     	    <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.0</version>
</dependency>
<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.0</version>
</dependency>
 <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.0</version>
</dependency>
<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.0</version>
</dependency>
        <!--pdf-->
<dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.4</version>
</dependency>

读取pdf文本和图片

public String readPDF(String file) throws IOException {
        StringBuilder result = new StringBuilder();
        FileInputStream is = null;
        PDDocument document = null;
        is = new FileInputStream(file);
        PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
        parser.parse();
        PDDocument doc = parser.getPDDocument();
        PDFTextStripper textStripper =new PDFTextStripper();
        for(int i=1;i<=doc.getNumberOfPages();i++)
        {
            textStripper.setStartPage(i);
            textStripper.setEndPage(i);
            textStripper.setSortByPosition(true);//按顺序行读
            String s=textStripper.getText(doc);
            result.append(s);
        }
//        //文本为空,读图片提取图片里的文字,
//        if(result.toString().trim().length()==0){
//            for(int i=1;i<doc.getNumberOfPages();i++){
//                PDPage page=doc.getPage(i-1);
//                PDResources resources = page.getResources();
//                Iterable<COSName> xobjects =resources.getXObjectNames();
//                if(xobjects!=null) {
//                    Iterator<COSName> imageIter = xobjects.iterator();
//                    while (imageIter.hasNext()) {
//                        COSName cosName = imageIter.next();
//                        boolean isImageXObject = resources.isImageXObject(cosName);
//                        if (isImageXObject) {
//                            //获取每页资源的图片
//                            PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
//                            File outputfile = new File("D:\\\\tmp\\\\" + cosName.getName() + ".jpg");
//                            ImageIO.write(ixt.getImage(), "jpg", outputfile);//可保存图片到本地
//							 //调用图片识别文字接口
//							//...
//                        }
//                    }
//                }
//            }
//        }
        doc.close();
        return result.toString();
    }

简单读取word docx/doc文字

public static String readDoc(String path) throws IOException, XmlException, OpenXML4JException {
        String resullt = "";
        InputStream is =null;
        if (path.toLowerCase().endsWith(".doc")) {
            is=new FileInputStream(new File(path));
            WordExtractor re = new WordExtractor(is);
            resullt = re.getText();
            re.close();
        } else if (path.toLowerCase().endsWith(".docx")) {
            OPCPackage opcPackage = POIXMLDocument.openPackage(path);
            POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
            resullt = extractor.getText();
            extractor.close();
        }
        return resullt;
    }
    
//针对doc文件另一种写法
public void readWords() throws Exception{
        String file="C:\\\\xx.doc";
        HWPFDocument document=new HWPFDocument(new FileInputStream(file));
        Range r=document.getRange();
        for(int i=0;i<r.numParagraphs();i++){
            Paragraph paragraph = r.getParagraph(i);
            System.out.println(paragraph.text());
        }
    }

读取word doc表格内容

public void read() throws Exception{
       String file="C:\\\\xx.doc";
       HWPFDocument document=new HWPFDocument(new FileInputStream(file));
       Range r=document.getRange();
        TableIterator tableIterator=new TableIterator(r);
        while (tableIterator.hasNext()){
            Table table=tableIterator.next();
            int numRows = table.numRows();
            for(int j=0;j<numRows;j++){
                TableRow row = table.getRow(j);
                for (int k=0;k<row.numCells();k++){
                    TableCell cell = row.getCell(k);
                    System.out.println(j+" "+k+" "+cell.text());//行号+列号+文字内容
                }
                System.out.println("\\n");
            }
        }
   }

读取word doc图片

public void readPicture() throws Exception{
        String file="C:\\\\xx.doc";
        HWPFDocument document=new HWPFDocument(new FileInputStream(file));
        Range r=document.getRange();
        PicturesTable picturesTable=document.getPicturesTable();
        List<Picture> pictureList=picturesTable.getAllPictures();//一次性获取,如果想按段落依次可以写进map
        Map<Integer, Picture> lookup =new HashMap<Integer, Picture>();
        for (Picture p : pictureList) {
            lookup.put(p.getStartOffset(), p);
        }
        System.out.println(pictureList.size());
        for(int i=0;i<r.numParagraphs();i++){
            Paragraph paragraph = r.getParagraph(i);
            for(int j=0;j<paragraph.numCharacterRuns();j++){
                if(picturesTable.hasPicture(paragraph.getCharacterRun(j))){
                    System.out.println("有图片");
                    Picture picture=lookup.get(paragraph.getCharacterRun(j));//依次从map取图片
                    picture.writeImageContent(new FileOutputStream("  "));
                }
            }
        }
    }

读取excel

针对xlsx文件 XSSFWorkbook
针对xls文件 HSSFWorkbook

public static String readEXCELxlsx(String file) throws IOException {
        StringBuilder content = new StringBuilder();
        XSSFWorkbook workbook = new XSSFWorkbook(file);//根据文件类型选择xlsx
        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
            if (null != workbook.getSheetAt(numSheets)) {
                XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
                for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                    if (null != aSheet.getRow(rowNumOfSheet)) {
                        XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
                        for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                            if (null != aRow.getCell(cellNumOfRow)) {
                                XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
                                if (convertCell(aCell).length() > 0) {
                                    content.append(convertCell(aCell));
                                }
                            }
                            content.append("\\n");
                        }
                    }
                }
            }
        }
        return  content.toString();
 }

格式化单元格内容

private static String convertCell(Cell cell) {
        NumberFormat formater = NumberFormat.getInstance();
        formater.setGroupingUsed(false);
        String cellValue = "";
        if (cell == null) {
            return cellValue;
        }
        if(cell.getCellType()== CellType.NUMERIC){
            //cellValue = formater.format(cell.getNumericCellValue());
            //解决读取日期数字混有中文问题如2021/8/9读出中文
            if (org.apache.poi.ss.usermodel.DateUtil.isCellDateFormatted(cell)) {
                Date theDate = cell.getDateCellValue();
                SimpleDateFormat dff = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                cellValue = dff.format(theDate);
            }else{
                DecimalFormat df = new DecimalFormat("0");
                cellValue = df.format(cell.getNumericCellValue());
            }
        }
        else if(cell.getCellType()==CellType.STRING){
            cellValue = cell.getStringCellValue();
        }
        else if(cell.getCellType()==CellType.BLANK){
            cellValue = cell.getStringCellValue();
        }
        else if(cell.getCellType()==CellType.BOOLEAN){
            cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
        }
        else if(cell.getCellType()==CellType.ERROR){
            cellValue = String.valueOf(cell.getErrorCellValue());
        }
        return (cellValue.trim());
    }

输出到excel

public void exportToExcel() throws Exception{
        List<Student> studentList=new ArrayList<>();
        studentList.add(new Student().setName("张三").setAge(12).setDate(new Date()));
        studentList.add(new Student().setName("李四").setAge(12).setDate(new Date()));
        studentList.add(new Student().setName("王五").setAge(12).setDate(new Date()));
        XSSFWorkbook xssfWorkbook=new XSSFWorkbook();
        XSSFSheet sheet=xssfWorkbook.createSheet("第一个表单");
        XSSFRow rowTitle = sheet.createRow(0);
        rowTitle.createCell(0).setCellValue("姓名");
        rowTitle.createCell(1).setCellValue("年龄");
        rowTitle.createCell(2).setCellValue("时间");
        for (int i=0;i<studentList.size();i++){
            XSSFRow valueRow=sheet.createRow(i+1);
            valueRow.createCell(0).setCellValue(studentList.get(i).getName());
            valueRow.createCell(1).setCellValue(studentList.get(i).getAge());
            valueRow.createCell(2).setCellValue(new SimpleDateFormat("yyyy-MM-dd").format(studentList.get(i).getDate()));
        }
        XSSFSheet sheet2=xssfWorkbook.createSheet("第二个表单");
        XSSFRow rowTitle2 = sheet2.createRow(0);
        rowTitle2.createCell(0).setCellValue("姓名");
        rowTitle2.createCell(1).setCellValue("年龄");
        rowTitle2.createCell(2).setCellValue("时间");
        for (int i=0;i<studentList.size();i++){
            XSSFRow valueRow=sheet2.createRow(i+1);
            valueRow.createCell(0).setCellValue(studentList.get(i).getName());
            valueRow.createCell(1).setCellValue(studentList.get(i).getAge());
            valueRow.createCell(2).setCellValue(studentList.get(i).getDate());
        }
        File file=new File("D:\\\\tmp","student.xlsx");
        xssfWorkbook.write(new FileOutputStream(file));
        xssfWorkbook.close();
    }

office转pdf

https://blog.csdn.net/UnicornRe/article/details/119677482?spm=1001.2014.3001.5501

以上是关于java poi读取pdf word excel文档,读取pdf文字图片的主要内容,如果未能解决你的问题,请参考以下文章

java poi读取pdf word excel文档,读取pdf文字图片

java操作office和pdf文件java读取word,excel和pdf文档内容

java操作excel,pdf,word等文件的方法

用JAVA能把Word和PDF文档的表格内容和格式识别出来吗

java 如何从word中把excel表抽取出来

Java实现word文档在线预览,读取office文件