java poi读取pdf word excel文档,读取pdf文字图片
Posted 宇宙磅礴而冷漠
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java poi读取pdf word excel文档,读取pdf文字图片相关的知识,希望对你有一定的参考价值。
依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.0</version>
</dependency>
<!--pdf-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>
读取pdf文本和图片
public String readPDF(String file) throws IOException {
StringBuilder result = new StringBuilder();
FileInputStream is = null;
PDDocument document = null;
is = new FileInputStream(file);
PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
PDDocument doc = parser.getPDDocument();
PDFTextStripper textStripper =new PDFTextStripper();
for(int i=1;i<=doc.getNumberOfPages();i++)
{
textStripper.setStartPage(i);
textStripper.setEndPage(i);
textStripper.setSortByPosition(true);//按顺序行读
String s=textStripper.getText(doc);
result.append(s);
}
// //文本为空,读图片提取图片里的文字,
// if(result.toString().trim().length()==0){
// for(int i=1;i<doc.getNumberOfPages();i++){
// PDPage page=doc.getPage(i-1);
// PDResources resources = page.getResources();
// Iterable<COSName> xobjects =resources.getXObjectNames();
// if(xobjects!=null) {
// Iterator<COSName> imageIter = xobjects.iterator();
// while (imageIter.hasNext()) {
// COSName cosName = imageIter.next();
// boolean isImageXObject = resources.isImageXObject(cosName);
// if (isImageXObject) {
// //获取每页资源的图片
// PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
// File outputfile = new File("D:\\\\tmp\\\\" + cosName.getName() + ".jpg");
// ImageIO.write(ixt.getImage(), "jpg", outputfile);//可保存图片到本地
// //调用图片识别文字接口
// //...
// }
// }
// }
// }
// }
doc.close();
return result.toString();
}
简单读取word docx/doc文字
public static String readDoc(String path) throws IOException, XmlException, OpenXML4JException {
String resullt = "";
InputStream is =null;
if (path.toLowerCase().endsWith(".doc")) {
is=new FileInputStream(new File(path));
WordExtractor re = new WordExtractor(is);
resullt = re.getText();
re.close();
} else if (path.toLowerCase().endsWith(".docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
resullt = extractor.getText();
extractor.close();
}
return resullt;
}
//针对doc文件另一种写法
public void readWords() throws Exception{
String file="C:\\\\xx.doc";
HWPFDocument document=new HWPFDocument(new FileInputStream(file));
Range r=document.getRange();
for(int i=0;i<r.numParagraphs();i++){
Paragraph paragraph = r.getParagraph(i);
System.out.println(paragraph.text());
}
}
读取word doc表格内容
public void read() throws Exception{
String file="C:\\\\xx.doc";
HWPFDocument document=new HWPFDocument(new FileInputStream(file));
Range r=document.getRange();
TableIterator tableIterator=new TableIterator(r);
while (tableIterator.hasNext()){
Table table=tableIterator.next();
int numRows = table.numRows();
for(int j=0;j<numRows;j++){
TableRow row = table.getRow(j);
for (int k=0;k<row.numCells();k++){
TableCell cell = row.getCell(k);
System.out.println(j+" "+k+" "+cell.text());//行号+列号+文字内容
}
System.out.println("\\n");
}
}
}
读取word doc图片
public void readPicture() throws Exception{
String file="C:\\\\xx.doc";
HWPFDocument document=new HWPFDocument(new FileInputStream(file));
Range r=document.getRange();
PicturesTable picturesTable=document.getPicturesTable();
List<Picture> pictureList=picturesTable.getAllPictures();//一次性获取,如果想按段落依次可以写进map
Map<Integer, Picture> lookup =new HashMap<Integer, Picture>();
for (Picture p : pictureList) {
lookup.put(p.getStartOffset(), p);
}
System.out.println(pictureList.size());
for(int i=0;i<r.numParagraphs();i++){
Paragraph paragraph = r.getParagraph(i);
for(int j=0;j<paragraph.numCharacterRuns();j++){
if(picturesTable.hasPicture(paragraph.getCharacterRun(j))){
System.out.println("有图片");
Picture picture=lookup.get(paragraph.getCharacterRun(j));//依次从map取图片
picture.writeImageContent(new FileOutputStream(" "));
}
}
}
}
读取excel
针对xlsx文件 XSSFWorkbook
针对xls文件 HSSFWorkbook
public static String readEXCELxlsx(String file) throws IOException {
StringBuilder content = new StringBuilder();
XSSFWorkbook workbook = new XSSFWorkbook(file);//根据文件类型选择xlsx
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值
if (convertCell(aCell).length() > 0) {
content.append(convertCell(aCell));
}
}
content.append("\\n");
}
}
}
}
}
return content.toString();
}
格式化单元格内容
private static String convertCell(Cell cell) {
NumberFormat formater = NumberFormat.getInstance();
formater.setGroupingUsed(false);
String cellValue = "";
if (cell == null) {
return cellValue;
}
if(cell.getCellType()== CellType.NUMERIC){
//cellValue = formater.format(cell.getNumericCellValue());
//解决读取日期数字混有中文问题如2021/8/9读出中文
if (org.apache.poi.ss.usermodel.DateUtil.isCellDateFormatted(cell)) {
Date theDate = cell.getDateCellValue();
SimpleDateFormat dff = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
cellValue = dff.format(theDate);
}else{
DecimalFormat df = new DecimalFormat("0");
cellValue = df.format(cell.getNumericCellValue());
}
}
else if(cell.getCellType()==CellType.STRING){
cellValue = cell.getStringCellValue();
}
else if(cell.getCellType()==CellType.BLANK){
cellValue = cell.getStringCellValue();
}
else if(cell.getCellType()==CellType.BOOLEAN){
cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
}
else if(cell.getCellType()==CellType.ERROR){
cellValue = String.valueOf(cell.getErrorCellValue());
}
return (cellValue.trim());
}
输出到excel
public void exportToExcel() throws Exception{
List<Student> studentList=new ArrayList<>();
studentList.add(new Student().setName("张三").setAge(12).setDate(new Date()));
studentList.add(new Student().setName("李四").setAge(12).setDate(new Date()));
studentList.add(new Student().setName("王五").setAge(12).setDate(new Date()));
XSSFWorkbook xssfWorkbook=new XSSFWorkbook();
XSSFSheet sheet=xssfWorkbook.createSheet("第一个表单");
XSSFRow rowTitle = sheet.createRow(0);
rowTitle.createCell(0).setCellValue("姓名");
rowTitle.createCell(1).setCellValue("年龄");
rowTitle.createCell(2).setCellValue("时间");
for (int i=0;i<studentList.size();i++){
XSSFRow valueRow=sheet.createRow(i+1);
valueRow.createCell(0).setCellValue(studentList.get(i).getName());
valueRow.createCell(1).setCellValue(studentList.get(i).getAge());
valueRow.createCell(2).setCellValue(new SimpleDateFormat("yyyy-MM-dd").format(studentList.get(i).getDate()));
}
XSSFSheet sheet2=xssfWorkbook.createSheet("第二个表单");
XSSFRow rowTitle2 = sheet2.createRow(0);
rowTitle2.createCell(0).setCellValue("姓名");
rowTitle2.createCell(1).setCellValue("年龄");
rowTitle2.createCell(2).setCellValue("时间");
for (int i=0;i<studentList.size();i++){
XSSFRow valueRow=sheet2.createRow(i+1);
valueRow.createCell(0).setCellValue(studentList.get(i).getName());
valueRow.createCell(1).setCellValue(studentList.get(i).getAge());
valueRow.createCell(2).setCellValue(studentList.get(i).getDate());
}
File file=new File("D:\\\\tmp","student.xlsx");
xssfWorkbook.write(new FileOutputStream(file));
xssfWorkbook.close();
}
office转pdf
https://blog.csdn.net/UnicornRe/article/details/119677482?spm=1001.2014.3001.5501
以上是关于java poi读取pdf word excel文档,读取pdf文字图片的主要内容,如果未能解决你的问题,请参考以下文章
java poi读取pdf word excel文档,读取pdf文字图片
java操作office和pdf文件java读取word,excel和pdf文档内容