利用pdfbox和poi抽取pdfdoc以及docx格式的内容

Posted 2020-10-09 石头木

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了利用pdfbox和poi抽取pdfdoc以及docx格式的内容相关的知识，希望对你有一定的参考价值。

使用pdfbox1.5.0抽取pdf格式文档内容，使用poi3.7抽取doc及docx文档内容：

  1 /**
  2  * Created by yan.shi on 2017/9/25.
  3  */
  4 import org.apache.pdfbox.pdfparser.PDFParser;
  5 import org.apache.pdfbox.pdmodel.PDDocument;
  6 import org.apache.pdfbox.util.PDFTextStripper;
  7 
  8 import org.apache.poi.POIXMLDocument;
  9 import org.apache.poi.POIXMLTextExtractor;
 10 import org.apache.poi.hwpf.extractor.WordExtractor;
 11 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 12 import org.apache.poi.openxml4j.opc.OPCPackage;
 13 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 14 import org.apache.xmlbeans.XmlException;
 15 
 16 import java.io.File;
 17 import java.io.FileInputStream;
 18 import java.io.IOException;
 19 
 20 /**
 21  * 这里使用pdfbox解析pdf类型文档
 22  * 使用poi解析doc与docx类型文档
 23  */
 24 public class ExtractText {
 25 
 26     public static void main(String[] args) {
 27         ExtractText text=new ExtractText();
 28         String filePath="文件";
 29         String content=text.getText(filePath);
 30         if(null!=content)
 31             System.out.println("content: "+content);
 32     }
 33 
 34     public ExtractText(){
 35     }
 36     public ExtractText(String filePath){
 37     }
 38 
 39     /**
 40      * 根据不同的文档类型读取，这里只使用pdf、doc、docs类型
 41      * @param filePath
 42      * @return
 43      */
 44     public String getText(String filePath){
 45         File file = new File(filePath);
 46         String fileName=file.getName();
 47         String postfix=fileName.substring(fileName.lastIndexOf(".")+1);
 48         String content=null;
 49         if(postfix.equalsIgnoreCase("pdf")){
 50             content=getPDFText(file);
 51         }else if(postfix.equalsIgnoreCase("doc")){
 52             content=getDocText(file);
 53         }else if(postfix.equalsIgnoreCase("docx")){
 54             content=getDocxText(filePath);
 55         }else {
 56             System.out.println("输入的文件格式不支持！");
 57             return null;
 58         }
 59         if(null!=content && !"".equals(content))
 60             return content;
 61         else
 62             return null;
 63     }
 64 
 65     /**
 66      * 利用pdfbox解析pdf内容
 67      * @param file
 68      * @return
 69      */
 70     private String getPDFText(File file){
 71         FileInputStream fileinput=null;
 72         String text=null;
 73         try {
 74             fileinput=new FileInputStream(file);
 75             PDFParser parser=new PDFParser(fileinput);//pdf解析器
 76             parser.parse();//解析
 77             PDDocument pdfdocument=parser.getPDDocument();//pdf文档
 78             PDFTextStripper stripper=new PDFTextStripper();//文本剥离
 79             //List allPages=pdfdocument.getDocumentCatalog().getAllPages();
 80             text=stripper.getText(pdfdocument);//从pdf文档剥离文本
 81         } catch (IOException e) {
 82             e.printStackTrace();
 83         }finally {
 84             if(fileinput!=null){
 85                 try {
 86                     fileinput.close();
 87                 } catch (IOException e) {
 88                     e.printStackTrace();
 89                 }
 90             }
 91         }
 92         return text;
 93     }
 94 
 95     /**
 96      * 读取doc文档类型
 97      * @param file
 98      * @return
 99      */
100     private String getDocText(File file){
101         FileInputStream fileinput=null;
102         String text=null;
103 
104         try {
105             fileinput=new FileInputStream(file);
106             WordExtractor we=new WordExtractor(fileinput);
107             //text=we.getText();
108             String s[]=we.getParagraphText();
109             for(String str:s){
110                 str=str.trim();
111                 if(str.equals("") || str==null)
112                     continue;
113                 //System.out.println(str);
114             }
115         } catch (IOException e) {
116             e.printStackTrace();
117         }finally {
118             if(fileinput!=null){
119                 try {
120                     fileinput.close();
121                 } catch (IOException e) {
122                     e.printStackTrace();
123                 }
124             }
125         }
126         return text;
127     }
128 
129     /**
130      * 读取docx文档类型
131      * @param file
132      * @return
133      */
134     private String getDocxText(String file){
135         String text=null;
136         try {
137             OPCPackage opcPackage=POIXMLDocument.openPackage(file);
138             POIXMLTextExtractor extractor=new XWPFWordExtractor(opcPackage);
139             text=extractor.getText();
140            //InputStream is=new FileInputStream(file);
141             //XWPFWordExtractor doc=new XWPFWordExtractor(OPCPackage.open(is));
142             //List<XWPFParagraph> paras=doc.get
143             //System.out.println(text);
144         } catch (IOException e) {
145             e.printStackTrace();
146         } catch (XmlException e) {
147             e.printStackTrace();
148         } catch (OpenXML4JException e) {
149             e.printStackTrace();
150         }
151         return text;
152     }
153 
154 }

以上是关于利用pdfbox和poi抽取pdfdoc以及docx格式的内容的主要内容，如果未能解决你的问题，请参考以下文章

用java读取多种文件格式的文件（pdf,pptx,ppt,doc,docx..)

基于PDFBox的PDF文字坐标抽取API文档

利用POI操作不同版本号word文档中的图片以及创建word文档

Apache的poi到底是一个怎样的东西？

基于pdfbox实现的pdf添加文字水印工具

doc以及docx文档转html文件（同时解析图片音频和视频）