pdfBox 读取pdf文件
Posted yinz163diudiu
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pdfBox 读取pdf文件相关的知识,希望对你有一定的参考价值。
1、引入maven依赖
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.4</version> </dependency>
2、相关工具类:PdfParser.java
package com.insurance.tool; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.pdfbox.text.PDFTextStripper; import com.insurance.pojo.Insurance; import com.insurance.pojo.InsuranceOrder; import com.insurance.pojo.InsuranceProgram; public class PdfParser { public static void main(String[] args) { readPDF("C:\\\\Users\\\\yinz\\\\Desktop\\\\场景1\\\\场景1_样例_电子保单识别.pdf"); } public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{ List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>(); PDDocument document = null; document=PDDocument.load(stream); // 获取页码 int pages = document.getNumberOfPages(); // 读文本内容 PDFTextStripper stripper=new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); /*stripper.setStartPage(1); stripper.setEndPage(pages); String content = stripper.getText(document); System.out.println(content);*/ for(int page = 1; page <= pages; page++) { stripper.setStartPage(page); stripper.setEndPage(page); String content = stripper.getText(document); //System.out.println(content); parseContent(content, orderList); } System.out.println(orderList); return orderList; } public static void readPDF(String filePath) { List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>(); File pdfFile = new File(filePath); PDDocument document = null; try { document=PDDocument.load(pdfFile); // 获取页码 int pages = document.getNumberOfPages(); // 读文本内容 PDFTextStripper stripper=new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); /*stripper.setStartPage(1); stripper.setEndPage(pages); String content = stripper.getText(document); System.out.println(content);*/ for(int page = 1; page <= pages; page++) { stripper.setStartPage(page); stripper.setEndPage(page); String content = stripper.getText(document); //System.out.println(content); parseContent(content, orderList); } System.out.println(orderList); } catch(Exception e) { System.out.println(e); } } private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\\\s(.*?)\\\\s"); private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\\\s(.*?)\\\\s"); private static Pattern policeHolderP = Pattern.compile("投 保 人.*\\r\\n"); private static Pattern insuredP = Pattern.compile("被保险人.*\\r\\n"); private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\\\s(.*?)(\\r\\n|\\\\s)"); private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\\\s(.*?)(\\r\\n|\\\\s)"); private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\\\s(.*?)(\\r\\n|\\\\s)"); private static Pattern validPeriodP = Pattern.compile("保险期间\\\\s(.*?)\\\\s合同生效日", Pattern.DOTALL); private static Pattern effectiveDateP = Pattern.compile("合同生效日\\\\s(.*?)(\\r\\n|\\\\s)"); private static Pattern chargeWayP = Pattern.compile("交费方式\\\\s(.*?)\\\\s"); private static Pattern feeP = Pattern.compile("保 险 费\\\\s(.*?)(\\r\\n|\\\\s)"); private static Pattern policeHolderCount = Pattern.compile("投保份数\\\\s(.*?)(\\r\\n|\\\\s)"); private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL); /*private static Pattern validPeriodP = Pattern.compile("保险期间\\\\s(.*?)\\\\s"); private static Pattern effectiveDateP = Pattern.compile("合同生效日\\\\s(.*?)\\\\s");*/ private static void parseContent(String content, List<InsuranceOrder> list) { if(content == null || content.trim().length() == 0) { return; } if(content.startsWith("个 人 人 身 保 险 保 险 单")) { //个人信息 InsuranceOrder order = new InsuranceOrder(); String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1); if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) { return; } list.add(order); order.setInsurancePoliceNo(insurancePoliceNo); order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1)); String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0); if(policeHolderInfo != null) { Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别"); Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期"); Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码"); Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$"); order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1)); order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1)); order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1)); order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1)); } String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0); if(insuredInfo != null) { Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别"); Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期"); Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码"); Pattern insuredIDP = Pattern.compile("证件号码(.*)$"); order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1)); order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1)); order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1)); order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1)); } order.setInsuredAge(retriveText(content, insuredAgeP, 1)); order.setBeneficiary(retriveText(content, beneficiaryP, 1)); //保险信息 Insurance insurance = new Insurance(); order.setInsurance(insurance); insurance.setName(retriveText(content, insuranceNameP, 1)); insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\\r\\n", "")); insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1)); insurance.setChargeWay(retriveText(content, chargeWayP, 1)); insurance.setFee(retriveText(content, feeP, 1)); insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1)); //保险项目信息 String programList = retriveTextWithInnnerBlank(content, programListP, 1); if(programList != null) { String[] pArr = programList.split("\\r\\n"); for(String str : pArr) { if(str != null && str.trim().length() > 0) { String[] subArr = str.split(" "); InsuranceProgram program = new InsuranceProgram(); order.getProgramList().add(program); program.setName(subArr[0]); program.setFee(subArr[1]); } } } } } private static String retriveText(String content, Pattern p, int position) { Matcher m = p.matcher(content); if(m.find()) { return m.group(position).trim().replace(" ", ""); } return ""; } private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) { Matcher m = p.matcher(content); if(m.find()) { return m.group(position).trim(); } return ""; } }
相关实体类:InsuranceOrder .java
package com.insurance.pojo; import java.util.ArrayList; import java.util.List; public class InsuranceOrder { private String insurancePoliceNo; //保险单号 private String insuranceApplicationNo; //投保单号 private String policeHolderName; // 投保人 private String policeHolderBirthday; //投保人出生日期 private String policeHolderGender; //投保人性别 private String policeHolderID; // 投保人证件号码 private String insuredName; //被保险人 private String insuredGender; //被保险人性别 private String insuredBirthday; //被保险人出生日期 private String insuredID; //被保险人证件号 private String insuredAge; //被保险人投保年龄 private String beneficiary; //身故受益人及分配方式 private Insurance insurance; //险种 private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>(); //保险项目 public String getPoliceHolderBirthday() { return policeHolderBirthday; } public void setPoliceHolderBirthday(String policeHolderBirthday) { this.policeHolderBirthday = policeHolderBirthday; } public String getInsuredBirthday() { return insuredBirthday; } public void setInsuredBirthday(String insuredBirthday) { this.insuredBirthday = insuredBirthday; } public String getInsurancePoliceNo() { return insurancePoliceNo; } public void setInsurancePoliceNo(String insurancePoliceNo) { this.insurancePoliceNo = insurancePoliceNo; } public String getInsuranceApplicationNo() { return insuranceApplicationNo; } public void setInsuranceApplicationNo(String insuranceApplicationNo) { this.insuranceApplicationNo = insuranceApplicationNo; } public String getPoliceHolderName() { return policeHolderName; } public void setPoliceHolderName(String policeHolderName) { this.policeHolderName = policeHolderName; } public String getPoliceHolderGender() { return policeHolderGender; } public void setPoliceHolderGender(String policeHolderGender) { this.policeHolderGender = policeHolderGender; } public String getPoliceHolderID() { return policeHolderID; } public void setPoliceHolderID(String policeHolderID) { this.policeHolderID = policeHolderID; } public String getInsuredName() { return insuredName; } public void setInsuredName(String insuredName) { this.insuredName = insuredName; } public String getInsuredGender() { return insuredGender; } public void setInsuredGender(String insuredGender) { this.insuredGender = insuredGender; } public String getInsuredID() { return insuredID; } public void setInsuredID(String insuredID) { this.insuredID = insuredID; } public String getInsuredAge() { return insuredAge; } public void setInsuredAge(String insuredAge) { this.insuredAge = insuredAge; } public String getBeneficiary() { return beneficiary; } public void setBeneficiary(String beneficiary) { this.beneficiary = beneficiary; } public Insurance getInsurance() { return insurance; } public void setInsurance(Insurance insurance) { this.insurance = insurance; } public List<InsuranceProgram> getProgramList() { return programList; } public void setProgramList(List<InsuranceProgram> programList) { this.programList = programList; } @Override public String toString() { return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo + ", insuranceApplicationNo=" + insuranceApplicationNo + ", policeHolderName=" + policeHolderName + ", policeHolderBirthday=" + policeHolderBirthday + ", policeHolderGender=" + policeHolderGender + ", policeHolderID=" + policeHolderID + ", insuredName=" + insuredName + ", insuredGender=" + insuredGender + ", insuredBirthday=" + insuredBirthday + ", insuredID=" + insuredID + ", insuredAge=" + insuredAge + ", beneficiary=" + beneficiary + ", insurance=" + insurance + ", programList=" + programList + "]"; } }
InsuranceProgram.java
package com.insurance.pojo; /** * 保险项目 * @author yinz * */ public class InsuranceProgram { private String name; //项目名称 private String fee; //金额 public String getName() { return name; } public void setName(String name) { this.name = name; } public String getFee() { return fee; } public void setFee(String fee) { this.fee = fee; } @Override public String toString() { return "InsuranceProgram [name=" + name + ", fee=" + fee + "]"; } }
此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar
以上是关于pdfBox 读取pdf文件的主要内容,如果未能解决你的问题,请参考以下文章