pdfBox 读取pdf文件

Posted yinz163diudiu

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pdfBox 读取pdf文件相关的知识,希望对你有一定的参考价值。

1、引入maven依赖

        <dependency>
          <groupId>org.apache.pdfbox</groupId>
          <artifactId>pdfbox</artifactId>
          <version>2.0.4</version>
        </dependency>

 

2、相关工具类:PdfParser.java

package com.insurance.tool;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper;

import com.insurance.pojo.Insurance;
import com.insurance.pojo.InsuranceOrder;
import com.insurance.pojo.InsuranceProgram;


public class PdfParser {
    
    public static void main(String[] args) {
        readPDF("C:\\\\Users\\\\yinz\\\\Desktop\\\\场景1\\\\场景1_样例_电子保单识别.pdf");
    }

    public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
        List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
        PDDocument document = null;
        document=PDDocument.load(stream);

        // 获取页码
        int pages = document.getNumberOfPages();

        // 读文本内容
        PDFTextStripper stripper=new PDFTextStripper();
        // 设置按顺序输出
        stripper.setSortByPosition(true);
        /*stripper.setStartPage(1);
        stripper.setEndPage(pages);
        String content = stripper.getText(document);
        System.out.println(content);*/     
        
        for(int page = 1; page <= pages; page++) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
            String content = stripper.getText(document);
            //System.out.println(content);
            parseContent(content, orderList);
        }
        
        System.out.println(orderList);
        return orderList;
    }
    
    public static void readPDF(String filePath) {
        List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
        File pdfFile = new File(filePath);
        PDDocument document = null;
        try
        {
            document=PDDocument.load(pdfFile);

            // 获取页码
            int pages = document.getNumberOfPages();

            // 读文本内容
            PDFTextStripper stripper=new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            /*stripper.setStartPage(1);
            stripper.setEndPage(pages);
            String content = stripper.getText(document);
            System.out.println(content);*/     
            
            for(int page = 1; page <= pages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                String content = stripper.getText(document);
                //System.out.println(content);
                parseContent(content, orderList);
            }
            System.out.println(orderList);
        }
        catch(Exception e)
        {
            System.out.println(e);
        }

        }
    
    private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\\\s(.*?)\\\\s");
    private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\\\s(.*?)\\\\s");
    private static Pattern policeHolderP = Pattern.compile("投 保 人.*\\r\\n");
    private static Pattern insuredP = Pattern.compile("被保险人.*\\r\\n");
    private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\\\s(.*?)(\\r\\n|\\\\s)");
    private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\\\s(.*?)(\\r\\n|\\\\s)");
    private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\\\s(.*?)(\\r\\n|\\\\s)");
    private static Pattern validPeriodP = Pattern.compile("保险期间\\\\s(.*?)\\\\s合同生效日", Pattern.DOTALL);
    private static Pattern effectiveDateP = Pattern.compile("合同生效日\\\\s(.*?)(\\r\\n|\\\\s)");
    private static Pattern chargeWayP = Pattern.compile("交费方式\\\\s(.*?)\\\\s");
    private static Pattern feeP = Pattern.compile("保 险 费\\\\s(.*?)(\\r\\n|\\\\s)");
    private static Pattern policeHolderCount = Pattern.compile("投保份数\\\\s(.*?)(\\r\\n|\\\\s)");
    private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
    /*private static Pattern validPeriodP = Pattern.compile("保险期间\\\\s(.*?)\\\\s");
    private static Pattern effectiveDateP = Pattern.compile("合同生效日\\\\s(.*?)\\\\s");*/
    private static void parseContent(String content, List<InsuranceOrder> list) {
        if(content == null || content.trim().length() == 0) {
            return;
        }
        if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
            //个人信息
            InsuranceOrder order = new InsuranceOrder();
            String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
            if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
                return;
            }
            list.add(order);
            order.setInsurancePoliceNo(insurancePoliceNo);
            order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1));
            
            String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
            if(policeHolderInfo != null) {
                Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
                Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
                Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$");
                
                order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
                order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
                order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
                order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
            }
            String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
            if(insuredInfo != null) {
                Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
                Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
                Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                Pattern insuredIDP = Pattern.compile("证件号码(.*)$");
                
                order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
                order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
                order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
                order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
            }
            order.setInsuredAge(retriveText(content, insuredAgeP, 1));
            order.setBeneficiary(retriveText(content, beneficiaryP, 1));
            
            //保险信息
            Insurance insurance = new Insurance();
            order.setInsurance(insurance);
            insurance.setName(retriveText(content, insuranceNameP, 1));
            insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\\r\\n", ""));
            insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
            insurance.setChargeWay(retriveText(content, chargeWayP, 1));
            insurance.setFee(retriveText(content, feeP, 1));
            insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1));
            
            //保险项目信息
            String programList = retriveTextWithInnnerBlank(content, programListP, 1);
            if(programList != null) {
                String[] pArr = programList.split("\\r\\n");
                for(String str : pArr) {
                    if(str != null && str.trim().length() > 0) {
                        String[] subArr = str.split(" ");
                        InsuranceProgram program = new InsuranceProgram();
                        order.getProgramList().add(program);
                        program.setName(subArr[0]);
                        program.setFee(subArr[1]);
                    }
                }
            }
        }
    }
    
    private static String retriveText(String content, Pattern p, int position) {
        Matcher m = p.matcher(content);
        if(m.find()) {
            return m.group(position).trim().replace(" ", "");
        }
        return "";
    }
    
    private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
        Matcher m = p.matcher(content);
        if(m.find()) {
            return m.group(position).trim();
        }
        return "";
    }
}

 

相关实体类:InsuranceOrder .java

package com.insurance.pojo;

import java.util.ArrayList;
import java.util.List;

public class InsuranceOrder {

    private String insurancePoliceNo;  //保险单号
    private String insuranceApplicationNo;  //投保单号
    private String policeHolderName;  //  投保人
    private String policeHolderBirthday; //投保人出生日期
    private String policeHolderGender;  //投保人性别
    private String policeHolderID;  //  投保人证件号码
    private String insuredName;  //被保险人
    private String insuredGender;  //被保险人性别
    private String insuredBirthday; //被保险人出生日期
    private String insuredID;  //被保险人证件号
    private String insuredAge;  //被保险人投保年龄
    private String beneficiary;  //身故受益人及分配方式
    
    private Insurance insurance; //险种
    private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>();  //保险项目
    
    
    public String getPoliceHolderBirthday() {
        return policeHolderBirthday;
    }
    public void setPoliceHolderBirthday(String policeHolderBirthday) {
        this.policeHolderBirthday = policeHolderBirthday;
    }
    public String getInsuredBirthday() {
        return insuredBirthday;
    }
    public void setInsuredBirthday(String insuredBirthday) {
        this.insuredBirthday = insuredBirthday;
    }
    public String getInsurancePoliceNo() {
        return insurancePoliceNo;
    }
    public void setInsurancePoliceNo(String insurancePoliceNo) {
        this.insurancePoliceNo = insurancePoliceNo;
    }
    public String getInsuranceApplicationNo() {
        return insuranceApplicationNo;
    }
    public void setInsuranceApplicationNo(String insuranceApplicationNo) {
        this.insuranceApplicationNo = insuranceApplicationNo;
    }
    public String getPoliceHolderName() {
        return policeHolderName;
    }
    public void setPoliceHolderName(String policeHolderName) {
        this.policeHolderName = policeHolderName;
    }
    public String getPoliceHolderGender() {
        return policeHolderGender;
    }
    public void setPoliceHolderGender(String policeHolderGender) {
        this.policeHolderGender = policeHolderGender;
    }
    public String getPoliceHolderID() {
        return policeHolderID;
    }
    public void setPoliceHolderID(String policeHolderID) {
        this.policeHolderID = policeHolderID;
    }
    public String getInsuredName() {
        return insuredName;
    }
    public void setInsuredName(String insuredName) {
        this.insuredName = insuredName;
    }
    public String getInsuredGender() {
        return insuredGender;
    }
    public void setInsuredGender(String insuredGender) {
        this.insuredGender = insuredGender;
    }
    public String getInsuredID() {
        return insuredID;
    }
    public void setInsuredID(String insuredID) {
        this.insuredID = insuredID;
    }
    public String getInsuredAge() {
        return insuredAge;
    }
    public void setInsuredAge(String insuredAge) {
        this.insuredAge = insuredAge;
    }
    public String getBeneficiary() {
        return beneficiary;
    }
    public void setBeneficiary(String beneficiary) {
        this.beneficiary = beneficiary;
    }
    public Insurance getInsurance() {
        return insurance;
    }
    public void setInsurance(Insurance insurance) {
        this.insurance = insurance;
    }
    public List<InsuranceProgram> getProgramList() {
        return programList;
    }
    public void setProgramList(List<InsuranceProgram> programList) {
        this.programList = programList;
    }
    @Override
    public String toString() {
        return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
                + ", insuranceApplicationNo=" + insuranceApplicationNo
                + ", policeHolderName=" + policeHolderName
                + ", policeHolderBirthday=" + policeHolderBirthday
                + ", policeHolderGender=" + policeHolderGender
                + ", policeHolderID=" + policeHolderID + ", insuredName="
                + insuredName + ", insuredGender=" + insuredGender
                + ", insuredBirthday=" + insuredBirthday + ", insuredID="
                + insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
                + beneficiary + ", insurance=" + insurance + ", programList="
                + programList + "]";
    }
    
    
}

InsuranceProgram.java

package com.insurance.pojo;

/**
 * 保险项目
 * @author yinz
 *
 */
public class InsuranceProgram {

    private String name;  //项目名称
    private String fee;  //金额
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public String getFee() {
        return fee;
    }
    public void setFee(String fee) {
        this.fee = fee;
    }
    @Override
    public String toString() {
        return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
    }
    
    
}

 

此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

以上是关于pdfBox 读取pdf文件的主要内容,如果未能解决你的问题,请参考以下文章

java 如何读取PDF文件内容

PDFBox 解析PDF文件-解析服务器文件

java读取pdf和MS Office文档

使用 PDFBox 从 PDF 文档中读取特定页面

Apache PDFbox开发指南之PDF文档读取

用java读取多种文件格式的文件(pdf,pptx,ppt,doc,docx..)