ExcelPDF文档解析
Posted lplucky
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ExcelPDF文档解析相关的知识,希望对你有一定的参考价值。
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFTextExtractionNotAllowed, PDFPage from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter from pdfminer.layout import LAParams, LTTextBoxHorizontal from pdfminer.converter import PDFPageAggregator import sys import xlrd reload(sys) sys.setdefaultencoding(‘utf-8‘) def pdf_transform_text(): print "开始解析pdf" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") content = ‘‘ fp = open(‘tempPdfFile_new3.pdf‘, ‘rb‘) # 来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): content += x.get_text().encode(‘utf-8‘) + ‘\n‘ print "解析pdf成功" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") return content.decode("utf-8") def parse_excel(url, filename): print ‘开始解析excel文档‘, filename, url req = WebRequests() inputStream = req.get(url, timeout=180) # 写入到本地暂存,方便解析成文本 with open("tempExcelFile_new3.xls", "wb") as xls: xls.write(inputStream.content) data = xlrd.open_workbook("tempExcelFile_new3.xls") for i in range(len(data.sheets())): table = data.sheets()[i] print table.name print table.nrows for i in range(2, table.nrows): if len(table.row_values(i))>=12: result = {} result[‘company_name‘] = table.row_values(i)[1] result[‘province‘] = table.row_values(i)[2] result[‘industry‘] = table.row_values(i)[3] result[‘broker_company‘] = table.row_values(i)[4] result[‘broker_person‘] = table.row_values(i)[5] result[‘law_firm‘] = table.row_values(i)[6] result[‘laywyer‘] = table.row_values(i)[7] result[‘accounting_firm‘] = table.row_values(i)[8] result[‘accountant‘] = table.row_values(i)[9] # result[‘‘] = table.row_values(i)[10] #挂牌同时发行((是/否)) result[‘progress‘] = table.row_values(i)[11] if len(table.row_values(i))==13: result[‘receive_date‘] = table.row_values(i)[12] saveOrUpdateNew3CompanyBaseInfo(result)
注意:Excelt在解析时要看一下有没有多个sheet。该pdf解析只能解析文本内容的pdf
以上是关于ExcelPDF文档解析的主要内容,如果未能解决你的问题,请参考以下文章
片段(Java) | 机试题+算法思路+考点+代码解析 2023