如何在 JavaScript 中从 PDF 中提取文本

Posted 2023-02-24

技术标签:

【中文标题】如何在 JavaScript 中从 PDF 中提取文本【英文标题】：How to extract text from PDF in JavaSript 【发布时间】：2010-12-05 23:20:36 【问题描述】：

我想知道是否可以仅使用 javascript 来获取 PDF 文件中的文本？如果是的话，谁能告诉我怎么做？

我知道有一些服务器端 java、c# 等库，但我不想使用服务器。谢谢

【问题讨论】：

【参考方案1】：

这是一个古老的问题，但由于pdf.js多年来一直在发展，我想给出一个新的答案。也就是说，它可以在本地完成，而不涉及任何服务器或外部服务。新的 pdf.js 有一个函数：page.getTextContent()。您可以从中获取文本内容。我已经使用以下代码成功完成了它。

您在每一步中得到的都是一个承诺。您需要这样编码：.then( function()...) 才能继续下一步。

1) PDFJS.getDocument( data ).then( function(pdf)

2) pdf.getPage(i).then( function(page)

3)page.getTextContent().then( function(textContent)

你最终得到的是一个字符串数组textContent.bidiTexts[]。您将它们连接起来以获取 1 页的文本。文本块的坐标用于判断是否需要插入换行符或空格。（这可能并不完全健壮，但从我的测试看来还可以。）

输入参数data 需要是 URL 或 ArrayBuffer 类型的数据。我使用FileReader API 中的 ReadAsArrayBuffer(file) 函数来获取数据。

希望这会有所帮助。

注意：根据其他一些用户的说法，该库已更新并导致代码中断。根据下面 async5 的评论，您需要将textContent.bidiTexts 替换为textContent.items。

    function Pdf2TextClass()
     var self = this;
     this.complete = 0;

    /**
     *
     * @param data ArrayBuffer of the pdf file content
     * @param callbackPageDone To inform the progress each time
     *        when a page is finished. The callback function's input parameters are:
     *        1) number of pages done;
     *        2) total number of pages in file.
     * @param callbackAllDone The input parameter of callback function is 
     *        the result of extracted text from pdf file.
     *
     */
     this.pdfToText = function(data, callbackPageDone, callbackAllDone)
     console.assert( data  instanceof ArrayBuffer  || typeof data == 'string' );
     PDFJS.getDocument( data ).then( function(pdf) 
     var div = document.getElementById('viewer');

     var total = pdf.numPages;
     callbackPageDone( 0, total );        
     var layers = ;        
     for (i = 1; i <= total; i++)
        pdf.getPage(i).then( function(page)
        var n = page.pageNumber;
        page.getTextContent().then( function(textContent)
          if( null != textContent.bidiTexts )
            var page_text = "";
            var last_block = null;
            for( var k = 0; k < textContent.bidiTexts.length; k++ )
                var block = textContent.bidiTexts[k];
                if( last_block != null && last_block.str[last_block.str.length-1] != ' ')
                    if( block.x < last_block.x )
                        page_text += "\r\n"; 
                    else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
                        page_text += ' ';
                
                page_text += block.str;
                last_block = block;
            

            textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
            layers[n] =  page_text + "\n\n";
          
          ++ self.complete;
          callbackPageDone( self.complete, total );
          if (self.complete == total)
            window.setTimeout(function()
              var full_text = "";
              var num_pages = Object.keys(layers).length;
              for( var j = 1; j <= num_pages; j++)
                  full_text += layers[j] ;
              callbackAllDone(full_text);
            , 1000);              
          
        ); // end  of page.getTextContent().then
      ); // end of page.then
     // of for
  );
 ; // end of pdfToText()
; // end of class

【讨论】：

Ancient question 但很好的答案。您知道如何让 textLayer 不在单个 div 中呈现字符而是将它们呈现为整个单词？尝试使用与绝对定位的 div 重叠的文本层对我的性能造成了很大的影响，因为它们太多了。如果您希望将此作为单独的实际 *** 问题，我会提出一个。 @gm2008 我一直在尝试使用您的功能从 PDF 中提取文本。但是，我无法提取文本。 full_text 最后返回一个空字符串。你能帮忙吗？我也无法让它工作（API 已更改）。在下面添加了我自己的示例。在答案中添加更多示例：github.com/mozilla/pdf.js/blob/master/examples/text-only/… 和 github.com/mozilla/pdf.js/blob/master/examples/node/getinfo.js 用 textContent.items 替换 textContent.bidiTexts【参考方案2】：

我无法让 gm2008 的示例工作（pdf.js 上的内部数据结构明显改变了），所以我编写了自己的完全基于 Promise 的解决方案，不使用任何 DOM 元素、查询选择器或画布，使用mozilla 示例中更新的 pdf.js

因为我将它与 node-webkit 一起使用，所以它吃掉了上传的文件路径。您需要确保已下载 cmaps 并指向某处，并且需要 pdf.js 和 pdf.worker.js 才能使其正常工作。

    /**
     * Extract text from PDFs with PDF.js
     * Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
     */
    this.pdfToText = function(data) 

        PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
        PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
        PDFJS.cMapPacked = true;

        return PDFJS.getDocument(data).then(function(pdf) 
            var pages = [];
            for (var i = 0; i < pdf.numPages; i++) 
                pages.push(i);
            
            return Promise.all(pages.map(function(pageNumber) 
                return pdf.getPage(pageNumber + 1).then(function(page) 
                    return page.getTextContent().then(function(textContent) 
                        return textContent.items.map(function(item) 
                            return item.str;
                        ).join(' ');
                    );
                );
            )).then(function(pages) 
                return pages.join("\r\n");
            );
        );

用法：

 self.pdfToText(files[0].path).then(function(result) 
      console.log("PDF done!", result);
 )

【讨论】：

另见github.com/mozilla/pdf.js/blob/master/examples/text-only/…和github.com/mozilla/pdf.js/blob/master/examples/node/getinfo.js "PDFJS.getDocument(...).then 不是函数"【参考方案3】：

这里有一些 JavaScript 代码，可以使用来自 http://hublog.hubmed.org/archives/001948.html 的 Pdf.js 执行您想要的操作：

var input = document.getElementById("input");  
var processor = document.getElementById("processor");  
var output = document.getElementById("output");  

// listen for messages from the processor  
window.addEventListener("message", function(event)  
  if (event.source != processor.contentWindow) return;  

  switch (event.data)  
    // "ready" = the processor is ready, so fetch the PDF file  
    case "ready":  
      var xhr = new XMLHttpRequest;  
      xhr.open('GET', input.getAttribute("src"), true);  
      xhr.responseType = "arraybuffer";  
      xhr.onload = function(event)   
        processor.contentWindow.postMessage(this.response, "*");  
      ;  
      xhr.send();  
    break;  

    // anything else = the processor has returned the text of the PDF  
    default:  
      output.textContent = event.data.replace(/\s+/g, " ");  
    break;  
    
, true);

...这里有一个例子：

http://git.macropus.org/2011/11/pdftotext/example/

【讨论】：

虽然这些链接可能会回答问题，但最好在此处包含答案的基本部分并提供链接以供参考。如果链接页面发生更改，仅链接的答案可能会失效。嗨，我正在尝试这个，但这仍然需要将文件上传到服务器。如何在客户端本地处理文件？【参考方案4】：

注意：此代码假定您使用的是 nodejs。这意味着您正在解析本地文件而不是来自网页的文件，因为原始问题没有明确询问有关在网页上解析 pdf 的问题。

@gm2008 的回答是一个很好的起点（请阅读它及其 cmets 了解更多信息），但需要一些更新 (08/19) 并且有一些未使用的代码。我也喜欢更完整的例子。可以进行更多重构和调整（例如使用await），但目前它已尽可能接近原始答案。

和以前一样，它使用 Mozilla 的 PDFjs 库。 npmjs 包位于https://www.npmjs.com/package/pdfjs-dist。

~~根据我的经验，这在寻找放置空间的位置方面效果不佳，但这是另一个问题。~~

[编辑：我相信.transform 使用的更新已经恢复了原来的空白。]

// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');

let pathToPDF = 'path/to/myPDFfileToText.pdf';

let toText = Pdf2TextObj();
let onPageDone = function() ; // don't want to do anything between pages
let onFinish = function(fullText)  console.log(fullText) ;
toText.pdfToText(pathToPDF, onPageDone, onFinish);

function Pdf2TextObj() 
    let self = this;
    this.complete = 0;

    /**
     *
     * @param path Path to the pdf file.
     * @param callbackPageDone To inform the progress each time
     *        when a page is finished. The callback function's input parameters are:
     *        1) number of pages done.
     *        2) total number of pages in file.
     *        3) the `page` object itself or null.
     * @param callbackAllDone Called after all text has been collected. Input parameters:
     *        1) full text of parsed pdf.
     *
     */
    this.pdfToText = function(path, callbackPageDone, callbackAllDone) 
        // console.assert(typeof path == 'string');
        PDFJS.getDocument(path).promise.then(function(pdf) 

            let total = pdf.numPages;
            callbackPageDone(0, total, null);

            let pages = ;
            // For some (pdf?) reason these don't all come in consecutive
            // order. That's why they're stored as an object and then
            // processed one final time at the end.
            for (let pagei = 1; pagei <= total; pagei++) 
                pdf.getPage(pagei).then(function(page) 
                    let pageNumber = page.pageNumber;
                    page.getTextContent().then(function(textContent) 
                        if (null != textContent.items) 
                            let page_text = "";
                            let last_item = null;
                            for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) 
                                let item = textContent.items[itemsi];
                                // I think to add whitespace properly would be more complex and
                                // would require two loops.
                                if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') 
                                    let itemX = item.transform[5]
                                    let lastItemX = last_item.transform[5]
                                    let itemY = item.transform[4]
                                    let lastItemY = last_item.transform[4]
                                    if (itemX < lastItemX)
                                        page_text += "\r\n";
                                    else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
                                        page_text += ' ';
                                 // ends if may need to add whitespace

                                page_text += item.str;
                                last_item = item;
                             // ends for every item of text

                            textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
                            pages[pageNumber] = page_text + "\n\n";
                         // ends if has items

                        ++self.complete;

                        callbackPageDone(self.complete, total, page);


                        // If all done, put pages in order and combine all
                        // text, then pass that to the callback
                        if (self.complete == total) 
                            // Using `setTimeout()` isn't a stable way of making sure 
                            // the process has finished. Watch out for missed pages.
                            // A future version might do this with promises.
                            setTimeout(function() 
                                let full_text = "";
                                let num_pages = Object.keys(pages).length;
                                for (let pageNum = 1; pageNum <= num_pages; pageNum++)
                                    full_text += pages[pageNum];
                                callbackAllDone(full_text);
                            , 1000);
                        
                    ); // ends page.getTextContent().then
                ); // ends page.then
             // ends for every page
        );
    ; // Ends pdfToText()

    return self;
; // Ends object factory

在终端中运行：

node myPDFfileToText.js

【讨论】：

"无法设置未定义的“完整”属性"【参考方案5】：

2021 年 2 月更新

<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
    <script>
    
function Pdf2TextClass()
    var self = this;
    this.complete = 0;

    this.pdfToText = function(data, callbackPageDone, callbackAllDone)
    console.assert( data  instanceof ArrayBuffer  || typeof data == 'string' );
    var loadingTask = pdfjsLib.getDocument(data);
    loadingTask.promise.then(function(pdf) 


    var total = pdf._pdfInfo.numPages;
    //callbackPageDone( 0, total );        
    var layers = ;        
    for (i = 1; i <= total; i++)
       pdf.getPage(i).then( function(page)
       var n = page.pageNumber;
       page.getTextContent().then( function(textContent)
       
       //console.log(textContent.items[0]);0
         if( null != textContent.items )
           var page_text = "";
           var last_block = null;
           for( var k = 0; k < textContent.items.length; k++ )
               var block = textContent.items[k];
               if( last_block != null && last_block.str[last_block.str.length-1] != ' ')
                   if( block.x < last_block.x )
                       page_text += "\r\n"; 
                   else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
                       page_text += ' ';
               
               page_text += block.str;
               last_block = block;
           

           textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
           layers[n] =  page_text + "\n\n";
         
         ++ self.complete;
         //callbackPageDone( self.complete, total );
         if (self.complete == total)
           window.setTimeout(function()
             var full_text = "";
             var num_pages = Object.keys(layers).length;
             for( var j = 1; j <= num_pages; j++)
                 full_text += layers[j] ;
             console.log(full_text);
           , 1000);              
         
       ); // end  of page.getTextContent().then
     ); // end of page.then
    // of for
 );
; // end of pdfToText()
; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
    </script>

【讨论】：

【参考方案6】：

在这里留下一个完整的工作示例。

<html>
    <head>
        <script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
    </head>
    <body>
        <input id="pdffile" name="pdffile" type="file" />
        <button id="btn" onclick="convert()">Process</button>
        <div id="result"></div>
    </body>
</html>

<script>

    function convert() 
        var fr=new FileReader();
        var pdff = new Pdf2TextClass();
        fr.onload=function()
            pdff.pdfToText(fr.result, null, (text) =>  document.getElementById('result').innerText += text; );
        
        fr.readAsDataURL(document.getElementById('pdffile').files[0])
        
    

    function Pdf2TextClass() 
        var self = this;
        this.complete = 0;

        this.pdfToText = function (data, callbackPageDone, callbackAllDone) 
            console.assert(data instanceof ArrayBuffer || typeof data == 'string');
            var loadingTask = pdfjsLib.getDocument(data);
            loadingTask.promise.then(function (pdf) 


                var total = pdf._pdfInfo.numPages;
                //callbackPageDone( 0, total );        
                var layers = ;
                for (i = 1; i <= total; i++) 
                    pdf.getPage(i).then(function (page) 
                        var n = page.pageNumber;
                        page.getTextContent().then(function (textContent) 

                            //console.log(textContent.items[0]);0
                            if (null != textContent.items) 
                                var page_text = "";
                                var last_block = null;
                                for (var k = 0; k < textContent.items.length; k++) 
                                    var block = textContent.items[k];
                                    if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') 
                                        if (block.x < last_block.x)
                                            page_text += "\r\n";
                                        else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
                                            page_text += ' ';
                                    
                                    page_text += block.str;
                                    last_block = block;
                                

                                textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
                                layers[n] = page_text + "\n\n";
                            
                            ++self.complete;
                            //callbackPageDone( self.complete, total );
                            if (self.complete == total) 
                                window.setTimeout(function () 
                                    var full_text = "";
                                    var num_pages = Object.keys(layers).length;
                                    for (var j = 1; j <= num_pages; j++)
                                        full_text += layers[j];
                                    callbackAllDone(full_text);
                                , 1000);
                            
                        ); // end  of page.getTextContent().then
                    ); // end of page.then
                 // of for
            );
        ; // end of pdfToText()
    ; // end of class

</script>

【讨论】：

【参考方案7】：

对于所有真正想在节点服务器上使用它的人：

/**
 * Created by velten on 25.04.16.
 */
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');

let pdfPipe = request(url: pdfUrl, encoding:null).pipe(pdfParser);

pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => 
    //optionally:
    //let pdf = pdfParser.getMergedTextBlocksIfNeeded();

    let count1 = 0;
    //get text on a particular page
    for (let page of pdf.formImage.Pages) 
        count1 += page.Texts.length;
    

    console.log(count1);
    pdfParser.destroy();
);

【讨论】：

"dest.on 不是函数" @BartusZak foo.bar 也不是函数 ;)【参考方案8】：

有可能，但是：

无论如何，您都必须使用服务器，如果不将文件传输到服务器并返回，您将无法在用户计算机上获取文件的内容我觉得还没有人写过这样的库

所以如果你有空闲时间你可以学习pdf格式并自己写一个这样的库，或者你当然可以使用服务器端库。

【讨论】：

以上是关于如何在 JavaScript 中从 PDF 中提取文本的主要内容，如果未能解决你的问题，请参考以下文章