PDF.js可以实现在html下直接浏览pdf文档,是一款开源的pdf文档读取解析插件,非常强大,能将PDF文件渲染成Canvas。PDF.js主要包含两个库文件,一个pdf.js和一个pdf.worker.js,一个负责API解析,一个负责核心解析。
首先引入pdf.js文件<script type="text/javascript" src=‘pdf.js‘></script>
PDF.js大部分用法都是基于Promise的,PDFJS.getDocument(url)方法返回的就是一个Promise:
PDFJS.getDocument(‘helloworld.pdf‘).then(function(pdf) {
});
PDF的解析工作需要通过pdf.getPage(page)去执行,这个方法返回的也是一个Promise,因此可以去逐页解析PDF:
pdf.getPage(1).then(function(page) {
});
官网地址:http://mozilla.github.io/pdf.js/
渲染页面
各PDF页面有它自己的视窗,它定义了像素大小(n.72dpi和初始旋转。默认情况下,该窗口将缩放到PDF但是通过修改视图可以更改此操作。当创建了视图时,还会创建一个初始转换矩阵,它考虑到期望的规模、旋转,并转换坐标系统(0点)PDF文档底部左边,而画布0是 左。
var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(‘the-canvas‘); var context = canvas.getContext(‘2d‘); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; page.render(renderContext);
还可以自定义canvas大小:
var desiredWidth = 100; var viewport = page.getViewport(1); var scale = desiredWidth / viewport.width; var scaledViewport = page.getViewport(scale);
官方给出的示例:
ar url = ‘//cdn.mozilla.net/pdfjs/helloworld.pdf‘; PDFJS.workerSrc = ‘//mozilla.github.io/pdf.js/build/pdf.worker.js‘; var loadingTask = PDFJS.getDocument(url); loadingTask.promise.then(function(pdf) { console.log(‘PDF loaded‘); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log(‘Page loaded‘); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(‘the-canvas‘); var context = canvas.getContext(‘2d‘); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log(‘Page rendered‘); }); }); }, function (reason) { console.error(reason); });
另外较大的PDF文件可以用base 64编码方式加载,例如:
var pdfData = atob( ‘JVBERi0xLjcKCjEgMCBvYmogICUgZW50cnkgcG9pbnQKPDwKICAvVHlwZSAvQ2F0YWxvZwog‘ + ‘IC9QYWdlcyAyIDAgUgo+PgplbmRvYmoKCjIgMCBvYmoKPDwKICAvVHlwZSAvUGFnZXMKICAv‘ + ‘TWVkaWFCb3ggWyAwIDAgMjAwIDIwMCBdCiAgL0NvdW50IDEKICAvS2lkcyBbIDMgMCBSIF0K‘ + ‘Pj4KZW5kb2JqCgozIDAgb2JqCjw8CiAgL1R5cGUgL1BhZ2UKICAvUGFyZW50IDIgMCBSCiAg‘ + ‘L1Jlc291cmNlcyA8PAogICAgL0ZvbnQgPDwKICAgICAgL0YxIDQgMCBSIAogICAgPj4KICA+‘ + ‘PgogIC9Db250ZW50cyA1IDAgUgo+PgplbmRvYmoKCjQgMCBvYmoKPDwKICAvVHlwZSAvRm9u‘ + ‘dAogIC9TdWJ0eXBlIC9UeXBlMQogIC9CYXNlRm9udCAvVGltZXMtUm9tYW4KPj4KZW5kb2Jq‘ + ‘Cgo1IDAgb2JqICAlIHBhZ2UgY29udGVudAo8PAogIC9MZW5ndGggNDQKPj4Kc3RyZWFtCkJU‘ + ‘CjcwIDUwIFRECi9GMSAxMiBUZgooSGVsbG8sIHdvcmxkISkgVGoKRVQKZW5kc3RyZWFtCmVu‘ + ‘ZG9iagoKeHJlZgowIDYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDAwMDEwIDAwMDAwIG4g‘ + ‘CjAwMDAwMDAwNzkgMDAwMDAgbiAKMDAwMDAwMDE3MyAwMDAwMCBuIAowMDAwMDAwMzAxIDAw‘ + ‘MDAwIG4gCjAwMDAwMDAzODAgMDAwMDAgbiAKdHJhaWxlcgo8PAogIC9TaXplIDYKICAvUm9v‘ + ‘dCAxIDAgUgo+PgpzdGFydHhyZWYKNDkyCiUlRU9G‘);
PDFJS.workerSrc = ‘//mozilla.github.io/pdf.js/build/pdf.worker.js‘; var loadingTask = PDFJS.getDocument({data: pdfData}); loadingTask.promise.then(function(pdf) { console.log(‘PDF loaded‘); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log(‘Page loaded‘); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById(‘the-canvas‘); var context = canvas.getContext(‘2d‘); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log(‘Page rendered‘); }); }); }, function (reason) { console.error(reason); });
pdf翻页处理:
// If absolute URL from the remote server is provided, configure the CORS // header on that server. var url = ‘//cdn.mozilla.net/pdfjs/tracemonkey.pdf‘; // The workerSrc property shall be specified. PDFJS.workerSrc = ‘//mozilla.github.io/pdf.js/build/pdf.worker.js‘; var pdfDoc = null, pageNum = 1, pageRendering = false, pageNumPending = null, scale = 0.8, canvas = document.getElementById(‘the-canvas‘), ctx = canvas.getContext(‘2d‘); /** * Get page info from document, resize canvas accordingly, and render page. * @param num Page number. */ function renderPage(num) { pageRendering = true;
pdfDoc.getPage(num).then(function(page) { var viewport = page.getViewport(scale); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: ctx, viewport: viewport }; var renderTask = page.render(renderContext);
renderTask.promise.then(function() { pageRendering = false; if (pageNumPending !== null) { renderPage(pageNumPending); pageNumPending = null; } }); }); document.getElementById(‘page_num‘).textContent = num; }
function queueRenderPage(num) { if (pageRendering) { pageNumPending = num; } else { renderPage(num); } }
function onPrevPage() { if (pageNum <= 1) { return; } pageNum--; queueRenderPage(pageNum); } document.getElementById(‘prev‘).addEventListener(‘click‘, onPrevPage);
function onNextPage() { if (pageNum >= pdfDoc.numPages) { return; } pageNum++; queueRenderPage(pageNum); } document.getElementById(‘next‘).addEventListener(‘click‘, onNextPage);
PDFJS.getDocument(url).then(function(pdfDoc_) { pdfDoc = pdfDoc_; document.getElementById(‘page_count‘).textContent = pdfDoc.numPages; renderPage(pageNum); });
关于page方式的使用:
解析结果,我们可以看下这个对象提供的方法:
方法 | 返回 |
---|---|
getAnnotations | A promise that is resolved with an {Array} of the annotation objects. |
getTextContent | That is resolved a TextContent object that represent the page text content. |
getViewport | Contains ‘width’ and ‘height’ properties along with transforms required for rendering. |
render | An object that contains the promise, which is resolved when the page finishes rendering. |
我们可以试试调用getTextContent方法,并将其结果打印出来:
pdf.getPage(1).then(function(page) { console.log(page); });
输入格式大致如下:
{ "items": [ { "str": "xxx", "dir": "xxx", "width": xxx, "height": xxx, "transform": [ 48, 0, 0, 48, 45.32495, 679.04 ], "fontName": "g_d0_f1" }, { "str": " ", "dir": "ltr", "width": 9.600000000000001, "height": 2304, "transform": [ 48, 0, 0, 48, 285.325, 679.04 ], "fontName": "g_d0_f2" } ], "styles": { "g_d0_f1": { "fontFamily": "monospace", "ascent": 1.05810546875, "descent": -0.26171875, "vertical": false }, "g_d0_f2": { "fontFamily": "sans-serif", "ascent": 0.74365234375, "descent": -0.25634765625 } } }
PDF.js能将每页文本的字符串、位置、字体都解析出来。
官网用的viewer.js:http://mozilla.github.io/pdf.js/web/viewer.html,首先底图是一个Canvas,内容和PDF一样(通过下面介绍的page.render方法可以得到),底图之上是一个textLayer,这一层就是通过page.getTextContent()得到了字体的位置和样式,再覆盖在Canvas上。
我们可以直接使用官网view.html的demo,然后修改样式去掉用不掉的功能,简单粗暴。只需要在跳转链接后面加上参数就行,例:http://xxxx/viewer.html?file=‘xxxx.pdf‘;