java将Word转换为html
Posted 磨人小妖精
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java将Word转换为html相关的知识,希望对你有一定的参考价值。
我这里是maven项目,只需在资源文件中配置,会自动下载ar包
在pox.xml中配置
<!--word转html https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/fr.opensagres.xdocreport.converter.docx.xwpf --> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId> <version>2.0.1</version> </dependency>
java代码
package com.lmt.service.file; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.util.UUID; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.util.IOUtils; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import org.springframework.web.multipart.MultipartFile; import org.w3c.dom.Document; import fr.opensagres.poi.xwpf.converter.core.ImageManager; import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter; import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions; @Component public class WordToHtml { private static final Logger logger = LoggerFactory.getLogger(WordToHtml.class); @Autowired private ParseFile parseFile; public File convert(MultipartFile file) { String filename = file.getOriginalFilename(); String suffix=filename.substring(filename.lastIndexOf(".")); String newName=UUID.randomUUID().toString(); // TODO 需要保存在一个新的位置 File convFile = new File("D:/test/" + newName +suffix); FileOutputStream fos = null; try { convFile.createNewFile(); fos = new FileOutputStream(convFile); fos.write(file.getBytes()); } catch (IOException ex) { logger.error("上传文件出错!", ex); return null; } finally { IOUtils.closeQuietly(fos); } // 输入文件名的所在文件夹 // 加上反斜杠 String parentDirectory = convFile.getParent(); if (!parentDirectory.endsWith("\")) { parentDirectory = parentDirectory + "\"; } if (filename.endsWith(".docx")) { return docxConvert(parentDirectory, convFile.getAbsolutePath(),newName); } else if (filename.endsWith(".doc")) { return docConvert(parentDirectory, convFile.getAbsolutePath(),newName); } else { logger.error("不支持的文件格式!"); return null; } } private File docxConvert(String parentDirectory, String filename,String newName) { try { XWPFDocument document = new XWPFDocument(new FileInputStream(filename)); XHTMLOptions options = XHTMLOptions.create().setImageManager(new ImageManager(new File(parentDirectory), UUID.randomUUID().toString())).indent(4); FileOutputStream out = new FileOutputStream(new File(parentDirectory + newName+ ".html")); XHTMLConverter.getInstance().convert(document, out, options); return new File(parentDirectory + newName+ ".html"); } catch (IOException ex) { logger.error("word转化出错!", ex); return null; } } private File docConvert(String parentDirectory, String filename,String newName) { try { HWPFDocument document = new HWPFDocument(new FileInputStream(filename)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument()); // converter默认对图片不作处理,需要手动下载图片并嵌入到html中 wordToHtmlConverter.setPicturesManager(new PicturesManager() { @Override public String savePicture(byte[] bytes, PictureType pictureType, String s, float v, float v1) { String imageFilename = parentDirectory + ""; String identity=UUID.randomUUID().toString(); File imageFile = new File(imageFilename, identity+s); imageFile.getParentFile().mkdirs(); InputStream in = null; FileOutputStream out = null; try { in = new ByteArrayInputStream(bytes); out = new FileOutputStream(imageFile); IOUtils.copy(in, out); } catch (IOException ex) { logger.error("word转化出错!", ex); } finally { if (in != null) { IOUtils.closeQuietly(in); } if (out != null) { IOUtils.closeQuietly(out); } } return imageFile.getName(); } }); wordToHtmlConverter.processDocument(document); Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); String result = new String(out.toByteArray()); FileWriter writer = new FileWriter(parentDirectory + newName + ".html"); writer.write(result); writer.close(); } catch (IOException | TransformerException | ParserConfigurationException ex) { logger.error("word转化出错!", ex); } return new File(parentDirectory + newName + ".html"); } /** * 将上传的Word文档转化成HTML字符串 * @param attachfile * @return */ public String convertToHtml(MultipartFile attachfile) { String wordContent = ""; // 将Word文件转换为html File file = convert(attachfile); // 读取html文件 if (file != null) { wordContent = parseFile.readHtml(file); } return wordContent; } }
以上是关于java将Word转换为html的主要内容,如果未能解决你的问题,请参考以下文章
使用freemarker生成的word文档,如何利用java代码将其转换为pdf格式?
HTML Bookmarklet模板:将任何JavaScript代码片段转换为Bookmarklet