1.引入maven依赖
<properties> <poi.version>5.2.3</poi.version> <xhtml.version>2.0.4</xhtml.version> </properties> <!--word转html--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>${poi.version}</version> </dependency> <!--word转html--> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId> <version>${xhtml.version}</version> </dependency> <!--处理office文档表格相关 2007+版--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>${poi.version}</version> </dependency> <!--处理office文档表格相关 2003版--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>${poi.version}</version> </dependency>
复制
2.Java代码
/** * Word2007(docx)格式转html * @param filePath 文件路径 * @return 返回转成String类型的html字符串 * @throws IOException */ public static String docxToHtml(String filePath) { try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream(); XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) { XHTMLOptions options = XHTMLOptions.create(); // 是否忽略未使用的样式 options.setIgnoreStylesIfUnused(false); // 设置片段模式,<div>标签包裹 options.setFragment(true); // 图片转base64 options.setImageManager(new Base64EmbedImgManager()); // 转换htm1 XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options); return htmlStream.toString(); } catch (Exception e) { log.error("Word转Html过程出现异常!", e); } return null; } /** * Word2003(doc)格式转html * @param filePath 文件路径 * @return 返回转成String类型的html字符串 * @throws Exception */ public static String docToHtml(String filePath) { try (StringWriter writer = new StringWriter(); HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) { WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //将图片转成base64的格式 wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes)); wordToHtmlConverter.processDocument(document); org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument(); DOMSource domSource = new DOMSource(htmlDocument); TransformerFactory factory = TransformerFactory.newInstance(); Transformer serializer = factory.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, new StreamResult(writer)); return writer.toString(); } catch (Exception e) { log.error("Word转Html过程出现异常!", e); } return null; } /** * word 转 html * 自动检测文件格式转换 * @param filePath 文件本地路径 * @return 成功返回转换后的html字符串;失败返回null */ public static String autoWord2Html(String filePath) { int lastIndexOf = filePath.lastIndexOf("."); String suffix = filePath.substring(lastIndexOf + 1); if ("doc".equalsIgnoreCase(suffix)) { return docToHtml(filePath); } else if ("docx".equalsIgnoreCase(suffix)) { return docxToHtml(filePath); } else { log.info("文件格式错误,只支持Docx和Doc格式的文档!"); return null; } }
复制