1.引入maven依赖
<properties>
<poi.version>5.2.3</poi.version>
<xhtml.version>2.0.4</xhtml.version>
</properties>
<!--word转html-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>
2.Java代码
/**
* Word2007(docx)格式转html
* @param filePath 文件路径
* @return 返回转成String类型的html字符串
* @throws IOException
*/
public static String docxToHtml(String filePath) {
try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) {
XHTMLOptions options = XHTMLOptions.create();
// 是否忽略未使用的样式
options.setIgnoreStylesIfUnused(false);
// 设置片段模式,<div>标签包裹
options.setFragment(true);
// 图片转base64
options.setImageManager(new Base64EmbedImgManager());
// 转换htm1
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
return htmlStream.toString();
} catch (Exception e) {
log.error("Word转Html过程出现异常!", e);
}
return null;
}
/**
* Word2003(doc)格式转html
* @param filePath 文件路径
* @return 返回转成String类型的html字符串
* @throws Exception
*/
public static String docToHtml(String filePath) {
try (StringWriter writer = new StringWriter();
HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) {
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//将图片转成base64的格式
wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
wordToHtmlConverter.processDocument(document);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, new StreamResult(writer));
return writer.toString();
} catch (Exception e) {
log.error("Word转Html过程出现异常!", e);
}
return null;
}
/**
* word 转 html
* 自动检测文件格式转换
* @param filePath 文件本地路径
* @return 成功返回转换后的html字符串;失败返回null
*/
public static String autoWord2Html(String filePath) {
int lastIndexOf = filePath.lastIndexOf(".");
String suffix = filePath.substring(lastIndexOf + 1);
if ("doc".equalsIgnoreCase(suffix)) {
return docToHtml(filePath);
} else if ("docx".equalsIgnoreCase(suffix)) {
return docxToHtml(filePath);
} else {
log.info("文件格式错误,只支持Docx和Doc格式的文档!");
return null;
}
}