Java word转为html 两种方式

滴滴滴上重点。。。

方式一：使用apache提供的工具包poi，poi使用的是4.1.2版本

缺点：对字体样式处理不精确；wmf公式图片部分转换不精确，本文档只支持doc格式

优点：转换速度相对很快，本地也方便调试

方式二：使用libreoffice，使用的是7.5版本

地址：下载 LibreOffice | LibreOffice 简体中文官方网站 - 自由免费的办公套件

Linux安装libreoffice案例：linux centos7工具安装之 libreOffice篇 libreOffice安装教程_centos7 安装libreoffice_the_bog的博客-CSDN博客

缺点：转换速度相对慢

优点：字体样式十分精确，本文档只支持doc，docx等等。转换pdf等相关命令百度获取

废话不多说直接上代码！！！

方式一代码实现：

相关jar包地址：

  <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi</artifactId>
      <version>4.1.2</version>
    </dependency>
 
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-scratchpad</artifactId>
      <version>4.1.2</version>
    </dependency>
 
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-ooxml</artifactId>
      <version>4.1.2</version>
    </dependency>
  <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.xmlgraphics</groupId>
      <artifactId>batik-codec</artifactId>
      <version>1.7</version>
    </dependency>
    <dependency>
      <groupId>net.arnx</groupId>
      <artifactId>wmf2svg</artifactId>
      <version>0.9.5</version>
    </dependency>复制

 package cn.hls.winner.winner_problem_manage.utils;
 
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.util.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.FileCopyUtils;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;
 
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
 
/**
 * @author lhz
 * @description TODO
 * @date 2023/9/18 10:14
 */
public class Word2003Util {
 
    private static final Logger logger = LoggerFactory.getLogger(Word2003Util.class);
 
 
    /**
     *
     * @param multipartFile  上传的文件
     * @param htmlFile       html上传路径
     * @param htmlFileImgUrl html图片上传路径
     * @param wordFileUrl    word上传路径
     * @return
     */
    public static String word2003ToHtml(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl) {
        // 需要判断文件是否为doc,docx
        if (multipartFile == null) {
            return "word文档上传为空！";
        }
        if (multipartFile.getOriginalFilename().endsWith("docx")) {
            return "word文档格式有误，请上传doc格式的！";
        }
        logger.info("***** word2003ToHtml start file:{}", multipartFile);
        //返回服务器代理地址
        String htmlUrl = "";
        //随机命名html文件
        String uuid = UUID.randomUUID().toString();
        String htmlFileName = uuid + "." + "html";
        logger.info("==== 初始化====（htmlFileName）{参数} " + htmlFileName);
        try {
            //上传服务器的图片本地地址
            logger.info("==== htmlFile{参数} ====" + htmlFile);
            //nginx转发后的图片地址
            logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl);
            //生成网页的文件夹地址
            String htmlFileUrl = htmlFile + uuid + "/";
            logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl);
            //上传文件到服务器
            boolean flag = upload(multipartFile, wordFileUrl, uuid);
            if (!flag) {
                return "word文档上传失败！";
            }
            logger.info("===== word文档上传成功！====");
            //获取文件名称
            String name = multipartFile.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.后缀名
            String filePath = wordFileUrl + uuid + suffix;
            logger.info("==== filePath ====" + filePath);
            File file = new File(filePath);
            // 1) 加载word文档生成 HWPFDocument对象
            InputStream inputStream = new FileInputStream(file);
            HWPFDocument wordDocument = new HWPFDocument(inputStream);
            WordToHtmlConverter wordToHtmlConverter =
                    new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            //图片地址
            String fileImg = htmlFileUrl + "images/";
            File htmlFile1 = new File(htmlFileUrl);
            if (!htmlFile1.exists()) {
                //创建
                if (htmlFile1.mkdirs()) {
                    logger.info("创建" + htmlFileUrl + "成功");
                } else {
                    logger.info("创建" + htmlFileUrl + "成功");
                }
            }
            //html代理地址
            htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;
            //html生成路径
            htmlFileName = htmlFileUrl + htmlFileName;
            logger.info("==== htmlFileName{ html ======== 输出地址} " + htmlFileName);
            //设置图片存放的位置
            String finalFileImg = fileImg;
            final int[] index = {1};
            //处理图片地址
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                    File imgPath = new File(finalFileImg);
                    if (!imgPath.exists()) {//图片目录不存在则创建
                        imgPath.mkdirs();
                    }
                    String extension = pictureType.getExtension();
                    //随机生成图片名称
                    suggestedName = finalFileImg + "image" + index[0] + "." + extension;
                    File file = new File(suggestedName);
                    OutputStream os = null;
                    try {
                        os = new FileOutputStream(file);
                        os.write(content);
                        os.close();
                        //处理wmf公式图片
//                        if (extension.equals("wmf") || extension.equals("svg")) {
//                            if (extension.equals("wmf")) {
//                                String svgFile = suggestedName.substring(0,
//                                        suggestedName.lastIndexOf(".wmf"))
//                                        + ".svg";
//                                SvgToPngUtil.wmfToSvg(suggestedName, svgFile);
//                            }
//                            String suggestedNameSVG = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".svg";
                            String s = SvgToPngUtil.readToString(suggestedNameSVG);
                            String suggestedNamePng = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".png";
                            SvgToPngUtil.convertToPng(s, suggestedNamePng);
                            String s1 = SvgToPngUtil.GetImageStr(suggestedNameSVG);
//                            //删除无用图片
                            deleteFile(suggestedNameSVG, suggestedName);
//                            suggestedName = suggestedNameSVG;
//                        }
                    } catch (FileNotFoundException e) {
                        throw new RuntimeException(e);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    //这里可以指定word文档中图片的路径。
                    String imgUlr = suggestedName.replace(htmlFile, htmlFileImgUrl);
                    index[0]++;
                    return imgUlr;
                }
            });
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            OutputStream outputStream = new FileOutputStream(htmlFileName);
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(outputStream);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            outputStream.close();
            logger.info("===== 网页样式转换开始 ====");
            String htmlContents = readHtml(htmlFileName);
            FileCopyUtils.copy(htmlContents.getBytes("utf-8"), new File(htmlFileName));
            logger.info("===== 网页样式转换完成 ====");
        } catch (Exception e) {
            logger.error("word2003ToHtml====异常");
            logger.error(e.getMessage());
            throw new RuntimeException(e);
        }
        //
        return htmlUrl;
    }
 
    //获取网页内容
    public static String readHtml(String htmlFileName) throws Exception {
        StringBuilder htmlContents1 = new StringBuilder();
        String htmlContents = "";
        //读图网页内容
        BufferedReader buf = new BufferedReader(
                new InputStreamReader(new FileInputStream(htmlFileName), "utf-8"));
        String c = "";
        while ((c = buf.readLine()) != null) {
            htmlContents1.append(c + "\n");
        }
 
        buf.close();
        htmlContents = htmlContents1.toString();
        htmlContents = htmlContents.replace("hyphenate:auto;font-family:Times New Roman;", "hyphenate:auto;font-family:宋体;").replace("vertical-align:text-bottom;", "vertical-align: middle;").replace("’","'").replace("&rsquo;","'");
        org.jsoup.nodes.Document document = Jsoup.parse(htmlContents);
        formatHtml(document);
        htmlContents = document.toString();
        return htmlContents;
    }
 
    //网页字体样式
    public static void formatHtml(org.jsoup.nodes.Document document) {
        Elements elements = document.getAllElements();
        String title = document.title();
        logger.info("==== formatHtml ====title"+title);
        for (Element element : elements) {
            if ("main".equals(element.className())) {
                continue;
            }
            if (title.contains("物理") || title.contains("数学") || title.contains("化学")) {
                if (element.hasClass("s1")) {
                    element.attr("style", "font-family:Times New Roman;" + element.attr("style"));
                }
            }
            String[] attrs = element.attr("style").split(";");
            List<String> attrList = new ArrayList();
            for (String attr : attrs) {
                if (attr.contains("font-family")) {
                    attrList.add(attr);
                }
            }
            //将<body>标签里的class属性b1 b2去掉
            Elements bodys = element.getElementsByTag("body");
            for(Element body : bodys){
                System.out.println("=======className:" + body.className() + "==========");
                if("b1 b2".equals(body.className())){
                    body.attr("class","");
                }
            }
        }
    }
 
    public static void deleteFile(String... imgUrl) {
        for (String s : imgUrl) {
            File file = new File(s);
            try {
                if (file.isFile()) {
                    // 删除文件
                    if (file.delete()) {
                        logger.info("删除文件成功==== 名称为：" + file.getName());
                    } else {
                    }
                } else {
                }
            } catch (Exception e) {
                logger.error("====== 删除图片失败 ======" + e.getMessage());
                throw new RuntimeException();
            }
        }
    }
 
 
    /**
     * @param file     文件
     * @param htmlFile 文件上传地址
     * @param fileName 文件名称
     * @return
     */
    public static boolean upload(MultipartFile file, String htmlFile, String fileName) {
        InputStream is = null;
        OutputStream os = null;
        try {
            File file1 = new File(htmlFile);
            if (!file1.exists()) {
                file1.mkdirs();
            }
            String name = file.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.后缀名
            is = file.getInputStream();
            os = new FileOutputStream(htmlFile + fileName + suffix);
            //数据对拷
            IOUtils.copy(is, os);
            logger.info("==== 文件写入成功！====");
        } catch (IOException e) {
            logger.error("===== 文件上传失败 ====" + e.getMessage());
            return false;
        } finally {
            if (null != is) {
                try {
                    is.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            if (null != os) {
                try {
                    os.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
        return true;
    }
}复制

方式二代码实现：

 package com.hls.poi.service;
 
 
import com.hls.poi.controller.WordToHtmlController;
import org.apache.poi.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.multipart.MultipartFile;
 
import java.io.*;
import java.util.UUID;
 
public class LibreOfficeCommandWordService {
 
    private static final Logger logger = LoggerFactory.getLogger(WordToHtmlController.class);
 
    /**
     * /opt/libreoffice7.5/program/soffice --headless --invisible --convert-to pdf /opt/a/1.docx --outdir /opt/a/
     * –convert-to pdf 后面的 /opt/a/1.docx 为原文件路径
     * –outdir /opt/a/（转换后文件存放目录）
     * <p>
     * soffice --headless --invisible --convert-to html:HTML ffc75d91-3594-451d-a55f-a941325bc380.doc --outdir mmm
     */
 
    //需要根据实际情况，查找LibreOffice安装的实际目录，
    //Mac下是默认安装到/usr/local/bin，
    //CentOS下默认安装在/usr/bin
    private final static String sofficeDir = "/opt/libreoffice7.6/program/";
 
    /**
     * @param multipartFile  上传的文件
     * @param htmlFile       html上传路径
     * @param htmlFileImgUrl html图片上传路径
     * @param wordFileUrl    word上传路径
     * @param sofficeDir     libreOffice安装地址
     * @throws Exception
     */
    public String word2html(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl, String sofficeDir) throws Exception {
        try {
            logger.info("exec command:[{}]\noutput: [{}]", "进入word2pdf{} 方法");
            // 需要判断文件是否为doc,docx
            if (multipartFile == null) {
                return "word文档上传为空！";
            }
            //返回服务器代理地址
            String htmlUrl = "";
            //随机命名html文件
            String uuid = UUID.randomUUID().toString();
            String htmlFileName = uuid + "." + "html";
            logger.info("==== 初始化====（htmlFileName）{参数} " + htmlFileName);
            //上传服务器的图片本地地址
            logger.info("==== htmlFile{参数} ====" + htmlFile);
            //nginx转发后的图片地址
            logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl);
            //生成网页的文件夹地址
            String htmlFileUrl = htmlFile + uuid + "/";
            logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl);
            //上传文件到服务器
            boolean flag = upload(multipartFile, wordFileUrl, uuid);
            if (!flag) {
                return "word文档上传失败！";
            }
            logger.info("===== word文档上传成功！====");
            //获取文件名称
            String name = multipartFile.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.后缀名
            //上传后word文档路径   /home/winnersoft/date/tomcat/html-root/office/word/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.doc
            String inPath = wordFileUrl + uuid + suffix;
            logger.info("==== inPath ====" + inPath);
            if (!new File(inPath).exists()) {
                return "word文档不存在！";
            }
            //图片地址
            File htmlFile1 = new File(htmlFileUrl);
            if (!htmlFile1.exists()) {
                //创建
                if (htmlFile1.mkdirs()) {
                    logger.info("创建" + htmlFileUrl + "成功");
                } else {
                    logger.info("创建" + htmlFileUrl + "成功");
                }
            }
            //html代理地址  //http://172.18.222.25:82/office/html/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.html
            htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;
            //html生成路径    /home/winnersoft/date/tomcat/html-root/office/html/af7ac82f-71bc-498c-8866-8bf7ef325345/
            htmlFileName = htmlFileUrl;
            logger.info("==== outPath{ html ======== 输出地址} " + htmlFileName);
            //设置图片存放的位置
//        String command = String.format("%s/soffice --convert-to pdf:writer_pdf_Export %s --outdir %s", sofficeDir, inPath, outPath);
            String command = String.format("%s/soffice --headless --invisible --convert-to html:HTML %s --outdir %s", sofficeDir, inPath, htmlFileName);
            logger.info("command==================================" + command);
            String output = this.executeCommand(command);
            logger.info("exec command:[{}]\noutput: [{}]", command, output);
            return htmlUrl;
        } catch (IOException e) {
            logger.error("io异常"+e.getMessage());
            throw new RuntimeException(e);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
 
    protected String executeCommand(String command) throws IOException, InterruptedException {
        logger.info("executeCommand{} 执行转化");
        StringBuffer output = new StringBuffer();
        Process p;
        p = Runtime.getRuntime().exec(command);
        p.waitFor();
        try (
                InputStreamReader inputStreamReader = new InputStreamReader(p.getInputStream(), "UTF-8");
                BufferedReader reader = new BufferedReader(inputStreamReader)
        ) {
            String line = "";
            while ((line = reader.readLine()) != null) {
                output.append(line + "\n");
            }
        }
        // 销毁子进程
        p.destroy();
        return output.toString();
    }
 
    /**
     * @param file     文件
     * @param htmlFile 文件上传地址
     * @param fileName 文件名称
     * @return
     */
    public static boolean upload(MultipartFile file, String htmlFile, String fileName) {
        InputStream is = null;
        OutputStream os = null;
        try {
            File file1 = new File(htmlFile);
            if (!file1.exists()) {
                file1.mkdirs();
            }
            String name = file.getOriginalFilename();
            String suffix = name.substring(name.lastIndexOf("."));//.后缀名
            is = file.getInputStream();
            os = new FileOutputStream(htmlFile + fileName + suffix);
            //数据对拷
            IOUtils.copy(is, os);
            logger.info("==== 文件写入成功！====");
        } catch (IOException e) {
            logger.error("===== 文件上传失败 ====" + e.getMessage());
            return false;
        } finally {
            if (null != is) {
                try {
                    is.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            if (null != os) {
                try {
                    os.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
        return true;
    }
}
 复制

Java word转为html 两种方式

方式一：使用apache提供的工具包poi，poi使用的是4.1.2版本

方式二：使用libreoffice，使用的是7.5版本

如何使用goquery进行HTML解析以及它的源码分析和实现原理

Web10--jQuery进阶

【jQuery】实现列表滚动

jQuery和JavaScript和vue让子盒子滚动到父盒子的指定位置，包括设置scrollTop到指定位置的值为rem

jQuery 和 Zepto 的区别？各自的使用场景？

vue3 vite项目优化。

echarts实现透明间隙

Echarts图表显示不完全（多种图表解决方案）

Echarts 地图设置鼠标hover区块颜色,地图描点,描点选中

npm install(报错)

前端哥

初中生物堂清、日清、周清、月清的具体做法1000字

初中生物四清具体做法1000字

七年级上册生物四清具体做法1000字

七年级上册生物在四清教学方面的具体做法1000字

七年级上册生物苏教版在四清教学方面的具体做法

七年级上册生物苏教版在四清教学方面的优秀做法具体措施

初中生物学科在四清教学方面的优秀做法

如何使用goquery进行HTML解析以及它的源码分析和实现原理

jQuery发送ajax请求出现跨域问题（请求头XXX不被允许）

Web10--jQuery进阶

1
ECharts 饼状图颜色设置

2024-02-16 14:02:001000

2
echarts实现动态渲染多柱图

2024-02-12 14:02:341000

3
移动端css布局大全

2024-02-06 15:02:421000

4
使用HTML5和JS实现日期下拉框功能

2024-02-04 11:02:521000

5
JS生成条形码JsBarcode.all.js，转成图片canvas2image.js，并打印二维码jQuery.print.js

2024-01-27 01:01:181000

6
HTML5期末大作业：我的家乡网站设计5

2024-01-28 12:01:43999

7
web期末作业网页设计——我的家乡（网页源码）

2024-01-24 15:01:48999

8
尚硅谷css3笔记

2024-02-13 10:02:44998

9
如何利用浏览器测试跨域

2024-02-11 10:02:28998

10
html生日祝福网页制作（粉色主题

2024-01-30 20:01:45997

	<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi</artifactId>
	<version>4.1.2</version>
	</dependency>

	<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-scratchpad</artifactId>
	<version>4.1.2</version>
	</dependency>

	<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-ooxml</artifactId>
	<version>4.1.2</version>
	</dependency>
	<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.9.2</version>
	</dependency>
	<dependency>
	<groupId>org.apache.xmlgraphics</groupId>
	<artifactId>batik-codec</artifactId>
	<version>1.7</version>
	</dependency>
	<dependency>
	<groupId>net.arnx</groupId>
	<artifactId>wmf2svg</artifactId>
	<version>0.9.5</version>
	</dependency>

	package cn.hls.winner.winner_problem_manage.utils;

	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.hwpf.converter.PicturesManager;
	import org.apache.poi.hwpf.converter.WordToHtmlConverter;
	import org.apache.poi.hwpf.usermodel.PictureType;
	import org.apache.poi.util.IOUtils;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Attributes;
	import org.jsoup.nodes.Element;
	import org.jsoup.select.Elements;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.springframework.util.FileCopyUtils;
	import org.springframework.web.multipart.MultipartFile;
	import org.w3c.dom.Document;

	import javax.xml.parsers.DocumentBuilderFactory;
	import javax.xml.transform.OutputKeys;
	import javax.xml.transform.Transformer;
	import javax.xml.transform.TransformerFactory;
	import javax.xml.transform.dom.DOMSource;
	import javax.xml.transform.stream.StreamResult;
	import java.io.*;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.UUID;

	/**
	* @author lhz
	* @description TODO
	* @date 2023/9/18 10:14
	*/
	public class Word2003Util {

	private static final Logger logger = LoggerFactory.getLogger(Word2003Util.class);


	/**
	*
	* @param multipartFile 上传的文件
	* @param htmlFile html上传路径
	* @param htmlFileImgUrl html图片上传路径
	* @param wordFileUrl word上传路径
	* @return
	*/
	public static String word2003ToHtml(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl) {
	// 需要判断文件是否为doc,docx
	if (multipartFile == null) {
	return "word文档上传为空！";
	}
	if (multipartFile.getOriginalFilename().endsWith("docx")) {
	return "word文档格式有误，请上传doc格式的！";
	}
	logger.info("***** word2003ToHtml start file:{}", multipartFile);
	//返回服务器代理地址
	String htmlUrl = "";
	//随机命名html文件
	String uuid = UUID.randomUUID().toString();
	String htmlFileName = uuid + "." + "html";
	logger.info("==== 初始化====（htmlFileName）{参数} " + htmlFileName);
	try {
	//上传服务器的图片本地地址
	logger.info("==== htmlFile{参数} ====" + htmlFile);
	//nginx转发后的图片地址
	logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl);
	//生成网页的文件夹地址
	String htmlFileUrl = htmlFile + uuid + "/";
	logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl);
	//上传文件到服务器
	boolean flag = upload(multipartFile, wordFileUrl, uuid);
	if (!flag) {
	return "word文档上传失败！";
	}
	logger.info("===== word文档上传成功！====");
	//获取文件名称
	String name = multipartFile.getOriginalFilename();
	String suffix = name.substring(name.lastIndexOf("."));//.后缀名
	String filePath = wordFileUrl + uuid + suffix;
	logger.info("==== filePath ====" + filePath);
	File file = new File(filePath);
	// 1) 加载word文档生成 HWPFDocument对象
	InputStream inputStream = new FileInputStream(file);
	HWPFDocument wordDocument = new HWPFDocument(inputStream);
	WordToHtmlConverter wordToHtmlConverter =
	new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
	//图片地址
	String fileImg = htmlFileUrl + "images/";
	File htmlFile1 = new File(htmlFileUrl);
	if (!htmlFile1.exists()) {
	//创建
	if (htmlFile1.mkdirs()) {
	logger.info("创建" + htmlFileUrl + "成功");
	} else {
	logger.info("创建" + htmlFileUrl + "成功");
	}
	}
	//html代理地址
	htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;
	//html生成路径
	htmlFileName = htmlFileUrl + htmlFileName;
	logger.info("==== htmlFileName{ html ======== 输出地址} " + htmlFileName);
	//设置图片存放的位置
	String finalFileImg = fileImg;
	final int[] index = {1};
	//处理图片地址
	wordToHtmlConverter.setPicturesManager(new PicturesManager() {
	public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
	File imgPath = new File(finalFileImg);
	if (!imgPath.exists()) {//图片目录不存在则创建
	imgPath.mkdirs();
	}
	String extension = pictureType.getExtension();
	//随机生成图片名称
	suggestedName = finalFileImg + "image" + index[0] + "." + extension;
	File file = new File(suggestedName);
	OutputStream os = null;
	try {
	os = new FileOutputStream(file);
	os.write(content);
	os.close();
	//处理wmf公式图片
	// if (extension.equals("wmf") \|\| extension.equals("svg")) {
	// if (extension.equals("wmf")) {
	// String svgFile = suggestedName.substring(0,
	// suggestedName.lastIndexOf(".wmf"))
	// + ".svg";
	// SvgToPngUtil.wmfToSvg(suggestedName, svgFile);
	// }
	// String suggestedNameSVG = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".svg";
	String s = SvgToPngUtil.readToString(suggestedNameSVG);
	String suggestedNamePng = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".png";
	SvgToPngUtil.convertToPng(s, suggestedNamePng);
	String s1 = SvgToPngUtil.GetImageStr(suggestedNameSVG);
	// //删除无用图片
	deleteFile(suggestedNameSVG, suggestedName);
	// suggestedName = suggestedNameSVG;
	// }
	} catch (FileNotFoundException e) {
	throw new RuntimeException(e);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	//这里可以指定word文档中图片的路径。
	String imgUlr = suggestedName.replace(htmlFile, htmlFileImgUrl);
	index[0]++;
	return imgUlr;
	}
	});
	wordToHtmlConverter.processDocument(wordDocument);
	Document htmlDocument = wordToHtmlConverter.getDocument();
	OutputStream outputStream = new FileOutputStream(htmlFileName);
	DOMSource domSource = new DOMSource(htmlDocument);
	StreamResult streamResult = new StreamResult(outputStream);
	TransformerFactory factory = TransformerFactory.newInstance();
	Transformer serializer = factory.newTransformer();
	serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
	serializer.setOutputProperty(OutputKeys.INDENT, "yes");
	serializer.setOutputProperty(OutputKeys.METHOD, "html");
	serializer.transform(domSource, streamResult);
	outputStream.close();
	logger.info("===== 网页样式转换开始 ====");
	String htmlContents = readHtml(htmlFileName);
	FileCopyUtils.copy(htmlContents.getBytes("utf-8"), new File(htmlFileName));
	logger.info("===== 网页样式转换完成 ====");
	} catch (Exception e) {
	logger.error("word2003ToHtml====异常");
	logger.error(e.getMessage());
	throw new RuntimeException(e);
	}
	//
	return htmlUrl;
	}

	//获取网页内容
	public static String readHtml(String htmlFileName) throws Exception {
	StringBuilder htmlContents1 = new StringBuilder();
	String htmlContents = "";
	//读图网页内容
	BufferedReader buf = new BufferedReader(
	new InputStreamReader(new FileInputStream(htmlFileName), "utf-8"));
	String c = "";
	while ((c = buf.readLine()) != null) {
	htmlContents1.append(c + "\n");
	}

	buf.close();
	htmlContents = htmlContents1.toString();
	htmlContents = htmlContents.replace("hyphenate:auto;font-family:Times New Roman;", "hyphenate:auto;font-family:宋体;").replace("vertical-align:text-bottom;", "vertical-align: middle;").replace("’","'").replace("’","'");
	org.jsoup.nodes.Document document = Jsoup.parse(htmlContents);
	formatHtml(document);
	htmlContents = document.toString();
	return htmlContents;
	}

	//网页字体样式
	public static void formatHtml(org.jsoup.nodes.Document document) {
	Elements elements = document.getAllElements();
	String title = document.title();
	logger.info("==== formatHtml ====title"+title);
	for (Element element : elements) {
	if ("main".equals(element.className())) {
	continue;
	}
	if (title.contains("物理") \|\| title.contains("数学") \|\| title.contains("化学")) {
	if (element.hasClass("s1")) {
	element.attr("style", "font-family:Times New Roman;" + element.attr("style"));
	}
	}
	String[] attrs = element.attr("style").split(";");
	List<String> attrList = new ArrayList();
	for (String attr : attrs) {
	if (attr.contains("font-family")) {
	attrList.add(attr);
	}
	}
	//将<body>标签里的class属性b1 b2去掉
	Elements bodys = element.getElementsByTag("body");
	for(Element body : bodys){
	System.out.println("=======className:" + body.className() + "==========");
	if("b1 b2".equals(body.className())){
	body.attr("class","");
	}
	}
	}
	}

	public static void deleteFile(String... imgUrl) {
	for (String s : imgUrl) {
	File file = new File(s);
	try {
	if (file.isFile()) {
	// 删除文件
	if (file.delete()) {
	logger.info("删除文件成功==== 名称为：" + file.getName());
	} else {
	}
	} else {
	}
	} catch (Exception e) {
	logger.error("====== 删除图片失败 ======" + e.getMessage());
	throw new RuntimeException();
	}
	}
	}


	/**
	* @param file 文件
	* @param htmlFile 文件上传地址
	* @param fileName 文件名称
	* @return
	*/
	public static boolean upload(MultipartFile file, String htmlFile, String fileName) {
	InputStream is = null;
	OutputStream os = null;
	try {
	File file1 = new File(htmlFile);
	if (!file1.exists()) {
	file1.mkdirs();
	}
	String name = file.getOriginalFilename();
	String suffix = name.substring(name.lastIndexOf("."));//.后缀名
	is = file.getInputStream();
	os = new FileOutputStream(htmlFile + fileName + suffix);
	//数据对拷
	IOUtils.copy(is, os);
	logger.info("==== 文件写入成功！====");
	} catch (IOException e) {
	logger.error("===== 文件上传失败 ====" + e.getMessage());
	return false;
	} finally {
	if (null != is) {
	try {
	is.close();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	if (null != os) {
	try {
	os.close();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	}
	return true;
	}
	}

	package com.hls.poi.service;


	import com.hls.poi.controller.WordToHtmlController;
	import org.apache.poi.util.IOUtils;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.springframework.web.multipart.MultipartFile;

	import java.io.*;
	import java.util.UUID;

	public class LibreOfficeCommandWordService {

	private static final Logger logger = LoggerFactory.getLogger(WordToHtmlController.class);

	/**
	* /opt/libreoffice7.5/program/soffice --headless --invisible --convert-to pdf /opt/a/1.docx --outdir /opt/a/
	* –convert-to pdf 后面的 /opt/a/1.docx 为原文件路径
	* –outdir /opt/a/（转换后文件存放目录）
	* <p>
	* soffice --headless --invisible --convert-to html:HTML ffc75d91-3594-451d-a55f-a941325bc380.doc --outdir mmm
	*/

	//需要根据实际情况，查找LibreOffice安装的实际目录，
	//Mac下是默认安装到/usr/local/bin，
	//CentOS下默认安装在/usr/bin
	private final static String sofficeDir = "/opt/libreoffice7.6/program/";

	/**
	* @param multipartFile 上传的文件
	* @param htmlFile html上传路径
	* @param htmlFileImgUrl html图片上传路径
	* @param wordFileUrl word上传路径
	* @param sofficeDir libreOffice安装地址
	* @throws Exception
	*/
	public String word2html(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl, String sofficeDir) throws Exception {
	try {
	logger.info("exec command:[{}]\noutput: [{}]", "进入word2pdf{} 方法");
	// 需要判断文件是否为doc,docx
	if (multipartFile == null) {
	return "word文档上传为空！";
	}
	//返回服务器代理地址
	String htmlUrl = "";
	//随机命名html文件
	String uuid = UUID.randomUUID().toString();
	String htmlFileName = uuid + "." + "html";
	logger.info("==== 初始化====（htmlFileName）{参数} " + htmlFileName);
	//上传服务器的图片本地地址
	logger.info("==== htmlFile{参数} ====" + htmlFile);
	//nginx转发后的图片地址
	logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl);
	//生成网页的文件夹地址
	String htmlFileUrl = htmlFile + uuid + "/";
	logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl);
	//上传文件到服务器
	boolean flag = upload(multipartFile, wordFileUrl, uuid);
	if (!flag) {
	return "word文档上传失败！";
	}
	logger.info("===== word文档上传成功！====");
	//获取文件名称
	String name = multipartFile.getOriginalFilename();
	String suffix = name.substring(name.lastIndexOf("."));//.后缀名
	//上传后word文档路径 /home/winnersoft/date/tomcat/html-root/office/word/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.doc
	String inPath = wordFileUrl + uuid + suffix;
	logger.info("==== inPath ====" + inPath);
	if (!new File(inPath).exists()) {
	return "word文档不存在！";
	}
	//图片地址
	File htmlFile1 = new File(htmlFileUrl);
	if (!htmlFile1.exists()) {
	//创建
	if (htmlFile1.mkdirs()) {
	logger.info("创建" + htmlFileUrl + "成功");
	} else {
	logger.info("创建" + htmlFileUrl + "成功");
	}
	}
	//html代理地址 //http://172.18.222.25:82/office/html/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.html
	htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName;
	//html生成路径 /home/winnersoft/date/tomcat/html-root/office/html/af7ac82f-71bc-498c-8866-8bf7ef325345/
	htmlFileName = htmlFileUrl;
	logger.info("==== outPath{ html ======== 输出地址} " + htmlFileName);
	//设置图片存放的位置
	// String command = String.format("%s/soffice --convert-to pdf:writer_pdf_Export %s --outdir %s", sofficeDir, inPath, outPath);
	String command = String.format("%s/soffice --headless --invisible --convert-to html:HTML %s --outdir %s", sofficeDir, inPath, htmlFileName);
	logger.info("command==================================" + command);
	String output = this.executeCommand(command);
	logger.info("exec command:[{}]\noutput: [{}]", command, output);
	return htmlUrl;
	} catch (IOException e) {
	logger.error("io异常"+e.getMessage());
	throw new RuntimeException(e);
	} catch (InterruptedException e) {
	throw new RuntimeException(e);
	}
	}

	protected String executeCommand(String command) throws IOException, InterruptedException {
	logger.info("executeCommand{} 执行转化");
	StringBuffer output = new StringBuffer();
	Process p;
	p = Runtime.getRuntime().exec(command);
	p.waitFor();
	try (
	InputStreamReader inputStreamReader = new InputStreamReader(p.getInputStream(), "UTF-8");
	BufferedReader reader = new BufferedReader(inputStreamReader)
	) {
	String line = "";
	while ((line = reader.readLine()) != null) {
	output.append(line + "\n");
	}
	}
	// 销毁子进程
	p.destroy();
	return output.toString();
	}

	/**
	* @param file 文件
	* @param htmlFile 文件上传地址
	* @param fileName 文件名称
	* @return
	*/
	public static boolean upload(MultipartFile file, String htmlFile, String fileName) {
	InputStream is = null;
	OutputStream os = null;
	try {
	File file1 = new File(htmlFile);
	if (!file1.exists()) {
	file1.mkdirs();
	}
	String name = file.getOriginalFilename();
	String suffix = name.substring(name.lastIndexOf("."));//.后缀名
	is = file.getInputStream();
	os = new FileOutputStream(htmlFile + fileName + suffix);
	//数据对拷
	IOUtils.copy(is, os);
	logger.info("==== 文件写入成功！====");
	} catch (IOException e) {
	logger.error("===== 文件上传失败 ====" + e.getMessage());
	return false;
	} finally {
	if (null != is) {
	try {
	is.close();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	if (null != os) {
	try {
	os.close();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	}
	return true;
	}
	}

Java word转为html 两种方式

方式一：使用apache提供的工具包poi，poi使用的是4.1.2版本

方式二：使用libreoffice，使用的是7.5版本

微信扫一扫：分享