滴滴滴上重点。。。
方式一:使用apache提供的工具包poi,poi使用的是4.1.2版本
缺点:对字体样式处理不精确;wmf公式图片部分转换不精确,本文档只支持doc格式
优点:转换速度相对很快,本地也方便调试
方式二:使用libreoffice,使用的是7.5版本
地址:下载 LibreOffice | LibreOffice 简体中文官方网站 - 自由免费的办公套件
Linux安装libreoffice案例:linux centos7工具安装之 libreOffice篇 libreOffice安装教程_centos7 安装libreoffice_the_bog的博客-CSDN博客
缺点:转换速度相对慢
优点:字体样式十分精确,本文档只支持doc,docx等等。转换pdf等相关命令百度获取
废话不多说直接上代码!!!
方式一代码实现:
相关jar包地址:
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.9.2</version> </dependency> <dependency> <groupId>org.apache.xmlgraphics</groupId> <artifactId>batik-codec</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>net.arnx</groupId> <artifactId>wmf2svg</artifactId> <version>0.9.5</version> </dependency>
复制
package cn.hls.winner.winner_problem_manage.utils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.util.IOUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.FileCopyUtils; import org.springframework.web.multipart.MultipartFile; import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.UUID; /** * @author lhz * @description TODO * @date 2023/9/18 10:14 */ public class Word2003Util { private static final Logger logger = LoggerFactory.getLogger(Word2003Util.class); /** * * @param multipartFile 上传的文件 * @param htmlFile html上传路径 * @param htmlFileImgUrl html图片上传路径 * @param wordFileUrl word上传路径 * @return */ public static String word2003ToHtml(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl) { // 需要判断文件是否为doc,docx if (multipartFile == null) { return "word文档上传为空!"; } if (multipartFile.getOriginalFilename().endsWith("docx")) { return "word文档格式有误,请上传doc格式的!"; } logger.info("***** word2003ToHtml start file:{}", multipartFile); //返回服务器代理地址 String htmlUrl = ""; //随机命名html文件 String uuid = UUID.randomUUID().toString(); String htmlFileName = uuid + "." + "html"; logger.info("==== 初始化====(htmlFileName){参数} " + htmlFileName); try { //上传服务器的图片本地地址 logger.info("==== htmlFile{参数} ====" + htmlFile); //nginx转发后的图片地址 logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl); //生成网页的文件夹地址 String htmlFileUrl = htmlFile + uuid + "/"; logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl); //上传文件到服务器 boolean flag = upload(multipartFile, wordFileUrl, uuid); if (!flag) { return "word文档上传失败!"; } logger.info("===== word文档上传成功!===="); //获取文件名称 String name = multipartFile.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.后缀名 String filePath = wordFileUrl + uuid + suffix; logger.info("==== filePath ====" + filePath); File file = new File(filePath); // 1) 加载word文档生成 HWPFDocument对象 InputStream inputStream = new FileInputStream(file); HWPFDocument wordDocument = new HWPFDocument(inputStream); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //图片地址 String fileImg = htmlFileUrl + "images/"; File htmlFile1 = new File(htmlFileUrl); if (!htmlFile1.exists()) { //创建 if (htmlFile1.mkdirs()) { logger.info("创建" + htmlFileUrl + "成功"); } else { logger.info("创建" + htmlFileUrl + "成功"); } } //html代理地址 htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName; //html生成路径 htmlFileName = htmlFileUrl + htmlFileName; logger.info("==== htmlFileName{ html ======== 输出地址} " + htmlFileName); //设置图片存放的位置 String finalFileImg = fileImg; final int[] index = {1}; //处理图片地址 wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { File imgPath = new File(finalFileImg); if (!imgPath.exists()) {//图片目录不存在则创建 imgPath.mkdirs(); } String extension = pictureType.getExtension(); //随机生成图片名称 suggestedName = finalFileImg + "image" + index[0] + "." + extension; File file = new File(suggestedName); OutputStream os = null; try { os = new FileOutputStream(file); os.write(content); os.close(); //处理wmf公式图片 // if (extension.equals("wmf") || extension.equals("svg")) { // if (extension.equals("wmf")) { // String svgFile = suggestedName.substring(0, // suggestedName.lastIndexOf(".wmf")) // + ".svg"; // SvgToPngUtil.wmfToSvg(suggestedName, svgFile); // } // String suggestedNameSVG = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".svg"; String s = SvgToPngUtil.readToString(suggestedNameSVG); String suggestedNamePng = suggestedName.substring(0, suggestedName.lastIndexOf(".")) + ".png"; SvgToPngUtil.convertToPng(s, suggestedNamePng); String s1 = SvgToPngUtil.GetImageStr(suggestedNameSVG); // //删除无用图片 deleteFile(suggestedNameSVG, suggestedName); // suggestedName = suggestedNameSVG; // } } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } //这里可以指定word文档中图片的路径。 String imgUlr = suggestedName.replace(htmlFile, htmlFileImgUrl); index[0]++; return imgUlr; } }); wordToHtmlConverter.processDocument(wordDocument); Document htmlDocument = wordToHtmlConverter.getDocument(); OutputStream outputStream = new FileOutputStream(htmlFileName); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(outputStream); TransformerFactory factory = TransformerFactory.newInstance(); Transformer serializer = factory.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); outputStream.close(); logger.info("===== 网页样式转换开始 ===="); String htmlContents = readHtml(htmlFileName); FileCopyUtils.copy(htmlContents.getBytes("utf-8"), new File(htmlFileName)); logger.info("===== 网页样式转换完成 ===="); } catch (Exception e) { logger.error("word2003ToHtml====异常"); logger.error(e.getMessage()); throw new RuntimeException(e); } // return htmlUrl; } //获取网页内容 public static String readHtml(String htmlFileName) throws Exception { StringBuilder htmlContents1 = new StringBuilder(); String htmlContents = ""; //读图网页内容 BufferedReader buf = new BufferedReader( new InputStreamReader(new FileInputStream(htmlFileName), "utf-8")); String c = ""; while ((c = buf.readLine()) != null) { htmlContents1.append(c + "\n"); } buf.close(); htmlContents = htmlContents1.toString(); htmlContents = htmlContents.replace("hyphenate:auto;font-family:Times New Roman;", "hyphenate:auto;font-family:宋体;").replace("vertical-align:text-bottom;", "vertical-align: middle;").replace("’","'").replace("’","'"); org.jsoup.nodes.Document document = Jsoup.parse(htmlContents); formatHtml(document); htmlContents = document.toString(); return htmlContents; } //网页字体样式 public static void formatHtml(org.jsoup.nodes.Document document) { Elements elements = document.getAllElements(); String title = document.title(); logger.info("==== formatHtml ====title"+title); for (Element element : elements) { if ("main".equals(element.className())) { continue; } if (title.contains("物理") || title.contains("数学") || title.contains("化学")) { if (element.hasClass("s1")) { element.attr("style", "font-family:Times New Roman;" + element.attr("style")); } } String[] attrs = element.attr("style").split(";"); List<String> attrList = new ArrayList(); for (String attr : attrs) { if (attr.contains("font-family")) { attrList.add(attr); } } //将<body>标签里的class属性b1 b2去掉 Elements bodys = element.getElementsByTag("body"); for(Element body : bodys){ System.out.println("=======className:" + body.className() + "=========="); if("b1 b2".equals(body.className())){ body.attr("class",""); } } } } public static void deleteFile(String... imgUrl) { for (String s : imgUrl) { File file = new File(s); try { if (file.isFile()) { // 删除文件 if (file.delete()) { logger.info("删除文件成功==== 名称为:" + file.getName()); } else { } } else { } } catch (Exception e) { logger.error("====== 删除图片失败 ======" + e.getMessage()); throw new RuntimeException(); } } } /** * @param file 文件 * @param htmlFile 文件上传地址 * @param fileName 文件名称 * @return */ public static boolean upload(MultipartFile file, String htmlFile, String fileName) { InputStream is = null; OutputStream os = null; try { File file1 = new File(htmlFile); if (!file1.exists()) { file1.mkdirs(); } String name = file.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.后缀名 is = file.getInputStream(); os = new FileOutputStream(htmlFile + fileName + suffix); //数据对拷 IOUtils.copy(is, os); logger.info("==== 文件写入成功!===="); } catch (IOException e) { logger.error("===== 文件上传失败 ====" + e.getMessage()); return false; } finally { if (null != is) { try { is.close(); } catch (IOException e) { throw new RuntimeException(e); } } if (null != os) { try { os.close(); } catch (IOException e) { throw new RuntimeException(e); } } } return true; } }
复制
方式二代码实现:
package com.hls.poi.service; import com.hls.poi.controller.WordToHtmlController; import org.apache.poi.util.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.web.multipart.MultipartFile; import java.io.*; import java.util.UUID; public class LibreOfficeCommandWordService { private static final Logger logger = LoggerFactory.getLogger(WordToHtmlController.class); /** * /opt/libreoffice7.5/program/soffice --headless --invisible --convert-to pdf /opt/a/1.docx --outdir /opt/a/ * –convert-to pdf 后面的 /opt/a/1.docx 为原文件路径 * –outdir /opt/a/(转换后文件存放目录) * <p> * soffice --headless --invisible --convert-to html:HTML ffc75d91-3594-451d-a55f-a941325bc380.doc --outdir mmm */ //需要根据实际情况,查找LibreOffice安装的实际目录, //Mac下是默认安装到/usr/local/bin, //CentOS下默认安装在/usr/bin private final static String sofficeDir = "/opt/libreoffice7.6/program/"; /** * @param multipartFile 上传的文件 * @param htmlFile html上传路径 * @param htmlFileImgUrl html图片上传路径 * @param wordFileUrl word上传路径 * @param sofficeDir libreOffice安装地址 * @throws Exception */ public String word2html(MultipartFile multipartFile, String htmlFile, String htmlFileImgUrl, String wordFileUrl, String sofficeDir) throws Exception { try { logger.info("exec command:[{}]\noutput: [{}]", "进入word2pdf{} 方法"); // 需要判断文件是否为doc,docx if (multipartFile == null) { return "word文档上传为空!"; } //返回服务器代理地址 String htmlUrl = ""; //随机命名html文件 String uuid = UUID.randomUUID().toString(); String htmlFileName = uuid + "." + "html"; logger.info("==== 初始化====(htmlFileName){参数} " + htmlFileName); //上传服务器的图片本地地址 logger.info("==== htmlFile{参数} ====" + htmlFile); //nginx转发后的图片地址 logger.info("==== htmlFileImgUrl{参数} ====" + htmlFileImgUrl); //生成网页的文件夹地址 String htmlFileUrl = htmlFile + uuid + "/"; logger.info("==== htmlFileUrl{参数} ==== " + htmlFileUrl); //上传文件到服务器 boolean flag = upload(multipartFile, wordFileUrl, uuid); if (!flag) { return "word文档上传失败!"; } logger.info("===== word文档上传成功!===="); //获取文件名称 String name = multipartFile.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.后缀名 //上传后word文档路径 /home/winnersoft/date/tomcat/html-root/office/word/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.doc String inPath = wordFileUrl + uuid + suffix; logger.info("==== inPath ====" + inPath); if (!new File(inPath).exists()) { return "word文档不存在!"; } //图片地址 File htmlFile1 = new File(htmlFileUrl); if (!htmlFile1.exists()) { //创建 if (htmlFile1.mkdirs()) { logger.info("创建" + htmlFileUrl + "成功"); } else { logger.info("创建" + htmlFileUrl + "成功"); } } //html代理地址 //http://172.18.222.25:82/office/html/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e/8ea8aec0-7fb5-4fbc-b73c-6f0e47b2857e.html htmlUrl = htmlFileImgUrl + uuid + "/" + htmlFileName; //html生成路径 /home/winnersoft/date/tomcat/html-root/office/html/af7ac82f-71bc-498c-8866-8bf7ef325345/ htmlFileName = htmlFileUrl; logger.info("==== outPath{ html ======== 输出地址} " + htmlFileName); //设置图片存放的位置 // String command = String.format("%s/soffice --convert-to pdf:writer_pdf_Export %s --outdir %s", sofficeDir, inPath, outPath); String command = String.format("%s/soffice --headless --invisible --convert-to html:HTML %s --outdir %s", sofficeDir, inPath, htmlFileName); logger.info("command==================================" + command); String output = this.executeCommand(command); logger.info("exec command:[{}]\noutput: [{}]", command, output); return htmlUrl; } catch (IOException e) { logger.error("io异常"+e.getMessage()); throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } } protected String executeCommand(String command) throws IOException, InterruptedException { logger.info("executeCommand{} 执行转化"); StringBuffer output = new StringBuffer(); Process p; p = Runtime.getRuntime().exec(command); p.waitFor(); try ( InputStreamReader inputStreamReader = new InputStreamReader(p.getInputStream(), "UTF-8"); BufferedReader reader = new BufferedReader(inputStreamReader) ) { String line = ""; while ((line = reader.readLine()) != null) { output.append(line + "\n"); } } // 销毁子进程 p.destroy(); return output.toString(); } /** * @param file 文件 * @param htmlFile 文件上传地址 * @param fileName 文件名称 * @return */ public static boolean upload(MultipartFile file, String htmlFile, String fileName) { InputStream is = null; OutputStream os = null; try { File file1 = new File(htmlFile); if (!file1.exists()) { file1.mkdirs(); } String name = file.getOriginalFilename(); String suffix = name.substring(name.lastIndexOf("."));//.后缀名 is = file.getInputStream(); os = new FileOutputStream(htmlFile + fileName + suffix); //数据对拷 IOUtils.copy(is, os); logger.info("==== 文件写入成功!===="); } catch (IOException e) { logger.error("===== 文件上传失败 ====" + e.getMessage()); return false; } finally { if (null != is) { try { is.close(); } catch (IOException e) { throw new RuntimeException(e); } } if (null != os) { try { os.close(); } catch (IOException e) { throw new RuntimeException(e); } } } return true; } }
复制