获取html的路径、word的模版路径,html包含图片路径前缀
public class Html2Word {
public static void main(String[] args) throws Exception {
String html2WordTemplatePath = "D:\\test\\test\\html2word\\html to word template (1).docx";
Document htmlDocument = Jsoup.parse(new File("D:\\test\\test\\html2word\\test-1.htm"));
NiceXWPFDocument document = new NiceXWPFDocument(Files.newInputStream(Paths.get(html2WordTemplatePath)));
boolean[] returnFlag = {false};
String filePathPre = "D:\\test\\test\\html2word\\";
List<Node> nodes = htmlDocument.body().childNodes();
for (Node node : nodes) {
if (!(node instanceof Element)){
continue;
}
if (StringUtils.isEmpty(node.toString().trim())){
continue;
}
XWPFParagraph paragraph;
if (node.nodeName().equalsIgnoreCase("table") || node.nodeName().equalsIgnoreCase("ul") || node.nodeName().equalsIgnoreCase("span")){
paragraph = document.getLastParagraph();
}else {
paragraph = document.createParagraph();
}
if (node.nodeName().equalsIgnoreCase("span")){
paragraph.setSpacingAfter(200);
continue;
}
SarHtml2WordUtils.parseHtmlToWord(node,document,paragraph,returnFlag,filePathPre);
if (returnFlag[0]){
break;
}
}
document.getLastParagraph().createRun().addBreak(BreakType.PAGE);
// 写入到输出流
String outPath = "D:\\test\\test\\html2word" + System.currentTimeMillis() + ".docx";
OutputStream outputStream1 = Files.newOutputStream(Paths.get(outPath));
document.write(outputStream1);
outputStream1.close();
}
}
读取html中的换行,颜色等信息,绘制到word中
public class Html2WordUtils {
/**
* 解析 html 格式内容 转变为 word
*
* @param node HTML的node 节点
* @param doc word 文档对象
* @param xwpfParagraph 段落
* @throws Exception 异常信息
*/
public static void parseHtmlToWord(Node node, NiceXWPFDocument doc, XWPFParagraph xwpfParagraph,boolean[] returnFlag, String filePathPre) throws Exception {
List<Node> nodes = node.childNodes();
if (CollectionUtils.isNotEmpty(nodes)) {
for (Node childNode : nodes) {
parseHtmlToWord(childNode, doc, xwpfParagraph,returnFlag,filePathPre);
}
}
//处理table标签
if ("table".equalsIgnoreCase(node.nodeName())) {
parseTableToWord(doc, node, xwpfParagraph,filePathPre);
returnFlag[0] = true;
return;
}
if (CollectionUtils.isNotEmpty(node.childNodes())) {
return;
}
String nodeValue = node.toString();
Node parent = node.parent();
boolean boldFlag = false;
String color = "";
boolean subFlag = false;
boolean supFlag = false;
boolean ulFlag = false;
boolean tableFlag = false;
if (null != parent) {
String parentNodeName = parent.nodeName();
if (parentNodeName.equalsIgnoreCase("strong") || parentNodeName.equalsIgnoreCase("b")) {
boldFlag = true;
} else if (parentNodeName.equalsIgnoreCase("font")) {
if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("strong")
|| Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("b")) {
boldFlag = true;
}
String color1 = parent.attr("color");
if (StringUtils.isNotEmpty(color1)){
if (!Objects.equals("#ff0000",color1)){
return;
}
color = color1.substring(1);
}
Node parented = parent.parent();
if (null != parented){
if (parented.nodeName().equalsIgnoreCase("li")) {
if (Objects.requireNonNull(parented.parent()).nodeName().equalsIgnoreCase("ul")) {
ulFlag = true;
}
}
}
} else if (parentNodeName.equalsIgnoreCase("sub")) {
subFlag = true;
} else if (parentNodeName.equalsIgnoreCase("sup")) {
supFlag = true;
} else if (parentNodeName.equalsIgnoreCase("li")) {
if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("ul")) {
ulFlag = true;
}
} else if (parentNodeName.equalsIgnoreCase("td")) {
tableFlag = true;
}
}
if (node.nodeName().equalsIgnoreCase("br")){
Node preNode = node.previousSibling();
if (null != preNode && null != preNode.parentNode()){
if (preNode.parentNode().nodeName().equalsIgnoreCase("font")) {
String color1 = preNode.attr("color");
if (StringUtils.isNotEmpty(color1)){
if (!Objects.equals("#ff0000",color1)){
return;
}
}
}
}
}
if ("#text".equalsIgnoreCase(node.nodeName()) && !tableFlag && !nodeValue.contains("<")) {
XWPFRun run = xwpfParagraph.createRun();
run.setFontFamily("Times New Roman");
run.setFontSize(10);
if (boldFlag) {
run.setBold(true);
}
if (StringUtils.isNotEmpty(color)) {
run.setColor(color);
}
if (supFlag) {
run.setSubscript(VerticalAlign.SUPERSCRIPT);
}
if (subFlag) {
run.setSubscript(VerticalAlign.SUBSCRIPT);
}
if (ulFlag && StringUtils.isNotEmpty(nodeValue.trim())) {
XWPFParagraph paragraph = doc.createParagraph();
paragraph.setIndentFromLeft(0);
paragraph.setFirstLineIndent(0);
paragraph.setIndentationLeftChars(125);
XWPFRun run1 = paragraph.createRun();
run1.setFontFamily("宋体");
run1.setFontSize(8);
run1.setText("● ");
run1.addTab();
XWPFRun run2 = paragraph.createRun();
run2.setText(nodeValue.trim());
run2.setFontFamily("宋体");
run2.setFontSize(10);
}
if (StringUtils.isNotEmpty(nodeValue) && !ulFlag){
run.setText(nodeValue.trim());
}
}
boolean enabledBreak = ReUtil.isMatch("(h[12345]|li|img|br)", node.nodeName().toLowerCase());
if (enabledBreak) {
XWPFRun run = xwpfParagraph.createRun();
run.addCarriageReturn();
}
}
private static void parseTableToWord(NiceXWPFDocument doc, Node node, XWPFParagraph paragraph,String filePathPre) throws Exception {
//简化表格html
String string = node.toString();
org.jsoup.nodes.Document tableDoc = Jsoup.parse(Objects.requireNonNull(simplifyTable(string)));
Elements trList = tableDoc.getElementsByTag("tr");
// 获取页边距
BigInteger right = (BigInteger) doc.getDocument().getBody().getSectPr().getPgMar().getRight();
BigInteger left = (BigInteger) doc.getDocument().getBody().getSectPr().getPgMar().getLeft();
// word 工作区域范围宽度
double wordWorkAreaWidth = 21 - ((double) (right.intValue() + left.intValue()) / 567);
//创建表格
XWPFTable xwpfTable = doc.insertNewTbl(paragraph.getCTP().newCursor());
if (null == xwpfTable) {
return;
}
//设置样式
xwpfTable.setWidth("100%");
//写入表格行和列内容
for (int row = 0; row < trList.size(); row++) {
XWPFTableRow tableRow = xwpfTable.getRow(row);
if (null == tableRow){
tableRow = xwpfTable.createRow();
}
Element trElement = trList.get(row);
Elements tds = trElement.getElementsByTag("td");
double widthTotal = 0.0;
for (int col = 0; col < tds.size(); col++) {
Element colElement = tds.get(col);
List<Node> nodes = colElement.childNodes();
for (Node tdNode : nodes) {
if ("img".equalsIgnoreCase(tdNode.nodeName())) {
String width = tdNode.attr("width");
if (NumberUtils.isNumeric(width.trim())){
widthTotal = widthTotal + Double.parseDouble(width.trim());
}
}
}
}
for (int col = 0; col < tds.size(); col++) {
XWPFTableCell tableCell = tableRow.getCell(col);
if (null == tableCell){
tableCell = tableRow.createCell();
}
CTTcPr tcPr = tableCell.getCTTc().isSetTcPr() ? tableCell.getCTTc().getTcPr() : tableCell.getCTTc().addNewTcPr();
CTTcBorders ctTcBorders = tcPr.addNewTcBorders();
ctTcBorders.addNewLeft().setVal(STBorder.NIL);
ctTcBorders.addNewRight().setVal(STBorder.NIL);
ctTcBorders.addNewTop().setVal(STBorder.NIL);
ctTcBorders.addNewBottom().setVal(STBorder.NIL);
Element colElement = tds.get(col);
List<Node> nodes = colElement.childNodes();
for (Node tdNode : nodes) {
if ("img".equalsIgnoreCase(tdNode.nodeName())) {
String src = tdNode.attr("src");
String width = tdNode.attr("width");
String height = tdNode.attr("height");
src = src.replaceAll(" ", " ").replaceAll("&","&");
String picturePath = filePathPre + src;
InputStream inputStream = Files.newInputStream(Paths.get(picturePath));
XWPFRun xwpfRun = tableCell.getParagraphs().get(0).createRun();
double picWidth = wordWorkAreaWidth * ( Double.parseDouble(width.trim()) / widthTotal);
double picHeight = picWidth * Double.parseDouble(height.trim()) / Double.parseDouble(width.trim());
xwpfRun.addPicture(inputStream, Document.PICTURE_TYPE_PNG, src,
(int) (picWidth * Units.EMU_PER_CENTIMETER), (int) (picHeight * Units.EMU_PER_CENTIMETER));
}else if ("#text".equalsIgnoreCase(tdNode.nodeName())){
parseHtmlToWordTable(colElement, doc, tableCell.getParagraphs().get(0));
}
}
}
}
}
private static void parseHtmlToWordTable(Node node, NiceXWPFDocument doc, XWPFParagraph xwpfParagraph) {
List<Node> nodes = node.childNodes();
if (CollectionUtils.isNotEmpty(nodes)) {
for (Node childNode : nodes) {
parseHtmlToWordTable(childNode, doc, xwpfParagraph);
}
}
//处理table标签
if ("table".equalsIgnoreCase(node.nodeName())) {
return;
}
if (CollectionUtils.isNotEmpty(node.childNodes())) {
return;
}
String nodeValue = node.toString();
Node parent = node.parent();
boolean boldFlag = false;
String color = "";
boolean subFlag = false;
boolean supFlag = false;
boolean ulFlag = false;
if (null != parent) {
String parentNodeName = parent.nodeName();
if (parentNodeName.equalsIgnoreCase("strong") || parentNodeName.equalsIgnoreCase("b")) {
boldFlag = true;
} else if (parentNodeName.equalsIgnoreCase("font")) {
if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("strong")
|| Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("b")) {
boldFlag = true;
}
String color1 = parent.attr("color");
if (StringUtils.isNotEmpty(color1)){
color = color1.substring(1);
}
Node parented = parent.parent();
if (null != parented){
if (parented.nodeName().equalsIgnoreCase("li")) {
if (Objects.requireNonNull(parented.parent()).nodeName().equalsIgnoreCase("ul")) {
ulFlag = true;
}
}
}
} else if (parentNodeName.equalsIgnoreCase("sub")) {
subFlag = true;
} else if (parentNodeName.equalsIgnoreCase("sup")) {
supFlag = true;
} else if (parentNodeName.equalsIgnoreCase("li")) {
if (Objects.requireNonNull(parent.parent()).nodeName().equalsIgnoreCase("ul")) {
ulFlag = true;
}
}
}
if ("#text".equalsIgnoreCase(node.nodeName()) && !nodeValue.contains("<")) {
XWPFRun run = xwpfParagraph.createRun();
run.setFontFamily("Times New Roman");
run.setFontSize(10);
if (boldFlag) {
run.setBold(true);
}
if (StringUtils.isNotEmpty(color)) {
run.setColor(color);
}
if (supFlag) {
run.setSubscript(VerticalAlign.SUPERSCRIPT);
}
if (subFlag) {
run.setSubscript(VerticalAlign.SUBSCRIPT);
}
if (ulFlag && StringUtils.isNotEmpty(nodeValue.trim())) {
XWPFParagraph paragraph = doc.createParagraph();
paragraph.setIndentFromLeft(0);
paragraph.setFirstLineIndent(0);
paragraph.setIndentationLeftChars(125);
XWPFRun run1 = paragraph.createRun();
run1.setFontFamily("Times New Roman");
run1.setFontSize(8);
run1.setText("●");
run1.addTab();
XWPFRun run2 = paragraph.createRun();
run2.setText(nodeValue.trim());
run2.setFontFamily("Times New Roman");
run2.setFontSize(10);
}
if (StringUtils.isNotEmpty(nodeValue) && !ulFlag) {
run.setText(nodeValue.trim());
}
}
boolean enabledBreak = ReUtil.isMatch("(|h[12345]|li|img|br)", node.nodeName().toLowerCase());
if (enabledBreak) {
XWPFRun run = xwpfParagraph.createRun();
run.addCarriageReturn();
}
}
public static String simplifyTable(String tableContent) {
if (StringUtils.isEmpty(tableContent)) {
return null;
}
org.jsoup.nodes.Document tableDoc = Jsoup.parse(tableContent);
Elements trElements = tableDoc.getElementsByTag("tr");
// 针对于colspan操作
for (Element trElement : trElements) {
//去除所有样式
trElement.removeAttr("class");
Elements tdElements = trElement.getElementsByTag("td");
List<Element> tdEleList = covertElements2List(tdElements);
for (Element curTdElement : tdEleList) {
//去除所有样式
curTdElement.removeAttr("class");
Element ele = curTdElement.clone();
String colspanValStr = curTdElement.attr("colspan");
if (!StringUtils.isEmpty(colspanValStr)) {
ele.removeAttr("colspan");
int colspanVal = Integer.parseInt(colspanValStr);
for (int k = 0; k < colspanVal - 1; k++) {
curTdElement.after(ele.outerHtml());
}
}
}
}
// 针对于rowspan操作
List<Element> trEleList = covertElements2List(trElements);
Element firstTrEle = trElements.first();
if (null == firstTrEle){
return "";
}
Elements tdElements = firstTrEle.getElementsByTag("td");
Integer tdCount = tdElements.size();
//获取该列下所有单元格
for (int i = 0; i < tdElements.size(); i++) {
for (Element trElement : trEleList) {
List<Element> tdElementList = covertElements2List(trElement.getElementsByTag("td"));
Node curTdNode = tdElementList.get(i);
Node cNode = curTdNode.clone();
String rowspanValStr = curTdNode.attr("rowspan");
if (!StringUtils.isEmpty(rowspanValStr)) {
cNode.removeAttr("rowspan");
Element nextTrElement = trElement.nextElementSibling();
int rowspanVal = Integer.parseInt(rowspanValStr);
for (int j = 0; j < rowspanVal - 1; j++) {
Node tempNode = cNode.clone();
List<Node> nodeList = new ArrayList<Node>();
nodeList.add(tempNode);
if (j > 0 && null != nextTrElement) {
nextTrElement = nextTrElement.nextElementSibling();
}
Integer indexNum = i + 1;
if (i == 0) {
indexNum = 0;
}
if (null != nextTrElement){
if (indexNum.equals(tdCount)) {
nextTrElement.appendChild(tempNode);
} else {
nextTrElement.insertChildren(indexNum, nodeList);
}
}
}
}
}
}
Element tableEle = tableDoc.getElementsByTag("table").first();
if (null == tableEle){
return "";
}
return tableEle.outerHtml();
}
private static List<Element> covertElements2List(Elements curElements) {
return new ArrayList<>(curElements);
}
}