方案一:
pdf2htmlex
package com.realize.controller;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson2.JSONObject;
import com.realize.util.MsgUtil;
import com.realize.util.OssUtil;
import com.realize.util.PdfConvertUtil;
import com.realize.util.StreamGobbler;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
@RestController
@Slf4j
public class ParserController {
@GetMapping("/test")
public String test() {
return "test";
}
// @PostMapping("/parseHtml")
// public JSONObject parseHtml(@ModelAttribute("htmlUrl") String htmlUrl) {
// try (Playwright playwright = Playwright.create()) {
// Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
// Page page = browser.newPage();
// String filePath = "/mnt/temp/html/" + RandomUtil.randomString(10) + ".html";
String filePath = "/Users/sunyechen/IdeaProjects/realize-nacos/bin/" + RandomUtil.randomString(10) + ".html";
// HttpUtil.downloadFile(htmlUrl, filePath);
// page.navigate("file:" + filePath);
// page.evaluate("var imgList=document.getElementsByTagName('img');" +
// "for(var i=0;i<imgList.length;i++){" +
// "var src=imgList[i].getAttribute('src');" +
// "imgList[i].setAttribute('src','https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/'+src);" +
// "}");
// JSONObject result = new JSONObject();
// result.put("html", page.innerHTML("css=body"));
// result.put("css", page.innerHTML("css=style"));
// result.put("txt", page.innerText("css=body").trim().replaceAll("\n", ""));
// page.close();
// browser.close();
// return result;
// } catch (Exception e) {
// e.printStackTrace();
// }
// return null;
// }
@GetMapping("/batchConvertPdf")
public String batchConvertPdf() {
String folderName = "/root/pdf";
File folder = new File(folderName);
File[] files = folder.listFiles();
for (int i = 0; i < files.length; i++) {
String fileName = files[i].getName();
if (fileName.toLowerCase().endsWith(".pdf")) {
String exec = " docker run -i --rm -v /root/pdf:/pdf -w /pdf docker.io/pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64 --font-size-multiplier 1 --zoom 1.3 " + fileName;
try {
// Process process = new ProcessBuilder("/bin/sh", "-c", exec).start();
// String result = IOUtils.toString(process.getInputStream(), "utf-8");
// log.info("Executing Command [result]:{}", result);
// Runtime rt = Runtime.getRuntime();
log.info("exec:{}", exec);
// String[] execArray = new String[]{"/bin/sh", "-c", " docker run -i --rm -v /root/pdf:/pdf -w /pdf docker.io/pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64 --font-size-multiplier 1 --zoom 1.3 ", fileName};
Process process = Runtime.getRuntime().exec(exec);
StreamGobbler errorGobbler = new StreamGobbler(process.getErrorStream(), "ERROR");
// 开启屏幕标准错误流
errorGobbler.start();
StreamGobbler outGobbler = new StreamGobbler(process.getInputStream(), "OUTPUT");
// 开启屏幕标准输出流
outGobbler.start();
int w = process.waitFor();
int v = process.exitValue();
if (w == 0 && v == 0) {
log.info("转换成功:{}", fileName);
} else {
log.info("转换失败:{}", fileName);
}
} catch (Exception e) {
log.error("{}", e);
return null;
}
}
}
return "ok";
}
@PostMapping("/convertAndParsePdf")
public JSONObject convertAndParsePdf(@ModelAttribute("pdfUrl") String pdfUrl, @ModelAttribute("ossKey") String ossKey) throws Exception {
log.info("接收到转换请求{},{}", pdfUrl, ossKey);
String fileFolder = "/mnt_real/pdf/";
JSONObject result = new JSONObject();
String tempFileName = ossKey.substring(ossKey.lastIndexOf("/") + 1, ossKey.lastIndexOf("."));
String pdfFilePath = fileFolder + tempFileName + ".pdf";
String htmlFilePath = fileFolder + tempFileName + ".html";
HttpUtil.downloadFile(pdfUrl, pdfFilePath);
log.info("pdf文件下载成功{},{}", pdfUrl, ossKey);
//解析pdf正文
// String pdfText = PdfBoxUtil.getPdfText(pdfFilePath);
// result.put("pdfText", pdfText);
// log.info("pdfbox正文解析成功{},{}", pdfUrl, ossKey);
//解析
Boolean convertResult = PdfConvertUtil.convertPdf(tempFileName + ".pdf");
if (convertResult) {
try {
List<String> allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));
String content = String.join("\n", allLines);
// File file = new File(htmlFilePath);
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = "", oldContent = "";
// while ((line = reader.readLine()) != null) {
// oldContent += line + "\n";
// }
// reader.close();
content = content.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tg").replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
File file = new File(htmlFilePath);
file.delete();
FileWriter writer = new FileWriter(htmlFilePath);
writer.write(content);
writer.close();
log.info("html文件处理完成{},{}", pdfUrl, ossKey);
result.put("code", 0);
} catch (IOException e) {
e.printStackTrace();
result.put("code", -1);
} finally {
//上传所有文件
String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
OssUtil.batchFileUploadOssUrl(fileFolder, ossPath);
log.info("文件上传成功,完整链接:https://oss.imvib.com/{}", ossKey.replace(".html", ".pdf"));
}
} else {
MsgUtil.sendDingTalkMsg(pdfUrl);
result.put("code", -1);
}
return result;
}
private static byte[] readAllBytes(File file) throws IOException {
try (FileInputStream fileInputStream = new FileInputStream(file)) {
byte[] buffer = new byte[(int) file.length()];
fileInputStream.read(buffer);
return buffer;
}
}
public static void main(String[] args) throws Exception {
// String htmlFilePath = "/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html";
// File htmlFile = new File(htmlFilePath);
// Document html = Jsoup.parse(htmlFile);
Element script = html.select("script").first();
String sourceScript = script.html();
script.html(sourceScript + PdfConvertUtil.addScript);
// FileOutputStream fos = new FileOutputStream(htmlFilePath.replace(".html", "_.html"), false);
// OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
// osw.write(html.outerHtml());
// osw.close();
// try (Playwright playwright = Playwright.create()) {
// Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
// BrowserContext context = browser.newContext(new Browser.NewContextOptions());
// Page page = browser.newPage();
// String htmlUrl = "https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/2023-01-03 1 发行人及保荐机构关于二轮审核问询函的回复(修订稿)_易瑞生物.htm";
// HttpUtil.downloadFile(htmlUrl, "/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
// page.navigate("file:/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
// System.out.println(page.innerHTML("body"));
// }
// System.out.println(URLDecoder.decode("https://oss.imvib.com/a-filings%2Foriginal%2F000586%2F2023-04-07+%E5%B9%B4%E5%BA%A6%E5%85%B3%E8%81%94%E6%96%B9%E8%B5%84%E9%87%91%E5%8D%A0%E7%94%A8%E4%B8%93%E9%A1%B9%E5%AE%A1%E8%AE%A1%E6%8A%A5%E5%91%8A.PDF", "UTF-8"));
// try {
// File file = new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html");
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = "", oldContent = "";
// while ((line = reader.readLine()) != null) {
// oldContent += line + "\n";
// }
// reader.close();
// String newContent = oldContent.replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
// FileWriter writer = new FileWriter(new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见_1.html"));
// writer.write(newContent);
// writer.close();
// System.out.println("File updated successfully.");
// } catch (IOException e) {
// e.printStackTrace();
// }
// log.info("start");
// String htmlFilePath = "/Users/sunyechen/sfit/7745c98a5ba34525937bce19519c0b1e.html";
// try {
// File file = new File(htmlFilePath);
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = "", oldContent = "";
// while ((line = reader.readLine()) != null) {
// oldContent += line + "\n";
// }
// reader.close();
// String newContent = oldContent.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tanqiuhuashigou").replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
// String newHtmlFilePath = htmlFilePath.replace(".html", "_.html");
// FileWriter writer = new FileWriter(newHtmlFilePath);
// writer.write(newContent);
// writer.close();
// log.info("html文件处理完成{},{}");
//
// } catch (IOException e) {
// e.printStackTrace();
// }
log.info("start");
// String ossKey = "ann/688249/2023/4/688249_20230412_9XYK/688249_20230412_9XYK.html";
// String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
// OssUtil.batchFileUploadOssUrl("/Users/sunyechen/sfit/test/", ossPath);
// File[] fileList = new File("/Users/sunyechen/sfit/test/").listFiles();
// for (int i = 0; i < fileList.length; i++) {
// OssUtil.fileUploadOssUrl(fileList[i], ossPath + fileList[i].getName());
// fileList[i].delete();
// }
String htmlFilePath = "/Users/sunyechen/sfit/test/600499_20230415_EVH7.html";
List<String> allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));
String content = String.join("\n", allLines);
System.out.println(content);
log.info("start");
File file = new File(htmlFilePath);
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = "", oldContent = "";
while ((line = reader.readLine()) != null) {
oldContent += line + "\n";
}
reader.close();
System.out.println(oldContent);
log.info("end");
}
}
方案二:
kkFileView-4.0.0
kkFileView - 在线文件预览
方案三:
wkhtmltox-0.12.6-1.centos7.x86_64.rpm
wkhtmltopdf