HtmlSucker 提供两种正文提取算法:
目前还处于非常简单的阶段,但是可用。
导入依赖
<dependency>
<groupId>net.oschina.htmlsucker</groupId>
<artifactId>HtmlSucker</artifactId>
<version>0.0.2</version>
</dependency>
工具类:
public class HtmlUtil {
private static final Logger LOG = LoggerFactory.getLogger(HtmlUtil.class);
private RemoteFileService fileService;
public HtmlUtil(RemoteFileService fileService) {
this.fileService = fileService;
}
public JSONObject htmlSucker(String url) {
JSONObject jsonObject = new JSONObject();
try {
Article article = HtmlSucker.select(HtmlSucker.TEXT_DENSITY_EXTRACTOR).parse(url, 30000);
String content = article.getContent();
content = dealContentHtml(content, url);
jsonObject.set("title", article.getTitle());
jsonObject.set("publishDate", article.getDate());
jsonObject.set("author", article.getAuthor());
jsonObject.set("content", content);
} catch (IOException e) {
e.printStackTrace();
}
return jsonObject;
}
private String dealContentHtml(String content, String url) {
Document document = Jsoup.parse(content);
// jsoup标准化标签,生成闭合标签
document.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
URI uri = null;
try {
uri = new URI(url);
} catch (URISyntaxException e) {
e.printStackTrace();
}
// 处理data-src地址
Elements dataSrcElems = document.getElementsByAttribute("data-src");
for(Element dataSrcElem : dataSrcElems) {
String dataSrc = dataSrcElem.attr("data-src");
if(dataSrc != null) {
dataSrcElem.attr("src", dataSrc);
}
}
// 处理src地址
Elements srcElems = document.getElementsByAttribute("src");
dealElementUrl(srcElems, "src", uri);
// 处理href地址
Elements links = document.getElementsByAttribute("href");
dealElementUrl(links, "href", uri);
return document.body().html();
}
private void dealElementUrl(Elements elements, String attr, URI uri) {
for(Element element : elements) {
if("src".equals(attr)) {
// 处理图片前端无法直接引用的问题
element.attr("referrerpolicy", "no-referrer");
}
String val = element.attr(attr);
if(val.startsWith("//")) {
element.attr(attr, uri.getScheme() + ":" + val);
}
else if(val.startsWith("/")) {
element.attr(attr, uri.getScheme() + "://" + uri.getHost() + val);
}
else if(val.startsWith(":")) {
element.attr(attr, uri.getScheme() + val);
}
else if(val.startsWith("data://")) {
try {
BufferedImage bufferedImage = ImgUtil.toImage(val);
//创建一个ByteArrayOutputStream
ByteArrayOutputStream os = new ByteArrayOutputStream();
//把BufferedImage写入ByteArrayOutputStream
ImageIO.write(bufferedImage, "jpg", os);
//ByteArrayOutputStream转成InputStream
InputStream inputStream = new ByteArrayInputStream(os.toByteArray());
//InputStream转成MultipartFile
MultipartFile multipartFile = new MockMultipartFile("file", "file.jpg", "text/plain", inputStream);
// 上传图片
R<SysFile> uploadResult = fileService.upload(multipartFile);
if(uploadResult.getCode() == HttpStatus.SUCCESS) {
SysFile sysFile = uploadResult.getData();
String uploadUrl = sysFile.getUrl();
element.attr(attr, uploadUrl);
}
} catch (IOException e) {
LOG.error(e.getMessage());
}
}
}
}
public static void exportPdf(HttpServletResponse response,
String title, String content,
List<KsAppendix> appendices) throws IOException {
com.spire.doc.Document document = new com.spire.doc.Document();
if(!StringUtils.isEmpty(content)) {
Section section = document.addSection();
StringBuilder contentBuilder = new StringBuilder();
contentBuilder.append("<h2>");
contentBuilder.append(title);
contentBuilder.append("</h2>");
contentBuilder.append(content);
section.addParagraph().appendHTML(contentBuilder.toString());
}
if(appendices != null && appendices.size() > 0) {
Section section = document.addSection();
StringBuilder appendicesBuilder = new StringBuilder();
appendicesBuilder.append("<div>");
appendicesBuilder.append("<p style='font-size: 22px;'>文件列表</p>");
for (KsAppendix ksAppendix : appendices) {
String fileName = ksAppendix.getFileName();
String fileUrl = ksAppendix.getFileUrl();
appendicesBuilder.append("<p style='color:red; text-decoration:underline;'><a href='");
appendicesBuilder.append(fileUrl);
appendicesBuilder.append("'>");
appendicesBuilder.append(fileName);
appendicesBuilder.append("</a></p>");
}
appendicesBuilder.append("</div>");
section.addParagraph().appendHTML(appendicesBuilder.toString());
}
document.saveToFile(response.getOutputStream(), FileFormat.PDF);
document.dispose();
}
}
使用:
·public ResponseResult getArticleFromUrl(@RequestParam(name = "url") String url) {
HtmlUtil htmlUtil = new HtmlUtil(fileService);
JSONObject jsonObject = htmlUtil.htmlSucker(url);
return ResponseResult.success(jsonObject);
}
其中fileService可去掉,改为
HtmlUtil htmlUtil = new HtmlUtil();