最近做了一个 html 解析的 小项目,用的是 NekoHtml, 解析静态的html页面,提取需要的信息 成jason对象并放到一个文件中。
思路是, 先使用Netko可以快速的将需要的html中的指定标签如 table 中的信息拿到。 然后存入临时生成一个temp.html页面,再重新解析成 node对象。就可以根据结构获得制定的 node下的内容了。
核心代码如下:
public static CustomerRecord convertFileToObj(String filePath) throws Exception {
CustomerRecord cr = new CustomerRecord();
List<SOARec> soaList = new ArrayList<SOARec>();
List<ARec> aList = new ArrayList<ARec>();
List<MXRec> mxList = new ArrayList<MXRec>();
List<NSRec> nxList = new ArrayList<NSRec>();
// *Get Need Content from file
File file = new File(filePath);
cr.setFileName(file.getName());
// create element remover filter
ElementRemover remover = new ElementRemover();
// set which elements to accept
remover.acceptElement("table", null);
remover.acceptElement("td", null);
remover.acceptElement("tr", null);
remover.removeElement("title");
StringWriter filteredDescription = new StringWriter();
// create writer filter
org.cyberneko.html.filters.Writer writer = new org.cyberneko.html.filters.Writer(filteredDescription, null);
// setup filter chain
XMLDocumentFilter[] filters = { remover, writer, };
// create HTML parser
XMLParserConfiguration parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
XMLInputSource source = new XMLInputSource(null, filePath, null);
parser.parse(source);
String description = filteredDescription.toString();
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(description);
description = m.replaceAll("");
// * wirte the content into file
File temp = new File(file.getParentFile().getPath(), "temp.html");
Writer out = null;
out = new FileWriter(temp, false);
out.write(description);
out.close();
DOMParser parser2 = new DOMParser();
parser2.parse(temp.getPath());
Document document = parser2.getDocument();
int a = 0;
NodeList nodeList = XPathAPI.selectNodeList(document, "//TR");
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
String trContent = node.getTextContent();
//System.out.println(trContent);
//Start to convent into object.
// Domain
if (trContent.equals("ZoneundRecordsbearbeiten")) {
a = i;
}
if (i == (a + 2)) {
if (trContent.contains("Domain")) {
cr.setDomain(node.getChildNodes().item(1).getTextContent());
}
}
// SOA
if (trContent.startsWith("SOARecord")) {
NodeList soanodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
for (int j = 1; j < soanodes.getLength(); j++) {
SOARec soa = new SOARec();
soa.setDomain(soanodes.item(j).getChildNodes().item(0).getTextContent());
soa.setSeriennummer(soanodes.item(j).getChildNodes().item(1).getTextContent());
soa.setEmail(soanodes.item(j).getChildNodes().item(2).getTextContent());
soa.setPrimaryDNS(soanodes.item(j).getChildNodes().item(3).getTextContent());
soaList.add(soa);
}
}
// A
if (trContent.startsWith("ARecords")) {
NodeList anodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
for (int j = 1; j < anodes.getLength(); j++) {
ARec ar = new ARec();
ar.setHost(anodes.item(j).getChildNodes().item(0).getTextContent());
ar.setIp(anodes.item(j).getChildNodes().item(1).getTextContent());
ar.setTtl(anodes.item(j).getChildNodes().item(2).getTextContent());
aList.add(ar);
}
}
// MXRecords
if (trContent.startsWith("MXRecords")) {
NodeList mxnodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
for (int j = 1; j < mxnodes.getLength(); j++) {
MXRec mx = new MXRec();
mx.setHost(mxnodes.item(j).getChildNodes().item(0).getTextContent());
mx.setMailExchanger(mxnodes.item(j).getChildNodes().item(1).getTextContent());
mx.setTtl(mxnodes.item(j).getChildNodes().item(2).getTextContent());
mx.setPreference(mxnodes.item(j).getChildNodes().item(3).getTextContent());
mxList.add(mx);
}
}
// NSRecords
if (trContent.startsWith("NSRecords")) {
NodeList nsnodes = node.getChildNodes().item(1).getChildNodes().item(0).getChildNodes();
for (int j = 1; j < nsnodes.getLength(); j++) {
NSRec ns = new NSRec();
ns.setHost(nsnodes.item(j).getChildNodes().item(0).getTextContent());
ns.setNameserver(nsnodes.item(j).getChildNodes().item(1).getTextContent());
ns.setTtl(nsnodes.item(j).getChildNodes().item(2).getTextContent());
nxList.add(ns);
}
}
}
cr.setaRecList(aList);
cr.setMxRecList(mxList);
cr.setSoaRecList(soaList);
cr.setNxRecList(nxList);
temp.delete();
return cr;
}