当前位置: 首页 > 工具软件 > NekoHTML > 使用案例 >

java nekohtml_一个使用NekoHTML解析html的例子

陶璞
2023-12-01

import java.io.PrintWriter;

import org.cyberneko.html.parsers.DOMParser;

import org.w3c.dom.Document;

import org.w3c.dom.NamedNodeMap;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

/**

*

* @author Administrator

*

*/

public class TestHTMLDOM {

public static void main(String[] argv) throws Exception {

DOMParser parser = new DOMParser();

System.setProperty("http.proxyHost", "88.148.41.44");

System.setProperty("http.proxyPort", "51966");

parser.setFeature("http://xml.org/sax/features/namespaces", false);

parser.parse("c:/blog.htm");//也可以是如:http://itindex.net/blog

//writeDoc(parser.getDocument().getDocumentElement());

//print(parser.getDocument(), "");

printDOMTree(parser.getDocument());

PrintWriter w = new PrintWriter(System.out);

}

private String indent = " "; // 缩进的标志

private String lineSeparator = "\n"; // 换行的标志

/** Prints the specified node, then prints all of its children. */

/**

* 打印xml/html文档

*/

public static void printDOMTree(Node node) {

int type = node.getNodeType();

switch (type) {

// print the document element

case Node.DOCUMENT_NODE: {

System.out.println("<?xml version=\"1.0\" ?>");

printDOMTree(((Document) node).getDocumentElement());

break;

}

// print element with attributes

case Node.ELEMENT_NODE: {

System.out.print("

System.out.print(node.getNodeName());

NamedNodeMap attrs = node.getAttributes();

for (int i = 0; i < attrs.getLength(); i++) {

Node attr = attrs.item(i);

System.out.print(" " + attr.getNodeName() + "=\""

+ attr.getNodeValue() + "\"");

}

System.out.println(">");

NodeList children = node.getChildNodes();

if (children != null) {

int len = children.getLength();

for (int i = 0; i < len; i++)

printDOMTree(children.item(i));

}

break;

}

// handle entity reference nodes

case Node.ENTITY_REFERENCE_NODE: {

System.out.print("&");

System.out.print(node.getNodeName());

System.out.print(";");

break;

}

// print cdata sections

case Node.CDATA_SECTION_NODE: {

System.out.print("

System.out.print(node.getNodeValue());

System.out.print("]]>");

break;

}

// print text

case Node.TEXT_NODE: {

System.out.print(node.getNodeValue());

break;

}

// print processing instruction

case Node.PROCESSING_INSTRUCTION_NODE: {

System.out.print("");

System.out.print(node.getNodeName());

String data = node.getNodeValue();

{

System.out.print(" ");

System.out.print(data);

}

System.out.print("?>");

break;

}

}

if (type == Node.ELEMENT_NODE) {

System.out.println();

System.out.print("");

System.out.print(node.getNodeName());

System.out.print(">");

}

}

/**

* 输出经过修整干净的html文档,fix up and clean html

*

* @param node

*/

private static void writeDoc(Node node) {

short type = node.getNodeType();

switch (type) {

case Node.ELEMENT_NODE: {

String name = "

NamedNodeMap attrs = node.getAttributes();

if (attrs != null) {

int length = attrs.getLength();

for (int i = 0; i < length; i++) {

Node attr = attrs.item(i);

name += " " + attr.getNodeName();

name += "=\"" + attr.getNodeValue() + "\"";

}

}

name += ">";

System.out.println(name);

NodeList children = node.getChildNodes();

if (children != null) {

int length = children.getLength();

for (int i = 0; i < length; i++)

writeDoc(children.item(i));

}

System.out.println("" + node.getNodeName() + ">");

break;

}

case Node.TEXT_NODE: {

System.out.println(node.getNodeValue());

break;

}

}

}

/**

* 抽取html文档里的文本Text

*

* @param node

* @param indent

*/

public static void print(Node node, String indent) {

// System.out.println(indent+node.getClass().getName());

if (node.getNodeValue() != null) {

if ("".equals(node.getNodeValue().trim())) {

} else {

System.out.print(indent);

System.out.println(node.getNodeValue());

}

}

Node child = node.getFirstChild();

while (child != null) {

print(child, indent + " ");

child = child.getNextSibling();

}

}

}

 类似资料: