遍历html,删除节点信息demo
//author: thrillerzw
public class DomUtils {
public static Document parse(String str) {
InputSource input = new InputSource(new StringReader(str));
DOMParser parser = new DOMParser();
try {
// parser.setFeature("http://cyberneko.org/html/features/override-namespaces",
// false);
parser.setFeature(
"http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
true);
parser.setFeature(
"http://cyberneko.org/html/features/balance-tags/document-fragment",
false);
parser.setFeature(
"http://cyberneko.org/html/features/scanner/script/strip-comment-delims",
true);
parser.setFeature(
"http://cyberneko.org/html/features/scanner/script/strip-cdata-delims",
true);
parser.setFeature(
"http://cyberneko.org/html/features/scanner/style/strip-comment-delims",
true);
parser.setFeature(
"http://cyberneko.org/html/features/scanner/style/strip-cdata-delims",
true);
parser.setFeature(
"http://cyberneko.org/html/features/scanner/notify-builtin-refs",
true);
parser.setFeature(
"http://apache.org/xml/features/scanner/notify-char-refs",
true);
parser.setFeature(
"http://apache.org/xml/features/scanner/notify-builtin-refs",
true);
} catch (SAXNotRecognizedException e1) {
e1.printStackTrace();
} catch (SAXNotSupportedException e1) {
e1.printStackTrace();
}
try {
// 设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"utf-8");
// parser.parse(input);
parser.parse(str);
Document d = parser.getDocument();
return d;
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
//<base target="_blank">
public static String[] delNodes = { "SCRIPT", "STYLE", "OBJECT", "EMBED", "INPUT","SELECT","IFRAME","LINK","BASE","TITLE"};
public static String[] clearAttrsNodes = { "BODY","HTML","STRONG", "EM", "OL", "UL", "DL","DT","DD","LI","SUB","SUP","BR","SPAN","H1","H2" };
public static String[] transPNodes = { "CAPTION","DIV"};
public static String[] transH2Nodes = { "H3","H4","H5","H6",};
// img: {$:{width:1,height:1,src:1,style:['float','width','height']}}
public static String[] imgAttrs = {"src","style"};
private static boolean isDelNode(String nodeName){
List<String> list=Arrays.asList(delNodes);
boolean res=list.contains(nodeName);
return res;
}
private static boolean isClearAttrsNode(String nodeName){
List<String> list=Arrays.asList(clearAttrsNodes);
boolean res=list.contains(nodeName);
return res;
}
private static boolean isTransPNodes(String nodeName){
List<String> list=Arrays.asList(transPNodes);
boolean res=list.contains(nodeName);
return res;
}
private static boolean isTransH2Nodes(String nodeName){
List<String> list=Arrays.asList(transH2Nodes);
boolean res=list.contains(nodeName);
return res;
}
private static boolean isImgAttrs(String nodeName){
List<String> list=Arrays.asList(imgAttrs);
boolean res=list.contains(nodeName.toLowerCase());
return res;
}
private static void toHTML(StringBuilder sb, Node node,String path) {
int type = node.getNodeType();
switch (type) {
case Node.DOCUMENT_NODE: {
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++)
toHTML(sb, children.item(i),path);
}
break;
}
// print element with attributes
case Node.ELEMENT_NODE: {
//节点名字都是大写
String nodeName=node.getNodeName();
if(isDelNode(nodeName)||(!nodeName.startsWith("P:")&&nodeName.contains(":"))){
break;
}
sb.append("<");
if(isTransPNodes(nodeName)||nodeName.startsWith("P:")){
nodeName="P";
}else if("B".equals(nodeName)){
nodeName="STRONG";
}
else if(isTransH2Nodes(nodeName)){
nodeName="H2";
}
else if("H2".equals(nodeName)){
nodeName="H1";
}else if("NOBR".equals(nodeName)){
nodeName="SPAN";
}
sb.append(nodeName);
if (!isClearAttrsNode(nodeName)) {
NamedNodeMap attrs = node.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String key = attr.getNodeName();
String value = attr.getNodeValue();
//a : clearStyle
if("A".equals(nodeName)&&"STYLE".equalsIgnoreCase(key)){
continue;
}
//p: {$:{id:1,style:['text-align']}},
if("P".equals(nodeName)&&(!"id".equalsIgnoreCase(key)||!"style".equalsIgnoreCase(key))){
continue;
}
if("P".equals(nodeName)&&"style".equalsIgnoreCase(key)){
int p1=value.indexOf("text-align");
if(p1!=-1){
int p2=value.indexOf(";", p1);
value=value.substring(p1, p2);
}
}
//img: {$:{width:1,height:1,src:1,style:['float','width','height']}}
if("IMG".equals(nodeName)&&!isImgAttrs(key)){
continue;
}
if("IMG".equals(nodeName)&&"src".equalsIgnoreCase(key)){
value=path+value;
}
if("IMG".equals(nodeName)&&"style".equalsIgnoreCase(key)){
String[] cssArr={"float","width","height"};
StringBuffer cssSb=new StringBuffer();
for(int j=0;j<cssArr.length;j++){
int p1=value.indexOf(cssArr[j]);
if(p1!=-1){
int p2=value.indexOf(";", p1);
if(p2==-1){
p2=value.length();
}
cssSb.append(value.substring(p1, p2)).append(";");
}
}
value=cssSb.toString();
}
sb.append(" " + key + "=\"" + value + "\"");
}
}
if (!node.hasChildNodes()) {
if (!sb.toString().trim().endsWith("/>")) {
sb.append("/>");
}
return;
}
sb.append(">");
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++)
toHTML(sb, children.item(i),path);
}
break;
}
// handle entity reference nodes
case Node.ENTITY_REFERENCE_NODE: {
sb.append("&").append(node.getNodeName()).append(";");
break;
}
// print cdata sections
/* case Node.CDATA_SECTION_NODE: {
sb.append("<![CDATA[").append(node.getNodeValue()).append("]]>");
break;
}*/
// print text
case Node.TEXT_NODE: {
String value=node.getNodeValue();
String name=node.getNodeName();
if(value.trim().startsWith("if(navigator")){
value=value.replace("if(navigator.userAgent.indexOf('MSIE')<0) {", "").replace("}", "");
}
sb.append(value);
break;
}
/* case Node.COMMENT_NODE: {
String value=node.getNodeValue();
if(value.startsWith("[if !mso]>")){
sb.append("<!--").append(value).append("-->");
}
break;
}*/
}
if (type == Node.ELEMENT_NODE) {
String nodeName=node.getNodeName();
if (!isDelNode(nodeName)&&!(!nodeName.startsWith("P:")&&nodeName.contains(":"))) {
sb.append("</");
if(isTransPNodes(nodeName)||nodeName.startsWith("P:")){
nodeName="P";
}else if("B".equals(nodeName)){
nodeName="STRONG";
}
else if(isTransH2Nodes(nodeName)){
nodeName="H2";
}
else if("H2".equals(nodeName)){
nodeName="H1";
}else if("NOBR".equals(nodeName)){
nodeName="SPAN";
}
sb.append(nodeName);
sb.append(">");
}
}
}
/**
* prase node to HTML
*
* @param node
* @return html string
*/
public static String toHTML(Node node,String path) {
StringBuilder sb = new StringBuilder();
toHTML(sb, node,path);
return sb.toString();
}
public static void main(String[] args) throws IOException {
// 解析为dom节点
Node node = DomUtils.parse("http://www.baidu.com");
// Node node = DomTest.parse("D:\\tmp\\11pptx.files\\slide1.htm");
//Node node = DomTest.parse("D:\\tmp\11pptx.files\\slide1.htm");
System.out.println(DomUtils.toHTML(node,""));
}
}