当前位置: 首页 > 工具软件 > htmlparser2 > 使用案例 >

htmlparser(2)

董谦
2023-12-01
[b]上接 1

else if (node instanceof TextNode) {

stringText = node.toPlainTextString();

if ( "" .equals( title ))

continue ;

stringText = stringText.replaceAll( "[ \t\n\f\r   ]+" , " " );

stringText = TextHtml.html2text (stringText.trim());



if (! "" .equals(stringText)) {

body .append(stringText);

body .append( " " );

}

} else if (node instanceof TagNode) {

TagNode tagNode = (TagNode) node;

String name = ((TagNode) node).getTagName();

if (name.equals( "TITLE" ) && !tagNode.isEndTag()) {

node = lexer.nextNode();

stringText = node.toPlainTextString().trim();

if (! "" .equals(stringText)) {

title = stringText;

}

} else if (name.equals( "META" )) {

String contentCharSet = tagNode.getAttribute( "CONTENT" );

// System.out.println("contentCharset="+contentCharSet);

int b = contentCharSet.toLowerCase().indexOf( "charset" );

if (b > -1) {

String newCharSet = getCharset (contentCharSet);

// System.out.println("newCharSet=" + newCharSet);

if (!charSet.equals(newCharSet)) {

tryAgain = true ;

charSet = newCharSet;

// System.out.println("charSet=" + charSet);

// System.out.println("newCharSet=" + newCharSet);

break ;

}

}

}

}

}



/** 如果在 Meta 信息中检测到新的字符编码,则需要按照 meta 信息中的编码再次解析网页。 **/

if (tryAgain) {

body = new StringBuffer();



try {

uc = (HttpURLConnection) uc.getURL().openConnection();

lexer = new Lexer( new Page(uc.getInputStream(), charSet));

} catch (Exception e) {

e.printStackTrace();

}



lexer.setNodeFactory( new PrototypicalNodeFactory());



while ( null != (node = lexer.nextNode())) {

if (node instanceof TextNode) {

stringText = node.toPlainTextString();

if ( "" .equals( title ))

continue ;

stringText = stringText.replaceAll( "[ \t\n\f\r   ]+" , " " );

stringText = TextHtml.html2text (stringText.trim());

if (! "" .equals(stringText)) {

body .append(stringText);

body .append( " " );

}

}

}

}

}



/**

* 找出最终的网页编码

* @param name 经过 getCharset 方法处理后 meta 标签的值

* @param _default 默认的编码集

* @return

*/

public static String findCharset(String name, String _default) {

String ret;



try {

Class<java.nio.charset.Charset> cls;

Method method;

Object object;



cls = java.nio.charset.Charset. class ;

method = cls.getMethod( "forName" , new Class[] { String. class });

object = method.invoke( null , new Object[] { name });

method = cls.getMethod( "name" , new Class[] {});

object = method.invoke(object, new Object[] {});

ret = (String) object;

} catch (NoSuchMethodException nsme) {

ret = name;

} catch (IllegalAccessException ia) {

ret = name;

} catch (InvocationTargetException ita) {

ret = _default;

System. out

.println( "unable to determine cannonical charset name for "

+ name + " - using " + _default);

}



return (ret);

}



未完,接3 [/b]
 类似资料: