htmlparser(2)
董谦
2023-12-01
[b]上接 1
else if (node instanceof TextNode) {
stringText = node.toPlainTextString();
if ( "" .equals( title ))
continue ;
stringText = stringText.replaceAll( "[ \t\n\f\r ]+" , " " );
stringText = TextHtml.html2text (stringText.trim());
if (! "" .equals(stringText)) {
body .append(stringText);
body .append( " " );
}
} else if (node instanceof TagNode) {
TagNode tagNode = (TagNode) node;
String name = ((TagNode) node).getTagName();
if (name.equals( "TITLE" ) && !tagNode.isEndTag()) {
node = lexer.nextNode();
stringText = node.toPlainTextString().trim();
if (! "" .equals(stringText)) {
title = stringText;
}
} else if (name.equals( "META" )) {
String contentCharSet = tagNode.getAttribute( "CONTENT" );
// System.out.println("contentCharset="+contentCharSet);
int b = contentCharSet.toLowerCase().indexOf( "charset" );
if (b > -1) {
String newCharSet = getCharset (contentCharSet);
// System.out.println("newCharSet=" + newCharSet);
if (!charSet.equals(newCharSet)) {
tryAgain = true ;
charSet = newCharSet;
// System.out.println("charSet=" + charSet);
// System.out.println("newCharSet=" + newCharSet);
break ;
}
}
}
}
}
/** 如果在 Meta 信息中检测到新的字符编码,则需要按照 meta 信息中的编码再次解析网页。 **/
if (tryAgain) {
body = new StringBuffer();
try {
uc = (HttpURLConnection) uc.getURL().openConnection();
lexer = new Lexer( new Page(uc.getInputStream(), charSet));
} catch (Exception e) {
e.printStackTrace();
}
lexer.setNodeFactory( new PrototypicalNodeFactory());
while ( null != (node = lexer.nextNode())) {
if (node instanceof TextNode) {
stringText = node.toPlainTextString();
if ( "" .equals( title ))
continue ;
stringText = stringText.replaceAll( "[ \t\n\f\r ]+" , " " );
stringText = TextHtml.html2text (stringText.trim());
if (! "" .equals(stringText)) {
body .append(stringText);
body .append( " " );
}
}
}
}
}
/**
* 找出最终的网页编码
* @param name 经过 getCharset 方法处理后 meta 标签的值
* @param _default 默认的编码集
* @return
*/
public static String findCharset(String name, String _default) {
String ret;
try {
Class<java.nio.charset.Charset> cls;
Method method;
Object object;
cls = java.nio.charset.Charset. class ;
method = cls.getMethod( "forName" , new Class[] { String. class });
object = method.invoke( null , new Object[] { name });
method = cls.getMethod( "name" , new Class[] {});
object = method.invoke(object, new Object[] {});
ret = (String) object;
} catch (NoSuchMethodException nsme) {
ret = name;
} catch (IllegalAccessException ia) {
ret = name;
} catch (InvocationTargetException ita) {
ret = _default;
System. out
.println( "unable to determine cannonical charset name for "
+ name + " - using " + _default);
}
return (ret);
}
未完,接3 [/b]