KeyWord: HotSAX Java解析html
补充:原来不需要HotSAX也可以解析,真是蛋疼,罪过,罪过...
HotSAX对中文章支持较差
以下源码需要HotSAX的支持。HotSAX是GPL协议。
下载HotSAX: http://hotsax.sourceforge.net/
下载的包是源码,没有打过包,图方便的话把HotSAX整个目录复制到你的工程下,其中HotSAX又信赖于hotsax.jar,在下载的文件的lib目录中
以下是源码,其作用是解析一串html字符串,并且查找其中的指定文本,并将包含这些文本的节点路径输出。
package t1;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
/**
* 解析html字符串,需要HotSAX插件的支持(from:http://hotsax.sourceforge.net/)<br/>
* @author TaoPeng
*
*/
public class HtmlParserDemo {
/**
* @param args
*/
public static void main(String[] args) {
String html = "<html><head><title>|中国|</title></head><body><div id=\"firstDiv\">中国</div></body></html>";
String keyWord = "中国";
try {
new HtmlParserDemo().test1(html, keyWord);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 在html文本中查询文本keyWord,并输出包含这些文本的节点路径
* @param html
* @param keyWord
* @throws IOException
* @throws SAXException
*/
public void test1(String html, String keyWord) throws IOException, SAXException{
MyContentHandler mch = new MyContentHandler();
mch.setKeyword(keyWord);
// XMLReader parser = XMLReaderFactory.createXMLReader("hotsax.html.sax.SaxParser");
XMLReader parser = XMLReaderFactory.createXMLReader();//使用这个构造,直接忽略hotsax
parser.setContentHandler(mch);
StringReader sr = new StringReader(html);
InputSource is = new InputSource(sr);
parser.parse(is);
List<String> tps = mch.getTagPath();
for(String tp : tps){
System.out.println(tp);
}
}
}
class MyContentHandler implements ContentHandler{
/**
* 查询的关键字
*/
private String keyword;
private List<String> tagPath = new ArrayList<String>(10);
private Stack<String> tagStack = new Stack<String>();
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public List<String> getTagPath() {
return tagPath;
}
public void setDocumentLocator(Locator locator) {
// TODO Auto-generated method stub
}
public void startDocument() throws SAXException {
// TODO Auto-generated method stub
}
public void endDocument() throws SAXException {
// TODO Auto-generated method stub
}
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
// TODO Auto-generated method stub
}
public void endPrefixMapping(String prefix) throws SAXException {
// TODO Auto-generated method stub
}
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
String tag = localName;
String id = atts.getValue("id");
if( id != null && id.length() > 0 ){
tag = tag + "(#" + id + ")";
}
tagStack.push(tag);
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
tagStack.pop();
}
public void characters(char[] ch, int start, int length)
throws SAXException {
if(keyword == null || length <= 0){
return;
}
String text = new String(ch);
if (text.indexOf(keyword) >= 0) {
int size = tagStack.size();
StringBuffer sb = new StringBuffer(size);
for (int i = 0; i < size; i++) {
if (sb.length() > 0) {
sb.append(" > ");
}
sb.append(tagStack.get(i));
}
tagPath.add(sb.toString());
}
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
// TODO Auto-generated method stub
}
public void processingInstruction(String target, String data)
throws SAXException {
// TODO Auto-generated method stub
}
public void skippedEntity(String name) throws SAXException {
// TODO Auto-generated method stub
}
}