当前位置: 首页 > 工具软件 > NekoHTML > 使用案例 >

java httpclient读取html文件,httpclient+nekohtml 解析HTML

白坚壁
2023-12-01

httpclient+nekohtml 解析HTML

import org.cyberneko.html.parsers.DOMParser;

import org.xml.sax.InputSource;

import org.xml.sax.SAXException;

import org.w3c.dom.Document;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.methods.GetMethod;

import java.io.InputStream;

import java.io.IOException;

public class Html2XML {

private int connectionTimeout = 5000;

private int soTimeout = 12000;

private String proxyHost = null;

private int proxyPort;

public Document getDocument(String url) {

HttpClient client = new HttpClient();

if (proxyHost != null) {

client.getHostConfiguration().setProxy(proxyHost, proxyPort);

}

client.getHttpConnectionManager().getParams().setConnectionTimeout(connectionTimeout);

client.getHttpConnectionManager().getParams().setSoTimeout(soTimeout);

GetMethod method = new GetMethod(url);

method.addRequestHeader("Content-Type", "text/html; charset=utf-8");

try {

int statusCode = client.executeMethod(method);

if (statusCode != HttpStatus.SC_OK) {

throw new HttpException("HttpStatusCode : " + statusCode);

}

InputStream is = method.getResponseBodyAsStream();

DOMParser parser = new DOMParser();

parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");

parser.parse(new InputSource(is));

return (parser.getDocument());

} catch (HttpException he) {

he.printStackTrace();

} catch (IOException ie) {

ie.printStackTrace();

} catch (SAXException se) {

se.printStackTrace();

}

return null;

}

}

 类似资料: