当前位置: 首页 > 工具软件 > Ex-Crawler > 使用案例 >

对爬虫技术的理解--Crawler

简宏义
2023-12-01
爬虫的基本思路如下:
根据URl获取相应页面的html代码
利用正则匹配或者Jsoup等库解析html代码,提取需要的内容
将获取的内容持久化到数据库中
处理好中文字符的编码问题,可以采用多线程提高效率
测试通过的demo:
实体bean:
public class CrawlStat{
private long totalLinks;
private int totalProcessedPages;
private long totalTextSize;
public long getTotalLinks() {
return totalLinks;
}
public int getTotalProcessedPages() {
return totalProcessedPages;
}
public long getTotalTextSize() {
return totalTextSize;
}
public void incProcessedPages() {
this.totalProcessedPages++;
}
public void incTotalLinks(int count) {
this.totalLinks += count;
}
public void incTotalTextSize(int count) {
this.totalTextSize += count;
}
public void setTotalLinks(long totalLinks) {
this.totalLinks = totalLinks;
}
public void setTotalProcessedPages(int totalProcessedPages) {
this.totalProcessedPages = totalProcessedPages;
}
public void setTotalTextSize(long totalTextSize) {
this.totalTextSize = totalTextSize;
}
}
具体的是先方法:
public class MyCrawlers extends WebCrawler{
/** 爬取数据保存文件路径 */
private final static String CVS_PATH = "data/crawl/ziroom.csv";
/** 爬取匹配原则 */
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|ico"
+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
private final static String LINK_PATH = "data/crawl/link.csv";//爬取link文件路径
private static final Logger logger = LoggerFactory.getLogger(MyCrawler.class);
private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl";
private final File csv;
private final File csv2;
private CsvWriter cw;
private CsvWriter cw2;
CrawlStat myCrawlStat;
public MyCrawler()throws Exception{
myCrawlStat = new CrawlStat();
csv = new File(CVS_PATH);
csv2 = new File(LINK_PATH);
if(csv.isFile()){
csv.delete();
}
if(csv2.isFile()){
csv2.delete();
}
cw = new CsvWriter(CVS_PATH,',', Charset.forName("GBK"));
String[] headerCw = {"图片","价格","地址","说明"};
cw.writeRecord(headerCw);
cw.close();
}
public void dumpMyData(){
final int id = getMyId();
logger.info("Crawler {} > Processed Pages: {}",id,myCrawlStat.getTotalProcessedPages());
logger.info("Crawler {} > Total Links Found: {}",id,myCrawlStat.getTotalLinks());
logger.info("Crawler {} > Total Text Size: {}",id,myCrawlStat.getTotalTextSize());
}
@Override
public Object getMyLocalData(){
return myCrawlStat;
}
@Override
public void onBeforeExit(){
dumpMyData();
}
public boolean shouldVisit(Page referringPage, WebURL url){
final String href = url.getURL().toLowerCase();
if(FILTERS.matcher(href).matches() ||!href.startsWith(URL_PREFIX)){
return false;
}
return true;
}
@Override
public void visit(Page page){

final String url = page.getWebURL().getURL();
logger.info("====爬取路径===="+url);
myCrawlStat.incProcessedPages();
if(page.getParseData() instanceof HtmlParseData){
try{
final HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
final List<WebURL> links = htmlParseData.getOutgoingUrls();
// linkToCsv(links);
myCrawlStat.incTotalLinks(links.size());
myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("GBK").length);
final String html = htmlParseData.getHtml();
final Document doc = Jsoup.parse(html);
final Elements contents = doc.select("li[class=clearfix]");
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv, true), "GBK"), 1024);
cw = new CsvWriter(out, ',');
for(final Element c : contents) {
final String img = c.select(".img img").first().attr("src");//图片
//地址
final Element txt = c.select("div[class=txt]").first();
final String arr1 = txt.select("h3 a").first().text();
final String arr2 = txt.select("h4 a").first().text();
final String arr3 = txt.select("div[class=detail]").first().text();
final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3);
final String rank = txt.select("p").first().text();//说明
final String pirce = c.select("p[class=price]").first().text();//价格
System.out.println("返回数据:" + img+"|"+arr+"|"+rank+"|"+pirce);
String[] content = {img,pirce,arr,rank};
/* cw.write(img);
cw.write(pirce);
cw.write(new String(arr.getBytes("utf-8"),"GBK"));
cw.write(rank);*/
cw.writeRecord(content);
}
cw.flush();
cw.close();
}catch (Exception ex){
ex.printStackTrace();}
}
}
private void linkToCsv(List<WebURL> links)throws Exception{
cw2 = new CsvWriter(LINK_PATH,',', Charset.forName("GBK"));
String[] header = {"请求路径"};
cw2.writeRecord(header);
for(final WebURL webURL:links){
String[]contentUrl = {webURL.getURL()};
cw2.writeRecord(contentUrl);
}
cw2.close();
}
}
测试的main函数:
public static void main(String[] args){
final String crawlStorageFolder = "data/crawl/root";
final int numberOfCrawlers = 2;
final CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
config.setIncludeBinaryContentInCrawling(false);
config.setMaxPagesToFetch(50);
final PageFetcher pageFetcher = new PageFetcher(config);
final RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig,pageFetcher);
CrawlController controller;
try{
controller = new CrawlController(config,pageFetcher,robotstxtServer);
controller.addSeed("http://johnhany.net/");
controller.start(MyCrawler.class,numberOfCrawlers);
final List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for(final Object localData :crawlersLocalData){
final CrawlStat stat = (CrawlStat) localData;
totalLinks +=stat.getTotalLinks();
totalTextSize +=stat.getTotalTextSize();
totalProcessedPages +=stat.getTotalProcessedPages();
}
System.out.println("Aggregated Statistics:");
System.out.print("\tProcessed Pages:{}"+totalProcessedPages);
System.out.println("\tTotal Links found: {}" + totalLinks);
System.out.println("\tTotal Text Size: {}" + totalTextSize);
}catch (Exception ex){
ex.printStackTrace();
}
}
 类似资料: