当前位置: 首页 > 工具软件 > WebCollector > 使用案例 >

webcollector爬虫框架使用案例

庄博厚
2023-12-01

网站为中国政招标网 流程为定时任务触发service层 查询需要查询的网站关键词,爬虫根据关键字爬取当天的数据 条件符合的放入list中结束后返回并保存


package gov.zb.data.webcollector.tender;

import gov.zb.data.entity.monitor.Monitor;
import gov.zb.data.enums.GeneralEnums;
import gov.zb.data.enums.GeneralEnums.WebsiteType;
import gov.zb.data.util.DateUtils;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;


public class BidcenterCralwer extends BreadthCrawler{
	 private WebsiteType website_Type;
	 private String monitorFrom;
	 private List<Monitor>listMonitor = new ArrayList<Monitor>();
	 private String nowTime ;
	 private String[] keys ;
	 private Monitor monitor;
	 
	  public BidcenterCralwer(String crawlPath, boolean autoParse,String[] keys){
	        super(crawlPath, autoParse);
	        /*对网站前N页进行爬取*/
	  
	      for(String keyword:keys){//因为网站支持全局搜索 所以直接在url里拼入关键词
	        	   String eStr="";
	   			try {
	   				eStr = URLEncoder.encode(keyword, "utf-8");//关键词url加密
	   			} catch (UnsupportedEncodingException e) {
	   				System.out.println("编码出错="+e);
	   			}
	        	 for (int i = 1; i <= 1; i++) {//for循环可以根据关键字数量加减
	        	      this.addSeed(new CrawlDatum("http://search.bidcenter.com.cn/search?keywords="+eStr+"&type=1&page="+i+"").meta("pageNum", i + ""));
	 	        }
	        	
	        }
	        

	        
	        this.addRegex("http://www.bidcenter.com.cn/news-\\d+-1.html");//文章页
	        
	        /*不要爬取包含 # 的URL*/
	        this.addRegex("-.*#.*");
	    }



	@Override
	public void visit(Page page, CrawlDatums next) {
		SimpleDateFormat sDate = new SimpleDateFormat("yyyy-MM-dd");
		
		nowTime=sDate.format(DateUtils.minus());//获取当前日期减1天并转换格式
    	try {
    		  String url = page.getUrl();
    		  System.out.println(url+","+page.matchUrl("http://www.bidcenter.com.cn/news-\\d+-1.html"));
    	        /*判断是否为文章页*/
    	        if (page.matchUrl("http://www.bidcenter.com.cn/news-\\d+-1.html")) {
    	        	String time = page.select("table[class=table_content] tr + tr td[width=213] span[class=d2] ").first().text();//获取文章日期
    	            String title = page.select("h1[class=content_title2]").text();//标题
    	            String content = page.select("div[class=zdynr]").first().text();//内容
    	            String Jcontent =content.substring(0,content.indexOf("。")+1).replaceAll("【\\d】", "");
    	        	for(String keyword:keys){
    					if((-1!=title.indexOf(keyword)||-1!=content.indexOf(keyword)) && nowTime.equals(time)){
    						System.out.println(title+","+keyword);
    						monitor = new Monitor();
    						monitor.setMonitorUrl(url);//url
    						monitor.setMonitorTitle(title);//标题
    						monitor.setMonitorContent(Jcontent);//内容简介
    						monitor.setKeyword(keyword);//关键字
    						monitor.setMonitorFrom(monitorFrom);//数据来源
    						monitor.setWebsiteType(website_Type);
    						monitor.setCreateDate(new Date());
    						monitor.setDelState(GeneralEnums.DelState.正常);
    						listMonitor.add(monitor);
    						break;
    					}
    				}
    	          
    	            
    	             
    	            
    	        }
		} catch (Exception e) {
			System.out.println(e);
		}
	}
	 public List<Monitor> doItCralwer(String [] keyworlds ,BidcenterCralwer crawler,WebsiteType websiteType,String from){
		 keys=keyworlds;
		 website_Type = websiteType;//网站类型
		 monitorFrom = from;//来自那个网站
	        try {
	        	if(null != crawler){
	        		crawler.setThreads(40);
	        		crawler.setTopN(5000);
	    			crawler.start(2);
	    			 
	        	}
			} catch (Exception e) {
				e.printStackTrace();
			}
	        return this.listMonitor;
		}
	

}

	//service
	@Transactional(propagation = Propagation.REQUIRED)
	public Boolean BidcenterCralwer(){
		
		website= websiteService.getById("xxxxxxxxxxxid");//根据id查询网站信息
		//取出关键词组 放入数组中
		String[] keys = website.getWebsiteKey().split("、");
		//初始化爬虫
		BidcenterCralwer crawler = new BidcenterCralwer("bidcenterCralwer", true,keys);
		
		List<Monitor> listMonitor = crawler.doItCralwer(keys,crawler,website.getWebsiteType(),website.getWebsiteName());
		if(null != listMonitor && listMonitor.size()>0){
			for(Monitor monitor:listMonitor){
				saveMonitor(monitor);
			}
			return true;
		}
		return false;
	}





 类似资料: