网站为中国政招标网 流程为定时任务触发service层 查询需要查询的网站关键词,爬虫根据关键字爬取当天的数据 条件符合的放入list中结束后返回并保存
package gov.zb.data.webcollector.tender;
import gov.zb.data.entity.monitor.Monitor;
import gov.zb.data.enums.GeneralEnums;
import gov.zb.data.enums.GeneralEnums.WebsiteType;
import gov.zb.data.util.DateUtils;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
public class BidcenterCralwer extends BreadthCrawler{
private WebsiteType website_Type;
private String monitorFrom;
private List<Monitor>listMonitor = new ArrayList<Monitor>();
private String nowTime ;
private String[] keys ;
private Monitor monitor;
public BidcenterCralwer(String crawlPath, boolean autoParse,String[] keys){
super(crawlPath, autoParse);
/*对网站前N页进行爬取*/
for(String keyword:keys){//因为网站支持全局搜索 所以直接在url里拼入关键词
String eStr="";
try {
eStr = URLEncoder.encode(keyword, "utf-8");//关键词url加密
} catch (UnsupportedEncodingException e) {
System.out.println("编码出错="+e);
}
for (int i = 1; i <= 1; i++) {//for循环可以根据关键字数量加减
this.addSeed(new CrawlDatum("http://search.bidcenter.com.cn/search?keywords="+eStr+"&type=1&page="+i+"").meta("pageNum", i + ""));
}
}
this.addRegex("http://www.bidcenter.com.cn/news-\\d+-1.html");//文章页
/*不要爬取包含 # 的URL*/
this.addRegex("-.*#.*");
}
@Override
public void visit(Page page, CrawlDatums next) {
SimpleDateFormat sDate = new SimpleDateFormat("yyyy-MM-dd");
nowTime=sDate.format(DateUtils.minus());//获取当前日期减1天并转换格式
try {
String url = page.getUrl();
System.out.println(url+","+page.matchUrl("http://www.bidcenter.com.cn/news-\\d+-1.html"));
/*判断是否为文章页*/
if (page.matchUrl("http://www.bidcenter.com.cn/news-\\d+-1.html")) {
String time = page.select("table[class=table_content] tr + tr td[width=213] span[class=d2] ").first().text();//获取文章日期
String title = page.select("h1[class=content_title2]").text();//标题
String content = page.select("div[class=zdynr]").first().text();//内容
String Jcontent =content.substring(0,content.indexOf("。")+1).replaceAll("【\\d】", "");
for(String keyword:keys){
if((-1!=title.indexOf(keyword)||-1!=content.indexOf(keyword)) && nowTime.equals(time)){
System.out.println(title+","+keyword);
monitor = new Monitor();
monitor.setMonitorUrl(url);//url
monitor.setMonitorTitle(title);//标题
monitor.setMonitorContent(Jcontent);//内容简介
monitor.setKeyword(keyword);//关键字
monitor.setMonitorFrom(monitorFrom);//数据来源
monitor.setWebsiteType(website_Type);
monitor.setCreateDate(new Date());
monitor.setDelState(GeneralEnums.DelState.正常);
listMonitor.add(monitor);
break;
}
}
}
} catch (Exception e) {
System.out.println(e);
}
}
public List<Monitor> doItCralwer(String [] keyworlds ,BidcenterCralwer crawler,WebsiteType websiteType,String from){
keys=keyworlds;
website_Type = websiteType;//网站类型
monitorFrom = from;//来自那个网站
try {
if(null != crawler){
crawler.setThreads(40);
crawler.setTopN(5000);
crawler.start(2);
}
} catch (Exception e) {
e.printStackTrace();
}
return this.listMonitor;
}
}
//service @Transactional(propagation = Propagation.REQUIRED) public Boolean BidcenterCralwer(){ website= websiteService.getById("xxxxxxxxxxxid");//根据id查询网站信息 //取出关键词组 放入数组中 String[] keys = website.getWebsiteKey().split("、"); //初始化爬虫 BidcenterCralwer crawler = new BidcenterCralwer("bidcenterCralwer", true,keys); List<Monitor> listMonitor = crawler.doItCralwer(keys,crawler,website.getWebsiteType(),website.getWebsiteName()); if(null != listMonitor && listMonitor.size()>0){ for(Monitor monitor:listMonitor){ saveMonitor(monitor); } return true; } return false; }