https://github.com/CrawlScript/WebCollector
java爬虫,下面代码基于webCollector,可以爬取加载js后的数据,部分网站做了防护后也是抓取不到数据的。
pom:
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.73-alpha</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.4.0</version>
<exclusions>
<exclusion>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit-core-js</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<artifactId>htmlunit-core-js</artifactId>
<groupId>net.sourceforge.htmlunit</groupId>
<version>2.33</version>
</dependency>
代码:
package com.marketing.test;
import cn.edu.hfut.dmic.webcollector.crawldb.DBManager;
import cn.edu.hfut.dmic.webcollector.crawler.Crawler;
import cn.edu.hfut.dmic.webcollector.fetcher.Executor;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import java.util.List;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.RocksDBManager;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
/**
* 本教程演示如何利用WebCollector爬取javascript生成的数据
*
* @author hu
*/
public class DemoSeleniumCrawler {
static {
//禁用Selenium的日志
Logger logger = Logger.getLogger("com.gargoylesoftware.htmlunit");
logger.setLevel(Level.OFF);
}
public static void main(String[] args) throws Exception {
Executor executor = new Executor() {
@Override
public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get(datum.url());
System.out.println(driver.toString());
List<WebElement> elementList = driver.findElementsByCssSelector("detail");
System.out.println("=============="+elementList.size());
for(WebElement element:elementList){
System.out.println("title:"+element.getText());
}
}
};
//创建一个基于伯克利DB的DBManager
DBManager manager = new RocksDBManager("crawl");
//创建一个Crawler需要有DBManager和Executor
Crawler crawler = new Crawler(manager, executor);
crawler.addSeed("https://item.jd.com/100008348542.html");
crawler.start(1);
}
}