Java Crawler(1)HTMLUnit
祁均
2023-12-01
Java Crawler(1)HTMLUnit
pom.xml Add Few Dependencies
+ <dependency>
+ <groupId>net.sourceforge.htmlunit</groupId>
+ <artifactId>htmlunit</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>net.sourceforge.htmlunit</groupId>
+ <artifactId>htmlunit-core-js</artifactId>
+ <version>2.27</version>
+ </dependency>
+ <dependency>
+ <groupId>net.sourceforge.htmlunit</groupId>
+ <artifactId>neko-htmlunit</artifactId>
+ <version>2.27</version>
+ </dependency>
+ <dependency>
+ <groupId>org.w3c.css</groupId>
+ <artifactId>sac</artifactId>
+ <version>1.3</version>
+ </dependency><dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>xalan</groupId>
+ <artifactId>xalan</artifactId>
+ <version>2.7.2</version>
+ </dependency>
+ <dependency>
+ <groupId>net.sourceforge.cssparser</groupId>
+ <artifactId>cssparser</artifactId>
+ <version>0.9.23</version>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty.websocket</groupId>
+ <artifactId>websocket-client</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>xerces</groupId>
+ <artifactId>xercesImpl</artifactId>
+ <version>2.11.0</version>
+ </dependency>
Most easiest Test Class CrawlerTest.java
package com.sillycat.jobsmonitorapi.service;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.List;
import org.junit.Test;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSpan;
public class CrawlerTest {
@Test
public void testCrawlWalmart() throws FailingHttpStatusCodeException, MalformedURLException, IOException {
try (final WebClient webClient = new WebClient(BrowserVersion.INTERNET_EXPLORER)) {
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setRedirectEnabled(true);
webClient.getOptions().setTimeout(30000);
webClient.setJavaScriptTimeout(30000);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
HtmlPage page = webClient.getPage("https://www.walmart.com/search/?grid=false&page=2&query=computer#searchProductResult");
webClient.waitForBackgroundJavaScript(10000);
//String htmlContent = page.asXml();
//File htmlFile = new File("/Users/carl/Downloads/products.html");
//PrintWriter pw = new PrintWriter(htmlFile);
//pw.print(htmlContent);
//pw.close();
HtmlDivision div = page.getFirstByXPath("//div[@class='search-result-listview-items']");
Iterator<DomElement> itDivs = div.getChildElements().iterator();
int count = 0;
for(;itDivs.hasNext();){
count++;
DomElement helloDiv = itDivs.next();
//helloDiv.click();
System.out.println("Print==== " + count + " " + helloDiv.asText());
}
//List<?> links = page.getByXPath("//a[@class='product-title-link']");
//for(int i = 0;i<links.size();i++){
//System.out.println("Link===========" + (i+1) + " " + links.get(i));
//}
//List<HtmlAnchor> anchors = page.getAnchors();
//int count = 0;
//for(int i = 0;i<anchors.size();i++){
//HtmlAnchor anchor = anchors.get(i);
//String url = anchor.getHrefAttribute();
//if(url.startsWith("/ip")){
//count++;
//System.out.println("URL ============== " + count + " " + url);
//}
//}
// Detail Page
//HtmlPage page = webClient.getPage("https://www.walmart.com/ip/HP-15-ay041wm-15-6-Silver-Fusion-Laptop-Touch-Screen-Windows-10-Intel-Core-i3-6100U-Processor-8GB-Memory-1TB-Hard-Drive/51397784");
//HtmlSpan priceSpan = page.getFirstByXPath("//span[@class='Price-characteristic']");
//System.out.println("Price========" + priceSpan.getTextContent());
}
}
}
References: