Implement a webpage Crawler to crawl webpages of http://www.wikipedia.org/
. To simplify the question, let's use url instead of the the webpage content.
Your crawler should:
HtmlHelper.parseUrls(url)
to get all urls from a webpage of given url.http://www.wikipedia.org/
Example 1
Input:
"http://www.wikipedia.org/": ["http://www.wikipedia.org/help/"]
"http://www.wikipedia.org/help/": []
Output: ["http://www.wikipedia.org/", "http://www.wikipedia.org/help/"]
思路:就是用queue,bfs,不过要用多线程写,我抄的答案;
/**
* public class HtmlHelper {
* public static List<String> parseUrls(String url);
* // Get all urls from a webpage of given url.
* }
*/
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.*;
import java.util.concurrent.locks.*;
import java.net.*;
public class Solution {
/**
* @param url: a url of root page
* @return: all urls
*/
ExecutorService pool = Executors.newFixedThreadPool(4);
AtomicLong numTasks = new AtomicLong(0);
Lock lock = new ReentrantLock();
List<String> result = new ArrayList<String>();
Set<String> visited = new HashSet<>();
private class crawlTask implements Runnable {
String url;
public crawlTask(String url) {
this.url = url;
}
@Override
public void run() {
try {
for(String neighbor: HtmlHelper.parseUrls(url)) {
URL neighborURL = new URL(neighbor);
if(!neighborURL.getHost().endsWith("wikipedia.org")) {
continue;
}
lock.lock();
if(!visited.contains(neighbor)) {
visited.add(neighbor);
result.add(neighbor);
pool.execute(new crawlTask(neighbor));
numTasks.incrementAndGet();
}
lock.unlock();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
numTasks.decrementAndGet();
}
}
}
public List<String> crawler(String url) {
visited.add(url);
result.add(url);
pool.execute(new crawlTask(url));
numTasks.incrementAndGet();
try{
while(numTasks.get() != 0){
Thread.sleep(10);
}
} catch (Exception e) {
e.printStackTrace();
}
pool.shutdown();
return result;
}
}