搜索引擎-heritrix
卢子民
2023-12-01
有哪位哥们知道为什么我用Heritrix抓取页面(在一个网站中),却只有一个线程工作.
package com.lantao.bookuu.frontierschedule;
import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.frontier.QueueAssignmentPolicy;
public class MyQueueAssignmentPolicy extends QueueAssignmentPolicy {
@Override
public String getClassKey(CrawlController controller, CandidateURI cauri) {
String uri = cauri.getUURI().toString();
long hash = ELFHash(uri);
String a = Long.toString(hash % 100);
return a;
}
public long ELFHash(String str)
{
long hash = 0;
long x = 0;
for(int i = 0; i < str.length(); i++)
{
hash = (hash << 4) + str.charAt(i);
if((x = hash & 0xF0000000L) != 0)
{
hash ^= (x >> 24);
hash &= ~x;
}
}
return hash & 0x7FFFFFFF;
}
}