Berkeley DB 是一个嵌入式数据库,它适合于管理海量的(256T)、简单的数据。BDB是以键值对(value/key)来存储和管理数据库的。键可以重复,数据值可以是任意类型的。BDB的底层是用B+树或者其他算法实现的。我用的jar包是B+树实现的版本。
Berkeley DB是用Environment对象来管理数据库,一个Environment可以管理多个database。每个database都存储键值对,而序列化到磁盘上是通过catalog实现的。BDB的操作是在内存和磁盘上的,最终BDB的存取结果集在程序中使用是通过容器实现的(数据库在程序中的视图)。
所以BDB的使用分为5步:
//Open Environment
private Environment environment;
//同EnvironmentConfig来配置环境
EnvironmentConfig environmentConfig=new EnvironmentConfig();
environmentConfig.setTransactional(true);
environmentConfig.setAllowCreate(true);
//homeDirectory是数据库存放的目录
environment=new Environment(new File(homeDirectory),environmentConfig);
protected StoredClassCatalog catalog;//catalog
protected Database database;//database
private static final String CLASS_CATALOG="java_class_catalog";//数据库名
protected Database catalogDatabase;//catalog存放处
//open Database
DatabaseConfig dbConfig=new DatabaseConfig();//数据库配置
dbConfig.setTransactional(true);
catalogDBConfig.setAllowCreate(true);
dbConfig.setSortedDuplicates(false);//不存重复键值
database=environment.openDatabase(null, "URL", dbConfig);//存放实际数据的数据库
//Open Catalog
DatabaseConfig catalogDBConfig=new DatabaseConfig();//数据库配置
catalogDBConfig.setTransactional(true);
catalogDBConfig.setAllowCreate(true);
catalogDatabase=environment.openDatabase(null, CLASS_CATALOG, catalogDBConfig);
//用StoredClassCatalog类将catalog保存在db中并返回可操作对象。
catalog=new StoredClassCatalog(catalogDatabase);
//键绑定
EntryBinding<Integer> keyBinding=new SerialBinding<Integer>(catalog,Integer.class);
//值绑定
SerialBinding<Url> valueBinding=new SerialBinding<Url>(catalog,Url.class);
StoreMap<Integer,Url> urlMap;//结果集的操作容器(视图)
urlMap=new StoredMap<Integer,Url>(database,keyBinding,valueBinding,true);
database.close();
catalog.close();
environment.close();
我在爬虫项目中庸BDB作为内存数据库来保存未访问的URL。由于StoredMap元素存储无序,StoredsortedMap给元素排序后存储。但都不是队列的序列。故本例中以整数位主键存储,记录整数的值来模拟队列的头和尾。StoredMap实现了Map接口,可以使用其所有函数,例如:get()、remove()、put()等。
java代码如下:
//BDBFrontier.java
import java.io.File;
import java.io.FileNotFoundException;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
public abstract class BDBFrontier{
public BDBFrontier(String homeDirectory) throws DatabaseException,FileNotFoundException{
//Open Environment
System.out.println("Opening environment in: "+homeDirectory);
EnvironmentConfig environmentConfig=new EnvironmentConfig();
environmentConfig.setTransactional(true);
environmentConfig.setAllowCreate(true);
environment=new Environment(new File(homeDirectory),environmentConfig);
//Open Catalog
DatabaseConfig catalogDBConfig=new DatabaseConfig();
catalogDBConfig.setTransactional(true);
catalogDBConfig.setAllowCreate(true);
catalogDatabase=environment.openDatabase(null, CLASS_CATALOG, catalogDBConfig);
catalog=new StoredClassCatalog(catalogDatabase);
//open Database
DatabaseConfig dbConfig=new DatabaseConfig();
dbConfig.setTransactional(true);
dbConfig.setAllowCreate(true);
database=environment.openDatabase(null, "URL", dbConfig);
}
public void close() throws DatabaseException{
database.close();
catalog.close();//这句应该可以关闭与之相关的数据库,但是API上没有将
environment.close();
}
protected abstract Object put(Object key,Object value);
protected abstract Object get(Object key);
protected abstract Object delete(Object key);
private Environment environment;
protected StoredClassCatalog catalog;
protected Database database;
private static final String CLASS_CATALOG="java_class_catalog";//name
protected Database catalogDatabase;
}
//BDBFrontier.java
import java.io.FileNotFoundException;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.collections.StoredMap;
import com.sleepycat.je.DatabaseException;
public class Frontier extends BDBFrontier implements UrlFrontier{
private StoredMap<Integer, Url> urlMap=null;//a database view
private Integer head;//URL队列头
private Integer tail;//URL队列尾
public Frontier(String homeDirectory) throws DatabaseException, FileNotFoundException {
super(homeDirectory);
// TODO Auto-generated constructor stub
EntryBinding<Integer> keyBinding=new SerialBinding<Integer>(catalog,Integer.class);
SerialBinding<Url> valueBinding=new SerialBinding<Url>(catalog,Url.class);
//创建视图,并设置为可写
urlMap=new StoredMap<Integer,Url>(database,keyBinding,valueBinding,true);//true可写
head=0;
tail=0;
}
@Override
public Url getNext() throws Exception {
// TODO Auto-generated method stub
Url result=null;
if(!urlMap.isEmpty()){
result=urlMap.get(head);
delete(head++);
}
return result;
}
@Override
public boolean putUrl(Url url) throws Exception {
// TODO Auto-generated method stub
if( put(tail++,url) != null) return true;
else return false;
}
@Override
protected Object put(Object key, Object value) {
// TODO Auto-generated method stub
return urlMap.put((Integer)key, (Url)value);
}
@Override
protected Object get(Object key) {
// TODO Auto-generated method stub
return urlMap.get(key);
}
@Override
protected Object delete(Object key) {
// TODO Auto-generated method stub
return urlMap.remove(key);
}
public boolean isEmpty() {
// TODO Auto-generated method stub
return urlMap.isEmpty();
}
public boolean contains(Url url) {
// TODO Auto-generated method stub
return urlMap.containsValue(url);
}
public static void main(String[] args){
try{
Frontier frontier=new Frontier("D:\\workspace\\db");
Url url=new Url();
url.setOriUrl("http://www.163.com");
frontier.putUrl(url);
url.setOriUrl("http://www.164.com");
frontier.putUrl(url);
url.setOriUrl("http://www.165.com");
frontier.putUrl(url);
System.out.println(frontier.getNext().getOriUrl());
System.out.println(frontier.getNext().getOriUrl());
System.out.println(frontier.getNext().getOriUrl());
frontier.close();
}catch(Exception e){
e.printStackTrace();
}finally{
}
}
}
参考文献
自己动手写网络爬虫
嵌入式数据库系统Berkeley DB
Berkeley DB 使用经验总结