BloomFilter——大规模数据处理利器:http://www.cnblogs.com/heaad/archive/2011/01/02/1924195.html
一、Java版的BloomFilter
JAVA Bitset应用总结:http://blog.csdn.net/originalintention/article/details/8224831
Bitset对象在内存中所占的大小等于:(指定大小/8) Bytes。
package cn.edu.xjtu.nhpcc.jenva.datastructure;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.BitSet;
public class BloomFilter {
/* BitSet初始分配2^24个bit */
private static final int DEFAULT_SIZE =1<<24; //each filter is 2^24 bits, equal 4MB
/* 不同哈希函数的种子,一般应取质数 */
private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37, 61 };
private BitSet bitSet = new BitSet(DEFAULT_SIZE);
/* 哈希函数对象 */
private SimpleHash[] func =new SimpleHash[seeds.length];
public BloomFilter()
{
for (int i =0; i < seeds.length; i++)
{
func[i] =new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
/**
* @param args
* @throws IOException
* @throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
// TODO Auto-generated method stub
BloomFilter filter = new BloomFilter();
filter.add("shengeng");
filter.add("hujun");
filter.add("Hello");
filter.add("大众点评");
String fileName = "ourbitset";
filter.persistBitSet(fileName);
filter.bitSet = null;
filter.loadPersistedBitSet(fileName);
boolean b1 = filter.contains("shengeng");
boolean b2 = filter.contains("hello");
boolean b3 = filter.contains("大众点评");
boolean b4 = filter.contains("百度");
System.out.println(b1);
System.out.println(b2);
System.out.println(b3);
System.out.println(b4);
}
// 将字符串标记到bits中
public void add(String value)
{
for (SimpleHash f : func)
{
bitSet.set(f.hash(value), true);
}
}
//判断字符串是否已经被bits标记
public boolean contains(String value)
{
if (value ==null)
{
return false;
}
boolean ret =true;
for (SimpleHash f : func)
{
ret = ret && bitSet.get(f.hash(value));
}
return ret;
}
/* 哈希函数类 */
public static class SimpleHash
{
private int cap;
private int seed;
public SimpleHash(int cap, int seed)
{
this.cap = cap;
this.seed = seed;
}
//hash函数,采用简单的加权和hash
public int hash(String value)
{
int result =0;
int len = value.length();
for (int i =0; i < len; i++)
{
result = seed * result + value.charAt(i);
}
return (cap -1) & result;
}
}
/*
* This function is for persistence of bitset.
*/
public void persistBitSet(String fileName) throws IOException{
File file = new File(fileName);
FileOutputStream fileOutputStream = new FileOutputStream(file);
ObjectOutputStream objectOutputStream = new ObjectOutputStream(fileOutputStream);
objectOutputStream.writeObject(this.bitSet);
objectOutputStream.close();
}
/*
* This function is to load persisted bitset.
*/
public void loadPersistedBitSet(String fileName) throws IOException, ClassNotFoundException{
File file = new File(fileName);
FileInputStream fileInputStream = new FileInputStream(file);
ObjectInputStream objectInputStream = new ObjectInputStream(fileInputStream);
BitSet bitSet = (BitSet) objectInputStream.readObject();
this.bitSet = bitSet;
}
}
关于程序需要注意:
1. Bitset对象在内存中所占的大小等于:(指定大小/8) Bytes。本例中“指定大小”为DEFAULT_SIZE 。这个写了一个程序验证过了。
2. Java中一个对象占用了多少内存,只和它直接所有的变量以及构造函数里的变量有关系,与该类内部其它方法有多少变量没有关系。
二、Python下的BloomFilter
zhxue@xzh3:~/xzhprog/python$ sudo aptitude install -y python-pip
1. 使用pybloomfilter module(未成功)
zhxue@xzh3:~/xzhprog/python$ sudo pip install -y pybloomfiltermmap
zhxue@xzh3:~/xzhprog/python$ sudo apt-get install --reinstall wamerican //测试时所需要的一个文件
执行时,碰到下列问题:
zhxue@xzh3:~/xzhprog/python$ python testBloomFilter.py
Traceback (most recent call last):
File "testBloomFilter.py", line 1, in <module>
from pybloomfilter import BloomFilter
ImportError: No module named pybloomfilter
zhxue@xzh3:~/xzhprog/python$ vi testBloomFilter.py
于是:
zhxue@xzh3:~/xzhprog/python$ sudo pip install pybloomfilter
Downloading/unpacking pybloomfilter
Downloading pybloomfilter-1.0.tar.gz
Running setup.py egg_info for package pybloomfilter
Installing collected packages: pybloomfilter
Running setup.py install for pybloomfilter
Successfully installed pybloomfilter
Cleaning up...
zhxue@xzh3:~/xzhprog/python$
接着如下问题:
zhxue@xzh3:~/xzhprog/python$ python testBloomFilter.py
[-]calg library: http://c-algorithms.sourceforge.net
待续。。。。。
参考文献:
http://blog.csdn.net/pirage/article/details/8878846
http://axiak.github.io/pybloomfiltermmap/index.html#install
2. 直接编程(验证通过)
zhxue@xzh3:~/xzhprog/python$ sudo apt-get install -y python-dev
zhxue@xzh3:~/xzhprog/python$ sudo pip install bitarray
zhxue@xzh3:~/xzhprog/python$ sudo pip install mmh3
zhxue@xzh3:~/xzhprog/python$ sudo apt-get install --reinstall wamerican //测试时所需要的一个文件
from bitarray import bitarray
import mmh3
class BloomFilter:
def __init__(self, size, hash_count):
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
def add(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
self.bit_array[result] = 1
def lookup(self, string):
for seed in xrange(self.hash_count):
result = mmh3.hash(string, seed) % self.size
if self.bit_array[result] == 0:
return "Nope"
return "Probably"
bf = BloomFilter(500000, 7)
lines = open("/usr/share/dict/american-english").read().splitlines()
for line in lines:
bf.add(line)
print bf.lookup("google")
print bf.lookup("Max")
print bf.lookup("mice")
print bf.lookup("3")
运行结果:
zhxue@xzh3:~/xzhprog/python$ python myBloomFilter.py
Nope
Probably
Probably
Nope
参考文献:
http://maxburstein.com/blog/creating-a-simple-bloom-filter/