一、simHash算法
package com.xxxx.checkandbigdataquery.utils;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.CharBuffer;
import java.util.Set;
/**
* a basic SimHash implementation
*
*
*/
public class SimHash {
public static final int HASH_SIZE = 64;
public static final long HASH_RANGE = 2 ^ HASH_SIZE;
public static MurmurHash hasher = new MurmurHash();
/**
* use short cuts to obtains a speed optimized simhash calculation
*
* @param s
* input string
* @return 64 bit simhash of input string
*/
private static final int FIXED_CGRAM_LENGTH = 4;
public static long computeOptimizedSimHashForString(String s) {
return computeOptimizedSimHashForString(CharBuffer.wrap(s));
}
public static long computeOptimizedSimHashForString(CharBuffer s) {
LongSet shingles = new LongOpenHashSet(Math.min(s.length(), 100000));
int length = s.length();
long timeStart = System.currentTimeMillis();
for (int i = 0; i < length - FIXED_CGRAM_LENGTH + 1; i++) {
// extract an ngram
long shingle = s.charAt(i);
shingle <<= 16;
shingle |= s.charAt(i + 1);
shingle <<= 16;
shingle |= s.charAt(i + 2);
shingle <<= 16;
shingle |= s.charAt(i + 3);
shingles.add(shingle);
}
long timeEnd = System.currentTimeMillis();
int v[] = new int[HASH_SIZE];
byte longAsBytes[] = new byte[8];
for (long shingle : shingles) {
longAsBytes[0] = (byte) (shingle >> 56);
longAsBytes[1] = (byte) (shingle >> 48);
longAsBytes[2] = (byte) (shingle >> 40);
longAsBytes[3] = (byte) (shingle >> 32);
longAsBytes[4] = (byte) (shingle >> 24);
longAsBytes[5] = (byte) (shingle >> 16);
longAsBytes[6] = (byte) (shingle >> 8);
longAsBytes[7] = (byte) (shingle);
long longHash = FPGenerator.std64.fp(longAsBytes, 0, 8);
for (int i = 0; i < HASH_SIZE; ++i) {
boolean bitSet = ((longHash >> i) & 1L) == 1L;
v[i] += (bitSet) ? 1 : -1;
}
}
long simhash = 0;
for (int i = 0; i < HASH_SIZE; ++i) {
if (v[i] > 0) {
simhash |= (1L << i);
}
}
return simhash;
}
public static long computeSimHashFromString(Set<String> shingles) {
int v[] = new int[HASH_SIZE];
// compute a set of shingles
for (String shingle : shingles) {
byte[] bytes = shingle.getBytes();
long longHash = FPGenerator.std64.fp(bytes, 0, bytes.length);
// long hash1 = hasher.hash(bytes, bytes.length, 0);
// long hash2 = hasher.hash(bytes, bytes.length, (int)hash1);
// long longHash = (hash1 << 32) | hash2;
for (int i = 0; i < HASH_SIZE; ++i) {
boolean bitSet = ((longHash >> i) & 1L) == 1L;
v[i] += (bitSet) ? 1 : -1;
}
}
long simhash = 0;
for (int i = 0; i < HASH_SIZE; ++i) {
if (v[i] > 0) {
simhash |= (1L << i);
}
}
return simhash;
}
public static int hammingDistance(long hash1, long hash2) {
long bits = hash1 ^ hash2;
int count = 0;
while (bits != 0) {
bits &= bits - 1;
++count;
}
return count;
}
public static long rotate(long hashValue) {
return (hashValue << 1) | (hashValue >>> -1);
}
public static void main(String[] args) {
try {
// File file1 = new File("/Users/rana/academia.edu_01.html");
// File file2 = new File("/Users/rana/academia.edu_02.html");
File file1 = new File(args[0]);
File file2 = new File(args[1]);
byte data1[] = new byte[(int) file1.length()];
byte data2[] = new byte[(int) file2.length()];
FileInputStream stream1 = new FileInputStream(file1);
FileInputStream stream2 = new FileInputStream(file2);
stream1.read(data1);
stream2.read(data2);
String string1 = new String(data1);
String string2 = new String(data2);
long timeStart = System.currentTimeMillis();
long simhash1 = computeSimHashFromString(Shingle.shingles(string1));
long timeEnd = System.currentTimeMillis();
System.out.println("Old Calc for Document A Took:"
+ (timeEnd - timeStart));
timeStart = System.currentTimeMillis();
long simhash2 = computeSimHashFromString(Shingle.shingles(string2));
timeEnd = System.currentTimeMillis();
System.out.println("Old Calc for Document B Took:"
+ (timeEnd - timeStart));
timeStart = System.currentTimeMillis();
long simhash3 = computeOptimizedSimHashForString(string1);
timeEnd = System.currentTimeMillis();
System.out.println("New Calc for Document A Took:"
+ (timeEnd - timeStart));
timeStart = System.currentTimeMillis();
long simhash4 = computeOptimizedSimHashForString(string2);
timeEnd = System.currentTimeMillis();
System.out.println("New Calc for Document B Took:"
+ (timeEnd - timeStart));
int hammingDistance = hammingDistance(simhash1, simhash2);
int hammingDistance2 = hammingDistance(simhash3, simhash4);
System.out.println("hammingdistance Doc (A) to Doc(B) OldWay:"
+ hammingDistance);
System.out.println("hammingdistance Doc (A) to Doc(B) NewWay:"
+ hammingDistance2);
} catch (IOException e) {
e.printStackTrace();
}
}
}
二、计算带宽利用率
package com.xxxx.checkandbigdataquery.utils;
/**
* Bandwidth utilization calculation helper
*/
public class BandwidthUtils {
public static class BandwidthStats {
public double bitsPerSecond = 0;
public double bytesPerSecond = 0;
public double scaledBytesPerSecond = 0;
public String scaledBytesUnits;
public double scaledBitsPerSecond = 0;
public String scaledBitsUnits;
}
public static class BandwidthHistory {
private static final int SPEED_HISTORY_SIZE = 20;
private static final int SPEED_SAMPLE_MIN = 150;
private static final int STALL_START_TIME = 5000;
private static String byte_units[] = { "B/s", "KB/s", "MB/s",
"GB/s" };
private static String bit_units[] = { "b/s", "Kb/s", "Mb/s",
"Gb/s" };
int pos;
int times[] = new int[SPEED_HISTORY_SIZE];
int bytes[] = new int[SPEED_HISTORY_SIZE];
int total_time;
int total_bytes;
int recent_bytes;
long recent_start;
boolean stalled = false;
private void reset() {
pos = 0;
recent_bytes = 0;
times[0] = 0;
bytes[0] = 0;
total_time = 0;
total_bytes = 0;
}
public void update(int bytesUploaded) {
if (recent_start == 0)
recent_start = System.currentTimeMillis();
long currTime = System.currentTimeMillis();
int recentAge = (int) (currTime - recent_start);
recent_bytes += bytesUploaded;
if (recentAge < SPEED_SAMPLE_MIN) {
return;
}
if (bytesUploaded == 0) {
if (recentAge >= STALL_START_TIME) {
stalled = true;
reset();
}
return;
} else {
if (stalled) {
stalled = false;
recentAge = 1;
}
total_time -= times[pos];
total_bytes -= bytes[pos];
times[pos] = recentAge;
bytes[pos] = recent_bytes;
total_time += recentAge;
total_bytes += recent_bytes;
recent_start = currTime;
recent_bytes = 0;
if (++pos == SPEED_HISTORY_SIZE)
pos = 0;
}
}
public void calcSpeed(BandwidthStats statsOut) {
int downloadAmount = total_bytes + recent_bytes;
int downloadTime = total_time;
if (recent_start != 0 && !stalled) {
downloadTime += (int) (System.currentTimeMillis() - recent_start);
}
if (downloadAmount != 0 && downloadTime != 0) {
statsOut.bytesPerSecond = (double) downloadAmount
/ ((double) downloadTime / 1000.0);
statsOut.bitsPerSecond = statsOut.bytesPerSecond * 8.0;
if (statsOut.bytesPerSecond < 1024.0) {
statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond;
statsOut.scaledBytesUnits = byte_units[0];
} else if (statsOut.bytesPerSecond < 1024.0 * 1024.0) {
statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond / 1024.0;
statsOut.scaledBytesUnits = byte_units[1];
} else if (statsOut.bytesPerSecond < 1024.0 * 1024.0 * 1024.0) {
statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond
/ (1024.0 * 1024.0);
statsOut.scaledBytesUnits = byte_units[2];
} else {
statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond
/ (1024.0 * 1024.0 * 1024.0);
statsOut.scaledBytesUnits = byte_units[3];
}
if (statsOut.bitsPerSecond < 1024.0) {
statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond;
statsOut.scaledBitsUnits = bit_units[0];
} else if (statsOut.bitsPerSecond < 1024.0 * 1024.0) {
statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond / 1024.0;
statsOut.scaledBitsUnits = bit_units[1];
} else if (statsOut.bitsPerSecond < 1024.0 * 1024.0 * 1024.0) {
statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond
/ (1024.0 * 1024.0);
statsOut.scaledBitsUnits = bit_units[2];
} else {
statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond
/ (1024.0 * 1024.0 * 1024.0);
statsOut.scaledBitsUnits = bit_units[3];
}
}
}
};
public static class RateLimiter {
// desired rate limit ...
private int _desiredBandwidthBytes;
// history object to collect stats ...
private BandwidthHistory _history = new BandwidthHistory();
// window start time ...
private long _windowStartTime;
private int _bytesAccumulated;
public RateLimiter(int maxBitsPerSecond) {
_desiredBandwidthBytes = maxBitsPerSecond / 8;
}
/**
* checkRateLimit
*
* take in bytes available and based on desired rate limit return adjusted
* bytes available
* @return
*/
public int checkRateLimit(int byteAvailable) {
long currentTime = System.currentTimeMillis();
// if first time into call or if delta between last call and current
// window
if (_windowStartTime == 0 || (currentTime - _windowStartTime) >= 1000) {
_windowStartTime = currentTime;
_bytesAccumulated = 0;
}
// now calulate bandwidth available
return Math.min((_desiredBandwidthBytes - _bytesAccumulated),
byteAvailable);
}
public void updateStats(int bytesInOrOut) {
_bytesAccumulated += bytesInOrOut;
_history.update(bytesInOrOut);
}
public void getStats(BandwidthStats bandwidthStats) {
_history.calcSpeed(bandwidthStats);
}
}
}
三、布鲁姆过滤器
package com.xxxx.checkandbigdataquery.utils;
/**
* The following calculations are taken from:
* http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
* "Bloom Filters - the math"
*
* This class's static methods are meant to facilitate the use of the Bloom
* Filter class by helping to choose correct values of 'bits per element' and
* 'number of hash functions, k'. Author : Avinash Lakshman (
* alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
*/
public class BloomCalculations {
private static final int maxBuckets = 15;
private static final int minBuckets = 2;
private static final int minK = 1;
private static final int maxK = 8;
private static final int[] optKPerBuckets = new int[] { 1, // dummy K for 0
// buckets per
// element
1, // dummy K for 1 buckets per element
1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 8 };
/**
* In the following table, the row 'i' shows false positive rates if i buckets
* per element are used. Column 'j' shows false positive rates if j hash
* functions are used. The first row is 'i=0', the first column is 'j=0'. Each
* cell (i,j) the false positive rate determined by using i buckets per
* element and j hash functions.
*/
static final double[][] probs = new double[][] {
{ 1.0 }, // dummy row representing 0 buckets per element
{ 1.0, 1.0 }, // dummy row representing 1 buckets per element
{ 1.0, 0.393, 0.400 },
{ 1.0, 0.283, 0.237, 0.253 },
{ 1.0, 0.221, 0.155, 0.147, 0.160 },
{ 1.0, 0.181, 0.109, 0.092, 0.092, 0.101 }, // 5
{ 1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638 },
{ 1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364 },
{ 1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229 },
{ 1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145 }, // 9
{ 1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846 },
{ 1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509 },
{ 1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314 },
{ 1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217,
0.00199 },
{ 1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146,
0.00129 },
{ 1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852 } // 15
}; // the first column is a dummy
// column representing K=0.
/**
* Given the number of buckets that can be used per element, return the
* optimal number of hash functions in order to minimize the false positive
* rate.
*
* @param bucketsPerElement
* @return The number of hash functions that minimize the false positive rate.
*/
public static int computeBestK(int bucketsPerElement) {
assert bucketsPerElement >= 0;
if (bucketsPerElement >= optKPerBuckets.length)
return optKPerBuckets[optKPerBuckets.length - 1];
return optKPerBuckets[bucketsPerElement];
}
/**
* A wrapper class that holds two key parameters for a Bloom Filter: the
* number of hash functions used, and the number of buckets per element used.
*/
public static final class BloomSpecification {
final int K; // number of hash functions.
final int bucketsPerElement;
public BloomSpecification(int k, int bucketsPerElement) {
K = k;
this.bucketsPerElement = bucketsPerElement;
}
}
/**
* Given a maximum tolerable false positive probability, compute a Bloom
* specification which will give less than the specified false positive rate,
* but minimize the number of buckets per element and the number of hash
* functions used. Because bandwidth (and therefore total bitvector size) is
* considered more expensive than computing power, preference is given to
* minimizing buckets per element rather than number of hash funtions.
*
* @param maxFalsePosProb
* The maximum tolerable false positive rate.
* @return A Bloom Specification which would result in a false positive rate
* less than specified by the function call.
*/
public static BloomSpecification computeBucketsAndK(double maxFalsePosProb) {
// Handle the trivial cases
if (maxFalsePosProb >= probs[minBuckets][minK]) {
return new BloomSpecification(2, optKPerBuckets[2]);
}
if (maxFalsePosProb < probs[maxBuckets][maxK]) {
return new BloomSpecification(maxK, maxBuckets);
}
// First find the minimal required number of buckets:
int bucketsPerElement = 2;
int K = optKPerBuckets[2];
while (probs[bucketsPerElement][K] > maxFalsePosProb) {
bucketsPerElement++;
K = optKPerBuckets[bucketsPerElement];
}
// Now that the number of buckets is sufficient, see if we can relax K
// without losing too much precision.
while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
K--;
}
return new BloomSpecification(K, bucketsPerElement);
}
/**
* Max elements you can put in a Bloom filter of a particular size with a specified error rate
* @param nbits total bitsize of the bloom filter
* @param hashCount how many hashes we're using
* @param errorRate the false positive rate we're looking for
*/
public static long calcMaxElements(long nbits, double errorRate, int hashCount) {
return (long) (-nbits * 1.0 / hashCount * Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));
}
/**
* Get error rate for a given bloom filter based on size and hash count
* @param numElements how many elements we're storing in the bloom filter
* @param nbits total bitsize of the bloom filter
* @param hashCount how many hashes we're using
*/
public static double calcErrorRate(long numElements, long nbits, int hashCount) {
return Math.exp(Math.log(1 - Math.exp(-hashCount * numElements * 1.0 / nbits)) * hashCount);
}
public static void usage(int exitCode) {
String error = "Usage: \n" +
"calcErrorRate numElements nbits hashCount\n" +
"calcMaxElements nbits errorRate hashCount";
System.err.println(error);
System.exit(exitCode);
}
public static void main(String[] args) {
if (args.length < 4) {
usage(1);
}
if (args[0].equals("calcErrorRate")) {
long numElements = Long.parseLong(args[1]);
long nbits = Long.parseLong(args[2]);
int hashCount = Integer.parseInt(args[3]);
System.out.println(calcErrorRate(numElements, nbits, hashCount));
} else if (args[0].equals("calcMaxElements")) {
long nbits = Long.parseLong(args[1]);
double errorRate = Double.parseDouble(args[2]);
int hashCount = Integer.parseInt(args[3]);
System.out.println(calcMaxElements(nbits, errorRate, hashCount));
}
}
}
四、MurmurHash 基于散列的查找
package com.xxxx.checkandbigdataquery.utils;
/**
* This is a very fast, non-cryptographic hash suitable for general hash-based
* lookup. See http://murmurhash.googlepages.com/ for more details.
*
* <p>
* The C version of MurmurHash 2.0 found at that site was ported to Java by
* Andrzej Bialecki (ab at getopt org).
* </p>
*/
public class MurmurHash {
static final int m = 0x5bd1e995;
static final int r = 24;
public static final int hashInt(int k, int seed) {
int k1 = k * m;
int k2 = k1 ^ (k1 >>> r);
int k3 = k2 * m;
int h1 = seed * m;
return h1 ^ k3;
}
public static final int hashLong(long k, int seed) {
int h = hashInt((int) ((k >> 32) & Integer.MAX_VALUE), seed);
int h2 = hashInt(((int) k), h);
return h2;
}
public static final int hashFloat(float f, int seed) {
return hashInt(Float.floatToIntBits(f), seed);
}
public static final int hashDouble(double d, int seed) {
return hashLong(Double.doubleToLongBits(d), seed);
}
public static final int hashBoolean(boolean b, int seed) {
return hashInt((b) ? 1 : 0, seed);
}
public static final int hashString(String s, int seed) {
byte[] bytes = s.getBytes();
return hash(bytes, bytes.length, seed);
}
public static final int hash(byte[] data, int length, int seed) {
return hash(data, 0, length, seed);
}
public static final int hash(byte[] data, int offset, int length, int seed) {
int h = seed ^ length;
int len_4 = length >> 2;
for (int i = 0; i < len_4; i++) {
int i_4 = i << 2;
int k = data[offset + i_4 + 3];
k = k << 8;
k = k | (data[offset + i_4 + 2] & 0xff);
k = k << 8;
k = k | (data[offset + i_4 + 1] & 0xff);
k = k << 8;
k = k | (data[offset + i_4 + 0] & 0xff);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
// avoid calculating modulo
int len_m = len_4 << 2;
int left = length - len_m;
if (left != 0) {
if (left >= 3) {
h ^= (int) data[offset + length - 3] << 16;
}
if (left >= 2) {
h ^= (int) data[offset + length - 2] << 8;
}
if (left >= 1) {
h ^= (int) data[offset + length - 1];
}
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
}