JAVA实现simHash算法等工具类

颛孙飞

2023-12-01

一、simHash算法

package com.xxxx.checkandbigdataquery.utils;

import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.CharBuffer;
import java.util.Set;


/**
 * a basic SimHash implementation
 *
 * 
 */
public class SimHash {
  public static final int  HASH_SIZE          = 64;
  public static final long HASH_RANGE         = 2 ^ HASH_SIZE;
  public static MurmurHash hasher             = new MurmurHash();

  /**
   * use short cuts to obtains a speed optimized simhash calculation
   * 
   * @param s
   *          input string
   * @return 64 bit simhash of input string
   */

  private static final int FIXED_CGRAM_LENGTH = 4;

  public static long computeOptimizedSimHashForString(String s) {
    return computeOptimizedSimHashForString(CharBuffer.wrap(s));
  }

  public static long computeOptimizedSimHashForString(CharBuffer s) {

    LongSet shingles = new LongOpenHashSet(Math.min(s.length(), 100000));

    int length = s.length();

    long timeStart = System.currentTimeMillis();
    for (int i = 0; i < length - FIXED_CGRAM_LENGTH + 1; i++) {
      // extract an ngram

      long shingle = s.charAt(i);
      shingle <<= 16;
      shingle |= s.charAt(i + 1);
      shingle <<= 16;
      shingle |= s.charAt(i + 2);
      shingle <<= 16;
      shingle |= s.charAt(i + 3);

      shingles.add(shingle);
    }
    long timeEnd = System.currentTimeMillis();
    int v[] = new int[HASH_SIZE];
    byte longAsBytes[] = new byte[8];

    for (long shingle : shingles) {

      longAsBytes[0] = (byte) (shingle >> 56);
      longAsBytes[1] = (byte) (shingle >> 48);
      longAsBytes[2] = (byte) (shingle >> 40);
      longAsBytes[3] = (byte) (shingle >> 32);
      longAsBytes[4] = (byte) (shingle >> 24);
      longAsBytes[5] = (byte) (shingle >> 16);
      longAsBytes[6] = (byte) (shingle >> 8);
      longAsBytes[7] = (byte) (shingle);

      long longHash = FPGenerator.std64.fp(longAsBytes, 0, 8);
      for (int i = 0; i < HASH_SIZE; ++i) {
        boolean bitSet = ((longHash >> i) & 1L) == 1L;
        v[i] += (bitSet) ? 1 : -1;
      }
    }

    long simhash = 0;
    for (int i = 0; i < HASH_SIZE; ++i) {
      if (v[i] > 0) {
        simhash |= (1L << i);
      }
    }
    return simhash;
  }

  public static long computeSimHashFromString(Set<String> shingles) {

    int v[] = new int[HASH_SIZE];
    // compute a set of shingles
    for (String shingle : shingles) {
      byte[] bytes = shingle.getBytes();
      long longHash = FPGenerator.std64.fp(bytes, 0, bytes.length);
      // long hash1 = hasher.hash(bytes, bytes.length, 0);
      // long hash2 = hasher.hash(bytes, bytes.length, (int)hash1);
      // long longHash = (hash1 << 32) | hash2;
      for (int i = 0; i < HASH_SIZE; ++i) {
        boolean bitSet = ((longHash >> i) & 1L) == 1L;
        v[i] += (bitSet) ? 1 : -1;
      }
    }
    long simhash = 0;
    for (int i = 0; i < HASH_SIZE; ++i) {
      if (v[i] > 0) {
        simhash |= (1L << i);
      }
    }

    return simhash;
  }

  public static int hammingDistance(long hash1, long hash2) {
    long bits = hash1 ^ hash2;
    int count = 0;
    while (bits != 0) {
      bits &= bits - 1;
      ++count;
    }
    return count;
  }

  public static long rotate(long hashValue) {
    return (hashValue << 1) | (hashValue >>> -1);
  }

  public static void main(String[] args) {
    try {
      // File file1 = new File("/Users/rana/academia.edu_01.html");
      // File file2 = new File("/Users/rana/academia.edu_02.html");

      File file1 = new File(args[0]);
      File file2 = new File(args[1]);

      byte data1[] = new byte[(int) file1.length()];
      byte data2[] = new byte[(int) file2.length()];
      FileInputStream stream1 = new FileInputStream(file1);
      FileInputStream stream2 = new FileInputStream(file2);
      stream1.read(data1);
      stream2.read(data2);
      String string1 = new String(data1);
      String string2 = new String(data2);

      long timeStart = System.currentTimeMillis();
      long simhash1 = computeSimHashFromString(Shingle.shingles(string1));
      long timeEnd = System.currentTimeMillis();
      System.out.println("Old Calc for Document A Took:"
          + (timeEnd - timeStart));
      timeStart = System.currentTimeMillis();
      long simhash2 = computeSimHashFromString(Shingle.shingles(string2));
      timeEnd = System.currentTimeMillis();
      System.out.println("Old Calc for Document B Took:"
          + (timeEnd - timeStart));
      timeStart = System.currentTimeMillis();
      long simhash3 = computeOptimizedSimHashForString(string1);
      timeEnd = System.currentTimeMillis();
      System.out.println("New Calc for Document A Took:"
          + (timeEnd - timeStart));
      timeStart = System.currentTimeMillis();
      long simhash4 = computeOptimizedSimHashForString(string2);
      timeEnd = System.currentTimeMillis();
      System.out.println("New Calc for Document B Took:"
          + (timeEnd - timeStart));

      int hammingDistance = hammingDistance(simhash1, simhash2);
      int hammingDistance2 = hammingDistance(simhash3, simhash4);

      System.out.println("hammingdistance Doc (A) to Doc(B) OldWay:"
          + hammingDistance);
      System.out.println("hammingdistance Doc (A) to Doc(B) NewWay:"
          + hammingDistance2);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}

二、计算带宽利用率

package com.xxxx.checkandbigdataquery.utils;

/**
 * Bandwidth utilization calculation helper
 */
public class BandwidthUtils {

  public static class BandwidthStats {
    public double bitsPerSecond        = 0;
    public double bytesPerSecond       = 0;
    public double scaledBytesPerSecond = 0;
    public String scaledBytesUnits;
    public double scaledBitsPerSecond  = 0;
    public String scaledBitsUnits;

  }

  public static class BandwidthHistory {

    private static final int SPEED_HISTORY_SIZE = 20;
    private static final int SPEED_SAMPLE_MIN   = 150;
    private static final int STALL_START_TIME   = 5000;
    private static String    byte_units[]       = { "B/s", "KB/s", "MB/s",
                                                    "GB/s" };
    private static String    bit_units[]        = { "b/s", "Kb/s", "Mb/s",
                                                    "Gb/s" };

    int                      pos;
    int                      times[]            = new int[SPEED_HISTORY_SIZE];
    int                      bytes[]            = new int[SPEED_HISTORY_SIZE];

    int                      total_time;
    int                      total_bytes;
    int                      recent_bytes;
    long                     recent_start;
    boolean                  stalled            = false;

    private void reset() {
      pos = 0;
      recent_bytes = 0;
      times[0] = 0;
      bytes[0] = 0;
      total_time = 0;
      total_bytes = 0;
    }

    public void update(int bytesUploaded) {
      if (recent_start == 0)
        recent_start = System.currentTimeMillis();

      long currTime = System.currentTimeMillis();
      int recentAge = (int) (currTime - recent_start);
      recent_bytes += bytesUploaded;

      if (recentAge < SPEED_SAMPLE_MIN) {
        return;
      }

      if (bytesUploaded == 0) {

        if (recentAge >= STALL_START_TIME) {
          stalled = true;
          reset();
        }
        return;
      } else {
        if (stalled) {
          stalled = false;
          recentAge = 1;
        }

        total_time -= times[pos];
        total_bytes -= bytes[pos];

        times[pos] = recentAge;
        bytes[pos] = recent_bytes;
        total_time += recentAge;
        total_bytes += recent_bytes;

        recent_start = currTime;
        recent_bytes = 0;

        if (++pos == SPEED_HISTORY_SIZE)
          pos = 0;

      }

    }

    public void calcSpeed(BandwidthStats statsOut) {

      int downloadAmount = total_bytes + recent_bytes;
      int downloadTime = total_time;
      if (recent_start != 0 && !stalled) {
        downloadTime += (int) (System.currentTimeMillis() - recent_start);
      }

      if (downloadAmount != 0 && downloadTime != 0) {

        statsOut.bytesPerSecond = (double) downloadAmount
            / ((double) downloadTime / 1000.0);
        statsOut.bitsPerSecond = statsOut.bytesPerSecond * 8.0;

        if (statsOut.bytesPerSecond < 1024.0) {
          statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond;
          statsOut.scaledBytesUnits = byte_units[0];
        } else if (statsOut.bytesPerSecond < 1024.0 * 1024.0) {
          statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond / 1024.0;
          statsOut.scaledBytesUnits = byte_units[1];
        } else if (statsOut.bytesPerSecond < 1024.0 * 1024.0 * 1024.0) {
          statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond
              / (1024.0 * 1024.0);
          statsOut.scaledBytesUnits = byte_units[2];
        } else {
          statsOut.scaledBytesPerSecond = statsOut.bytesPerSecond
              / (1024.0 * 1024.0 * 1024.0);
          statsOut.scaledBytesUnits = byte_units[3];
        }

        if (statsOut.bitsPerSecond < 1024.0) {
          statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond;
          statsOut.scaledBitsUnits = bit_units[0];
        } else if (statsOut.bitsPerSecond < 1024.0 * 1024.0) {
          statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond / 1024.0;
          statsOut.scaledBitsUnits = bit_units[1];
        } else if (statsOut.bitsPerSecond < 1024.0 * 1024.0 * 1024.0) {
          statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond
              / (1024.0 * 1024.0);
          statsOut.scaledBitsUnits = bit_units[2];
        } else {
          statsOut.scaledBitsPerSecond = statsOut.bitsPerSecond
              / (1024.0 * 1024.0 * 1024.0);
          statsOut.scaledBitsUnits = bit_units[3];
        }
      }
    }
  };

  public static class RateLimiter {

    // desired rate limit ...
    private int              _desiredBandwidthBytes;

    // history object to collect stats ...
    private BandwidthHistory _history = new BandwidthHistory();

    // window start time ...
    private long             _windowStartTime;
    private int              _bytesAccumulated;

    public RateLimiter(int maxBitsPerSecond) {
      _desiredBandwidthBytes = maxBitsPerSecond / 8;
    }

    /**
     * checkRateLimit
     * 
     * take in bytes available and based on desired rate limit return adjusted
     * bytes available
     * @return
     */
    public int checkRateLimit(int byteAvailable) {

      long currentTime = System.currentTimeMillis();

      // if first time into call or if delta between last call and current
      // window
      if (_windowStartTime == 0 || (currentTime - _windowStartTime) >= 1000) {
        _windowStartTime = currentTime;
        _bytesAccumulated = 0;
      }

      // now calulate bandwidth available
      return Math.min((_desiredBandwidthBytes - _bytesAccumulated),
          byteAvailable);
    }

    public void updateStats(int bytesInOrOut) {
      _bytesAccumulated += bytesInOrOut;
      _history.update(bytesInOrOut);
    }

    public void getStats(BandwidthStats bandwidthStats) {
      _history.calcSpeed(bandwidthStats);
    }
  }
}

三、布鲁姆过滤器

package com.xxxx.checkandbigdataquery.utils;

/**
 * The following calculations are taken from:
 * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
 * "Bloom Filters - the math"
 * 
 * This class's static methods are meant to facilitate the use of the Bloom
 * Filter class by helping to choose correct values of 'bits per element' and
 * 'number of hash functions, k'. Author : Avinash Lakshman (
 * alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
 */
public class BloomCalculations {

  private static final int   maxBuckets     = 15;
  private static final int   minBuckets     = 2;
  private static final int   minK           = 1;
  private static final int   maxK           = 8;
  private static final int[] optKPerBuckets = new int[] { 1, // dummy K for 0
                                                             // buckets per
                                                             // element
      1, // dummy K for 1 buckets per element
      1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 8 };

  /**
   * In the following table, the row 'i' shows false positive rates if i buckets
   * per element are used. Column 'j' shows false positive rates if j hash
   * functions are used. The first row is 'i=0', the first column is 'j=0'. Each
   * cell (i,j) the false positive rate determined by using i buckets per
   * element and j hash functions.
   */
  static final double[][]    probs          = new double[][] {
      { 1.0 }, // dummy row representing 0 buckets per element
      { 1.0, 1.0 }, // dummy row representing 1 buckets per element
      { 1.0, 0.393, 0.400 },
      { 1.0, 0.283, 0.237, 0.253 },
      { 1.0, 0.221, 0.155, 0.147, 0.160 },
      { 1.0, 0.181, 0.109, 0.092, 0.092, 0.101 }, // 5
      { 1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638 },
      { 1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364 },
      { 1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229 },
      { 1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145 }, // 9
      { 1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846 },
      { 1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509 },
      { 1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314 },
      { 1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217,
      0.00199 },
      { 1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146,
      0.00129 },
      { 1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852 } // 15
                                            };   // the first column is a dummy
                                                  // column representing K=0.

  /**
   * Given the number of buckets that can be used per element, return the
   * optimal number of hash functions in order to minimize the false positive
   * rate.
   * 
   * @param bucketsPerElement
   * @return The number of hash functions that minimize the false positive rate.
   */
  public static int computeBestK(int bucketsPerElement) {
    assert bucketsPerElement >= 0;
    if (bucketsPerElement >= optKPerBuckets.length)
      return optKPerBuckets[optKPerBuckets.length - 1];
    return optKPerBuckets[bucketsPerElement];
  }

  /**
   * A wrapper class that holds two key parameters for a Bloom Filter: the
   * number of hash functions used, and the number of buckets per element used.
   */
  public static final class BloomSpecification {
    final int K;                // number of hash functions.
    final int bucketsPerElement;

    public BloomSpecification(int k, int bucketsPerElement) {
      K = k;
      this.bucketsPerElement = bucketsPerElement;
    }
  }

  /**
   * Given a maximum tolerable false positive probability, compute a Bloom
   * specification which will give less than the specified false positive rate,
   * but minimize the number of buckets per element and the number of hash
   * functions used. Because bandwidth (and therefore total bitvector size) is
   * considered more expensive than computing power, preference is given to
   * minimizing buckets per element rather than number of hash funtions.
   * 
   * @param maxFalsePosProb
   *          The maximum tolerable false positive rate.
   * @return A Bloom Specification which would result in a false positive rate
   *         less than specified by the function call.
   */
  public static BloomSpecification computeBucketsAndK(double maxFalsePosProb) {
    // Handle the trivial cases
    if (maxFalsePosProb >= probs[minBuckets][minK]) {
      return new BloomSpecification(2, optKPerBuckets[2]);
    }
    if (maxFalsePosProb < probs[maxBuckets][maxK]) {
      return new BloomSpecification(maxK, maxBuckets);
    }

    // First find the minimal required number of buckets:
    int bucketsPerElement = 2;
    int K = optKPerBuckets[2];
    while (probs[bucketsPerElement][K] > maxFalsePosProb) {
      bucketsPerElement++;
      K = optKPerBuckets[bucketsPerElement];
    }
    // Now that the number of buckets is sufficient, see if we can relax K
    // without losing too much precision.
    while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
      K--;
    }

    return new BloomSpecification(K, bucketsPerElement);
  }

    /**
     * Max elements you can put in a Bloom filter of a particular size with a specified error rate
     * @param nbits total bitsize of the bloom filter
     * @param hashCount how many hashes we're using
     * @param errorRate the false positive rate we're looking for
     */
    public static long calcMaxElements(long nbits, double errorRate, int hashCount) {
        return (long) (-nbits * 1.0 / hashCount * Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));
    }

    /**
     * Get error rate for a given bloom filter based on size and hash count
     * @param numElements how many elements we're storing in the bloom filter
     * @param nbits total bitsize of the bloom filter
     * @param hashCount how many hashes we're using
     */
    public static double calcErrorRate(long numElements, long nbits, int hashCount) {
        return Math.exp(Math.log(1 - Math.exp(-hashCount * numElements * 1.0 / nbits)) * hashCount);
    }

    public static void usage(int exitCode) {
        String error = "Usage: \n" +
                "calcErrorRate numElements nbits hashCount\n" +
                "calcMaxElements nbits errorRate hashCount";
        System.err.println(error);
        System.exit(exitCode);
    }
    public static void main(String[] args) {
        if (args.length < 4) {
            usage(1);
        }
        if (args[0].equals("calcErrorRate")) {
            long numElements = Long.parseLong(args[1]);
            long nbits = Long.parseLong(args[2]);
            int hashCount = Integer.parseInt(args[3]);
            System.out.println(calcErrorRate(numElements, nbits, hashCount));
        } else if (args[0].equals("calcMaxElements")) {
            long nbits = Long.parseLong(args[1]);
            double errorRate = Double.parseDouble(args[2]);
            int hashCount = Integer.parseInt(args[3]);
            System.out.println(calcMaxElements(nbits, errorRate, hashCount));
        }
    }

}

四、MurmurHash 基于散列的查找

package com.xxxx.checkandbigdataquery.utils;

/**
 * This is a very fast, non-cryptographic hash suitable for general hash-based
 * lookup. See http://murmurhash.googlepages.com/ for more details.
 * 
 * <p>
 * The C version of MurmurHash 2.0 found at that site was ported to Java by
 * Andrzej Bialecki (ab at getopt org).
 * </p>
 */
public class MurmurHash {

  static final int m = 0x5bd1e995;
  static final int r = 24;

  public static final int hashInt(int k, int seed) {
    int k1 = k * m;
    int k2 = k1 ^ (k1 >>> r);
    int k3 = k2 * m;
    int h1 = seed * m;
    return h1 ^ k3;
  }

  public static final int hashLong(long k, int seed) {
    int h = hashInt((int) ((k >> 32) & Integer.MAX_VALUE), seed);
    int h2 = hashInt(((int) k), h);
    return h2;
  }

  public static final int hashFloat(float f, int seed) {
    return hashInt(Float.floatToIntBits(f), seed);
  }

  public static final int hashDouble(double d, int seed) {
    return hashLong(Double.doubleToLongBits(d), seed);
  }

  public static final int hashBoolean(boolean b, int seed) {
    return hashInt((b) ? 1 : 0, seed);
  }

  public static final int hashString(String s, int seed) {
    byte[] bytes = s.getBytes();
    return hash(bytes, bytes.length, seed);
  }

  public static final int hash(byte[] data, int length, int seed) {
    return hash(data, 0, length, seed);
  }

  public static final int hash(byte[] data, int offset, int length, int seed) {
    int h = seed ^ length;
    int len_4 = length >> 2;
    for (int i = 0; i < len_4; i++) {
      int i_4 = i << 2;
      int k = data[offset + i_4 + 3];
      k = k << 8;
      k = k | (data[offset + i_4 + 2] & 0xff);
      k = k << 8;
      k = k | (data[offset + i_4 + 1] & 0xff);
      k = k << 8;
      k = k | (data[offset + i_4 + 0] & 0xff);
      k *= m;
      k ^= k >>> r;
      k *= m;
      h *= m;
      h ^= k;
    }
    // avoid calculating modulo
    int len_m = len_4 << 2;
    int left = length - len_m;
    if (left != 0) {
      if (left >= 3) {
        h ^= (int) data[offset + length - 3] << 16;
      }
      if (left >= 2) {
        h ^= (int) data[offset + length - 2] << 8;
      }
      if (left >= 1) {
        h ^= (int) data[offset + length - 1];
      }
      h *= m;
    }
    h ^= h >>> 13;
    h *= m;
    h ^= h >>> 15;
    return h;
  }
}

JAVA实现simHash算法等工具类

相关阅读

相关文章

相关问答

相关文档