当前位置: 首页 > 工具软件 > simhash-java > 使用案例 >

网页查重-simhash算法的java实现

松和安
2023-12-01

网页查重-simhash算法的java实现

在上一篇文章中,我们简单介绍了simhash算法,而在实际将它应用到网页查重中,我们首先需要分词算法将网页传来的数据流按照权重分开,但是由于对于中文和英文混杂的分词并不熟悉,我就十分偷懒的仅对英文进行分词,中文基本没管,简单的将所有的中文隔开,直接把所有含中文的字符串隔开,下面是java实现:
package test;
import java.math.BigInteger;
import java.util.*; 
import java.util.regex.*;

public class mysimhash {
	
	//通过正则将一些常用的在文本查重中无用的标签去掉
	private static String sp="=|\\.|。|,|\"|'|\\[|\\]|!|\\!|\\?|/|\n|\r| |<html|/html|<h1|</h1>|<h2|/h2|<h3|/h3|<ul|/ul|<br|/br|<head|/head|<li|/li|<div|/div|<a|/a|<p|/p|<body|/body|"
			+ "-_-|\\)|\\(|,|\\{|\\}|;|:|--|<|>|<script|/scirpt|<title|/title|<pre|</pre>|<meta|</meta>|<header|</header>|<span|/span|_|#|&|%|-";
	
	private String tokens;  //文本
	
	public BigInteger intSimHash; //没有长度限制的数
    
    public String strSimHash;
    
    private int hashbits = 64;
    
    private int max=100000;
    
    public mysimhash(String tokens,int hashbits){
    	this.tokens=tokens;
        this.hashbits = hashbits;
        this.intSimHash = this.simhash();
    }
    //用map来保存键值和权重。由于map键值唯一的特性,方便管理
	public Map<String, Integer> countString(){
		
		//先切割字符串,去掉一些分隔符
    	String[] st=this.tokens.split(sp);
    	String[] s=new String[max];
		int k=0;
		
		//去掉因为正则切割而造成的空格,并且将含中文的字符串分开
		for(int i=0;i<st.length;i++){
			if (st[i].length()<=0){
			}
			else
			{
				if(isChinese(st[i])==1){
					for(int j=0;j<st[i].length();j++){
						//System.out.println(String.valueOf(st[i].charAt(j)));
						s[k]=String.valueOf(st[i].charAt(j));
						k++;
					}
				}
				else{
					s[k]=st[i];
					k++;
				}
			}
		}
		
		Map<String, Integer> hash_Data = new HashMap<String, Integer>();
		for(int i=0;i<k;i++){
			String word=s[i];
			if(hash_Data.containsKey(word)){
				int t=hash_Data.get(word).intValue()+1;
				hash_Data.put(word, new Integer(t));
			}
			else{
				hash_Data.put(word, new Integer(1));
			 }
		 }
		 return hash_Data;
    }
    
	//判断是否为中文
	public int isChinese(String str) {
        String regEx1 = "[\\u4e00-\\u9fa5]+";
        String regEx2 = "[\\uFF00-\\uFFEF]+";
        String regEx3 = "[\\u2E80-\\u2EFF]+";
        String regEx4 = "[\\u3000-\\u303F]+";
        String regEx5 = "[\\u31C0-\\u31EF]+";
        Pattern p1 = Pattern.compile(regEx1);
        Pattern p2 = Pattern.compile(regEx2);
        Pattern p3 = Pattern.compile(regEx3);
        Pattern p4 = Pattern.compile(regEx4);
        Pattern p5 = Pattern.compile(regEx5);
        Matcher m1 = p1.matcher(str);
        Matcher m2 = p2.matcher(str);
        Matcher m3 = p3.matcher(str);
        Matcher m4 = p4.matcher(str);
        Matcher m5 = p5.matcher(str);
        if (m1.find() || m2.find() || m3.find() || m4.find() || m5.find())
            return 1;
        else
            return 0;
    }
	
	//主函数
    public BigInteger simhash(){
    	int[] v = new int[this.hashbits];
    	Map<String, Integer> hash_Data=countString();
    	Iterator<String> iter = hash_Data.keySet().iterator();
    	while(iter.hasNext()){
    		String word=(String) iter.next();
    		
    		//计算hash值,并且按照64位的每一位进行计算,结果保存在v内
    		BigInteger t = this.hash(word);
    		for(int j=0;j<this.hashbits;j++){
    			BigInteger bitmask = new BigInteger("1").shiftLeft(j);
    			if (t.and(bitmask).signum() != 0) {
                    v[j] += hash_Data.get(word);
                } 
    			else {
                    v[j] -= hash_Data.get(word);
                }
    		}
    	}
    	
    	//生成fingerprint
    	BigInteger fingerprint = new BigInteger("0");
    	StringBuffer simHashBuffer = new StringBuffer();
    	for (int i = 0; i < this.hashbits; i++) {
            if (v[i] >= 0) {
                fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
                simHashBuffer.append("1");
            }else{
                simHashBuffer.append("0");
            }
        }
    	this.strSimHash = simHashBuffer.toString();
        return fingerprint;
    }
    
    //计算每个字符的hash值
    private BigInteger hash(String source) {
    	if (source == null || source.length() == 0) {
            return new BigInteger("0");
        } else {
            char[] sourceArray = source.toCharArray();
            BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
            BigInteger m = new BigInteger("1000003");
            BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(
                    new BigInteger("1"));
            for (char item : sourceArray) {
                BigInteger temp = BigInteger.valueOf((long) item);
                x = x.multiply(m).xor(temp).and(mask);
            }
            x = x.xor(new BigInteger(String.valueOf(source.length())));
            if (x.equals(new BigInteger("-1"))) {
                x = new BigInteger("-2");
            }
            return x;
        }
    }
    
    //求两字符串之间的距离
    public int getDistance(String str1, String str2) {  
        int distance;  
        if (str1.length() != str2.length()) {  
            distance = -1;  
        } else {  
            distance = 0;  
            for (int i = 0; i < str1.length(); i++) {  
                if (str1.charAt(i) != str2.charAt(i)) {  
                    distance++;  
                }  
            }  
        }  
        return distance;  
    }
}


参考资料: 局部敏感hash函数
 类似资料: