java 基数估值

来源:互联网 发布:手机怎么淘宝购物 编辑:程序博客网 时间:2024/04/29 14:59

通过基数估值的方法来得到大量数据中重复的列。



算法步骤:随机生成n多数据,利用murmurhash,得到32位的hash值,通过2 de 10分桶,来计算

package test;import java.util.HashMap;import java.util.Iterator;import java.util.Random;  class jishuguzhi {    public static void main(String args[]) {  engine en=new engine();  en.jisuan();  }  }     class engine {    HashMap<Integer,Integer> map=new HashMap<Integer, Integer>();MurmurHash m=new MurmurHash();public long gethash(byte [] data,int length,int seed){return m.hash32(data, length, seed);  }public void jisuan(){//分桶数为pow(2,10);long top=100000000000l;while(top-->=0){Random r=new Random();long x=r.nextLong();byte[]data=  String.valueOf(x).getBytes();int length=data.length;long iwant=this.gethash(data,length ,33333333);//获得二进制字符串。    String iwantyou=Long.toBinaryString(iwant);    int distance=iwantyou.length()-10;    String tongn;    if(distance>=0)        {      tongn=iwantyou.substring(0, 10-1);        }        else         {          tongn=iwantyou.substring(0, iwantyou.length()-1);            }               //位数不够就在后面加0;                    int shi=Integer.valueOf(tongn,2);        System.out.println(tongn);    if(map.containsKey(shi))    {     this.map.put(Integer.valueOf(tongn), this.map.get(shi)+1);               }    else         {        this.map.put(Integer.valueOf(tongn), 1);           }}//迭代出map的值。Iterator it = map.keySet().iterator();while(it.hasNext()){Integer key =   (Integer) it.next();int value = map.get(key);System.out.println(key + "→" + value);}}   }      final class MurmurHash   {  private static final long serialVersionUID = 4342869264396184799L; // all methods static; private constructor. public MurmurHash() { }     protected byte[] toBytesWithoutEncoding(String str) {         int len = str.length();        int pos = 0;        byte[] buf = new byte[len << 1];        for (int i = 0; i < len; i++) {             char c = str.charAt(i);            buf[pos++] = (byte) (c & 0xFF);            buf[pos++] = (byte) (c >> 8);         }        return buf;     } public int hashcode(String str) {         byte[] bytes = toBytesWithoutEncoding(str);        return hash32(bytes, bytes.length); } /**  * Generates 32 bit hash from byte array of the given length and * seed. *  * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 32 bit hash of the given array */public int hash32( final byte[] data, int length, int seed) { // 'm' and 'r' are mixing constants generated offline.// They're not really 'magic', they just happen to work well.final int m = 0x5bd1e995;final int r = 24; // Initialize the hash to a random valueint h = seed^length;int length4 = length/4; for (int i=0; i<length4; i++) { final int i4 = i*4;int k = (data[i4+0]&0xff) +((data[i4+1]&0xff)<<8)+((data[i4+2]&0xff)<<16) +((data[i4+3]&0xff)<<24);k *= m;k ^= k >>> r;k *= m;h *= m;h ^= k; } // Handle the last few bytes of the input arrayswitch (length%4) { case 3: h ^= (data[(length&~3) +2]&0xff) << 16;case 2: h ^= (data[(length&~3) +1]&0xff) << 8;case 1: h ^= (data[length&~3]&0xff);h *= m; } h ^= h >>> 13;h *= m;h ^= h >>> 15; return h; }  /**  * Generates 32 bit hash from byte array with default seed value. *  * @param data byte array to hash * @param length length of the array to hash * @return 32 bit hash of the given array */public int hash32( final byte[] data, int length) { return hash32( data, length, 0x9747b28c);  }  /**  * Generates 64 bit hash from byte array of the given length and seed. *  * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 64 bit hash of the given array */public long hash64( final byte[] data, int length, int seed) { final long m = 0xc6a4a7935bd1e995L;final int r = 47; long h = (seed&0xffffffffl)^(length*m); int length8 = length/8; for (int i=0; i<length8; i++) { final int i8 = i*8;long k =  ((long)data[i8+0]&0xff)      +(((long)data[i8+1]&0xff)<<8)+(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)+(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)+(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56); k *= m;k ^= k >>> r;k *= m; h ^= k;h *= m;  } switch (length%8) { case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;case 1: h ^= (long)(data[length&~7]&0xff);h *= m; }; h ^= h >>> r;h *= m;h ^= h >>> r; return h; }  /**  * Generates 64 bit hash from byte array with default seed value. *  * @param data byte array to hash * @param length length of the array to hash * @return 64 bit hash of the given string */public long hash64( final byte[] data, int length) { return hash64( data, length, 0xe17a1465); }  }

0 0
原创粉丝点击