关于Hash的总结

来源：互联网发布：360网络连接配置有问题编辑：程序博客网时间：2024/05/21 10:13

hash函数

hash函数常用的是mod 素数，或者使用乘法策略，取某些位，这些策略直接影响到hash table的resize，如果是mod素数的话，只能按素数来递增，如果是取乘法方法，只能以2^p来递增。

参考文献[1][2]等给出了很多，常用的字符串hash函数，但更有价值的还包括下面这两个：

murmur hash[3][4]，其中[3]无法直接访问，不过本文的例子中有具体的实现，更多murmur hash[8]。

city hash见参考文献[5]

hash table的构建

hash table的构建通常使用list来解决冲突的问题，c++ tr1中就是这样的，tr1的实现中使用了策略base的设计方法，详情见参考文献[6]，目前没有仔细看。这个参考文献是在浏览的源码（ /usr/include/c++/4.4/tr1_impl/hashtable）时，在头注释中发现的。

resize问题

resize的策略包括：

策略1：

1）全部重新copy一遍，

策略2：

1）resize时，使用2个hashtable，当insert时，只向新的hash table中insert，同时将old hash_table中的r个数据放如新的table；查找时，2个hash table同时查询

2）如果old hash table数据已经全部移动完毕，删除old hash table

上面两个策略的详细信息见参考文献

bloomFilter

说到hash需要提一下bloomfilter，它通过hash实现，好的hash函数可以使bloomfilter具有很好的性能。它的一个使用方法，就是如果要查询数据库，可以在数据库前加一个bloomfilter，如果没在bloomfilter中，就不用查询数据库了，因为bloomfilter返回false是不会有错误的。cityhash和murmurhash来实现bloomfilter应该是非常好的选择。

consistent hash（一致性hash）

consistent hasn主要用于分布式系统，当增加或删除一个主机是，不会造成严重的抖动，因为他的策略只会导致相邻的一台主机rehash，因此影响比较小。参考文献[9]讲的也挺清楚的，值得参考。

C++中的hashtable

C++ tr1中的hashtable实现是unordered_map，本文的例子和参考文献[10]，有简单的演示，我也看了部分unordered_map的实现，内部是hash_table实现的。其中参考文献[10]的参考文献给了更多有价值的参考信息。

本文的计算性能比较

代码如下：

#include "basictypes.h"#include <string>#include <vector>#include <stdlib.h>#include <stdio.h>#include <sys/time.h>#include "cityhash/include/city.h"#include <tr1/unordered_map>#include <map>// 64-bit hash for 64-bit platformsconst uint32 kFingerPrintSeed = 19820125;  uint64 MurmurHash64A(const void* key, int len, uint32 seed) {  const uint64 m = 0xc6a4a7935bd1e995;  const int r = 47;  uint64 h = seed ^ (len * m);  const uint64* data = (const uint64 *)key;  const uint64* end = data + (len/8);  while (data != end) {    uint64 k = *data++;    k *= m;    k ^= k >> r;    k *= m;    h ^= k;    h *= m;  }  const uint8* data2 = (const uint8*)data;  switch (len & 7) {    case 7: h ^= static_cast<uint64>(data2[6]) << 48;    case 6: h ^= static_cast<uint64>(data2[5]) << 40;    case 5: h ^= static_cast<uint64>(data2[4]) << 32;    case 4: h ^= static_cast<uint64>(data2[3]) << 24;    case 3: h ^= static_cast<uint64>(data2[2]) << 16;    case 2: h ^= static_cast<uint64>(data2[1]) << 8;    case 1: h ^= static_cast<uint64>(data2[0]);    h *= m;  };  h ^= h >> r;  h *= m;  h ^= h >> r;  return h;}// 32-bit hashuint32 MurmurHash32A(const void* key, int len, uint32 seed) {  const uint32 m = 0x5bd1e995;  const int r = 24;  uint32 h = seed ^ (len * m);  const uint32* data = (const uint32 *)key;  while (len >= 4) {    uint32 k = *(uint32 *)data;    k *= m;    k ^= k >> r;    k *= m;    h *= m;    h ^= k;    data += 1;    len -= 4;  }  // Handle the last few bytes of the input array const uint8* data2 = (const uint8*)data;  switch (len) {    case 3: h ^= static_cast<uint32>(data2[2]) << 16;    case 2: h ^= static_cast<uint32>(data2[1]) << 8;    case 1: h ^= static_cast<uint32>(data2[0]);            h *= m;  };  // Do a few final mixes of the hash to ensure the last few  // bytes are well-incorporated.  h ^= h >> 13;  h *= m;  h ^= h >> 15;  return h;}/* A Simple Hash Function */unsigned int simple_hash(char *str){register unsigned int hash;register unsigned char *p;for(hash = 0, p = (unsigned char *)str; *p ; p++)hash = 31 * hash + *p;return (hash & 0x7FFFFFFF);}/* RS Hash Function */unsigned int RS_hash(char *str){         unsigned int b = 378551;         unsigned int a = 63689;         unsigned int hash = 0;         while (*str)         {                 hash = hash * a + (*str++);                 a *= b;         }         return (hash & 0x7FFFFFFF);}/* JS Hash Function */unsigned int JS_hash(char *str){         unsigned int hash = 1315423911;         while (*str)         {                 hash ^= ((hash << 5) + (*str++) + (hash >> 2));         }                 return (hash & 0x7FFFFFFF);}/* P. J. Weinberger Hash Function */unsigned int PJW_hash(char *str){         unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);         unsigned int ThreeQuarters     = (unsigned int)((BitsInUnignedInt   * 3) / 4);         unsigned int OneEighth         = (unsigned int)(BitsInUnignedInt / 8);         unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);         unsigned int hash              = 0;         unsigned int test              = 0;         while (*str)         {                 hash = (hash << OneEighth) + (*str++);                 if ((test = hash & HighBits) != 0)                 {                         hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));                 }         }         return (hash & 0x7FFFFFFF);}/* ELF Hash Function */unsigned int ELF_hash(char *str){         unsigned int hash = 0;         unsigned int x     = 0;         while (*str)         {                 hash = (hash << 4) + (*str++);                 if ((x = hash & 0xF0000000L) != 0)                 {                         hash ^= (x >> 24);                         hash &= ~x;                 }         }         return (hash & 0x7FFFFFFF);}/* BKDR Hash Function */unsigned int BKDR_hash(char *str){         unsigned int seed = 131; // 31 131 1313 13131 131313 etc..         unsigned int hash = 0;         while (*str)         {                 hash = hash * seed + (*str++);         }         return (hash & 0x7FFFFFFF);}/* SDBM Hash Function */unsigned int SDBM_hash(char *str){         unsigned int hash = 0;         while (*str)         {                 hash = (*str++) + (hash << 6) + (hash << 16) - hash;         }         return (hash & 0x7FFFFFFF);}/* DJB Hash Function */unsigned int DJB_hash(char *str){         unsigned int hash = 5381;         while (*str)         {                 hash += (hash << 5) + (*str++);         }         return (hash & 0x7FFFFFFF);}/* AP Hash Function */unsigned int AP_hash(char *str){         unsigned int hash = 0;         int i;         for (i=0; *str; i++)         {                 if ((i & 1) == 0)                 {                         hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));                 }                 else                 {                         hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));                 }         }         return (hash & 0x7FFFFFFF);}/* CRC Hash Function */unsigned int CRC_hash(char *str){    unsigned int        nleft   = strlen(str);    unsigned long long  sum     = 0;    unsigned short int *w       = (unsigned short int *)str;    unsigned short int  answer  = 0;    /*     * Our algorithm is simple, using a 32 bit accumulator (sum), we add     * sequential 16 bit words to it, and at the end, fold back all the     * carry bits from the top 16 bits into the lower 16 bits.     */    while ( nleft > 1 ) {        sum += *w++;        nleft -= 2;    }    /*     * mop up an odd byte, if necessary     */    if ( 1 == nleft ) {        *( unsigned char * )( &answer ) = *( unsigned char * )w ;        sum += answer;    }    /*     * add back carry outs from top 16 bits to low 16 bits     * add hi 16 to low 16     */    sum = ( sum >> 16 ) + ( sum & 0xFFFF );    /* add carry */    sum += ( sum >> 16 );    /* truncate to 16 bits */    answer = ~sum;    return (answer & 0xFFFFFFFF);}std::string Itoa(int value) {  if (value < 0) {    value *= -1;  }  char character[] = "0123456789abcdefghijklmnopqrstuvwxyz";  std::string res = "";  do {    res += character[value % sizeof(character)];  } while ((value /= sizeof(character)) > 0);  return res;}int GetTime() {  timeval tv;  gettimeofday(&tv, NULL);  return tv.tv_sec * 1000000 + tv.tv_usec;}class StringHash { public:  uint64 operator()(const std::string& s) const {    return CityHash64(s.c_str(), s.size());    // return MurmurHash64A(s.c_str(), s.size(), kFingerPrintSeed) % (unsigned int) 0xFFFFFFFF;  }};class StringEqual { public:  bool operator()(const std::string& left, const std::string& right) const {    return left == right;  }};int main(int argc, char** argv) {  const int kDataSize = 1000000;  std::string content = "";  std::vector<std::string> data;  for (int i = 0; i < kDataSize; ++i) {    content = "";    for (int j = 0; j < 10; ++j) {      content += Itoa(rand());    }    data.push_back(content);  }  //murmur test  int start = GetTime();  for (int i = 0; i < kDataSize; ++i) {    MurmurHash64A(data[i].c_str(), data[i].size(), kFingerPrintSeed);  }  printf("murmur64: %d\n", GetTime() - start);      start = GetTime();  for (int i = 0; i < kDataSize; ++i) {    MurmurHash32A(data[i].c_str(), data[i].size(), kFingerPrintSeed);     }  printf("murmur32:%d\n", GetTime() - start);    //simple hash  start = GetTime();  for (int i = 0; i < kDataSize; ++i) {    simple_hash(const_cast<char*>(data[i].c_str()));  }  printf("simple hash:%d\n", GetTime() - start);    // bkdr hash  start = GetTime();  for (int i = 0; i < kDataSize; ++i) {    BKDR_hash(const_cast<char*>(data[i].c_str()));  }  printf("bkdr hash:%d\n", GetTime() - start);    // AP  hash  start = GetTime();  for (int i = 0; i < kDataSize; ++i) {    AP_hash(const_cast<char*>(data[i].c_str()));  }  printf("AP hash:%d\n", GetTime() - start);    // City  hash  start = GetTime();  for (int i = 0; i < kDataSize; ++i) {    CityHash64(data[i].c_str(), data[i].size());  }  printf("city hash:%d\n", GetTime() - start);      std::tr1::unordered_map<std::string, int, StringHash, StringEqual> my_map_city;  // City  hash insert  start = GetTime();    for (int i = 0; i < kDataSize; ++i) {    my_map_city[data[i]] = i;  }  printf("city hash insert:%d\n", GetTime() - start);      // map insert  std::map<std::string, int> my_map_tree;  start = GetTime();    for (int i = 0; i < kDataSize; ++i) {    my_map_tree[data[i]] = i;  }  printf("tree map insert:%d\n", GetTime() - start);      // City  hash search  start = GetTime();  int value = 0;  for (int i = 0; i < kDataSize; ++i) {    value = my_map_city[data[i]];  }  printf("city hash search:%d\n", GetTime() - start);    // map search  start = GetTime();    for (int i = 0; i < kDataSize; ++i) {    value = my_map_tree[data[i]];  }  printf("tree map search:%d\n", GetTime() - start);      }

参考文献

[1]http://blog.csdn.net/liuben/article/details/5050697

[2]http://www.cnblogs.com/atlantis13579/archive/2010/02/06/1664792.html

[3]http://sites.google.com/site/murmurhash/

[4]http://blog.csdn.net/wisage/article/details/7104866

[5]http://code.google.com/p/cityhash/

[6]http://gcc.gnu.org/onlinedocs/libstdc++/ext/pb_ds/index.html

[7]http://en.wikipedia.org/wiki/Hash_table

[8]http://en.wikipedia.org/wiki/MurmurHash

[9]http://hi.baidu.com/fdwm_lx/blog/item/f670e73582c8411d90ef3950.html

[10]http://www.cnblogs.com/Frandy/archive/2011/07/26/Hash_map_Unordered_map.html