glib库 hash表实现分析

来源：互联网发布：ubuntu 安装区间编辑：程序博客网时间：2024/05/29 16:29

Hash Table的原理

哈希表的目的简单来说是为了实现存储多个key=>value关系（注意，此处是单项推导，不支持反向查找），一个比较简单的模型实现是用一个数组来存储这些关系，但是在插入数据时，并不在index最小的数组位置插入，而是直接通过函数算出这个key-value应该存储的位置，这样可以避免查找时遍历查找。

一个比较简单的实现方法是这样：

typedef struct{    T_Val value;    T_Pos next;}T_Item;struct _HashTable{    T_Item items[];    T_Pos hash_list_Headers[];}T_HashTable;

将具有相同hash值的条目组成一个链表，用T_Item::next域表示链表的下一个的位置，当插入的时候，首先计算出hash值，以该值作为索引，通过T_HashTable::hash_list_headers，找到该哈希链表的地一个位置T_Pos，然后遍历该链表，找到需要的key对应的value。

这种方法有什么缺点呢？
1. 链表在物理内存中不连续，造成cache命中减小
2. 如果用单项链表，则删除节点十分麻烦
3. 效率很低

一群牛人们在glib库中重新写了hash算法的模板，本文在此分析一下他的运行过程

大体算法：

  gpointer        *keys;  guint           *hashes;  gpointer        *values;

这三个数组长度相同，下标为x的三个元素 key[x] hashes[x] values[x]，共同描述这同一个key-value关系。
不会出现当x,y,z任意两个不想等时，key[x] hashes[y] values[z]，共同描述一个key-value关系。当插入一个
key-value关系时，伪代码如下：

find(key){    hash_val = get_hash_by_key( key);    for ( i = hash_val; HASH_IS_USED( hashes[i] ); i++ )    {        if ( HASH_VAL_TOMB == hash[i] )            continue;        if ( hash[i] == hash_val && keys[i] == key)            return values[i];   }}

首先计算该key的hash值，以该哈希值x为起始数组位置，找到hashes[x]与keys[x]同时匹配时，返回value

先来看一下最重要的数据结构，这个_GHashTable数据结构写在c文件中，对外不暴露细节，在glib.h文件中被
typedef为 GHashTable

struct GHashTable{  gint             size;  gint             mod;  guint            mask;  gint             nnodes;  gint             noccupied;  /* nnodes + tombstones */  gpointer        *keys;  guint           *hashes;  gpointer        *values;  GHashFunc        hash_func;  GEqualFunc       key_equal_func;  gint             ref_count;#ifndef G_DISABLE_ASSERT  /*   * Tracks the structure of the hash table, not its contents: is only   * incremented when a node is added or removed (is not incremented   * when the key or data of a node is modified).   */  int              version;#endif  GDestroyNotify   key_destroy_func;  GDestroyNotify   value_destroy_func;};

GHashTable *g_hash_table_new (GHashFunc  hash_func,                  GEqualFunc key_equal_func){  return g_hash_table_new_full (hash_func, key_equal_func, NULL, NULL);}GHashTable *g_hash_table_new_full (GHashFunc      hash_func,                       GEqualFunc     key_equal_func,                       GDestroyNotify key_destroy_func,                       GDestroyNotify value_destroy_func){  GHashTable *hash_table;  hash_table = g_slice_new (GHashTable);  /*g_slice_new和new是一样的*/  g_hash_table_set_shift (hash_table, HASH_TABLE_MIN_SHIFT);  /*这个函数很有意思，下文着重分析*/  hash_table->nnodes             = 0;  /*这个哈希表中，实际被真正占用的有多少个*/  hash_table->noccupied          = 0;  /*noccupied  = nnodes + tombs    tomb 坟墓，也就是被删除了的元素！    */  hash_table->hash_func          = hash_func ? hash_func : g_direct_hash;  hash_table->key_equal_func     = key_equal_func;  hash_table->ref_count          = 1;#ifndef G_DISABLE_ASSERT  hash_table->version            = 0;#endif  hash_table->key_destroy_func   = key_destroy_func;  hash_table->value_destroy_func = value_destroy_func;  hash_table->keys               = g_new0 (gpointer, hash_table->size);  hash_table->values             = hash_table->keys;  /*初始化时，将keys和values指向同一段内存，这是为了防止有key与value一直相等的情况，这样做可以节省内存*/  hash_table->hashes             = g_new0 (guint, hash_table->size);/*g_new0就是new*/  return hash_table;}

插入过程

gbooleang_hash_table_insert (GHashTable *hash_table,                     gpointer    key,                     gpointer    value){  return g_hash_table_insert_internal (hash_table, key, value, FALSE);}static gbooleang_hash_table_insert_internal (GHashTable *hash_table,                              gpointer    key,                              gpointer    value,                              gboolean    keep_new_key){  guint key_hash;  guint node_index;  g_return_val_if_fail (hash_table != NULL, FALSE);  /*hash_table==NULL 直接退出*/  node_index = g_hash_table_lookup_node (hash_table, key, &key_hash);  /*这个函数要么找到命中的node_index，如果没找到，则返回一个坟墓或者空项目的指针  */  return g_hash_table_insert_node (hash_table, node_index, key_hash, key, value, keep_new_key, FALSE);}static inline guintg_hash_table_lookup_node (GHashTable    *hash_table,                          gconstpointer  key,                          guint         *hash_return){  guint node_index;  guint node_hash;  guint hash_value;  guint first_tombstone = 0;  gboolean have_tombstone = FALSE;  guint step = 0;  g_assert (hash_table->ref_count > 0);  /*这里计算哈希值之后，又对哈希值做了处理，使之不能大于等于2*/  /*原因在于 0 表示该hash数组元素没有使用，1表示是坟墓*/  hash_value = hash_table->hash_func (key);  if (G_UNLIKELY (!HASH_IS_REAL (hash_value)))    hash_value = 2;  *hash_return = hash_value;  node_index = hash_value % hash_table->mod;  node_hash = hash_table->hashes[node_index];  /*tomb的hash为1 算是已使用，  正常的hash >= 2  未使用hash = 0*/  /*这里不会产生死循环，因为在要满之前，就会扩充内存，所以总会保证能够有一个空的哈希元素的  g_hash_table_resize*/  while (!HASH_IS_UNUSED (node_hash))    {      if (node_hash == hash_value)        {        /*哈希命中，则比对key*/          gpointer node_key = hash_table->keys[node_index];          if (hash_table->key_equal_func)            {              if (hash_table->key_equal_func (node_key, key))                return node_index;            }          else if (node_key == key)            {              return node_index;            }        }      else if (HASH_IS_TOMBSTONE (node_hash) && !have_tombstone)        {        /*如果是一个坟墓，则把下标记录下来，这样这个下标就可以作为插入的时候参考用*/          first_tombstone = node_index;          have_tombstone = TRUE;        }     /*只要不return 或者遇到没有使用的hash，则一直循环*/      step++;      node_index += step;      node_index &= hash_table->mask;      node_hash = hash_table->hashes[node_index];    }  /*走到这里发现，一直持续循环最后还是没命中，node_index*/  if (have_tombstone)    return first_tombstone;  return node_index;}

g_hash_table_lookup (GHashTable    *hash_table,                     gconstpointer  key){  guint node_index;  guint node_hash;  g_return_val_if_fail (hash_table != NULL, NULL); /*这里查询key又调用了这个函数*/  node_index = g_hash_table_lookup_node (hash_table, key, &node_hash);  return HASH_IS_REAL (hash_table->hashes[node_index])    ? hash_table->values[node_index]    : NULL;}

0 0