redis 源代码之数据结构(3)--hash表实现

来源：互联网发布：淘宝客服基本培训内容编辑：程序博客网时间：2024/05/18 20:37

hash表应用范围很广，实现一个hash表有两个重要因素。1，hash函数的选择，很多研究人员都给出了性能卓越的函数；2解决冲突，最常见的是链表的方法，还有开放定址法等方法。redis的hash表（在dict.c dict.h中）用的hash函数是Thomas Wang's 32 bit Mix Function 和MurmurHash2，整个hash实现相当精致而且它最大的特色在于可以实现自动扩容，这样可以解决负载因子过大产生的问题。整个redis hash内存布局如下

redis hash的结构体定义如下

typedef struct dictEntry {    void *key;    union {        void *val;        uint64_t u64;        int64_t s64;    } v;    struct dictEntry *next;} dictEntry; //此处定义了hash表中的一个节点，key/value/下一个节点指针typedef struct dictType {    unsigned int (*hashFunction)(const void *key);    //将key生成一个hash值         #1    void *(*keyDup)(void *privdata, const void *key); //存储key值                  #2    void *(*valDup)(void *privdata, const void *obj);  //存储value                 #3    int (*keyCompare)(void *privdata, const void *key1, const void *key2);//比较两个key #4    void (*keyDestructor)(void *privdata, void *key);           //删除key的内容          #5    void (*valDestructor)(void *privdata, void *obj);           // 删除val               #6} dictType;             //操作hash的几个基本函数/* This is our hash table structure. Every dictionary has two of this as we * implement incremental rehashing, for the old to the new table. */typedef struct dictht {    dictEntry **table;    unsigned long size;               //hash表的大小（总为2的n次幂）    unsigned long sizemask;           //实际为size - 1，这样就可以直接对sizemask进行取模获得桶的位置    unsigned long used;               //hash表中已经使用的桶数} dictht;typedef struct dict {    dictType *type;    void *privdata;    dictht ht[2];//有两个hash表，一开始新增加的元素都会塞到ht[0]中去，当负载因子（元素数目/桶数)达到一定的阈值（dict_force_resize_ratio = 5）,就会扩容    int rehashidx; /* rehashing not in progress if rehashidx == -1 */    int iterators; /* number of iterators currently running ,redis限制有迭代器（iterators > 0)的时候，禁止rehash*/} dict;/* If safe is set to 1 this is a safe iterator, that means, you can call * dictAdd, dictFind, and other functions against the dictionary even while * iterating. Otherwise it is a non safe iterator, and only dictNext() * should be called while iterating. */typedef struct dictIterator {    dict *d;    int table, index, safe;    dictEntry *entry, *nextEntry;} dictIterator;

1,hash表的创建

dict *dictCreate(dictType *type,        void *privDataPtr){    dict *d = zmalloc(sizeof(*d));//zmalloc是redis对malloc的封装（用的jemalloc库）    _dictInit(d,type,privDataPtr);//privDataPtr还不知道有什么用,_dictInit主要对dict结构体内的数据进行初始化,并调用_dictReset初始化ht[0],ht[1]    return d;}

2,向hash表添加元素

创建hash表的时候，并没有申请内存空间，当增加一个key的时候，才会真正划分hash表的内存。

int dictAdd(dict *d, void *key, void *val){    dictEntry *entry = dictAddRaw(d,key);    if (!entry) return DICT_ERR;    dictSetVal(d, entry, val);    return DICT_OK;}

增加key的函数调用链：dictAdd->dictAddRaw->_dictKeyIndex(这个主要获取该key在桶中的位置，如果该key已经存在，则返回-1）->dictSetKey

若正在处于rehash中，则在ht[1]表中插入key，否则只在ht[0]中插入key。

static int _dictKeyIndex(dict *d, const void *key){    //...    /* 这里会进行hash桶的内存分配*/    if (_dictExpandIfNeeded(d) == DICT_ERR)        return -1;    /* 计算该key所在的桶位置 */    h = dictHashKey(d, key);    for (table = 0; table <= 1; table++) {       //进行key的检查，确定没有重复的key，有的话，直接返回-1}

_dictKeyIndex会调用_dictExpandIfNeeded进行扩容, _dictExpandIfNeeded内部调用dictExpand，dictExpand会声明一个dictht变量，如果ht[0]的table为NULL，就用该变量初始化ht[0]的，否则就初始化ht[1],并将rehashidx设置为0，为rehash做准备。

3，redis的hash表实现rehash

/* 执行n步rehash,将ht[0] n个桶内容重新hash到ht[1]的n个桶，如果rehash完毕，则交换ht[0]和ht[1]的指针，并返回0，没rehash完毕，就返回1 */int dictRehash(dict *d, int n) {    if (!dictIsRehashing(d)) return 0;    while(n--) {        dictEntry *de, *nextde;        /* Check if we already rehashed the whole table... */        if (d->ht[0].used == 0) {            zfree(d->ht[0].table);            d->ht[0] = d->ht[1];            _dictReset(&d->ht[1]);            d->rehashidx = -1;            return 0;        }        /* Note that rehashidx can't overflow as we are sure there are more         * elements because ht[0].used != 0 */        assert(d->ht[0].size > (unsigned)d->rehashidx);        while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++;//跳过空桶        de = d->ht[0].table[d->rehashidx];                        //一个桶的第一个元素        /* Move all the keys in this bucket from the old to the new hash HT */        while(de) {            unsigned int h;            nextde = de->next;            /* Get the index in the new hash table */            h = dictHashKey(d, de->key) & d->ht[1].sizemask;  //重新计算hash值，并计算出key在ht[1]桶的位置            de->next = d->ht[1].table[h];            d->ht[1].table[h] = de;            d->ht[0].used--;            d->ht[1].used++;            de = nextde;        }        d->ht[0].table[d->rehashidx] = NULL;       //清空链头        d->rehashidx++;    }    return 1;}

真正rehash的过程并不是一次就完成的，如果一百万个key进行rehash，会导致整个服务卡在rehash 的过程上，导致局部过热，因此，作者渐进的rehash，也就是将rehahs的操作平摊到dictAddRaw（增加key）, dictGenericDelete（删除key）,dictFind（找到key）, dictGetRandomKey（随机获得一个key）这些操作中，rehash更加平滑。