HashTable—闭散列与开链法

来源：互联网发布：php从入门到精通视频编辑：程序博客网时间：2024/05/20 19:46

哈希表，又称散列表，是搜索方法之一，其特点为根据关键字（key）直接访问在内存中的位置

直接定址法
举一个例子，现在有如下一组字符

char* arr[]={"hashtable"};

接着定义一个大小为256的数组Hash，由于是字符型char，这些字符一定可以在这个数组中找到一个对应的位置进行插入；我们将这个表就成为哈希表，搜索时直接根据将自身作为下标便能搜索到所存位置；
而根据key又有两种方法，直接使用和间接使用，上述方法便是直接方法，我们称之为直接定址法，还有一种最为常用，为除留取余法，当然，间接法还有很多，如平方取中法，随机数法等；
除留取余法

就如其名，这种方法是将key模表的大小，从而得到一个小与这个表大小的index，再对应插入，即：Hash（key）=key%size。那么当我们有这样一组数据：

    int arr1[] = { 17, 10005, 108, 1006 };

我们开一个大小为10的数组，就完全可以进行存储；<之前的一片博客中我们介绍到了map与set，其底层为RBTree，而在STL源代码中，还存在unordered_map与unordered_set,区别就在于这两个底层便是哈希表进行实现，那么我们在实现哈希表时，便也使用K,V格式，便于unordered_map的使用；
但是在写之前，我们还有一个重要的问题需要考虑，假设我们的表大小为10，而存在一组数据如下：

    int arr1[] = { 89,18,58,9,49 };

这时，89和9还有49模10都是9，这种情况叫做哈希冲突，指的便是这种不同的数经过一个函数处理映射到相同的位置。而解决这种情况，有两种情况，首先使用第一种方法：
闭散列式
当我们遇到哈希冲突时，检测其下一个位置是否有数据，没有则放置：
即：放置数据为Hash（key）+i（i=0,1,2……）

这种方法称为线性探测，当然这种方法缺陷也很明显：数据集中；因此又出现了二次探测，即Hash（key）+i²（i=0,1,2……）

这种方法使数据更为分散，对于哈希表来说，数据越分散效率越高，但相对来说，这样代码更加复杂，并且当冲突过多，i值太大时，位置并不好找，各有优缺；
而哈希表的大小虽然无特殊规定，但在SGISTL中还是使用了素数作为表格大小

const int _PrimeSize = 28;        static const unsigned long _PrimeList[_PrimeSize] =        {            53ul, 97ul, 193ul, 389ul, 769ul,            1543ul, 3079ul, 6151ul, 12289ul, 24593ul,            49157ul, 98317ul, 196613ul, 393241ul, 786433ul,            1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,            50331653ul, 100663319ul, 201326611ul, 402653189ul,            805306457ul, 1610612741ul, 3221225473ul, 4294967291ul        };

那么万事俱备我们便可以着手写一个闭散列式的哈希表了
哈希表节点

enum Status{    EXIST,    DELETE,    EMPTY,};template<class K,class V>struct HashNode{    K _key;    V _value;    Status _status;    HashNode(const K& key=K(),const V& value=V())        :_key(key)        , _value(value)        , _status(EMPTY)    {}};

其中枚举变量status是为了使我们找节点时，避免由于节点删除使我们找不到后面的节点：

这里写图片描述

在删除9后，若不标注为删除状态，则49我们便找不到了；
HashTable类

template<class K, class V,class HashFunc=__HashFunc<K>>class HashTable{    typedef HashNode<K, V>  Node;public:    HashTable(size_t size)        :_size(0)    {        assert(size > 0);        _tables.resize(size);    }    ~HashTable()    {}    pair<Node*,bool> Insert(const K& key,const V& value )    {        CheckCapacity();        //线性探测        size_t index = HashFunction(key);        while (_tables[index]._status == EXIST)        {            if (_tables[index]._key == key)            {                return make_pair(&_tables[index], false);            }            ++index;            if (index == _tables.size())                index = 0;        }        _tables[index]._key = key;        _tables[index]._value = value;        _tables[index]._status = EXIST;        ++_size;        return make_pair(&_tables[index], true);    }    Node* Find(const K& key)    {        size_t index = HashFunction(key);        while (_tables[index]._status != EMPTY)        {            if (_tables[index]._key == key)            {                if (_tables[index]._status != DELETE)                    return &_tables[index];            }            else                return NULL;            ++index;            if (index == _tables.size())                index = 0;        }        return NULL;    }    bool Remove(const K& key,const V& value)    {        size_t index = HashFunction(key);        while (_tables[index]._status == EXIST)        {            if (_tables[index]._key == key)            {                _tables[index]._status = DELETE;                return true;            }            ++index;            if (index == _tables.size())                index = 0;        }        return false;    }protected:    size_t HashFunction(const K& key)//求得模值    {        HashFunc k;        size_t size = k(key);        return size%_tables.size();    }    void Swap(HashTable<K, V, HashFunc>& tmp)//为扩容中交换节点提供交换（现代写法）    {        swap(_tables, tmp._tables);        swap(_size, tmp._size);    }    void CheckCapacity()//当负载因子大于等于0.7时，取下一个素数作为新的表格大小    {        if (_size * 10 / _tables.size() >= 7)        {            size_t NewSize = GetNextPrime(_tables.size());            HashTable<K, V, HashFunc> tmp(NewSize);            for (size_t i = 0; i < _tables.size(); i++)            {                if (_tables[i]._status == EXIST)                {                    tmp.Insert(_tables[i]._key, _tables[i]._value);                }            }            Swap(tmp);            return;        }    }    size_t GetNextPrime(size_t num)//素数表返回表格大小    {        static size_t index = 0;        const int _PrimeSize = 28;        static const unsigned long _PrimeList[_PrimeSize] =        {            53ul, 97ul, 193ul, 389ul, 769ul,            1543ul, 3079ul, 6151ul, 12289ul, 24593ul,            49157ul, 98317ul, 196613ul, 393241ul, 786433ul,            1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,            50331653ul, 100663319ul, 201326611ul, 402653189ul,            805306457ul, 1610612741ul, 3221225473ul, 4294967291ul        };        for (size_t i = 0; i<_PrimeSize; ++i)        {            if (_PrimeList[i]>num)            {                return _PrimeList[i];            }        }        return _PrimeList[27];    }protected:    vector<Node> _tables;    size_t _size;};

实现的接口共有Insert,Remove,Find;

其中需要解释有两点

1.模板参数最后一项__HashFunc，是为了使这个表可以存任意类型，当数据为string时，不能进行%表格大小，因此做一个仿函数，并在此处将string 模板特化，这样使用string也可以直接使用

struct __HashFunc{    size_t operator()(const K& key)    {        return key;    }};template<>struct __HashFunc<string>{    size_t operator()(const string& s)    {        return BKDR_Hash(s.c_str());    }    static size_t BKDR_Hash(const char* str)    {        unsigned int seed = 131;        unsigned int hash = 0;        while (*str)        {            hash = hash*seed + (*str++);        }        return(hash & 0x7FFFFFFF);    }};

2.扩容中写道的负载因子，这个概念是由于哈希表本身存在一个驳论，所存数据越多浪费空间越少，但效率越低；数据存的越少，效率就越高，但浪费空间就越多，因此将存储数据个数/表格大小的值称为负载因子，并将其保持在0.7~0.9时，对于效率和空间同时而言最优。

当然还有第二种方法，通过一种顺序表加链表的方法，即在哈希表中每个节点不再只存储数据，而是存储一个指向一个节点的指针，这样在遇到哈希冲突时可以通过节点一直向下存储或访问，这种方法称为：
开链法
这里写图片描述
当然在这种情况下，显然之前的负载因子就不在试用于判断，我们定义此时的负载因子保持在1最好，而如果单个节点下链的节点过多（哈希冲突过多），可以选择在这一单个节点下挂红黑树，从而提高访问效率
哈希桶（开链法）代码

#pragma once#include<iostream>#include<vector>using namespace std;//为了避免和闭散列式发生冲突，使用不同命名空间namespace Bucket{    template<class K, class V>    class HashTable;    template<class K, class V>    struct HashNode    {        pair<K, V> _kv;        HashNode<K, V>* _next;        HashNode(const pair<K, V> kv)            :_kv(kv)            , _next(NULL)        {}    };    template<class K, class V, class Ref, class Ptr>    struct HashTableIterator    {        typedef HashNode<K, V> Node;        Node* _node;        HashTable<K, V>* _ht;    public:        typedef HashTableIterator<K, V, Ref, Ptr> Self;        HashTableIterator(Node* node, HashTable<K, V>* ht)            :_node(node)            , _ht(ht)        {}        Ref operator* ()        {            return _node->_kv;        }        Ptr operator-> ()        {            return &(operator*());        }        Self& operator++ ()        {            _node = Next(_node);            return *this;        }        bool operator!= (const Self& s)const        {            return _node != s._node;        }        Node* Next(Node* _node)        {            Node* next = _node->_next;            if (next)            {                return next;            }            else            {                size_t index = _ht->HashFunc(_node->_kv.first)+1;                for (; index < _ht->_tables.size(); index++)                {                    next = _ht->_tables[index];                    if (next)                    {                        return next;                    }                }            }            return NULL;        }    };    template<class K, class V>    class HashTable    {        typedef HashNode<K, V>  Node;    public:        typedef HashTableIterator<K, V, pair<K, V>&, pair<K, V>*> Iterator;        typedef HashTableIterator<K, V, const pair<K, V>&, const pair<K, V>*> ConstIterator;        friend struct Iterator;        friend struct ConstIterator;        HashTable()            :_size(0)        {}        HashTable(size_t size)            :_size(0)        {            _tables.resize(size);        }        ~HashTable()        {            Clear();        }        pair<Node*, bool> Insert(const pair<K, V> kv)        {            CheckCapacity();            size_t index = HashFunc(kv.first);            Node* cur = _tables[index];            if (Node* ret = Find(kv.first))            {                return make_pair(ret, false);            }            Node* tmp = new Node(kv);            tmp->_next = _tables[index];            _tables[index] = tmp;            return make_pair(tmp, true);        }        Node* Find(const K& key)        {            size_t index = HashFunc(key);            Node* cur = _tables[index];            while (cur)            {                if (cur->_kv.first == key)                    return cur;                else                    cur = cur->_next;            }            return NULL;        }        bool Remove(const pair<K, V> kv)        {            size_t index = HashFunc(kv.first);            Node* cur = _tables[index];            Node* prev = NULL;            while (cur)            {                if (cur->_kv.first == kv.first)                {                    if (prev == NULL)                    {                        _tables[index] = cur->_next;                    }                    else                    {                        prev->_next = cur->_next;                    }                    delete cur;                    cur = NULL;                    return true;                }                prev = cur;                cur = cur->_next;            }            return false;        }        Iterator Begin()        {            for (size_t index = 0; index < _tables.size(); index++)            {                Node* cur = _tables[index];                if (cur)                {                    return HashTableIterator<K, V, pair<K, V>&, pair<K, V>*>(cur, this);                }            }            return End();        }        Iterator End()        {            return HashTableIterator<K, V, pair<K, V>&, pair<K, V>*>((Node*)NULL, this);        }    protected:        size_t GetNextPrime(size_t num)        {            static size_t index = 0;            const int _PrimeSize = 28;            static const unsigned long _PrimeList[_PrimeSize] =            {                53ul, 97ul, 193ul, 389ul, 769ul,                1543ul, 3079ul, 6151ul, 12289ul, 24593ul,                49157ul, 98317ul, 196613ul, 393241ul, 786433ul,                1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,                50331653ul, 100663319ul, 201326611ul, 402653189ul,                805306457ul, 1610612741ul, 3221225473ul, 4294967291ul            };            for (size_t i = 0; i<_PrimeSize; ++i)            {                if (_PrimeList[i]>num)                {                    return _PrimeList[i];                }            }                   return _PrimeList[27];        }        size_t HashFunc(const K& key)        {            return key%_tables.size();        }        void CheckCapacity()        {            if (_size == _tables.size())            {                size_t newSize = GetNextPrime(_size);                HashTable<K, V> tmp(newSize);                for (size_t index = 0; index < _tables.size(); index++)                {                    Node* cur = _tables[index];                    while (cur)                    {                        tmp.Insert(cur->_kv);                        cur = cur->_next;                    }                }                Swap(tmp);            }            else                return;        }        void Swap(HashTable<K, V> tmp)        {            swap(tmp._tables, _tables);        }        void Clear()        {            for (size_t index = 0; index < _tables.size(); index++)            {                Node* cur = _tables[index];                Node* del = NULL;                while (cur)                {                    del = cur;                    cur = cur->_next;                    delete del;                }                _tables[index] = NULL;            }        }    protected:        vector<Node*> _tables;        size_t _size;    };};

0 0