Huffman codes

来源:互联网 发布:淘宝网如何做推广 编辑:程序博客网 时间:2024/06/04 00:32

参考维基百科与《算法导论》

霍夫曼编码Huffman Coding)是一种编码方式,是一种用于无损数据压缩熵编码(权编码)算法。也称哈夫曼编码赫夫曼编码1952年,David A. Huffman麻省理工攻读博士时所发明的,并发表于《一种构建极小多余编码的方法》(A Method for the Construction of Minimum-Redundancy Codes)一文。

计算机数据处理中,霍夫曼编码使用变长编码表对源符号(如文件中的一个字母)进行编码,其中变长编码表是通过一种评估来源符号出现机率的方法得到的,出现机率高的字母使用较短的编码,反之出现机率低的则使用较长的编码,这便使编码之后的字符串的平均长度、期望值降低,从而达到无损压缩数据的目的。

例如,在英文中,e的出现机率最高,而z的出现概率则最低。当利用霍夫曼编码对一篇英文进行压缩时,e极有可能用一个比特来表示,而z则可能花去25比特(不是26)。用普通的表示方法时,每个英文字母均占用一个字节(byte),即8比特。二者相比,e使用了一般编码的1/8的长度,z则使用了3倍多。倘若我们能实现对于英文中各个字母出现概率的较准确的估算,就可以大幅度提高无损压缩的比例。

霍夫曼树又称最优二叉树,是一种带权路径长度最短的二叉树。所谓树的带权路径长度,就是树中所有的叶结点的权值乘上其到根结点的路径长度(若根结点为0层,叶结点到根结点的路径长度为叶结点的层数)。树的路径长度是从树根到每一结点的路径长度之和,记为WPL=W1*L1+W2*L2+W3*L3+...+Wn*Ln),N个权值Wii=1,2,...n)构成一棵有N个叶结点的二叉树,相应的叶结点的路径长度为Lii=1,2,...n)。可以证明霍夫曼树的WPL是最小的。

霍夫曼编码可以有效的压缩数据;通常可以节省20%~90%的空间,具体压缩率依赖于数据的特征。变长编码(variable-length code赋予高频字符短码字,赋予低频字符长码字,这样可以达到比定长编码好得多的压缩率。前缀码(prefix code即没有任何码字其他码字的前缀。

霍夫曼设计了一个贪心算法来构造最优前缀码,被称为霍夫曼编码(Huffman code)。

赫夫曼编码的实现 

在实现中,我们假设C是一个含n个字符的集合,而其中每个字符c∈C都是一个对象,其属性c.freq给出了字符的出现频率。算法自底向上地构建出对应最优编码的二叉树T。它从|C|个叶子结点开始,执行|C|-1合并操作创建出最终的二叉树。算法使用一个以属性freq为关键字最小优先队列Q,以识别两个最低频率的对象将其合并。当合并两个对象时,得到的新对象的频率设置为原来两个对象的频率之和。


代码实现1:

//=============================================================// Huffman编码实现(2014/8/18)// 使用二叉树构建最小优先队列//=============================================================#include<stdio.h>#include<stdlib.h>typedef struct hmap {char c;int freq;} HMap; typedef struct hnode { // huffman树结点char c;int freq;struct hnode *left;struct hnode *right;} HNode, *HTree;// 用数组遍历可能实现更简单一些#define LEFT(i)   (2 * (i) + 1)#define RIGHT(i)  (2 * (i) + 2)#define PARENT(i) (((i) - 1) / 2)//=============================================================// 对堆中第i个元素进行堆化//=============================================================void Heapify(HNode **arr, int n, int i){int min; // 对小的孩子进行标记for (int j = i; j <= n / 2 - 1; j = min) {min = 2 * j + 1; // leftif (min + 1 < n && arr[min+1]->freq < arr[min]->freq)min += 1; // 2 * j + 2 rightif (arr[j]->freq > arr[min]->freq) {HNode *tmp = arr[j];arr[j] = arr[min];arr[min] = tmp;}}}//=============================================================// 建小顶堆//=============================================================void BuildHeap(HNode **arr, int n){for (int i = n / 2 - 1; i >= 0; i--) {Heapify(arr, n, i);}}//=============================================================// 获取堆中最小结点的指针//=============================================================HNode *ExtractMin(HNode **arr, int n){HNode *min = arr[0];arr[0] = arr[n - 1];arr[n - 1] = NULL;Heapify(arr, n - 1, 0);return min;}//=============================================================// 向堆中插入元素//=============================================================void MinHeapInsert(HNode **arr, int n, HNode *x){if (n == 0) { // 堆为空的情况arr[0] = x;return;}// 找到x插入的位置, 向上过滤while (n > 0 && arr[PARENT(n)]->freq > x->freq) {arr[n] = arr[PARENT(n)];n = PARENT(n);}arr[n] = x; }//=========================================================// Huffman编码实现//=========================================================HTree Huffman(HMap *C, int n){HNode **Q = NULL;Q = (HNode **)malloc(sizeof(HNode *) * n);if (!Q) {printf("Q malloc error\n");return NULL;}for (int i = 0; i < n; i++)Q[i] = NULL;// 初始化Qfor (int i = 0; i < n; i++) {HNode *p = (HNode *)malloc(sizeof(HNode));if (!p) {printf("p malloc error\n");return NULL;}p->c = C[i].c;p->freq = C[i].freq;p->left = p->right = NULL;Q[i] = p;}// 建堆,形成最小优先队列BuildHeap(Q, n);// 建立huffman树for (int i = 0; i < n - 1; i++) {HNode *z = (HNode *)malloc(sizeof(HNode));if (!z) {printf("z malloc error\n");return NULL;}HNode *x = ExtractMin(Q, n - i);HNode *y = ExtractMin(Q, n - i - 1);z->left = x;z->right = y;z->freq = x->freq + y->freq;MinHeapInsert(Q, n - i - 2, z);}return ExtractMin(Q, 1);}HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12},{'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}};#include <iostream>#include <vector>#include <map>using namespace std;typedef vector<int> Huff_code; // 8 bit code of one charmap<char, Huff_code> Huff_Dic;// huffman coding dictionary//=============================================================// Give Huffman Coding to the Huffman Tree//=============================================================void Huffman_Coding(HTree root, Huff_code& curcode){if (root->left == NULL && root->right == NULL) {Huff_Dic[root->c] = curcode;return;}Huff_code lcode = curcode;Huff_code rcode = curcode;lcode.push_back(0);rcode.push_back(1);Huffman_Coding(root->left, lcode);Huffman_Coding(root->right, rcode);}int main(){int n = sizeof(C) / sizeof(C[0]);HTree root;root = Huffman(C, n);Huff_code nullcode;nullcode.clear();Huffman_Coding(root, nullcode);// 打印Huffman编码for (map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); ++it) {cout << (*it).first << '\t';for (vector<int>::iterator vit = (*it).second.begin(); vit != (*it).second.end(); ++vit) {cout << *vit;}cout << endl;}system("pause");return 0;}

代码实现2:

//=============================================================// Huffman编码实现(2014/8/18)// 使用数组构建最小优先队列//=============================================================#include<stdio.h>#include<stdlib.h>typedef struct hmap {char c;int freq;} HMap; typedef struct hnode { // huffman树结点char c;int freq;struct hnode *left;struct hnode *right;} HNode, *HTree;typedef struct qnode {HNode **node;bool *visited;} QNode, *QUEUE;//=========================================================// 构建一个最小优先队列//=========================================================QUEUE CreateQueue(int n){QUEUE Q = (QNode *)malloc(sizeof(QNode));if (!Q) {printf("Q malloc error\n");return NULL;}// 开辟2n-1个pointer用于盛放所有结点的地址 Q->node = (HNode **)malloc(sizeof(HNode *) * (2 * n - 1));if (!Q->node) {free(Q);printf("Q->node malloc error\n");return NULL;}for (int i = 0; i < 2 * n - 1; i++) {Q->node[i] = NULL;}// 开辟2n-1个bool用于标识结点是否被访问过 Q->visited = (bool *)malloc(sizeof(bool) * (2 * n - 1));if (!Q->visited) {free(Q);free(Q->node);printf("Q->visited malloc error\n");return NULL;}for (int i = 0; i < 2 * n - 1; i++) {Q->visited[i] = false;} return Q;}//=========================================================// 获取队列中freq最小的结点//=========================================================HNode *ExtractMin(QUEUE Q, int n){int min = 0xff;int index = -1; // 最小结点的索引 for (int i = 0; i < 2 * n - 1; i++) {// 找到队列中未被访问的最小元素if (Q->node[i] && !Q->visited[i] && Q->node[i]->freq < min) {min = Q->node[i]->freq;index = i;}}if (index == -1)return NULL;Q->visited[index] = true;return Q->node[index];}//=========================================================// 将node结点插入队列中//=========================================================void QueueInsert(QUEUE Q, int n, HNode *node){int index = 0;while (index < 2 * n - 1) {if (Q->node[index] == NULL) { // 找到新的位置,进行插入Q->node[index] = node;break;}index++;}}//=========================================================// Huffman编码实现//=========================================================HTree Huffman(HMap *C, int n){QUEUE Q = CreateQueue(n);// 初始化Qfor (int i = 0; i < n; i++) {HNode *p = (HNode *)malloc(sizeof(HNode));if (!p) {printf("p malloc error\n");return NULL;}p->c = C[i].c;p->freq = C[i].freq;p->left = p->right = NULL;Q->node[i] = p;}// 建立huffman树for (int i = 0; i < n - 1; i++) {HNode *z = (HNode *)malloc(sizeof(HNode));if (!z) {printf("z malloc error\n");return NULL;}HNode *x = ExtractMin(Q, n);HNode *y = ExtractMin(Q, n);z->left = x;z->right = y;z->freq = x->freq + y->freq;QueueInsert(Q, n, z);}return ExtractMin(Q, n);}HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12},{'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}};#include <iostream>#include <vector>#include <map>using namespace std;typedef vector<int> Huff_code; // 8 bit code of one charmap<char, Huff_code> Huff_Dic;// huffman coding dictionary//=============================================================// Give Huffman Coding to the Huffman Tree//=============================================================void Huffman_Coding(HTree root, Huff_code& curcode){if (root->left == NULL && root->right == NULL) {Huff_Dic[root->c] = curcode;return;}Huff_code lcode = curcode;Huff_code rcode = curcode;lcode.push_back(0);rcode.push_back(1);Huffman_Coding(root->left, lcode);Huffman_Coding(root->right, rcode);}int main(){int n = sizeof(C) / sizeof(C[0]);HTree root;root = Huffman(C, n);Huff_code nullcode;nullcode.clear();Huffman_Coding(root, nullcode);for (map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); ++it) {cout << (*it).first << '\t';for (vector<int>::iterator vit = (*it).second.begin(); vit != (*it).second.end(); ++vit) {cout << *vit;}cout << endl;}system("pause");return 0;}

代码实现3(C++)

引用以妹子的,链接:http://blog.csdn.net/abcjennifer/article/details/8020695

/************************************************************************//*File Name: Huffman.cpp*@Function: Lossless Compression@Author: Sophia Zhang@Create Time: 2012-9-26 10:40@Last Modify: 2012-9-26 12:10*//************************************************************************/#include"iostream"#include "queue"#include "map"#include "string"#include "iterator"#include "vector"#include "algorithm"using namespace std;#define NChar 8//suppose use 8 bits to describe all symbols#define Nsymbols 1<<NChar//can describe 256 symbols totally (include a-z, A-Z)typedef vector<bool> Huff_code;//8 bit code of one charmap<char,Huff_code> Huff_Dic;//huffman coding dictionary/************************************************************************//* Tree Class elements:*2 child trees*character and frequency of current node*//************************************************************************/class HTree{public :HTree* left;HTree* right;char ch;int weight;HTree(){left = right = NULL; weight=0;ch ='\0';}HTree(HTree* l,HTree* r,int w,char c){left = l;right = r;weight=w;ch=c;}~HTree(){delete left; delete right;}bool Isleaf(){return !left && !right; }};/************************************************************************//* prepare for pointer sorting*//*because we cannot use overloading in class HTree directly*//************************************************************************/class Compare_tree{public:bool operator () (HTree* t1, HTree* t2){return t1->weight> t2->weight;}};/************************************************************************//* use priority queue to build huffman tree*//************************************************************************/HTree* BuildTree(int *frequency){priority_queue<HTree*,vector<HTree*>,Compare_tree> QTree;//1st level add charactersfor (int i=0;i<Nsymbols;i++){if(frequency[i])QTree.push(new HTree(NULL,NULL,frequency[i],(char)i));}//buildwhile (QTree.size()>1){HTree* lc  = QTree.top();QTree.pop();HTree* rc = QTree.top();QTree.pop();HTree* parent = new HTree(lc,rc,lc->weight+rc->weight,(char)256);QTree.push(parent);}//return tree rootreturn QTree.top();}/************************************************************************//* Give Huffman Coding to the Huffman Tree*//************************************************************************/void Huffman_Coding(HTree* root, Huff_code& curcode){if(root->Isleaf()){Huff_Dic[root->ch] = curcode;return;}Huff_code lcode = curcode;Huff_code rcode = curcode;lcode.push_back(false);rcode.push_back(true);Huffman_Coding(root->left,lcode);Huffman_Coding(root->right,rcode);}int main(){int freq[Nsymbols] = {0};char *str = "this is the string need to be compressed";//statistic character frequencywhile (*str!='\0')freq[*str++]++;//build treeHTree* r = BuildTree(freq);Huff_code nullcode;nullcode.clear();Huffman_Coding(r,nullcode);for(map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); it++){cout<<(*it).first<<'\t';std::copy(it->second.begin(),it->second.end(),std::ostream_iterator<bool>(cout));cout<<endl;}}



0 0