Huffman编解码

来源：互联网发布：做非标刀具软件编辑：程序博客网时间：2024/05/29 16:21

一，实验原理

1，Huffman编码原理：

1）统计：将每个符号出现的概率进行统计，并且从小到大排序。

2）合并：将出现概率最小的两个符号概率进行合并，反映在二叉树上：将两个树叶节点合并得到一个父节点。重复此步骤直到根节点。

3）编码：二叉树按照左0右1的规则编码，遍历整棵树，然后自根节点向下到每个树叶节点，可以得到每个树叶节点的编码。

2，基本数据格式定义：

节点和码字的定义见代码。

typedef struct huffman_node_tag//节点的定义{unsigned char isLeaf;//指示是否是树叶节点，1是，0不是unsigned long count;//指示该字节出现的次数（用于统计概率）struct huffman_node_tag *parent;//指向父节点的指针，除根节点外，每个节点必有一个父节点union{struct{struct huffman_node_tag *zero, *one;//除树叶节点外，每个节点必有两个指向子节点的指针，左0右1};unsigned char symbol;};} huffman_node;typedef struct huffman_code_tag//码表的定义{/* The length of this code in bits. */unsigned long numbits;/* The bits that make up this code. The first   bit is at position 0 in bits[0]. The second   bit is at position 1 in bits[0]. The eighth   bit is at position 7 in bits[0]. The ninth   bit is at position 0 in bits[1]. */unsigned char *bits;} huffman_code;

二，实验流程及代码分析

1，Huffman编码流程

1）读入文件

2）第一次扫描文件，统计文件中各个字符出现的概率

3）建立码树

4）将码表写入文件

5）第二次扫描文件，对源文件进行编码并输出

2，代码分析

1）主函数操作：读取文件

intmain(int argc, char** argv){char memory = 0;//指示是否操作内存数据char compress = 1;//此处表示编码过程，若为0则表示为解码过程int opt;const char *file_in = NULL, *file_out = NULL;FILE *in = stdin;FILE *out = stdout;/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvm")) != -1)//读取命令行参数，此处利用getopt函数读取，最后一个参数是单个字符。{switch(opt){case 'i'://inputfile_in = optarg;break;case 'o'://outputfile_out = optarg;break;case 'c'://codecompress = 1;break;case 'd'://decodecompress = 0;break;case 'h'://输出参数用法的说明usage(stdout);return 0;case 'v'://输出版本号的信息version(stdout);return 0;case 'm'://对内存解码memory = 1;break;default:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in)//读取输入文件{in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out)//读取输出文件{out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}if(memory)//对内存数据进行编解码操作{return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}return compress ?huffman_encode_file(in, out) : huffman_decode_file(in, out);}

2）对文件编码：编码函数->huffman_encode_file()

inthuffman_encode_file(FILE *in, FILE *out){SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;/* Get the frequency of each symbol in the input file. */symbol_count = get_symbol_frequencies(&sf, in);//第一遍扫描，统计字节出现的频率（再文件中，符号用字节表示）/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);//建立码树，得到码表root = sf[0];//表示根节点为sf[0]/* Scan the file again and, using the table   previously built, encode it into the output file. */rewind(in);//回到文件头，为第二次扫描做准备rc = write_code_table(out, se, symbol_count);//再输出文件中写入码表if(rc == 0)rc = do_file_encode(in, out, se);//根据码表对文件的字节进行编码/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);return rc;}

2.1）统计字节出现概率的函数：get_symbol_frequencies(),上面编码函数涉及到的第一步

static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){int c;unsigned int total_count = 0;//初始化总字节数=0/* Set all frequencies to 0. */init_frequencies(pSF);//初始化所有字节的频率置0/* Count the frequency of each symbol in the input file. */while((c = fgetc(in)) != EOF)//读取每个字节{unsigned char uc = c;if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);//若没有该节点，则建立新节点++(*pSF)[uc]->count;//字节出现次数+1++total_count;//总字节数+1}return total_count;}

新建节点函数如下：

static huffman_node*new_leaf_node(unsigned char symbol){huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开空间p->isLeaf = 1;//初始化为叶节点p->symbol = symbol;p->count = 0;p->parent = 0;return p;}

2.2）建立码树

/* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;#if 0printf("BEFORE SORT\n");print_freqs(pSF);#endif/* Sort the symbol frequency array by ascending frequency. */qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//使用qsort函数对出现次数进行排序，下标为0的元素出现次数为最小（count最小）//SFComp是自定义的排序顺序，具体定义见后一个代码块#if 0printf("AFTER SORT\n");print_freqs(pSF);#endif/* Get the number of symbols. */for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n);//统计种类数，一个字节8bit，共256种，但是一个文件中未必256种全部出现。/* * Construct a Huffman tree. This code is based * on the algorithm given in Managing Gigabytes * by Ian Witten et al, 2nd edition, page 34. * Note that this implementation uses a simple * count instead of probability. */for(i = 0; i < n - 1; ++i){/* Set m1 and m2 to the two subsets of least probability. */m1 = (*pSF)[0];//将出现次数最少的（概率最小的两个节点设置为m1、m2）m2 = (*pSF)[1];/* Replace m1 and m2 with a set {m1, m2} whose probability * is the sum of that of m1 and m2. */(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);(*pSF)[1] = NULL;//上面两行是合并节点，令第二个节点为空了，/* Put newSet into the correct count position in pSF. */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);//利用qsort再次排序}//循环执行上述过程/* Build the SymbolEncoder array from the tree. */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));//给码字数组分配内存空间memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);//从树根开始向下走到每个分支的树叶节点，为每个符号构建码字return pSE;}

qsort中用到的自定义排序顺序：

/* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. */static intSFComp(const void *p1, const void *p2)//将节点按照出现概率由小到大排序{const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;//定义两个自定义的节点，用于比较，节点的定义见后一个代码块/* Sort all NULLs to the end. */if(hn1 == NULL && hn2 == NULL)//两个节点都空，返回0return 0;if(hn1 == NULL)//第一节点空，第二节点比第一节点大，返回1return 1;if(hn2 == NULL)//第二节点空，第一节点比第二节点大，返回-1return -1;if(hn1->count > hn2->count)//都不空，则比较count数值，1>2则返回1，1<2则返回-1return 1;else if(hn1->count < hn2->count)return -1;return 0;}

自定义的建立内部节点函数：

static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 0;//定义内部节点，此处等于0表示非树叶节点p->count = count;p->zero = zero;p->one = one;p->parent = 0;return p;}

2.3）生成码字

生成码字的过程如下：首先从根节点开始遍历，找到树叶节点，对于每一个树叶节点，都是从底层逐层向上回到根节点，在这个过程中依次编码。这里必须提到一点，这个码字有9位，而我们用来存储码字的数组为unsigned char类型，一个unsigned char的元素只有8位，所以存储时，分为高8位和低8位，低8位存储在bit[0]中，剩下的最高一位存储在bit[1]中的最低一位。本程序中，首先使用生成码字的函数new_code();此时生成的是从下层到上层的码字，也就是倒序的码字；随后再用倒序排列函数reverse_bits()得到正序的码字。

生成码字：

/* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){if(subtree == NULL)//空树则返回return;if(subtree->isLeaf)//isLeaf=1，则是树叶节点，对叶节点编码，调用new_code函数(*pSF)[subtree->symbol] = new_code(subtree);else//否则就先走左节点，再走右节点{build_symbol_encoder(subtree->zero, pSF);build_symbol_encoder(subtree->one, pSF);}}

新建码字的函数：

/* * new_code builds a huffman_code from a leaf in * a Huffman tree. */static huffman_code*new_code(const huffman_node* leaf){/* Build the huffman code by walking up to * the root node and then reversing the bits, * since the Huffman code is calculated by * walking down the tree. */unsigned long numbits = 0;//定义码字位数numbits，同时它也表示树从上到下的第几层，bits表示存码字的数组unsigned char* bits = NULL;huffman_code *p;while(leaf && leaf->parent)//由下到上，不是根节点时进入循环{huffman_node *parent = leaf->parent;//得到该节点父节点，由码字位数可以得到码字的位置和码字的字节数unsigned char cur_bit = (unsigned char)(numbits % 8);unsigned long cur_byte = numbits / 8;/* If we need another byte to hold the code,   then allocate it. */if(cur_bit == 0)//一旦当前比特位=0，则表示超过8位到了高8位范围{size_t newSize = cur_byte + 1;bits = (char*)realloc(bits, newSize);//新建字节保存高位的码字bits[newSize - 1] = 0;//新增加的字节初始化为0 /* Initialize the new byte. */}/* If a one must be added then or it in. If a zero * must be added then do nothing, since the byte * was initialized to zero. */if(leaf == parent->one)//如果是右节点，左0右1，把当前字节设为1bits[cur_byte] |= 1 << cur_bit;//和1做或操作使得当前位=1++numbits;//码字位数+1leaf = parent;//为了串接，将leaf赋parent}if(bits)reverse_bits(bits, numbits);//倒序函数进行码字倒序p = (huffman_code*)malloc(sizeof(huffman_code));//输出码字p->numbits = numbits;p->bits = bits;return p;}

倒序码字的函数：

static voidreverse_bits(unsigned char* bits, unsigned long numbits){unsigned long numbytes = numbytes_from_numbits(numbits);//判断码字占用几个字节unsigned char *tmp =    (unsigned char*)alloca(numbytes);//根据字节数开储存空间unsigned long curbit;long curbyte = 0;memset(tmp, 0, numbytes);for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8;if(curbit > 0 && curbit % 8 == 0)//判断当前位是第几位，到下一字节则字节数+1 ++curbyte;tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);//从后往前依次取每一位，再移位 }memcpy(bits, tmp, numbytes);}

判断字节数的函数和去除bits中第i位的函数：

static unsigned longnumbytes_from_numbits(unsigned long numbits){return numbits / 8 + (numbits % 8 ? 1 : 0);}/* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. */static unsigned charget_bit(unsigned char* bits, unsigned long i)//i/8取整，i%8取余，表示第几个字节的第几位{return (bits[i / 8] >> i % 8) & 1;}

2.4）写入码表，对文件编码

/* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded *   (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i)//统计码字种类{if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);if(fwrite(&i, sizeof(i), 1, out) != 1)return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i)//写入码表{huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* Write the 1 byte symbol. */fputc((unsigned char)i, out);//码表中首先写入字节符号/* Write the 1 byte code bit length. */fputc(p->numbits, out);//再写入码长/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);//再写入码字if(fwrite(p->bits, 1, numbytes, out) != numbytes)return 1;}}return 0;}

对文件进行编码：

static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){unsigned char curbyte = 0;unsigned char curbit = 0;int c;while((c = fgetc(in)) != EOF){unsigned char uc = (unsigned char)c;huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;//把马子中的一个比特位放在编码的字节的相应位/* If this byte is filled up then write it * out and reset the curbit and curbyte. */if(++curbit == 8)//每次写入1字节{fputc(curbyte, out);curbyte = 0;curbit = 0;}}}/* * If there is data in curbyte that has not been * output yet, which means that the last encoded * character did not fall on a byte boundary, * then output it. */if(curbit > 0)//不足以字节的情况等到下一个符号的编码，直至凑足一字节之后，再写入。从而才能实现压缩，一个字节没有多余的空位。fputc(curbyte, out);return 0;}

3）对文件解码

解码流程：同样要先读入文件，再读取码表，建立码树，最后根据码树解码。

3.1）对文件进行解码，首先读取码表

/* * read_code_table builds a Huffman tree from the code * in the in file. This function returns NULL on error. * The returned value should be freed with free_huffman_tree. */static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.   (it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1)//读取文件{free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0)//依次读取码表中的每一项：符号，码长，码字{int c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF){free_huffman_tree(root);//一次读一字节，第一字节symbolreturn NULL;}symbol = (unsigned char)c;if((c = fgetc(in)) == EOF){free_huffman_tree(root);//第二字节码长numbitsreturn NULL;}numbits = (unsigned char)c;numbytes = (unsigned char)numbytes_from_numbits(numbits);//计算一个码需多少字节bytes = (unsigned char*)malloc(numbytes);//开空间if(fread(bytes, 1, numbytes, in) != numbytes)//读numbytes个字节得到码字{free(bytes);free_huffman_tree(root);return NULL;}/* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. */for(curbit = 0; curbit < numbits; ++curbit)//读完码表的三种数据后就可以建立码树了{if(get_bit(bytes, curbit))//当前位为1，则建立右节点{if(p->one == NULL){//如果是最后一位，就建立树叶节点，否则就当做一个父节点，后续建立他的子节点p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else//否则建立左0节点{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;}

3.2）解码过程

inthuffman_decode_file(FILE *in, FILE *out)//解码函数{huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);//读入码表建立码树（利用上面的函数）if(!root)return 1;/* Decode the file. */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF)//循环读入字节，依次解码{unsigned char byte = (unsigned char)c;unsigned char mask = 1;//mask控制实现读入码字的每一位while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;//左移1位读下一字节if(p->isLeaf)//如果到了最底层的树叶节点，则输出叶节点中的符号，再回到根节点。data_count表示没解码的符号数-1{fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree(root);return 0;}

三，实验结果及总结

1、实验结果：

选用了10种不同格式的文件：

得到huffman编码后的.huff文件以及将频率分布结果写入excel表，统计了十种不同文件压缩前后的相关信息如下表所示：

以test10为例，概率统计的表格如下图所示，可以看出，出现概率大的符号码长短，这是Huffman可变长编码的特点，旨在尽量简短平均码长。

10种格式的频率分布表如下图所示：

2、实验总结

设置调试参数时要格外注意：

另就实验中遇到的问题做如下总结：

1）在计算平均码长时，在excel表格中应注意，不能单纯使用average函数，而应该用概率*码长再加和。否则会导致平均码长比真实值偏大，导致从数据上看，huffman编码并不能很好地使平均码长趋于下界（信源熵）

2）报错1：转换到COFF期间失败：文件无效或损坏

解决方法：项目属性->清单工具->输入与输出->嵌入清单->是改成否

此解决方法在每一次新建工程的时候都需做出改变。

3）报错2：PDB格式不兼容（可能是由于用vs2010打开了vc++的工程导致，只要按照如下方法解决再run即可）

解决方法：生成->清理解决方案或者重新生成解决方案


                                                     0        0           	
					
					   huffman编解码源代码
	  	   【数据压缩】Huffman编解码
	  	   Huffman编解码
	  	   Huffman编解码
	  	   Huffman编解码
	  	   Huffman编解码
	  	   Huffman编解码完全注释
	  	   Huffman编解码完全注释
	  	   实验3-huffman编解码
	  	   Huffman编解码实现文本压缩
	  	   Huffman 编解码--这回是正常树~
	  	   C++实现Huffman的编解码
	  	   实验三—Huffman编解码
	  	   数据压缩实验三：Huffman编解码
	  	   基于matlab的huffman编解码
	  	   huffman编解码实现（C语言实现版本）
	  	   huffman编解码算法实验与压缩效率分析
	  	   Huffman 编解码算法实现与压缩效率分析
	     		  
	  	   Windows下MySQL控制台不能删除数据库
	  	   数据库-通配符
	  	   基于CNN的Sementic Segmentation实用资源总结
	  	   Kubernetes编排工具-helm源码分析（Tiller中status命令处理流程）
	  	   验证码识别之旅(一)
	  	   Huffman编解码
	  	   Sublime_text3调色板的使用
	  	   第1章、安装和运行Lift
	  	   Android jdk1.8的使用配置并解决android jack编译乱码
	  	   融云、环信dlopen failed: library "libsqlite.so" not found
	  	   Numpy基础  --数组和矢量计算 利用Python进行数据分析读书笔记
	  	   QT5.2中文乱码解决问题
	  	   迷宫城堡   【求SCC 个数】
	  	   （13.1.2）PMBOK之二：五大过程组及其涉及的输入、输出、工具技术