Huffman编解码

来源：互联网发布：网络直播开场白编辑：程序博客网时间：2024/06/05 02:08

一.实验原理

1.Huffman编码

(1) Huffman Coding(霍夫曼编码)是一种无失真编码的编码方式，Huffman编码是可变字长编码(VLC)的一种。

(2) Huffman编码基于信源的概率统计模型，它的基本思路是，出现概率大的信源符号编长码，出现概率小的信源符号编短码，从而使平均码长最小。

(3)在程序实现中常使用一种叫做树的数据结构实现Huffman编码，由它编出的码是即时码。

2.Huffman编码方法

(1)将文件以ASCII字符流的形式读入，统计每个符号的发生频率；

(4)重复3，直到最后得到和为1的根节点；

3.Huffman编码的数据结构

(1)Huffman节点结构

typedef struct huffman_node_tag{unsigned char isLeaf;   //是否为树叶（叶节点），1代表是，0代表否unsigned long count;    //节点代表的符号加权和struct huffman_node_tag *parent;   //父节点指针union   //共同体{struct{struct huffman_node_tag *zero, *one;   //如果不是叶节点，则此项为该节点左右孩子指针};unsigned char symbol;   //若是叶节点，则此项为symbol，表示某个信源符号（1字节）};} huffman_node;

(2)Huffman码结构

typedef struct huffman_code_tag{/* The length of this code in bits. */unsigned long numbits;   //码字长度，单位：位/* 码字的第1位存于bits[0]的第1位,         码字的第2位存于bits[0]的第的第2位,        码字的第8位存于bits[0]的第的第8位,        码字的第9位存于bits[1]的第的第1位 */ unsigned char *bits;   //指向该码比特串的指针} huffman_code;

二.试验流程

1.Huffman编码流程

2.Huffman解码流程

三.主要代码分析

该程序包括两个工程，“huff_run”为主工程，包括“huffcode.c”文件；“Huff_code”为库工程，包括“huffman.c”文件。

读入待编码的原文件（huffcode.c）

intmain(int argc, char** argv){char memory = 0;     //memory = 1表示对内存数据进行操作，memory = 0表示对文件数据进行操作char compress = 1;   //compress = 1表示编码，compress = 0表示解码int opt;const char *file_in = NULL, *file_out = NULL;//step1:add by yzhang for huffman statisticsconst char *file_out_table = NULL;//end by yzhangFILE *in = stdin;FILE *out = stdout;//step1:add by yzhang for huffman statisticsFILE * outTable = NULL;   //用于统计数据表格输出//end by yzhang/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1)   //获取命令行参数{switch(opt){case 'i':  //i:输入文件file_in = optarg; break;case 'o':  //o:输出文件file_out = optarg;break;case 'c':  //c:进行编码compress = 1;break;case 'd':  //d:进行解码compress = 0;break;case 'h':  //h:输出参数用法说明usage(stdout);return 0;case 'v':  //v:输出版本号信息version(stdout);return 0;case 'm':  //m:内存数据操作memory = 1;break;// by yzhang for huffman statisticscase 't':  //t:输出统计数据表格file_out_table = optarg;break;//end by yzhangdefault:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in)   //读取输入文件{in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out)   //读取输出文件{out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}//by yzhang for huffman statisticsif(file_out_table){outTable = fopen(file_out_table, "w");if(!outTable){fprintf(stderr,"Can't open output file '%s': %s\n",file_out_table, strerror(errno));return 1;}}//end by yzhangif(memory)   //对内存数据进行编解码操作{return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}if(compress)  //对文件进行编解码操作huffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)elsehuffman_decode_file(in, out);if(in)fclose(in);if(out)fclose(out);if(outTable)fclose(outTable);return 0;}

使用库函数中的getopt解析命令行参数，命令行参数可设置为 “-i test1.doc -o test1.huff -c -t test1.txt”

Huffman文件编码函数（huffman.c）

定义一个256个元素的指针数组，用以保存256个信源符号的频率，其下标对应相应字符的ASCII码。

#define MAX_SYMBOLS 256typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];   //信源符号数组typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];       //码字数组，用于保存码表inthuffman_encode_file(FILE *in, FILE *out, FILE *out_Table)  //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table){SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;    //step2:add by yzhang for huffman statisticshuffman_stat hs;//end by yzhang/* 第一遍扫描，得到文件中各符号出现的频率 */symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后，SF指针数组的每个元素的构成//step3:add by yzhang for huffman statistics,...  get the frequency of each symbol     huffST_getSymFrequencies(&sf,&hs,symbol_count);    //end by yzhang/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);    //根据得到的符号频率建立Huffman树和码表root = sf[0];     //sf[0]为根节点    //step3:add by yzhang for huffman statistics... output the statistics to filehuffST_getcodeword(se, &hs);output_huffman_statistics(&hs,out_Table);//end by yzhang/* 第二次扫描文件，使用构建的码表将其编码到输出文件中。 */rewind(in);    //回到文件头rc = write_code_table(out, se, symbol_count);   //在输出文件中写入码表if(rc == 0)rc = do_file_encode(in, out, se);           //根据码表进行编码/* 释放Huffman树 */free_huffman_tree(root);free_encoder(se);return rc;}

第一次扫描，统计文件中各个字符出现频率

static unsigned intget_symbol_frequencies_from_memory(SymbolFrequencies *pSF,   const unsigned char *bufin,   unsigned int bufinlen){unsigned int i;unsigned int total_count = 0;/* Set all frequencies to 0. */init_frequencies(pSF);/* Count the frequency of each symbol in the input file. */for(i = 0; i < bufinlen; ++i){unsigned char uc = bufin[i];if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);++(*pSF)[uc]->count;++total_count;}return total_count;}

new_leaf_node()函数:

static huffman_node*new_leaf_node(unsigned char symbol)   //新建一个叶节点，并初始化{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));   //给该叶节点分配空间p->isLeaf = 1;        //叶节点p->symbol = symbol;   //该叶节点存储的信源符号p->count = 0;         //信源符号数为0p->parent = 0;        //该叶节点的父节点为空return p;}

建立Huffman树并计算符号对应的Huffman码字(huffman.c)

1.按频率从小到大排序并建立Huffman树

static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;#if 1printf("BEFORE SORT\n");print_freqs(pSF);   //演示堆栈的使用#endif/* Sort the symbol frequency array by ascending frequency. */qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);   //将信源符号按出现频率大小排序.小概率符号在前，即 pSF数组中下标较小#if 1printf("AFTER SORT\n");print_freqs(pSF);#endif/* Get the number of symbols. */for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)  //得到文件中所出现的信源符号的种类总数 ;/* * Construct a Huffman tree. This code is based * on the algorithm given in Managing Gigabytes * by Ian Witten et al, 2nd edition, page 34. * Note that this implementation uses a simple * count instead of probability. */for(i = 0; i < n - 1; ++i)  //建立霍夫曼树。需要合并n-1次，所以循环n-1次。{/* 将m1、m2设置为当前频数中最小的两个信源符号 */m1 = (*pSF)[0];    m2 = (*pSF)[1];/* 将m1、m2合并为一个新的非叶节点加入到数组中，新节点左右孩子分别置为m1、m2的地址，频数为m1、m2的频数之和*/(*pSF)[0] = m1->parent = m2->parent =                   //将左右孩子的父节点指向该非叶节点new_nonleaf_node(m1->count + m2->count, m1, m2);    //(*pSF)[0]指向该非叶节点(*pSF)[1] = NULL;                                       //(*pSF)[1]置空/* 在m1、m2合并后重新排序 */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);}/* 由建立的Huffman树从树根开始计算每个符号的码字 */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));      //定义指针数组，数组中每个元素为指向码节点的指针memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);return pSE;}

qsort函数是编译器函数库自带的快速排序函数,变量分别是数组的起始地址，数组的元素数，每个元素的大小，比较函数的指针。SFComp函数：

static intSFComp(const void *p1, const void *p2){const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;/* 将所有NULL排到列表末尾 */if(hn1 == NULL && hn2 == NULL)return 0;        //若两节点都为空，则返回0，相等if(hn1 == NULL)return 1;        //若第一个节点为空，则返回1，hn2在前hn1在后if(hn2 == NULL)return -1;       //若第二个节点为空，则返回-1，hn1在前hn2在后if(hn1->count > hn2->count)     //若两个节点都不为空，则进行比较并返回值（由小到大排序）return 1;else if(hn1->count < hn2->count)return -1;return 0;}

new_nonleaf_node()函数:

static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)     //新建一个内部节点（非叶节点）{huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 0;         //不是叶节点p->count = count;      //设置符号数p->zero = zero;        //左子节点p->one = one;          //右子节点p->parent = 0;         //无父节点return p;}

2.遍历递归Huffman树，对存在的每个字符计算码字

static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){if(subtree == NULL)    //若子树为空，则编码结束return;if(subtree->isLeaf)    //若为叶节点，则产生码字(*pSF)[subtree->symbol] = new_code(subtree);else                   //若非叶节点{build_symbol_encoder(subtree->zero, pSF);   //遍历左子树（递归）build_symbol_encoder(subtree->one, pSF);    //遍历右子数}}//从Huffman树中构建一个码字，因为霍夫曼编码是从上往下顺序进行，所以通过从子节点从下往上走到根节点然后反转位来构建huffman代码//采用向上回溯方法static huffman_code*new_code(const huffman_node* leaf){/* Build the huffman code by walking up to * the root node and then reversing the bits, * since the Huffman code is calculated by * walking down the tree. */unsigned long numbits = 0;    //码长，单位：位unsigned char* bits = NULL;   //指向码字的指针huffman_code *p;while(leaf && leaf->parent)   //判断节点和父节点是否存在。leaf为NULL时，不编码；parent为NULL时，到达树根，不编码{huffman_node *parent = leaf->parent;unsigned char cur_bit = (unsigned char)(numbits % 8);   //确定所编bit在当前byte中的位置unsigned long cur_byte = numbits / 8;                   //当前是第几个byte/* 码字长度超过一个字节则再分配一个字节 */if(cur_bit == 0){size_t newSize = cur_byte + 1;bits = (char*)realloc(bits, newSize);/*realloc()函数先判断当前的指针是否有足够的连续空间，如果有，扩大bits指向的地址，并且将bits返回，如果空间不够，先按照newsize指定的大小分配空间，将原有数据从头到尾拷贝到新分配的内存区域，而后释放原来bits所指内存区域(指针是自动释放，不需要使用free)，同时返回新分配的内存区域的首地址。*/bits[newSize - 1] = 0; //初始化新分配的1字节}/* If a one must be added then or it in. If a zero * must be added then do nothing, since the byte * was initialized to zero. */if(leaf == parent->one)      bits[cur_byte] |= 1 << cur_bit;   //若为右子节点，则左移1至当前byte的当前位，再将其与bits[cur_byte]进行或操作；若为左子节点，则不改变数值（初始化为0）++numbits;            //码长+1leaf = parent;        //将下一节点移至父节点处}if(bits)          //码字逆序（编码从叶到根，码字从根到叶）reverse_bits(bits, numbits);p = (huffman_code*)malloc(sizeof(huffman_code));p->numbits = numbits;    //码长赋给节点的numbitsp->bits = bits;          //码字付给节点的bitsreturn p;}//码字逆序static voidreverse_bits(unsigned char* bits, unsigned long numbits){unsigned long numbytes = numbytes_from_numbits(numbits);  //存储码字需要的字节数，numbits除8后上取整得到unsigned char *tmp =    (unsigned char*)alloca(numbytes);        //alloca 适用于堆栈上的空间，可自动释放unsigned long curbit;long curbyte = 0;      //记录要反转的二进制码所在的的数组下标memset(tmp, 0, numbytes);for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8;   //判断当前位在字节中的位数 if(curbit > 0 && curbit % 8 == 0)++curbyte;           //若bit数超出8，则字节数加1 tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);   //从后往前取码字中的每一位，移位到所在字节的正确位置 }memcpy(bits, tmp, numbytes);}/* 化bit为byte处理 */static unsigned longnumbytes_from_numbits(unsigned long numbits){return numbits / 8 + (numbits % 8 ? 1 : 0);     //确定字节数，利用取整+是否有余数来确定(8位一字节)}/*返回位数返回值的第 i/8 个字节的第 i%8 位*/static unsigned charget_bit(unsigned char* bits, unsigned long i){return (bits[i / 8] >> i % 8) & 1;   //第i位在第 i/8 字节的第 i%8 位，把这一位移到字节最低位，和 0000 0001 做与，只留下这一位}3.将Huffman码表写入文件static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i)   // 确定文件中码字的实际种类，保存在se中{if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，//因此big-endian作为network byte order，little-endian作为host byte order。//little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变if(fwrite(&i, sizeof(i), 1, out) != 1)return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);     //写入要被编码的字节种类的个数if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)return 1;/* 写入码表 */for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;   //字节数/* 写入1字节符号 */fputc((unsigned char)i, out);/* 写入1字节码长 */fputc(p->numbits, out);/* 写入numbytes字节的码字 */numbytes = numbytes_from_numbits(p->numbits);if(fwrite(p->bits, 1, numbytes, out) != numbytes)return 1;}}return 0;}

二次扫描文件，对文件查表进行Huffman编码

static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){unsigned char curbyte = 0;unsigned char curbit = 0;int c;while((c = fgetc(in)) != EOF)  //对文件进行遍历{unsigned char uc = (unsigned char)c;huffman_code *code = (*se)[uc];   //查表unsigned long i;for(i = 0; i < code->numbits; ++i)  //将码字写入文件{/* 把当前比特位加到编码字节的相应位置 */curbyte |= get_bit(code->bits, i) << curbit;/* 如果该字节被填满，则写入该字节并重新设置curbit和curbyte */if(++curbit == 8){fputc(curbyte, out);curbyte = 0;curbit = 0;}}}

输出统计结果

typedef struct huffman_statistics_result{float freq[256];    //记录每个信源符号出现的频次unsigned long numbits[256];unsigned char bits[256][100];   //用来存放码字，规定每个码字的最大长度为100}huffman_stat;int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count){int i,count =0;for(i = 0; i < MAX_SYMBOLS; ++i){if((*SF)[i]){st->freq[i]=(float)(*SF)[i]->count/total_count;count+=(*SF)[i]->count;}else {st->freq[i]= 0;}}if(count==total_count)return 1;elsereturn 0;}int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st){unsigned long i,j;for(i = 0; i < MAX_SYMBOLS; ++i){huffman_code *p = (*se)[i];if(p){unsigned int numbytes;            st->numbits[i] = p->numbits;numbytes = numbytes_from_numbits(p->numbits);for (j=0;j<numbytes;j++)    st->bits[i][j] = p->bits[j];}elsest->numbits[i] =0;}return 0;}void output_huffman_statistics(huffman_stat *st,FILE *out_Table){int i,j;unsigned char c;fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");for(i = 0; i < MAX_SYMBOLS; ++i){fprintf(out_Table,"%d\t   ",i);fprintf(out_Table,"%f\t   ",st->freq[i]);fprintf(out_Table,"%d\t    ",st->numbits[i]);if(st->numbits[i]){for(j = 0; j < st->numbits[i]; ++j){c =get_bit(st->bits[i], j);fprintf(out_Table,"%d",c);}}fprintf(out_Table,"\n");}}

输出的Huffman编码文件的存储示意图：

Huffman编码，读取码并重建此Huffman树

static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.   (it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1)   // 读取码表中的符号数{free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)    //读取此编码表示的数据字节数 {free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0)   //读取码表{int c;unsigned int curbit;unsigned char symbol;    //符号unsigned char numbits;   //码长unsigned char numbytes;  //字节数unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF)   //逐字节读入{free_huffman_tree(root);return NULL;}symbol = (unsigned char)c;if((c = fgetc(in)) == EOF){free_huffman_tree(root);return NULL;}numbits = (unsigned char)c;numbytes = (unsigned char)numbytes_from_numbits(numbits);  //计算保存一个码长所需要的字节数bytes = (unsigned char*)malloc(numbytes);                  //分配相应的空间if(fread(bytes, 1, numbytes, in) != numbytes)              //读取码字{free(bytes);free_huffman_tree(root);return NULL;}/* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. *///根据码表重建Huffman树，由根节点至叶节点  for(curbit = 0; curbit < numbits; ++curbit){if(get_bit(bytes, curbit))  {if(p->one == NULL)  //若当前读取位为1，则建立右子节点{/* 判断是否是当前码字的最后一位，若是则新建叶结点，若不是则新建非叶结点 */p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;  //设置右子节点的父节点 }p = p->one;}else                   //若当前读取位为0，则建立左子节点{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;      //返回Huffman树根结点，才能进行之后的遍历}

读取Huffman码字，根据Huffman树进行解码输出

inthuffman_decode_file(FILE *in, FILE *out){huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);   //读取码表if(!root)return 1;   //Huffman树建立失败/* 文件解码 */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF)  // data_count >0 ：逻辑上仍有数据；(c = fgetc(in)) != EOF)：文件中仍有数据 {unsigned char byte = (unsigned char)c; //1bit的码字unsigned char mask = 1;                // mask用于逐位读出码字while(data_count > 0 && mask)         {p = byte & mask ? p->one : p->zero;  //走huffman树，若当前字节为0则走左子树，否则走右子树mask <<= 1;                          //mask每个循环进行左移if(p->isLeaf)                     //走到叶结点，解码完毕 {fputc(p->symbol, out);        //输出叶节点中存储的符号p = root;                     //转向根节点，进行下一个码字的解码--data_count;                 //将没解码的码字数-1}}}free_huffman_tree(root);            // 所有码字均已解码输出，文件解码完毕 return 0;}

四.实验结果

0 0