Huffman编解码

来源:互联网 发布:做非标刀具软件 编辑:程序博客网 时间:2024/05/29 16:21

一,实验原理

1,Huffman编码原理:

1)统计:将每个符号出现的概率进行统计,并且从小到大排序。

2)合并:将出现概率最小的两个符号概率进行合并,反映在二叉树上:将两个树叶节点合并得到一个父节点。重复此步骤直到根节点。

3)编码:二叉树按照左0右1的规则编码,遍历整棵树,然后自根节点向下到每个树叶节点,可以得到每个树叶节点的编码。

2,基本数据格式定义:

节点和码字的定义见代码。

typedef struct huffman_node_tag//节点的定义{unsigned char isLeaf;//指示是否是树叶节点,1是,0不是unsigned long count;//指示该字节出现的次数(用于统计概率)struct huffman_node_tag *parent;//指向父节点的指针,除根节点外,每个节点必有一个父节点union{struct{struct huffman_node_tag *zero, *one;//除树叶节点外,每个节点必有两个指向子节点的指针,左0右1};unsigned char symbol;};} huffman_node;typedef struct huffman_code_tag//码表的定义{/* The length of this code in bits. */unsigned long numbits;/* The bits that make up this code. The first   bit is at position 0 in bits[0]. The second   bit is at position 1 in bits[0]. The eighth   bit is at position 7 in bits[0]. The ninth   bit is at position 0 in bits[1]. */unsigned char *bits;} huffman_code;

二,实验流程及代码分析

1,Huffman编码流程

1)读入文件

2)第一次扫描文件,统计文件中各个字符出现的概率

3)建立码树

4)将码表写入文件

5)第二次扫描文件,对源文件进行编码并输出

2,代码分析

1)主函数操作:读取文件

intmain(int argc, char** argv){char memory = 0;//指示是否操作内存数据char compress = 1;//此处表示编码过程,若为0则表示为解码过程int opt;const char *file_in = NULL, *file_out = NULL;FILE *in = stdin;FILE *out = stdout;/* Get the command line arguments. */while((opt = getopt(argc, argv, "i:o:cdhvm")) != -1)//读取命令行参数,此处利用getopt函数读取,最后一个参数是单个字符。{switch(opt){case 'i'://inputfile_in = optarg;break;case 'o'://outputfile_out = optarg;break;case 'c'://codecompress = 1;break;case 'd'://decodecompress = 0;break;case 'h'://输出参数用法的说明usage(stdout);return 0;case 'v'://输出版本号的信息version(stdout);return 0;case 'm'://对内存解码memory = 1;break;default:usage(stderr);return 1;}}/* If an input file is given then open it. */if(file_in)//读取输入文件{in = fopen(file_in, "rb");if(!in){fprintf(stderr,"Can't open input file '%s': %s\n",file_in, strerror(errno));return 1;}}/* If an output file is given then create it. */if(file_out)//读取输出文件{out = fopen(file_out, "wb");if(!out){fprintf(stderr,"Can't open output file '%s': %s\n",file_out, strerror(errno));return 1;}}if(memory)//对内存数据进行编解码操作{return compress ?memory_encode_file(in, out) : memory_decode_file(in, out);}return compress ?huffman_encode_file(in, out) : huffman_decode_file(in, out);}
2)对文件编码:编码函数->huffman_encode_file()

inthuffman_encode_file(FILE *in, FILE *out){SymbolFrequencies sf;SymbolEncoder *se;huffman_node *root = NULL;int rc;unsigned int symbol_count;/* Get the frequency of each symbol in the input file. */symbol_count = get_symbol_frequencies(&sf, in);//第一遍扫描,统计字节出现的频率(再文件中,符号用字节表示)/* Build an optimal table from the symbolCount. */se = calculate_huffman_codes(&sf);//建立码树,得到码表root = sf[0];//表示根节点为sf[0]/* Scan the file again and, using the table   previously built, encode it into the output file. */rewind(in);//回到文件头,为第二次扫描做准备rc = write_code_table(out, se, symbol_count);//再输出文件中写入码表if(rc == 0)rc = do_file_encode(in, out, se);//根据码表对文件的字节进行编码/* Free the Huffman tree. */free_huffman_tree(root);free_encoder(se);return rc;}
2.1)统计字节出现概率的函数:get_symbol_frequencies(),上面编码函数涉及到的第一步

static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){int c;unsigned int total_count = 0;//初始化总字节数=0/* Set all frequencies to 0. */init_frequencies(pSF);//初始化所有字节的频率置0/* Count the frequency of each symbol in the input file. */while((c = fgetc(in)) != EOF)//读取每个字节{unsigned char uc = c;if(!(*pSF)[uc])(*pSF)[uc] = new_leaf_node(uc);//若没有该节点,则建立新节点++(*pSF)[uc]->count;//字节出现次数+1++total_count;//总字节数+1}return total_count;}
新建节点函数如下:
static huffman_node*new_leaf_node(unsigned char symbol){huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开空间p->isLeaf = 1;//初始化为叶节点p->symbol = symbol;p->count = 0;p->parent = 0;return p;}
2.2)建立码树

/* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){unsigned int i = 0;unsigned int n = 0;huffman_node *m1 = NULL, *m2 = NULL;SymbolEncoder *pSE = NULL;#if 0printf("BEFORE SORT\n");print_freqs(pSF);#endif/* Sort the symbol frequency array by ascending frequency. */qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//使用qsort函数对出现次数进行排序,下标为0的元素出现次数为最小(count最小)//SFComp是自定义的排序顺序,具体定义见后一个代码块#if 0printf("AFTER SORT\n");print_freqs(pSF);#endif/* Get the number of symbols. */for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n);//统计种类数,一个字节8bit,共256种,但是一个文件中未必256种全部出现。/* * Construct a Huffman tree. This code is based * on the algorithm given in Managing Gigabytes * by Ian Witten et al, 2nd edition, page 34. * Note that this implementation uses a simple * count instead of probability. */for(i = 0; i < n - 1; ++i){/* Set m1 and m2 to the two subsets of least probability. */m1 = (*pSF)[0];//将出现次数最少的(概率最小的两个节点设置为m1、m2)m2 = (*pSF)[1];/* Replace m1 and m2 with a set {m1, m2} whose probability * is the sum of that of m1 and m2. */(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);(*pSF)[1] = NULL;//上面两行是合并节点,令第二个节点为空了,/* Put newSet into the correct count position in pSF. */qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);//利用qsort再次排序}//循环执行上述过程/* Build the SymbolEncoder array from the tree. */pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));//给码字数组分配内存空间memset(pSE, 0, sizeof(SymbolEncoder));build_symbol_encoder((*pSF)[0], pSE);//从树根开始向下走到每个分支的树叶节点,为每个符号构建码字return pSE;}
qsort中用到的自定义排序顺序:

/* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. */static intSFComp(const void *p1, const void *p2)//将节点按照出现概率由小到大排序{const huffman_node *hn1 = *(const huffman_node**)p1;const huffman_node *hn2 = *(const huffman_node**)p2;//定义两个自定义的节点,用于比较,节点的定义见后一个代码块/* Sort all NULLs to the end. */if(hn1 == NULL && hn2 == NULL)//两个节点都空,返回0return 0;if(hn1 == NULL)//第一节点空,第二节点比第一节点大,返回1return 1;if(hn2 == NULL)//第二节点空,第一节点比第二节点大,返回-1return -1;if(hn1->count > hn2->count)//都不空,则比较count数值,1>2则返回1,1<2则返回-1return 1;else if(hn1->count < hn2->count)return -1;return 0;}
自定义的建立内部节点函数:

static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));p->isLeaf = 0;//定义内部节点,此处等于0表示非树叶节点p->count = count;p->zero = zero;p->one = one;p->parent = 0;return p;}

2.3)生成码字

生成码字的过程如下:首先从根节点开始遍历,找到树叶节点,对于每一个树叶节点,都是从底层逐层向上回到根节点,在这个过程中依次编码。这里必须提到一点,这个码字有9位,而我们用来存储码字的数组为unsigned char类型,一个unsigned char的元素只有8位,所以存储时,分为高8位和低8位,低8位存储在bit[0]中,剩下的最高一位存储在bit[1]中的最低一位。本程序中,首先使用生成码字的函数new_code();此时生成的是从下层到上层的码字,也就是倒序的码字;随后再用倒序排列函数reverse_bits()得到正序的码字。

生成码字:

/* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){if(subtree == NULL)//空树则返回return;if(subtree->isLeaf)//isLeaf=1,则是树叶节点,对叶节点编码,调用new_code函数(*pSF)[subtree->symbol] = new_code(subtree);else//否则就先走左节点,再走右节点{build_symbol_encoder(subtree->zero, pSF);build_symbol_encoder(subtree->one, pSF);}}

新建码字的函数:

/* * new_code builds a huffman_code from a leaf in * a Huffman tree. */static huffman_code*new_code(const huffman_node* leaf){/* Build the huffman code by walking up to * the root node and then reversing the bits, * since the Huffman code is calculated by * walking down the tree. */unsigned long numbits = 0;//定义码字位数numbits,同时它也表示树从上到下的第几层,bits表示存码字的数组unsigned char* bits = NULL;huffman_code *p;while(leaf && leaf->parent)//由下到上,不是根节点时进入循环{huffman_node *parent = leaf->parent;//得到该节点父节点,由码字位数可以得到码字的位置和码字的字节数unsigned char cur_bit = (unsigned char)(numbits % 8);unsigned long cur_byte = numbits / 8;/* If we need another byte to hold the code,   then allocate it. */if(cur_bit == 0)//一旦当前比特位=0,则表示超过8位到了高8位范围{size_t newSize = cur_byte + 1;bits = (char*)realloc(bits, newSize);//新建字节保存高位的码字bits[newSize - 1] = 0;//新增加的字节初始化为0 /* Initialize the new byte. */}/* If a one must be added then or it in. If a zero * must be added then do nothing, since the byte * was initialized to zero. */if(leaf == parent->one)//如果是右节点,左0右1,把当前字节设为1bits[cur_byte] |= 1 << cur_bit;//和1做或操作使得当前位=1++numbits;//码字位数+1leaf = parent;//为了串接,将leaf赋parent}if(bits)reverse_bits(bits, numbits);//倒序函数进行码字倒序p = (huffman_code*)malloc(sizeof(huffman_code));//输出码字p->numbits = numbits;p->bits = bits;return p;}

倒序码字的函数:

static voidreverse_bits(unsigned char* bits, unsigned long numbits){unsigned long numbytes = numbytes_from_numbits(numbits);//判断码字占用几个字节unsigned char *tmp =    (unsigned char*)alloca(numbytes);//根据字节数开储存空间unsigned long curbit;long curbyte = 0;memset(tmp, 0, numbytes);for(curbit = 0; curbit < numbits; ++curbit){unsigned int bitpos = curbit % 8;if(curbit > 0 && curbit % 8 == 0)//判断当前位是第几位,到下一字节则字节数+1 ++curbyte;tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);//从后往前依次取每一位,再移位 }memcpy(bits, tmp, numbytes);}

判断字节数的函数和去除bits中第i位的函数:

static unsigned longnumbytes_from_numbits(unsigned long numbits){return numbits / 8 + (numbits % 8 ? 1 : 0);}/* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. */static unsigned charget_bit(unsigned char* bits, unsigned long i)//i/8取整,i%8取余,表示第几个字节的第几位{return (bits[i / 8] >> i % 8) & 1;}

2.4)写入码表,对文件编码

/* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded *   (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){unsigned long i, count = 0;/* Determine the number of entries in se. */for(i = 0; i < MAX_SYMBOLS; ++i)//统计码字种类{if((*se)[i])++count;}/* Write the number of entries in network byte order. */i = htonl(count);if(fwrite(&i, sizeof(i), 1, out) != 1)return 1;/* Write the number of bytes that will be encoded. */symbol_count = htonl(symbol_count);if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)return 1;/* Write the entries. */for(i = 0; i < MAX_SYMBOLS; ++i)//写入码表{huffman_code *p = (*se)[i];if(p){unsigned int numbytes;/* Write the 1 byte symbol. */fputc((unsigned char)i, out);//码表中首先写入字节符号/* Write the 1 byte code bit length. */fputc(p->numbits, out);//再写入码长/* Write the code bytes. */numbytes = numbytes_from_numbits(p->numbits);//再写入码字if(fwrite(p->bits, 1, numbytes, out) != numbytes)return 1;}}return 0;}

对文件进行编码:

static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){unsigned char curbyte = 0;unsigned char curbit = 0;int c;while((c = fgetc(in)) != EOF){unsigned char uc = (unsigned char)c;huffman_code *code = (*se)[uc];unsigned long i;for(i = 0; i < code->numbits; ++i){/* Add the current bit to curbyte. */curbyte |= get_bit(code->bits, i) << curbit;//把马子中的一个比特位放在编码的字节的相应位/* If this byte is filled up then write it * out and reset the curbit and curbyte. */if(++curbit == 8)//每次写入1字节{fputc(curbyte, out);curbyte = 0;curbit = 0;}}}/* * If there is data in curbyte that has not been * output yet, which means that the last encoded * character did not fall on a byte boundary, * then output it. */if(curbit > 0)//不足以字节的情况等到下一个符号的编码,直至凑足一字节之后,再写入。从而才能实现压缩,一个字节没有多余的空位。fputc(curbyte, out);return 0;}

3)对文件解码

解码流程:同样要先读入文件,再读取码表,建立码树,最后根据码树解码。

3.1)对文件进行解码,首先读取码表

/* * read_code_table builds a Huffman tree from the code * in the in file. This function returns NULL on error. * The returned value should be freed with free_huffman_tree. */static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){huffman_node *root = new_nonleaf_node(0, NULL, NULL);unsigned int count;/* Read the number of entries.   (it is stored in network byte order). */if(fread(&count, sizeof(count), 1, in) != 1)//读取文件{free_huffman_tree(root);return NULL;}count = ntohl(count);/* Read the number of data bytes this encoding represents. */if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1){free_huffman_tree(root);return NULL;}*pDataBytes = ntohl(*pDataBytes);/* Read the entries. */while(count-- > 0)//依次读取码表中的每一项:符号,码长,码字{int c;unsigned int curbit;unsigned char symbol;unsigned char numbits;unsigned char numbytes;unsigned char *bytes;huffman_node *p = root;if((c = fgetc(in)) == EOF){free_huffman_tree(root);//一次读一字节,第一字节symbolreturn NULL;}symbol = (unsigned char)c;if((c = fgetc(in)) == EOF){free_huffman_tree(root);//第二字节码长numbitsreturn NULL;}numbits = (unsigned char)c;numbytes = (unsigned char)numbytes_from_numbits(numbits);//计算一个码需多少字节bytes = (unsigned char*)malloc(numbytes);//开空间if(fread(bytes, 1, numbytes, in) != numbytes)//读numbytes个字节得到码字{free(bytes);free_huffman_tree(root);return NULL;}/* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. */for(curbit = 0; curbit < numbits; ++curbit)//读完码表的三种数据后就可以建立码树了{if(get_bit(bytes, curbit))//当前位为1,则建立右节点{if(p->one == NULL){//如果是最后一位,就建立树叶节点,否则就当做一个父节点,后续建立他的子节点p->one = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->one->parent = p;}p = p->one;}else//否则建立左0节点{if(p->zero == NULL){p->zero = curbit == (unsigned char)(numbits - 1)? new_leaf_node(symbol): new_nonleaf_node(0, NULL, NULL);p->zero->parent = p;}p = p->zero;}}free(bytes);}return root;}

3.2)解码过程

inthuffman_decode_file(FILE *in, FILE *out)//解码函数{huffman_node *root, *p;int c;unsigned int data_count;/* Read the Huffman code table. */root = read_code_table(in, &data_count);//读入码表建立码树(利用上面的函数)if(!root)return 1;/* Decode the file. */p = root;while(data_count > 0 && (c = fgetc(in)) != EOF)//循环读入字节,依次解码{unsigned char byte = (unsigned char)c;unsigned char mask = 1;//mask控制实现读入码字的每一位while(data_count > 0 && mask){p = byte & mask ? p->one : p->zero;mask <<= 1;//左移1位读下一字节if(p->isLeaf)//如果到了最底层的树叶节点,则输出叶节点中的符号,再回到根节点。data_count表示没解码的符号数-1{fputc(p->symbol, out);p = root;--data_count;}}}free_huffman_tree(root);return 0;}

三,实验结果及总结

1、实验结果:

选用了10种不同格式的文件:


得到huffman编码后的.huff文件以及将频率分布结果写入excel表,统计了十种不同文件压缩前后的相关信息如下表所示:


以test10为例,概率统计的表格如下图所示,可以看出,出现概率大的符号码长短,这是Huffman可变长编码的特点,旨在尽量简短平均码长。


10种格式的频率分布表如下图所示:

  

  

  

  

  

2、实验总结

设置调试参数时要格外注意:



另就实验中遇到的问题做如下总结:

1)在计算平均码长时,在excel表格中应注意,不能单纯使用average函数,而应该用概率*码长再加和。否则会导致平均码长比真实值偏大,导致从数据上看,huffman编码并不能很好地使平均码长趋于下界(信源熵)

2)报错1:转换到COFF期间失败:文件无效或损坏

解决方法:项目属性->清单工具->输入与输出->嵌入清单->是改成否

此解决方法在每一次新建工程的时候都需做出改变。

3)报错2:PDB格式不兼容(可能是由于用vs2010打开了vc++的工程导致,只要按照如下方法解决再run即可)

解决方法:生成->清理解决方案 或者 重新生成解决方案




                                             
0 0
原创粉丝点击