实验三 Huffman编解码算法实现与压缩效率分析
来源:互联网 发布:淘宝客怎样设置佣金 编辑:程序博客网 时间:2024/05/24 05:49
一、Huffman编解码原理
1. Huffman编码
对原始文件进行Huffman编码,首先需要解决以下几点问题:
- 文件符号的概率分布情况是怎样的?
- Huffman树是如何建立的?
- 建立起Huffman树后,又是怎样读出符号对应码字的?
这三个问题在程序中的实现思路如下图:
将待编码文件里的数据参照已形成的Huffman码表一一进行转换,就可以得到编码后的文件了。
2. Huffman解码
Huffman解码是查表+翻译的过程。读取随接收文件传来的码表后,再逐位读取文件实际数据,对照码表进行翻译即可。
二、程序实现
流程中最关键的对Huffman树的操作在程序中主要通过两个结构体实现:Huffman_node和Huffman_code。
建立的二叉树上每个节点都以Huffman_node类型存在。节点之间的主要关系有父子、兄弟,Huffman_node中定义了指向父节点的指针*parent和指向孩子的指针*zero, *one来表述节点与节点之间的关系。除此之外,还有节点本身的属性:isLeaf、count、symbol。
而编码码字定义为了Huffman_code,本身属性包括码字占用的比特数和码字本身。
具体程序如下,部分理解在注释中给出。
Huffcode.c
/* * huffcode - Encode/Decode files using Huffman encoding. * http://huffman.sourceforge.net * Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include "huffman.h"#include <stdio.h>#include <string.h>#include <errno.h>#include <stdlib.h>#include <assert.h>#ifdef WIN32#include <malloc.h>extern int getopt(int, char**, char*);extern char* optarg;#else#include <unistd.h>#endifstatic int memory_encode_file(FILE *in, FILE *out);static int memory_decode_file(FILE *in, FILE *out);static voidversion(FILE *out){ fputs("huffcode 0.3\n" "Copyright (C) 2003 Douglas Ryan Richardson" "; Gauss Interprise, Inc\n", out);}static voidusage(FILE* out){ fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n" "-i - input file (default is standard input)\n" "-o - output file (default is standard output)\n" "-d - decompress\n" "-c - compress (default)\n" "-m - read file into memory, compress, then write to file (not default)\n", // step1: by yzhang, for huffman statistics "-t - output huffman statistics\n", //step1:end by yzhang out);}intmain(int argc, char** argv){ char memory = 0; char compress = 1; int opt; const char *file_in = NULL, *file_out = NULL; //step1:add by yzhang for huffman statistics const char *file_out_table = NULL; //end by yzhang FILE *in = stdin; FILE *out = stdout; //step1:add by yzhang for huffman statistics FILE * outTable = NULL; //end by yzhang /* Get the command line arguments. */ while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对 { switch(opt) { case 'i': file_in = optarg; break; case 'o': file_out = optarg; break; case 'c': compress = 1;//压缩 break; case 'd': compress = 0;//解压 break; case 'h': usage(stdout); return 0; case 'v': version(stdout); return 0; case 'm': memory = 1; break; // by yzhang for huffman statistics case 't': file_out_table = optarg; break; //end by yzhang default: usage(stderr); return 1; } } /* If an input file is given then open it. */ if(file_in) { in = fopen(file_in, "rb"); if(!in) { fprintf(stderr, "Can't open input file '%s': %s\n", file_in, strerror(errno)); return 1; } } /* If an output file is given then create it. */ if(file_out) { out = fopen(file_out, "wb"); if(!out) { fprintf(stderr, "Can't open output file '%s': %s\n", file_out, strerror(errno)); return 1; } } //by yzhang for huffman statistics if(file_out_table) { outTable = fopen(file_out_table, "w"); if(!outTable) { fprintf(stderr, "Can't open output file '%s': %s\n", file_out_table, strerror(errno)); return 1; } } //end by yzhang if(memory) { return compress ? memory_encode_file(in, out) : memory_decode_file(in, out); } if(compress) //change by yzhang huffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable) else huffman_decode_file(in, out); if(in) fclose(in); if(out) fclose(out); if(outTable) fclose(outTable); return 0;}static intmemory_encode_file(FILE *in, FILE *out){ unsigned char *buf = NULL, *bufout = NULL; unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0; assert(in && out); /* Read the file into memory. */ while(!feof(in)) { unsigned char *tmp; len += inc; tmp = (unsigned char*)realloc(buf, len); if(!tmp) { if(buf) free(buf); return 1; } buf = tmp; cur += fread(buf + cur, 1, inc, in); } if(!buf) return 1; /* Encode the memory. */ if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen)) { free(buf); return 1; } free(buf); /* Write the memory to the file. */ if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen) { free(bufout); return 1; } free(bufout); return 0;}static intmemory_decode_file(FILE *in, FILE *out){ unsigned char *buf = NULL, *bufout = NULL; unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0; assert(in && out); /* Read the file into memory. */ while(!feof(in)) { unsigned char *tmp; len += inc; tmp = (unsigned char*)realloc(buf, len); if(!tmp) { if(buf) free(buf); return 1; } buf = tmp; cur += fread(buf + cur, 1, inc, in); } if(!buf) return 1; /* Decode the memory. */ if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen)) { free(buf); return 1; } free(buf); /* Write the memory to the file. */ if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen) { free(bufout); return 1; } free(bufout); return 0;}
Huffman.c
/* * huffman - Encode/Decode files using Huffman encoding. * http://huffman.sourceforge.net * Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include "huffman.h"#ifdef WIN32#include <winsock2.h>#include <malloc.h>#define alloca _alloca#else#include <netinet/in.h>#endiftypedef struct huffman_node_tag{ unsigned char isLeaf; unsigned long count; struct huffman_node_tag *parent; union { struct { struct huffman_node_tag *zero, *one; }; unsigned char symbol; };} huffman_node;typedef struct huffman_code_tag{ /* The length of this code in bits. */ unsigned long numbits; /* The bits that make up this code. The first bit is at position 0 in bits[0]. The second bit is at position 1 in bits[0]. The eighth bit is at position 7 in bits[0]. The ninth bit is at position 0 in bits[1]. */ unsigned char *bits;} huffman_code;//step2:add by yzhang for huffman statistics//存放信源符号的信息:符号频率、比特数、符号码字typedef struct huffman_statistics_result{ float freq[256]; unsigned long numbits[256]; unsigned char bits[256][100];}huffman_stat;/*huffman_stat *init_huffstatistics(){ huffman_stat *p; int i; p = (huffman_stat*)malloc(sizeof(huffman_stat)); p->freq = (float *)malloc(sizeof(float)*256 ); p->numbits = (unsigned long *) malloc(sizeof(unsigned long)*256); for (i=0 ; i<256;i++) p->bits[i] = (unsigned char *)malloc(sizeof(unsigned char)*100); return p;}*///end by yzhang//将bit数转换为其对应的byte数,不能被8整除的部分要多分配一整个byte给它static unsigned longnumbytes_from_numbits(unsigned long numbits){ return numbits / 8 + (numbits % 8 ? 1 : 0);}/* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. */static unsigned charget_bit(unsigned char* bits, unsigned long i){ return (bits[i / 8] >> i % 8) & 1;}//由于程序中从二叉树形成码字的过程是从叶到根的,所以需要bit反转函数来获得顺序正确的码字,同时以byte为单位对其进行规范化//例:传入倒序码字为010111011,通过bit反转函数变为00000001 10111010static voidreverse_bits(unsigned char* bits, unsigned long numbits){ unsigned long numbytes = numbytes_from_numbits(numbits); unsigned char *tmp = (unsigned char*)alloca(numbytes);//alloca与malloc功能相似,但alloca会自动释放申请的空间 unsigned long curbit; long curbyte = 0; memset(tmp, 0, numbytes);//将tmp指向空间的前numbytes个字节内容全部置0 for(curbit = 0; curbit < numbits; ++curbit) { unsigned int bitpos = curbit % 8; //如果一个byte写满了,就跳到下一个byte继续写 if(curbit > 0 && curbit % 8 == 0) ++curbyte; //通过get_bit函数从传入的bits里获得当前操作的比特结果,用移位运算将其移动到在一个byte里对应的位置 //由于tmp的指向操作是以byte为单位的,这里只能通过按位取或(|=)来把bit一个一个写到tmp指向的空间里去 //bit反转是靠numbits-curbit-1实现的 tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos); } memcpy(bits, tmp, numbytes);//把反转后的tmp写回到bits里}/* * new_code builds a huffman_code from a leaf in * a Huffman tree. */static huffman_code*new_code(const huffman_node* leaf){ /* Build the huffman code by walking up to * the root node and then reversing the bits, * since the Huffman code is calculated by * walking down the tree. */ unsigned long numbits = 0; unsigned char* bits = NULL; huffman_code *p; //此段while循环的目的是从传入的叶结点开始向上进行寻根,得到该叶结点对应的码字 while(leaf && leaf->parent) { huffman_node *parent = leaf->parent; unsigned char cur_bit = (unsigned char)(numbits % 8); unsigned long cur_byte = numbits / 8; /* If we need another byte to hold the code, then allocate it. */ if(cur_bit == 0) { size_t newSize = cur_byte + 1; bits = (unsigned char*)realloc(bits, newSize);//把bits所占的空间大小调整为newSize个字节 bits[newSize - 1] = 0; /* Initialize the new byte. */ } /* If a one must be added then or it in. If a zero * must be added then do nothing, since the byte * was initialized to zero. */ if(leaf == parent->one)//如果叶结点的地址等于该叶结点的爹妈的1孩子地址,则进行对应的移位操作 bits[cur_byte] |= 1 << cur_bit; ++numbits; leaf = parent; } if(bits) reverse_bits(bits, numbits); p = (huffman_code*)malloc(sizeof(huffman_code)); p->numbits = numbits; p->bits = bits; return p;//p里包含了编完的码字、码字长度}#define MAX_SYMBOLS 256typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];//传入符号,建立其对应的叶结点,设置参数static huffman_node*new_leaf_node(unsigned char symbol){ huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); p->isLeaf = 1; p->symbol = symbol; p->count = 0; p->parent = 0; return p;}//建立一个非叶结点,并将它的0、1孩子地址设置为传入的0、1结点地址static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){ huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); p->isLeaf = 0; p->count = count; p->zero = zero; p->one = one; p->parent = 0; return p;}static voidfree_huffman_tree(huffman_node *subtree){ if(subtree == NULL) return; if(!subtree->isLeaf) { free_huffman_tree(subtree->zero); free_huffman_tree(subtree->one); } free(subtree);}static voidfree_code(huffman_code* p){ free(p->bits); free(p);}static voidfree_encoder(SymbolEncoder *pSE){ unsigned long i; for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*pSE)[i]; if(p) free_code(p); } free(pSE);}static voidinit_frequencies(SymbolFrequencies *pSF){ memset(*pSF, 0, sizeof(SymbolFrequencies));#if 0 unsigned int i; for(i = 0; i < MAX_SYMBOLS; ++i) { unsigned char uc = (unsigned char)i; (*pSF)[i] = new_leaf_node(uc); }#endif}typedef struct buf_cache_tag{ unsigned char *cache; unsigned int cache_len; unsigned int cache_cur; unsigned char **pbufout; unsigned int *pbufoutlen;} buf_cache;static int init_cache(buf_cache* pc, unsigned int cache_size, unsigned char **pbufout, unsigned int *pbufoutlen){ assert(pc && pbufout && pbufoutlen); if(!pbufout || !pbufoutlen) return 1; pc->cache = (unsigned char*)malloc(cache_size); pc->cache_len = cache_size; pc->cache_cur = 0; pc->pbufout = pbufout; *pbufout = NULL; pc->pbufoutlen = pbufoutlen; *pbufoutlen = 0; return pc->cache ? 0 : 1;}static void free_cache(buf_cache* pc){ assert(pc); if(pc->cache) { free(pc->cache); pc->cache = NULL; }}static int flush_cache(buf_cache* pc){ assert(pc); if(pc->cache_cur > 0) { unsigned int newlen = pc->cache_cur + *pc->pbufoutlen; unsigned char* tmp = realloc(*pc->pbufout, newlen); if(!tmp) return 1; memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur); *pc->pbufout = tmp; *pc->pbufoutlen = newlen; pc->cache_cur = 0; } return 0;}static int write_cache(buf_cache* pc, const void *to_write, unsigned int to_write_len){ unsigned char* tmp; assert(pc && to_write); assert(pc->cache_len >= pc->cache_cur); /* If trying to write more than the cache will hold * flush the cache and allocate enough space immediately, * that is, don't use the cache. */ if(to_write_len > pc->cache_len - pc->cache_cur) { unsigned int newlen; flush_cache(pc); newlen = *pc->pbufoutlen + to_write_len; tmp = realloc(*pc->pbufout, newlen); if(!tmp) return 1; memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len); *pc->pbufout = tmp; *pc->pbufoutlen = newlen; } else { /* Write the data to the cache. */ memcpy(pc->cache + pc->cache_cur, to_write, to_write_len); pc->cache_cur += to_write_len; } return 0;}//为信源符号建立叶结点,统计次数static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){ int c; unsigned int total_count = 0; /* Set all frequencies to 0. */ init_frequencies(pSF); /* Count the frequency of each symbol in the input file. */ while((c = fgetc(in)) != EOF) { unsigned char uc = c; if(!(*pSF)[uc])//如果第一次遇到这个符号,则新建该符号的叶结点 (*pSF)[uc] = new_leaf_node(uc); ++(*pSF)[uc]->count;//对所有符号出现的次数分别进行计数 ++total_count; } return total_count;}static unsigned intget_symbol_frequencies_from_memory(SymbolFrequencies *pSF, const unsigned char *bufin, unsigned int bufinlen){ unsigned int i; unsigned int total_count = 0; /* Set all frequencies to 0. */ init_frequencies(pSF); /* Count the frequency of each symbol in the input file. */ for(i = 0; i < bufinlen; ++i) { unsigned char uc = bufin[i]; if(!(*pSF)[uc]) (*pSF)[uc] = new_leaf_node(uc); ++(*pSF)[uc]->count; ++total_count; } return total_count;}/* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. */static intSFComp(const void *p1, const void *p2){ const huffman_node *hn1 = *(const huffman_node**)p1; const huffman_node *hn2 = *(const huffman_node**)p2; /* Sort all NULLs to the end. */ if(hn1 == NULL && hn2 == NULL) return 0; if(hn1 == NULL) return 1; if(hn2 == NULL) return -1; if(hn1->count > hn2->count) return 1; else if(hn1->count < hn2->count) return -1; return 0;}#if 1static voidprint_freqs(SymbolFrequencies * pSF){ size_t i; for(i = 0; i < MAX_SYMBOLS; ++i) { if((*pSF)[i]) printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count); else printf("NULL\n"); }}#endif/* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){ if(subtree == NULL) return; //如果传入的结点是叶结点,对其进行编码并存放在对应的指针指向的空间里;如果不是,用递归方法不断调用自身传入该结点的左、右孩子,直到叶结点 if(subtree->isLeaf) (*pSF)[subtree->symbol] = new_code(subtree); else { //递归 build_symbol_encoder(subtree->zero, pSF); build_symbol_encoder(subtree->one, pSF); }}/* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){ unsigned int i = 0; unsigned int n = 0; huffman_node *m1 = NULL, *m2 = NULL; SymbolEncoder *pSE = NULL;#if 1 printf("BEFORE SORT\n"); print_freqs(pSF); //演示堆栈的使用#endif /* Sort the symbol frequency array by ascending frequency. */ //qsort是自带的快速排序函数,参数为待排序数组的首地址(*pSF),排序元素数量(MAX_SYMBOLS),每个元素的长度(sizeof((*pSF)[0])),自定义的比较函数(SFComp,返回1则前〉后,-1则后〉前) qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp); //讲解SFComp函数的作用,断点在调试程序里的作用#if 1 printf("AFTER SORT\n"); print_freqs(pSF);#endif /* Get the number of symbols. */ for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n) ; /* * Construct a Huffman tree. This code is based * on the algorithm given in Managing Gigabytes * by Ian Witten et al, 2nd edition, page 34. * Note that this implementation uses a simple * count instead of probability. */ for(i = 0; i < n - 1; ++i) { /* Set m1 and m2 to the two subsets of least probability. */ m1 = (*pSF)[0]; m2 = (*pSF)[1]; /* Replace m1 and m2 with a set {m1, m2} whose probability * is the sum of that of m1 and m2. */ (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count + m2->count, m1, m2); (*pSF)[1] = NULL; /* Put newSet into the correct count position in pSF. */ qsort((*pSF), n, sizeof((*pSF)[0]), SFComp); } /* Build the SymbolEncoder array from the tree. */ pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder)); memset(pSE, 0, sizeof(SymbolEncoder)); build_symbol_encoder((*pSF)[0], pSE); return pSE;}/* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded * (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){ unsigned long i, count = 0; /* Determine the number of entries in se. */ for(i = 0; i < MAX_SYMBOLS; ++i) { if((*se)[i]) ++count; } /* Write the number of entries in network byte order. */ i = htonl(count); //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D , //因此big-endian作为network byte order,little-endian作为host byte order。 //little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变 if(fwrite(&i, sizeof(i), 1, out) != 1) return 1; /* Write the number of bytes that will be encoded. */ symbol_count = htonl(symbol_count); if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) return 1; /* Write the entries. */ for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*se)[i]; if(p) { unsigned int numbytes; /* Write the 1 byte symbol. */ fputc((unsigned char)i, out); /* Write the 1 byte code bit length. */ fputc(p->numbits, out); /* Write the code bytes. */ numbytes = numbytes_from_numbits(p->numbits); if(fwrite(p->bits, 1, numbytes, out) != numbytes) return 1; } } return 0;}/* * Allocates memory and sets *pbufout to point to it. The memory * contains the code table. */static intwrite_code_table_to_memory(buf_cache *pc, SymbolEncoder *se, unsigned int symbol_count){ unsigned long i, count = 0; /* Determine the number of entries in se. */ for(i = 0; i < MAX_SYMBOLS; ++i) { if((*se)[i]) ++count; } /* Write the number of entries in network byte order. */ i = htonl(count); if(write_cache(pc, &i, sizeof(i))) return 1; /* Write the number of bytes that will be encoded. */ symbol_count = htonl(symbol_count); if(write_cache(pc, &symbol_count, sizeof(symbol_count))) return 1; /* Write the entries. */ for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*se)[i]; if(p) { unsigned int numbytes; /* The value of i is < MAX_SYMBOLS (256), so it can be stored in an unsigned char. */ unsigned char uc = (unsigned char)i; /* Write the 1 byte symbol. */ if(write_cache(pc, &uc, sizeof(uc))) return 1; /* Write the 1 byte code bit length. */ uc = (unsigned char)p->numbits; if(write_cache(pc, &uc, sizeof(uc))) return 1; /* Write the code bytes. */ numbytes = numbytes_from_numbits(p->numbits); if(write_cache(pc, p->bits, numbytes)) return 1; } } return 0;}/* * read_code_table builds a Huffman tree from the code * in the in file. This function returns NULL on error. * The returned value should be freed with free_huffman_tree. */static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){ //在解码端重建huffman树 huffman_node *root = new_nonleaf_node(0, NULL, NULL); unsigned int count; /* Read the number of entries. (it is stored in network byte order). */ if(fread(&count, sizeof(count), 1, in) != 1) { free_huffman_tree(root); return NULL; } count = ntohl(count);//将一个无符号长整形数从网络字节顺序转换为主机字节顺序 /* Read the number of data bytes this encoding represents. */ if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1) { free_huffman_tree(root); return NULL; } *pDataBytes = ntohl(*pDataBytes); /* Read the entries. */ while(count-- > 0) { int c; unsigned int curbit; unsigned char symbol; unsigned char numbits; unsigned char numbytes; unsigned char *bytes; huffman_node *p = root; if((c = fgetc(in)) == EOF)//读取符号并判断 { free_huffman_tree(root); return NULL; } symbol = (unsigned char)c; if((c = fgetc(in)) == EOF)//读取字符长度并判断 { free_huffman_tree(root); return NULL; } numbits = (unsigned char)c; numbytes = (unsigned char)numbytes_from_numbits(numbits); bytes = (unsigned char*)malloc(numbytes); if(fread(bytes, 1, numbytes, in) != numbytes) { free(bytes); free_huffman_tree(root); return NULL; } /* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. */ for(curbit = 0; curbit < numbits; ++curbit) { if(get_bit(bytes, curbit)) { if(p->one == NULL) { p->one = curbit == (unsigned char)(numbits - 1) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->one->parent = p; } p = p->one; } else { if(p->zero == NULL) { p->zero = curbit == (unsigned char)(numbits - 1) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->zero->parent = p; } p = p->zero; } } free(bytes); } return root;}static intmemread(const unsigned char* buf, unsigned int buflen, unsigned int *pindex, void* bufout, unsigned int readlen){ assert(buf && pindex && bufout); assert(buflen >= *pindex); if(buflen < *pindex) return 1; if(readlen + *pindex >= buflen) return 1; memcpy(bufout, buf + *pindex, readlen); *pindex += readlen; return 0;}static huffman_node*read_code_table_from_memory(const unsigned char* bufin, unsigned int bufinlen, unsigned int *pindex, unsigned int *pDataBytes){ huffman_node *root = new_nonleaf_node(0, NULL, NULL); unsigned int count; /* Read the number of entries. (it is stored in network byte order). */ if(memread(bufin, bufinlen, pindex, &count, sizeof(count))) { free_huffman_tree(root); return NULL; } count = ntohl(count); /* Read the number of data bytes this encoding represents. */ if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes))) { free_huffman_tree(root); return NULL; } *pDataBytes = ntohl(*pDataBytes); /* Read the entries. */ while(count-- > 0) { unsigned int curbit; unsigned char symbol; unsigned char numbits; unsigned char numbytes; unsigned char *bytes; huffman_node *p = root; if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol))) { free_huffman_tree(root); return NULL; } if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits))) { free_huffman_tree(root); return NULL; } numbytes = (unsigned char)numbytes_from_numbits(numbits); bytes = (unsigned char*)malloc(numbytes); if(memread(bufin, bufinlen, pindex, bytes, numbytes)) { free(bytes); free_huffman_tree(root); return NULL; } /* * Add the entry to the Huffman tree. The value * of the current bit is used switch between * zero and one child nodes in the tree. New nodes * are added as needed in the tree. */ for(curbit = 0; curbit < numbits; ++curbit) { if(get_bit(bytes, curbit)) { if(p->one == NULL) { p->one = curbit == (unsigned char)(numbits - 1) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->one->parent = p; } p = p->one; } else { if(p->zero == NULL) { p->zero = curbit == (unsigned char)(numbits - 1) ? new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL); p->zero->parent = p; } p = p->zero; } } free(bytes); } return root;}static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){ unsigned char curbyte = 0; unsigned char curbit = 0; int c; while((c = fgetc(in)) != EOF) { unsigned char uc = (unsigned char)c; huffman_code *code = (*se)[uc]; unsigned long i; for(i = 0; i < code->numbits; ++i) { /* Add the current bit to curbyte. */ curbyte |= get_bit(code->bits, i) << curbit; /* If this byte is filled up then write it * out and reset the curbit and curbyte. */ if(++curbit == 8) { fputc(curbyte, out); curbyte = 0; curbit = 0; } } } /* * If there is data in curbyte that has not been * output yet, which means that the last encoded * character did not fall on a byte boundary, * then output it. */ if(curbit > 0)//写最后一个符号没写满8bit的情况 fputc(curbyte, out); return 0;}static intdo_memory_encode(buf_cache *pc, const unsigned char* bufin, unsigned int bufinlen, SymbolEncoder *se){ unsigned char curbyte = 0; unsigned char curbit = 0; unsigned int i; for(i = 0; i < bufinlen; ++i) { unsigned char uc = bufin[i]; huffman_code *code = (*se)[uc]; unsigned long i; for(i = 0; i < code->numbits; ++i) { /* Add the current bit to curbyte. */ curbyte |= get_bit(code->bits, i) << curbit; /* If this byte is filled up then write it * out and reset the curbit and curbyte. */ if(++curbit == 8) { if(write_cache(pc, &curbyte, sizeof(curbyte))) return 1; curbyte = 0; curbit = 0; } } } /* * If there is data in curbyte that has not been * output yet, which means that the last encoded * character did not fall on a byte boundary, * then output it. */ return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;}//step3:add by yzhang for huffman statisticsint huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count){ int i,count =0; for(i = 0; i < MAX_SYMBOLS; ++i) { if((*SF)[i]) { st->freq[i]=(float)(*SF)[i]->count/total_count; count+=(*SF)[i]->count; } else { st->freq[i]= 0; } } if(count==total_count) return 1; else return 0;}int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st){ unsigned long i,j; for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*se)[i]; if(p) { unsigned int numbytes; st->numbits[i] = p->numbits; numbytes = numbytes_from_numbits(p->numbits); for (j=0;j<numbytes;j++) st->bits[i][j] = p->bits[j]; } else st->numbits[i] =0; } return 0;}void output_huffman_statistics(huffman_stat *st,FILE *out_Table){ int i,j; unsigned char c; fprintf(out_Table,"symbol\t freq\t codelength\t code\n"); for(i = 0; i < MAX_SYMBOLS; ++i) { fprintf(out_Table,"%d\t ",i); fprintf(out_Table,"%f\t ",st->freq[i]); fprintf(out_Table,"%d\t ",st->numbits[i]); if(st->numbits[i]) { for(j = 0; j < st->numbits[i]; ++j) { c =get_bit(st->bits[i], j); fprintf(out_Table,"%d",c); } } fprintf(out_Table,"\n"); }}//end by yzhang/* * huffman_encode_file huffman encodes in to out. */inthuffman_encode_file(FILE *in, FILE *out, FILE *out_Table) //step1:changed by yzhang for huffman statistics from (FILE *in, FILE *out) to (FILE *in, FILE *out, FILE *out_Table){ SymbolFrequencies sf; SymbolEncoder *se; huffman_node *root = NULL; int rc; unsigned int symbol_count; //step2:add by yzhang for huffman statistics huffman_stat hs; //end by yzhang /* Get the frequency of each symbol in the input file. */ symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成 //step3:add by yzhang for huffman statistics,... get the frequency of each symbol huffST_getSymFrequencies(&sf,&hs,symbol_count); //end by yzhang /* Build an optimal table from the symbolCount. */ se = calculate_huffman_codes(&sf); root = sf[0]; //step3:add by yzhang for huffman statistics... output the statistics to file huffST_getcodeword(se, &hs); output_huffman_statistics(&hs,out_Table); //end by yzhang /* Scan the file again and, using the table previously built, encode it into the output file. */ rewind(in); rc = write_code_table(out, se, symbol_count); if(rc == 0) rc = do_file_encode(in, out, se); /* Free the Huffman tree. */ free_huffman_tree(root); free_encoder(se); return rc;}inthuffman_decode_file(FILE *in, FILE *out){ huffman_node *root, *p; int c; unsigned int data_count; /* Read the Huffman code table. */ root = read_code_table(in, &data_count); if(!root) return 1; /* Decode the file. */ p = root; while(data_count > 0 && (c = fgetc(in)) != EOF) { unsigned char byte = (unsigned char)c; unsigned char mask = 1; while(data_count > 0 && mask) { p = byte & mask ? p->one : p->zero; mask <<= 1; if(p->isLeaf) { fputc(p->symbol, out); p = root; --data_count; } } } free_huffman_tree(root); return 0;}#define CACHE_SIZE 1024int huffman_encode_memory(const unsigned char *bufin, unsigned int bufinlen, unsigned char **pbufout, unsigned int *pbufoutlen){ SymbolFrequencies sf; SymbolEncoder *se; huffman_node *root = NULL; int rc; unsigned int symbol_count; buf_cache cache; /* Ensure the arguments are valid. */ if(!pbufout || !pbufoutlen) return 1; if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen)) return 1; /* Get the frequency of each symbol in the input memory. */ symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen); /* Build an optimal table from the symbolCount. */ se = calculate_huffman_codes(&sf); root = sf[0]; /* Scan the memory again and, using the table previously built, encode it into the output memory. */ rc = write_code_table_to_memory(&cache, se, symbol_count); if(rc == 0) rc = do_memory_encode(&cache, bufin, bufinlen, se); /* Flush the cache. */ flush_cache(&cache); /* Free the Huffman tree. */ free_huffman_tree(root); free_encoder(se); free_cache(&cache); return rc;}int huffman_decode_memory(const unsigned char *bufin, unsigned int bufinlen, unsigned char **pbufout, unsigned int *pbufoutlen){ huffman_node *root, *p; unsigned int data_count; unsigned int i = 0; unsigned char *buf; unsigned int bufcur = 0; /* Ensure the arguments are valid. */ if(!pbufout || !pbufoutlen) return 1; /* Read the Huffman code table. */ root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count); if(!root) return 1; buf = (unsigned char*)malloc(data_count); /* Decode the memory. */ p = root; for(; i < bufinlen && data_count > 0; ++i) { unsigned char byte = bufin[i]; unsigned char mask = 1; while(data_count > 0 && mask) { p = byte & mask ? p->one : p->zero; mask <<= 1; if(p->isLeaf) { buf[bufcur++] = p->symbol; p = root; --data_count; } } } free_huffman_tree(root); *pbufout = buf; *pbufoutlen = bufcur; return 0;}
三、结果分析
实验选取了10中文件类型进行Huffman编码,分别为bmp、doc、exe、pdf、png、ppt、rar、wav、xls、yuv。对编码后的文件进行分析,得到以下结果图表:
可以看到,进行Huffman编码后,大多数文件都变小了,压缩比在1到4之间。但也有rar这样经过编码后不小反大的文件。
再观察每个文件的字符概率分布情况:
对比联合图表可以发现,压缩比是由概率分布决定的。相比于实验选用的bmp、doc等字符概率比较集中的文件,字符概率分布平均分散的文件(如rar、png、pdf),压缩比更小,信源熵更大。
阅读全文
0 0
- 数据压缩实验三 Huffman编解码算法实现与压缩效率分析
- 《数据压缩》实验报告三·Huffman编解码算法实现与压缩效率分析
- 数据压缩原理 实验三 Huffman编解码算法实现与压缩效率分析
- 实验三Huffman编解码算法实现与压缩效率分析
- 实验三 Huffman编解码算法实现与压缩效率分析
- 数据压缩 实验三 Huffman编解码算法实现与压缩效率分析
- 数据压缩 实验三 Huffman编解码算法实现与压缩效率分析
- huffman编解码算法实验与压缩效率分析
- Huffman 编解码算法实现与压缩效率分析
- 数据压缩试验三 Huffman 编解码算法实现与压缩效率分析
- 数据压缩原理实验3_Huffman编解码算法实现与压缩效率分析
- 实验三—Huffman编解码
- 数据压缩实验三:Huffman编解码
- 数据压缩实验三:用c语言实现Huffman编码和压缩效率分析
- Huffman编解码实现文本压缩
- 实验3-huffman编解码
- 数据压缩实验三--Huffman编解码及压缩率的比较
- 实验三Huffman编码与解码
- iOS应用程序内购/内付费
- 深入理解 Java 虚拟机--早期(编译期)优化
- Ubuntu编译安装LLVM
- 学习淘淘商城第九十四课(js跨域分析)
- 多线程之互斥锁(By C++)与多进程
- 实验三 Huffman编解码算法实现与压缩效率分析
- 人脸识别--活体检测(眨眼检测)
- Centos 配置hostonly网络 使用winscp与服务器主机互联
- es6-symbol
- 【POJ2888】Magic Bracelet-Burnside引理+数论+DP矩阵优化
- java 转发和重定向
- RISC-V学习第一篇
- Getting Started with the LLVM System
- iOS应用内支付(IAP)详解