数据压缩原理实验3_实验报告

来源:互联网 发布:深圳淘宝摄影 编辑:程序博客网 时间:2024/05/18 13:26

一、实验原理
1、本实验中Huffman编码算法
(1)将文件以ASCII字符流的形式读入,统计每个符号的发生频率;
(2)将所有文件中出现过的字符按照频率从小到大的顺序排列;
(3)每一次选出最小的两个值,作为二叉树的两个叶子节点,将和作为它们的根节点, 这两个叶子节点不再参与比较,新的根节点参与比较;
(4)重复3,直到最后得到和为1的根节点;
(5)将形成的二叉树的左节点标0,右节点标1,把从最上面的根节点到最下面的叶子节 点途中遇到的0、1序列串起来,得到了各个字符的编码表示。
2、Huffman编码的数据结构设计,在程序实现中使用一种叫做二叉树的数据结构实现Huffman编码。
(1)哈夫曼节点结构

typedef struct huffman_node_tag{   unsigned char isLeaf;//是否为树叶    unsigned long count;//节点代表的符号加权和   struct huffman_node_tag *parent;//父节点指针   union    {       struct        {        struct huffman_node_tag *zero, *one; //子节点指针,分别代表0,1子节点指针         };       unsigned char symbol;//节点代表的符号   };} huffman_node;

(2)哈夫曼码结构

typedef struct huffman_code_tag {unsigned long numbits;//该码所用的比特数 unsigned char *bits; //指向该码比特串的指针} huffman_code;

二、实验步骤
编码流程
1.huffman编码流程
(1)读入源文件

    char memory = 0; //memory为1表示对内存编码    char compress = 1;//compress为1表示压缩,为0是解压    int opt;    //add by zhn    const char *file_in = NULL, *file_out = NULL;    const char *file_table=NULL;    FILE *in = stdin;//标准输入    FILE *out = stdout;//标准输出    //add by zhn    FILE *table;//输出码表    while((opt = getopt(argc, argv, "i:o:t:cdhvm")) != -1)//对argc,argv的解析,单个字符后跟一个冒号表示后面必须接参数    {        switch(opt)        {        case 'i':            file_in = optarg;            break;        case 'o':            file_out = optarg;            break;        //add by zhn        case 't':            file_table = optarg;            break;        case 'c':            compress = 1;            break;        case 'd':            compress = 0;            break;        case 'h':            usage(stdout);            return 0;        case 'v':            version(stdout);            return 0;        case 'm':            memory = 1;            break;        default:            usage(stderr);            return 1;        }    }

(2)第一次扫描,统计文件中各个字符出现频率

#define MAX_SYMBOLS 256//共有256个字符typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];static voidinit_frequencies(SymbolFrequencies *pSF){    memset(*pSF, 0, sizeof(SymbolFrequencies));}static huffman_node*new_leaf_node(unsigned char symbol)//新建叶子结点父节点为0{    huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));    p->isLeaf = 1;    p->symbol = symbol;    p->count = 0;    p->parent = 0;    return p;}static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){    int c;    unsigned int total_count = 0;    /* Set all frequencies to 0. */    init_frequencies(pSF);//初始化256个节点    /* Count the frequency of each symbol in the input file. */    while((c = fgetc(in)) != EOF)//每次取一个字符    {        unsigned char uc = c;        if(!(*pSF)[uc])//如果该字符以前没有出现则建立新的叶子节点            (*pSF)[uc] = new_leaf_node(uc);        ++(*pSF)[uc]->count;//统计频率        ++total_count;    }    return total_count;}

(3)按频率从小到大排序并建立Huffman树

static intSFComp(const void *p1, const void *p2){    const huffman_node *hn1 = *(const huffman_node**)p1;    const huffman_node *hn2 = *(const huffman_node**)p2;    /* Sort all NULLs to the end. */    if(hn1 == NULL && hn2 == NULL)        return 0;    if(hn1 == NULL)        return 1;    if(hn2 == NULL)        return -1;    if(hn1->count > hn2->count)//qsort为1时排为elem2,elem2;-1时排为elem1,elem2;        return 1;    else if(hn1->count < hn2->count)        return -1;    return 0;}//以频率从从小到大排序,并且count为0时不参与排序static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){    if(subtree == NULL)        return;    if(subtree->isLeaf)    {        (*pSF)[subtree->symbol] = new_code(subtree);    }    else    {        build_symbol_encoder(subtree->zero, pSF);        build_symbol_encoder(subtree->one, pSF);    }}//先遍历右节点,再遍历左节点,直到为叶子结点才分配码字static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){    unsigned int i = 0;    unsigned int n = 0;    huffman_node *m1 = NULL, *m2 = NULL;    SymbolEncoder *pSE = NULL;    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//qsort以SFComp为函数进行升序排序,并交换地址,    /* 得到非零字符的个数 */    for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)        ;    for(i = 0; i < n - 1; ++i)    {        /* Set m1 and m2 to the two subsets of least probability. */            m1 = (*pSF)[0];        m2 = (*pSF)[1];        /* Replace m1 and m2 with a set {m1, m2} whose probability         * is the sum of that of m1 and m2. */        (*pSF)[0] = m1->parent = m2->parent =            new_nonleaf_node(m1->count + m2->count, m1, m2);        (*pSF)[1] = NULL;        /* Put newSet into the correct count position in pSF. */        qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);    }    /* Build the SymbolEncoder array from the tree. */    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));    memset(pSE, 0, sizeof(SymbolEncoder));    build_symbol_encoder((*pSF)[0], pSE);    return pSE;}

(4)将码表和其他信息写入输出文件

static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){    unsigned long i, count = 0;    /* Determine the number of entries in se. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        if((*se)[i])            ++count;    }    /* Write the number of entries in network byte order. */    i = htonl(count);    if(fwrite(&i, sizeof(i), 1, out) != 1)        return 1;    /* Write the number of bytes that will be encoded. */    symbol_count = htonl(symbol_count);    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)        return 1;    /* Write the entries. */    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            unsigned int numbytes;            /* Write the 1 byte symbol. */            fputc((unsigned char)i, out);            /* Write the 1 byte code bit length. */            fputc(p->numbits, out);            /* Write the code bytes. */            numbytes = numbytes_from_numbits(p->numbits);            if(fwrite(p->bits, 1, numbytes, out) != numbytes)                return 1;        }    }    return 0;}

(5)第二次扫描文件,对文件查表进行Huffman编码

static unsigned charget_bit(unsigned char* bits, unsigned long i){    return (bits[i / 8] >> i % 8) & 1;}static intdo_file_encode(FILE* in, FILE* out, SymbolEncoder *se){    unsigned char curbyte = 0;    unsigned char curbit = 0;    int c;    while((c = fgetc(in)) != EOF)    {        unsigned char uc = (unsigned char)c;        huffman_code *code = (*se)[uc];        unsigned long i;        for(i = 0; i < code->numbits; ++i)        {            /* Add the current bit to curbyte. */            curbyte |= get_bit(code->bits, i) << curbit;            /* If this byte is filled up then write it             * out and reset the curbit and curbyte. */            if(++curbit == 8)            {                fputc(curbyte, out);                curbyte = 0;                curbit = 0;            }        }    }    /*     * If there is data in curbyte that has not been     * output yet, which means that the last encoded     * character did not fall on a byte boundary,     * then output it.     */    if(curbit > 0)        fputc(curbyte, out);    return 0;}

(6)释放码树

static voidfree_huffman_tree(huffman_node *subtree){    if(subtree == NULL)        return;    if(!subtree->isLeaf)    {        free_huffman_tree(subtree->zero);        free_huffman_tree(subtree->one);    }    free(subtree);}static voidfree_encoder(SymbolEncoder *pSE){    unsigned long i;    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*pSE)[i];        if(p)            free_code(p);    }    free(pSE);}

2.huffman编码流程
(1)读入解码文件,读取码树

static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes){    huffman_node *root = new_nonleaf_node(0, NULL, NULL);//建立根节点    unsigned int count;    if(fread(&count, sizeof(count), 1, in) != 1)//读取符号数,如果读取失败则不解码直接返回    {        free_huffman_tree(root);        return NULL;    }    count = ntohl(count);    if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)//读取总的解码出来的文件字节数    {        free_huffman_tree(root);        return NULL;    }    *pDataBytes = ntohl(*pDataBytes);    /* Read the entries. */    while(count-- > 0)//文件指向码表,第一项为信源符号,第二项为码字长度,码字    {        int c;        unsigned int curbit;        unsigned char symbol;        unsigned char numbits;        unsigned char numbytes;        unsigned char *bytes;        huffman_node *p = root;        if((c = fgetc(in)) == EOF)//信源符号        {            free_huffman_tree(root);            return NULL;        }        symbol = (unsigned char)c;        if((c = fgetc(in)) == EOF)//码长        {            free_huffman_tree(root);            return NULL;        }        numbits = (unsigned char)c;        numbytes = (unsigned char)numbytes_from_numbits(numbits);//码长转化为字节大小        bytes = (unsigned char*)malloc(numbytes);        if(fread(bytes, 1, numbytes, in) != numbytes)        {            free(bytes);            free_huffman_tree(root);            return NULL;        }        for(curbit = 0; curbit < numbits; ++curbit)        {            if(get_bit(bytes, curbit))//如果当前码字为1,则建立一个右节点            {                if(p->one == NULL)//如果右节点不存在,则新建右节点                {                    p->one = curbit == (unsigned char)(numbits - 1)//如果是当前的比特位到达了最后则建立一个叶子节点,没有则建立一个中间节点                        ? new_leaf_node(symbol)                        : new_nonleaf_node(0, NULL, NULL);                    p->one->parent = p;                }                p = p->one;//把当前节点当成中间节点,以便建立后面节点            }            else//当前码字为0,则建立左节点            {                if(p->zero == NULL)                {                    p->zero = curbit == (unsigned char)(numbits - 1)                        ? new_leaf_node(symbol)                        : new_nonleaf_node(0, NULL, NULL);                    p->zero->parent = p;                }                p = p->zero;            }        }        free(bytes);    }    return root;}

(2)解码文件

inthuffman_decode_file(FILE *in, FILE *out){    huffman_node *root, *p;    int c;    unsigned int data_count;    /* Read the Huffman code table. */    root = read_code_table(in, &data_count);    if(!root)        return 1;    /* Decode the file. */    p = root;    while(data_count > 0 && (c = fgetc(in)) != EOF)//解码没完成时条件成立    {        unsigned char byte = (unsigned char)c;//当前符号        unsigned char mask = 1;        while(data_count > 0 && mask)        {            p = byte & mask ? p->one : p->zero;//从根节点按码字遍历            mask <<= 1;            if(p->isLeaf)//遍历到叶子结点时输出信源符号,并且重新返回根节点            {                fputc(p->symbol, out);                p = root;                --data_count;            }        }    }    free_huffman_tree(root);    return 0;}

(3)输出码表

int write_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){    int i,j;    fprintf(out,"字符\t概率\t长度\t码字\n");//输出表头    for(i = 0; i < MAX_SYMBOLS; ++i)    {        huffman_code *p = (*se)[i];        if(p)        {            float chance=p->count/(double)symbol_count; //计算概率            fprintf(out,"%d\t",i);//信源符号            fprintf(out,"%f\t",chance);//概率            fprintf(out,"%d\t",p->numbits);//码长            for(j=0;j<p->numbits;j++)//二进制表示码字,每次取1bit输出到文件中            {                unsigned char c=get_bit(p->bits,j);                fprintf(out,"%d",c);            }            fprintf(out,"\t\n");        }    }    return 0;}

三、实验结果
总表

test1
test1

test2
2

test3
这里写图片描述

test4
这里写图片描述

test5
这里写图片描述

test6
这里写图片描述

test7
这里写图片描述

test8
这里写图片描述

test9
这里写图片描述

test10
这里写图片描述

四、实验结论
1、各文件类型压缩编码后的平均码长和信源熵大小基本相同,平均码长无限逼近信源熵,可以验证无失真编码的平均码长界限定理。
2、huffman编码对于概率分布不均匀,数据较大的信源压缩效果好,而对于符号分布接近等概,数据较小的压缩效果则比较差。

原创粉丝点击