赫夫曼编码\译码

来源:互联网 发布:影视后期编辑软件 编辑:程序博客网 时间:2024/05/02 00:12

赫夫曼编码

通过赫夫曼编码可以节省存储空间,在计算机科学中有广泛的应用。本文件生成的文件也得到了有效的压缩,中间应用了大量的位操作。这些操作用c语言写多少有点不方便。
以下是hfmTree.h的内容,这是个公共的头文件,其余源文件都需要包含它。
#include <stdio.h>#include <stdlib.h>#include <string.h>//这是树的节点 struct Node {char ch;int weight;int filepoint,lf,rf;//事实上这是白浪费空间,但我不想再定义一个结构体维护它了struct Node* lchild;struct Node* rchild;};//这是一个编码数组 ,在编码时用 struct Code {char ch;unsigned int code;int length;};

以下是Init.c中的内容,这个文件主要是根据输入的数据建立赫夫曼树,并保存在hfmTree文件中,这个文件编码和译码要用到
#include "hfmTree.h"int n=0;int filepoint=0;struct Code* code;struct Node* root;int input();int showCode();int toCode(struct Node* root,int ncode,int length);int writetofile(const char *filename);int static writefilepoint(struct Node* root,FILE* fout);int static makefilepoint(struct Node* root);int deltree(struct Node* tree);int main() {int i;input();//writetofile("hfmTree");code=(struct Code*)malloc(sizeof(struct Code)*n);for(i=0; i<n; i++) {code[i].length=0;}toCode(root,0,0);writetofile("hfmTree");showCode();free(code);deltree(root);return 0;}int showCode() { //将所有编码显示出来int i;for(i=0; i<n; i++) {printf("%c:",code[i].ch);int j;for(j=code[i].length-1; j>=0; j--) {if((code[i].code)&(1<<j))printf("1");elseprintf("0");}printf("\n");}}int toCode(struct Node* root,int ncode,int length) {//根据树建立编码数组 if(root->lchild!=NULL&&root->rchild!=NULL) {toCode(root->lchild,ncode<<1,length+1);toCode(root->rchild,(ncode<<1)+1,length+1);} else {int i;for(i=0; i<n; i++) {if(code[i].length==0)break;}code[i].ch=root->ch;code[i].code=ncode;code[i].length=length;}return 0;}int writetofile(const char *filename) {//将树写入文件 FILE* fout=fopen(filename,"wb");if(fout==NULL) {printf("file open error!\n");return -1;}fwrite(&n,sizeof(int),1,fout);filepoint=8;makefilepoint(root);writefilepoint(root,fout);fseek(fout,4,SEEK_SET);fwrite(&filepoint,sizeof(int),1,fout);fseek(fout,filepoint,SEEK_SET);fwrite(code,sizeof(struct Code)*n,1,fout);fclose(fout);//需要return 0;}int static writefilepoint(struct Node* root,FILE* fout){if(root!=NULL){fseek(fout,root->filepoint,SEEK_SET);fwrite(root,sizeof(struct Node),1,fout);writefilepoint(root->lchild,fout);writefilepoint(root->rchild,fout);}return 0;}int static makefilepoint(struct Node* root) {if(root!=NULL) {root->filepoint=filepoint;filepoint+=sizeof(struct Node);if(root->lchild!=NULL) {root->lf=filepoint;makefilepoint(root->lchild);} else {root->lf=0;}if(root->rchild!=NULL) {root->rf=filepoint;makefilepoint(root->rchild);} else {root->rf=0;}}return 0;}int deltree(struct Node* tree) {if(tree!=NULL) {deltree(tree->lchild);deltree(tree->rchild);free(tree);}return 0;}int input() {int i,weight;int si;//暂存根节点char ch;printf("Input:n=");scanf("%d",&n);struct Node* temp;struct Node** table=(struct Node**)malloc(sizeof(struct Node*)*n);printf("example(they are in different line):w 34\ne 56\n");for(i=0; i<n; i++) {temp=(struct Node*)malloc(sizeof(struct Node));fflush(stdin);scanf("%c %d",&(temp->ch),&(temp->weight));temp->lchild=NULL;temp->rchild=NULL;table[i]=temp;}//将输入的数组转换成树 while(1) {int flag=0;int sm1=0,sm2=0;int weight=0x7fffffff;//判断树是否建成 for(i=0; i<n; i++) {if(table[i]!=NULL) {flag++;si=i;}}if(flag==1)break;for(i=0; i<n; i++) { //获得最小权值的数if(table[i]==NULL)continue;if(weight>table[i]->weight) {weight=table[i]->weight;sm1=i;}}weight=0x7fffffff;for(i=0; i<n; i++) { //获得第二个最小权值的数if(table[i]==NULL)continue;if(i==sm1)continue;if(weight>table[i]->weight) {weight=table[i]->weight;sm2=i;}}//合并temp=(struct Node*)malloc(sizeof(struct Node));temp->weight=table[sm1]->weight+table[sm2]->weight;temp->lchild=table[sm1];temp->rchild=table[sm2];table[sm2]=NULL;table[sm1]=temp;}root=table[si];free(table);return 0;}
以下是Encod.c的内容,这个文件主要是根据建立的赫夫曼树进行编码,要读的文件是A.txt,写出的文件是B.dat。当数据量较大时,通过试验10个阿拉伯数字的编码,B.dat的大小约为A.txt一半。
#include "hfmTree.h"int n=0;struct Code* code;int Encoding(char c);int Encodtofile(const char* fnin,const char* fnout);int showCode();int readcode(const char* filename);int deltree(struct Node* tree);int main() {readcode("hfmTree");showCode();Encodtofile("A.txt","B.dat");free(code);return 0;}int Encoding(char c) { //找到则返回0-(n-1),否则返回nint i;for(i=0; i<n; i++) {if(code[i].ch==c)break;}return i;}int Encodtofile(const char* fnin,const char* fnout) {//编码 FILE* fin,*fout;fin=fopen(fnin,"r");fout=fopen(fnout,"wb");if(fin==NULL||fout==NULL) {printf("file open error!\n");return -1;}unsigned int buf=0;int length=0;//这表示最后一个字节的长度//intel的机器是大端模式,以四个字节为一个单位往文件写,字节顺序是倒的 fseek(fout,4,SEEK_SET);while(!feof(fin)) {char ch=fgetc(fin);int i=Encoding(ch);if(i==n) {//文件末尾可能会有一个未知的编码,这不会影响结果 printf("Unknown Code!\n");continue;}if((length+code[i].length)>32) { //会溢出,赶紧往文件写int temp=code[i].length-32+length;//以下操作用了大量位运算,主要是屏蔽无效位以及位移操作buf=((buf<<(32-length))|(((code[i].code)&(~((1<<temp)-1)))>>temp));fwrite(&buf,sizeof(int),1,fout);buf=code[i].code&((1<<temp)-1);length=temp;continue;}buf=(buf<<(code[i].length))|code[i].code;length+=code[i].length;}buf=buf<<(32-length);fwrite(&buf,sizeof(int),1,fout);fseek(fout,0,SEEK_SET);fwrite(&length,sizeof(int),1,fout);fclose(fin);fclose(fout);return 0;}int showCode() { //将所有编码显示出来int i;for(i=0; i<n; i++) {printf("%c:",code[i].ch);int j;for(j=code[i].length-1; j>=0; j--) {if((code[i].code)&(1<<j))printf("1");elseprintf("0");}printf("\n");}return 0;}int readcode(const char* filename){FILE* fin=fopen(filename,"rb");if(fin==NULL) {printf("file open error!\n");return -1;}int offset;fread(&n,sizeof(int),1,fin);fread(&offset,sizeof(int),1,fin);code=(struct Code*)malloc(sizeof(struct Code)*n);fseek(fin,offset,SEEK_SET);fread(code,sizeof(struct Code)*n,1,fin);fclose(fin);return 0;}

以下是Decod.c的内容,用于译码。将上一步生成的文件B.dat译码成C.txt,通过比较C.txt和A.txt的内容,前面基本基本一致,在结尾处没能处理好,出现了差异。
#include "hfmTree.h"int n=0;struct Node* root; int Dcoding(int temp,struct Node* root,int dep);int Dcodingfromfile(const char* fnin,const char* fnout);int Dcodingfromfile2(const char* fnin,const char* fnout);int readtree(const char* filename);int readnode(FILE* fin,struct Node* root,int seek);int main() {//struct Node* root=(struct Node*)malloc(sizeof(struct Node));readtree("hfmTree");//printf("%c\n",Dcoding(0xffffffff,root,1)&(0xffff));Dcodingfromfile2("B.dat","C.txt");deltree(root);return 0;}int Dcodingfromfile2(const char* fnin,const char* fnout){FILE* fin,*fout;fin=fopen(fnin,"rb");fout=fopen(fnout,"wb");if(fin==NULL||fout==NULL) {printf("file open error!\n");return -1;}fseek(fin,4,SEEK_SET);unsigned int buf,buf2,buf3;int ret;fread(&buf,sizeof(int),1,fin);fread(&buf2,sizeof(int),1,fin);int pos2=32;//指明二缓冲有效位个数while(1){ret=Dcoding(buf,root,0);char ch=(char)(ret&0xff);fwrite(&ch,sizeof(char),1,fout);ret=(ret&0xffff0000)>>16;buf=buf<<ret;if(pos2>ret){buf=buf|(buf2>>(32-ret));buf2=buf2<<ret;pos2=pos2-ret;}else{fread(&buf3,sizeof(int),1,fin);buf2=buf2|((buf3&(0xffffffff<<pos2))>>pos2);buf=buf|(buf2>>(32-ret));buf2=buf2<<ret;//buf2=buf2|((buf3&(0xffffffff>>(32-pos2)))<<(ret-pos2));//buf2=buf2|(buf3&(0xffffffff>>(32-pos2)));//pos2=32-pos2;pos2=32-ret+pos2;if(feof(fin)){break;}}}int pos3,oldpos2=pos2-32+ret;fseek(fin,0,SEEK_SET);fread(&pos3,sizeof(int),1,fin);if(ret-oldpos2-pos3>0){pos3=ret-oldpos2-pos3;}else{pos3=32+pos3-ret+oldpos2;}while(pos3>0){ret=Dcoding(buf,root,0);char ch=(char)(ret&0xff);fwrite(&ch,sizeof(char),1,fout);ret=(ret&0xffff0000)>>16;buf=buf<<ret; if(pos3>32){buf=buf|(buf2>>(32-ret));buf2=buf2<<ret;}pos3=pos3-ret;}fclose(fin);fclose(fout); return 0;}int Dcodingfromfile(const char* fnin,const char* fnout){FILE* fin,*fout;fin=fopen(fnin,"rb");fout=fopen(fnout,"wb");if(fin==NULL||fout==NULL) {printf("file open error!\n");return -1;}int pos1=32,pos2=32;//有效位 unsigned int buf=0;//两级缓冲 unsigned int buf2=0;//int pos=0;fseek(fin,4,SEEK_SET);fread(&buf,sizeof(int),1,fin);fread(&buf2,sizeof(int),1,fin);while(1){int ret=Dcoding(buf,root,0);fwrite(&ret,sizeof(char),1,fout);ret=(ret&0xffff0000)>>16;//这是要移入的位数 buf=buf<<ret;if(pos2>=ret){buf=buf&(0xffffffff<<ret);buf=buf|((buf2&((0xffffffff)<<(32-ret)))>>(32-ret));buf2=buf2<<ret;buf2=buf2&(0xffffffff<<ret);pos2-=ret;}else{pos1=ret-pos2;buf=buf&(0xffffffff<<ret);buf=buf+((buf2&((0xffffffff)<<(32-pos2)))>>(32-ret));//pos1=pos1-ret+pos2;//pos2=0;fread(&buf2,sizeof(int),1,fin);if(feof(fin)){fseek(fin,0,SEEK_SET);fread(&pos2,sizeof(int),1,fin);break;}buf=buf+((buf2&((0xffffffff)<<(32-pos1)))>>(32-ret));buf2=buf2<<pos1;buf2=buf2&(0xffffffff<<pos1);pos2=32-pos1;pos1=32;}}//还需要处理后续的不超过8个字节 fclose(fin);fclose(fout);return 0;}int Dcoding(int temp,struct Node* root,int dep){//高两字节是深度,低两字节是ch int wei=0x80000000;if(root->lchild==NULL&&root->rchild==NULL)return root->ch+(dep<<16);if(temp&wei){return Dcoding(temp<<1,root->rchild,dep+1);}else{return Dcoding(temp<<1,root->lchild,dep+1); }}int readtree(const char* filename){root=(struct Node*)malloc(sizeof(struct Node));FILE* fin=fopen(filename,"rb");if(fin==NULL) {printf("file open error!\n");return -1;}fread(&n,sizeof(int),1,fin);readnode(fin,root,8);fclose(fin);return 0;}int readnode(FILE* fin,struct Node* root,int seek){fseek(fin,seek,SEEK_SET);fread(root,sizeof(struct Node),1,fin);if(root->lf!=0){root->lchild=(struct Node*)malloc(sizeof(struct Node));readnode(fin,root->lchild,root->lf);}else{root->lchild=NULL;}if(root->rf!=0){root->rchild=(struct Node*)malloc(sizeof(struct Node));readnode(fin,root->rchild,root->rf);}else{root->rchild=NULL;}}int deltree(struct Node* tree) {if(tree!=NULL) {deltree(tree->lchild);deltree(tree->rchild);free(tree);}}

以上就是所有程序,执行的先后顺序是Init.c->Encod.c->Decod.c,其中要求预先写好A.txt,文件出现的字符应该在Init.c时输入。对于未知的编码,程序直接忽略,由于有一个文件结束符,文件末尾总会有一个未知的编码,好在这并不影响测试结果。
B.dat是二进制文件,以四个字节为一个单位,由于Intel是小端模式,用二进制文件查看是需要注意字节顺序。

0 0