Word2Vec里实现Huffman树
来源:互联网 发布:杨树基因组数据库 编辑:程序博客网 时间:2024/06/04 19:25
word2vec里是拿数组实现word2vec,效率很高,在学校里经常见到的是递归迭代实现Huffman树,这对于处理大量叶子节点的问题不是一个最佳方法。
数组法:
#include <stdio.h>#include <stdlib.h>#include <math.h>#define MAX_CODE_LENGTH 40//宏定义没有;struct vocab_word{ long long cn; int * point; char *word,*code,codelen; };long long vocab_size,a,b,k,min1,min2,i;//词汇表大小struct vocab_word * vocab;int VocabCompare(const void *a,const void *b){ return *((long long *)b)-*((long long *)a);}int main(){ freopen("input.txt","r",stdin); scanf("%lld",&vocab_size); //printf("vocab_size:%lld\n",vocab_size); vocab = (struct vocab_word *)calloc(vocab_size,sizeof(struct vocab_word)); for (i = 0; i < vocab_size; ++i) { vocab[i].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[i].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); //printf("allocate memory to vocab[%lld]\n",i); } long long *count = (long long *)calloc(vocab_size*2-1,sizeof(long long)); long long *binary = (long long *)calloc(vocab_size*2-1,sizeof(long long)); long long *parent_node = (long long *)calloc(vocab_size*2-1,sizeof(long long)); long long point[MAX_CODE_LENGTH]; char code[MAX_CODE_LENGTH]; for (i = 0; i < vocab_size; ++i) { scanf("%lld",&count[i]); //printf("%lld",count[i]); } //for (i = 0; i < vocab_size; ++i) printf("%lld ",count[i]); for (i = vocab_size; i < vocab_size*2-1; ++i) count[i]=1e15; //sort qsort(count,vocab_size,sizeof(long long),VocabCompare); //for (i = 0; i < vocab_size; ++i) printf("%lld ",count[i]); for (i = 0; i < vocab_size; ++i) vocab[i].cn=count[i]; //for (i = 0; i < vocab_size; ++i) printf("%lld ",vocab[i].cn); long long pos1 = vocab_size-1; long long pos2 = vocab_size; for (a = 0; a < vocab_size-1; ++a)//迭代vocab_size-1次构造huffman树 { //每次寻找两个最小的点min1和min2(次小),最小点为0,次小点为1 if (pos1>=0) { if (count[pos1]<count[pos2]) { min1=pos1; pos1--; }else{ min1 = pos2; pos2++; } }else{ min1 = pos2; pos2++; } if (pos1>=0) { if (count[pos1]<count[pos2]) { min2=pos1; pos1--; }else{ min2=pos2; pos2++; } }else{ min2 = pos2; pos2++; } //printf("count[%lld]=%lld count[%lld]=%lld\n",min1,count[min1],min2,count[min2]); count[vocab_size + a]=count[min1]+count[min2]; //printf("count[%lld]=%lld\n",vocab_size+a,count[vocab_size+a]); parent_node[min1]=vocab_size+a; parent_node[min2]=vocab_size+a; binary[min2]=1; //printf("binary[%lld]=%lld\n",min2,binary[min2]); } //for (i = 0; i < 2*vocab_size-1; ++i) printf("%lld ",binary[i]); //for (i = 0; i < 2*vocab_size-1; ++i) printf("i=%lld,parent_node:%lld\n",i,parent_node[i]); for (a = 0; a < vocab_size; ++a) { b=a; k=0; while(1){ code[k] = binary[b]; point[k] = b; k++; b=parent_node[b]; if (b==vocab_size*2-2) break; } vocab[a].codelen=k;//huffman编码长度 vocab[a].point[0]=vocab_size*2-2; for (b = 0; b < k; ++b)//逆序处理 { vocab[a].code[k-b-1]=code[b]; vocab[a].point[k-b]=point[b]; } //printf("vocab[%lld].cn=%lld\n",a,vocab[a].cn); //printf("vocab[%lld].codelen=%d\n",a,vocab[a].codelen); //for ( i = 0; i < k; ++i) printf("vocab[%lld].code=%d\n",a,vocab[a].code[i]); //for ( i = 0; i < k+1; ++i) printf("vocab[%lld].point=%d\n",a,vocab[a].point[i]); } free(count); free(binary); free(parent_node); //output for (a = 0; a < vocab_size; ++a) { printf("vocab[%lld].cn=%lld\n",a,vocab[a].cn); //printf("vocab[%lld].codelen=%d\n",a,vocab[a].codelen); printf("code: "); for ( i = 0; i < vocab[a].codelen; ++i) printf("%d ",vocab[a].code[i]); printf("\n"); //printf("point: "); //for ( i = 0; i < vocab[a].codelen+1; ++i) printf("%d ",vocab[a].point[i]); //printf("\n"); }}
迭代法:
#include <stdio.h>#include <stdlib.h>typedef int ElemType;struct BTreeNode{ ElemType data; struct BTreeNode * left; struct BTreeNode * right;};//1、输出二叉树,可在前序遍历的基础上修改。采用广义表格式,元素类型为int void PrintBTree_int(struct BTreeNode *BT){ if (BT!=NULL) { printf("%d",BT->data);//输出根结点的值 if (BT->left!=NULL||BT->right!=NULL) { printf("("); PrintBTree_int(BT->left);//输出左子树 if (BT->right!=NULL) printf(","); PrintBTree_int(BT->right);//输出右子树 printf(")"); } }}//2、根据数组 a 中 n 个权值建立一棵哈夫曼树,返回树根指针 struct BTreeNode * CreateHuffman(ElemType a[],int n){ int i,j; struct BTreeNode **b,*q; b = (struct BTreeNode **)malloc(n*sizeof(struct BTreeNode)); for (i = 0; i < n; ++i) //初始化b指针数组,使每个指针元素指向a数组中对应的元素结点 { b[i] = (struct BTreeNode *)malloc(sizeof(struct BTreeNode)); b[i]->data = a[i]; b[i]->left=b[i]->right=NULL; } for (int i = 1; i < n; ++i)//进行 n-1 次循环建立哈夫曼树 { //k1表示森林中具有最小权值的树根结点的下标,k2为次最小的下标 int k1=-1,k2; for (int j = 0; j < n; ++j)//让k1初始指向森林中第一棵树,k2指向第二棵 { if (b[j]!=NULL&&k1==-1) { k1=j; continue; } if (b[j]!=NULL) { k2=j; break; } } for (int j = k2; j < n; ++j) { if (b[j]!=NULL) { if (b[j]->data<b[k1]->data) { k2=k1; k1=j; } else if (b[j]->data<b[k2]->data) { k2=j; } } } //由最小权值树和次最小权值树建立一棵新树,q指向树根结点 q=(struct BTreeNode *)malloc(sizeof(struct BTreeNode)); q->data = b[k1]->data + b[k2]->data; q->left = b[k1]; q->right =b[k2]; b[k1] = q;//将指向新树的指针赋给b指针数组中k1位置 b[k2] = NULL;//k2位置为空 } free(b);//删除动态建立的数组b return q;//返回整个哈夫曼树的树根指针 }//3、求哈夫曼树的带权路径长度 ElemType WeightPathLength(struct BTreeNode * FBT,int len)//len初始值为0{ if (FBT==NULL)//空树返回0 { return 0; } else { if (FBT->left == NULL && FBT->right == NULL)//访问到叶子结点 return FBT->data*len; else//访问到非叶子结点,进行递归调用,返回左右子树的带权路径长度之和,len递增 return WeightPathLength(FBT->left,len+1)+WeightPathLength(FBT->right,len+1); }}//4、哈夫曼编码(可以根据哈夫曼树带权路径长度的算法基础上进行修改) void HuffmanCoding(struct BTreeNode *FBT,int len)//len初始值为0{ static int a[10];//定义静态数组a,保存每个叶子的编码,数组长度至少是树深度减一 if (FBT!=NULL) { if (FBT->left==NULL&&FBT->right==NULL) { int i; printf("节点权值为%d的编码",FBT->data); for (i = 0; i < len; ++i) printf("%d",a[i]); printf("\n"); }else { //访问到非叶子结点时分别向左右子树递归调用,并把分支上的0、1编码保存到数组a //的对应元素中,向下深入一层时len值增1 a[len] = 0; HuffmanCoding(FBT->left,len+1); a[len]=1; HuffmanCoding(FBT->right,len+1); } }}//主函数 int main(){ freopen("input.txt","r",stdin); int n,i; ElemType *a; struct BTreeNode * fbt; //printf("从键盘输入待构造的哈夫曼树中带权叶子结点数n:"); while(1) { scanf("%d",&n); printf("n:%d\n",n); if (n>1) break; else printf("重输n值:"); } a=(ElemType *)malloc(n*sizeof(ElemType)); //printf("从键盘输入%d个整数作为权值:", n); for (int i = 0; i < n; ++i) { scanf("%d",&a[i]); printf("a[%d]=%d\n",i,a[i]); } fbt = CreateHuffman(a,n); printf("广义表形式的哈夫曼树:"); PrintBTree_int(fbt); printf("\n"); printf("哈夫曼树的带权路径长度:"); printf("%d\n",WeightPathLength(fbt,0)); printf("树中每个叶子结点的哈夫曼编码:\n"); HuffmanCoding(fbt,0); return 0;}
0 0
- Word2Vec里实现Huffman树
- 自己动手写word2vec (三):构建Huffman树
- huffman树在word2vec中的应用原理
- Word2vec基础介绍(三):构建Huffman树
- Huffman树的实现
- 实现huffman树
- Huffman树实现
- java实现Huffman树
- 实现Huffman树
- Huffman树的实现
- Huffman树的实现
- Huffman树与MinHeap实现
- 优先队列实现Huffman树
- Huffman树及其编码实现
- Huffman树的简单实现
- Huffman树及JAVA实现
- 【C++】Huffman树的实现
- Huffman树与Huffman编码(C语言实现)
- 欢迎使用CSDN-markdown编辑器
- php Recess framework入门
- 当你在浏览器中输入Google.com并且按下回车之后发生了什么?
- 一个老鸟眼中“IT民工”的发展方向
- 程序员专用经典语录—看完笑一阵可以,千万不要死循环哦!
- Word2Vec里实现Huffman树
- K-SVD简述——字典学习,稀疏编码,MOD与之对比(附代码)
- HTML学习笔记--制作表格
- Android获取本地通讯录发生变化的联系人
- OC语言基础知识
- Error_9_fatal error LNK1112: module machine type 'X86' conflicts with target machine type 'x64'
- 1032. Sharing (25)
- 说说
- html头文件设置常用之<meta>设置缓存