计算文章中每个词的权重值-信息熵及代码实现

来源:互联网 发布:centos 拼音不出中文 编辑:程序博客网 时间:2024/06/01 15:15

计算出每个词的信息熵可以用来作为词的权重,信息熵公式是:


W代表该词,p代表该词左右出现的不同词的数目。

比如现在某篇文章中出现了两次 A W C, 一次B W D

那么W的左侧信息熵为:


2/3表示词组A在3次中出现了2次,B只出现了一次,故为1/3.

W右侧的信息熵也是一样的。如果是A W C, B W C

那么W右侧就是0,因为是 -1log(1)。

对所有的词计算左右信息熵,如果某个词的左右信息熵都很大,那这个词就很可能是关键词。

以下是我用java实现的计算信息熵的代码。

输入是分好的词,输出是词和对应的信息熵值。

package word.weight;import java.util.ArrayList;import java.util.List;import java.util.TreeSet;/** * 计算句子中每个词的信息熵 *  * @author Administrator * */public class GetWordEntropy {public static void main(String[] args) {// TODO Auto-generated method stubString sentence = "今天下雨北京鲁迅北京上海下雨";String[] words = sentence.split("\t");CalculateWordEntropy(words);}public static void CalculateWordEntropy(String[] words) {int length = words.length;ArrayList<String[]> wordList = new ArrayList<String[]>();// 将分好的词每3个一组存到数组中for (int i = 0; i < length; i++) {String[] wordSeg = new String[3];if (i == 0) {wordSeg[0] = "null";wordSeg[1] = words[i];wordSeg[2] = words[i + 1];} else if (i == length - 1) {wordSeg[0] = words[i - 1];wordSeg[1] = words[i];wordSeg[2] = "null";} else {wordSeg[0] = words[i - 1];wordSeg[1] = words[i];wordSeg[2] = words[i + 1];}wordList.add(wordSeg);}// 去除重复的词List<String> lists = new ArrayList<String>();for (int l = 0; l < length; l++) {lists.add(words[l]);}List<String> tempList = new ArrayList<String>();for (String str : lists) {if (!(tempList.contains(str))) {tempList.add(str);}}String[] wordClean = new String[tempList.size()];for (int m = 0; m < tempList.size(); m++) {wordClean[m] = tempList.get(m);}// 统计每个词的词频int[] frequent = new int[wordClean.length];for (int j = 0; j < wordClean.length; j++) {int count = 0;for (int k = 0; k < words.length; k++) {if (wordClean[j].equals(words[k])) {count++;}}frequent[j] = count;}// 将三元组中中间的那个词相同的存到一个list中,然后计算该词的信息熵double[] allEntropy = new double[wordClean.length];for (int n = 0; n < wordClean.length; n++) {ArrayList<String[]> wordSegList = new ArrayList<String[]>();int count = 1;for (int p = 0; p < wordList.size(); p++) {String[] wordSegStr = wordList.get(p);if (wordSegStr[1].equals(wordClean[n])) {count++;wordSegList.add(wordSegStr);}}String[] leftword = new String[wordSegList.size()];String[] rightword = new String[wordSegList.size()];// 计算左信息熵for (int i = 0; i < wordSegList.size(); i++) {String[] left = wordSegList.get(i);leftword[i] = left[0];}// 去除左边重复的词List<String> listsLeft = new ArrayList<String>();for (int l = 0; l < leftword.length; l++) {listsLeft.add(leftword[l]);}List<String> tempListLeft = new ArrayList<String>();for (String str : listsLeft) {if (!(tempListLeft.contains(str))) {tempListLeft.add(str);}}String[] leftWordClean = new String[tempListLeft.size()];for (int m = 0; m < tempListLeft.size(); m++) {leftWordClean[m] = tempListLeft.get(m);}// 统计左边每个词的词频int[] leftFrequent = new int[leftWordClean.length];for (int j = 0; j < leftWordClean.length; j++) {int leftcount = 0;for (int k = 0; k < leftword.length; k++) {if (leftWordClean[j].equals(leftword[k])) {leftcount++;}}leftFrequent[j] = leftcount;}// 计算左熵值double leftEntropy = 0;for (int i = 0; i < leftFrequent.length; i++) {double a = (double) leftFrequent[i] / count;double b = Math.log((double) leftFrequent[i] / count);leftEntropy += -a * b;// leftEntropy +=// (-(double)(leftFrequent[i]/count))*Math.log((double)(leftFrequent[i]/count));}// 计算右信息熵for (int i = 0; i < wordSegList.size(); i++) {String[] right = wordSegList.get(i);rightword[i] = right[2];}// 去除右边重复的词List<String> listsRight = new ArrayList<String>();for (int l = 0; l < rightword.length; l++) {listsRight.add(rightword[l]);}List<String> tempListRight = new ArrayList<String>();for (String str : listsRight) {if (!(tempListRight.contains(str))) {tempListRight.add(str);}}String[] rightWordClean = new String[tempListRight.size()];for (int m = 0; m < tempListRight.size(); m++) {rightWordClean[m] = tempListRight.get(m);}// 统计右边每个词的词频int[] rightFrequent = new int[rightWordClean.length];for (int j = 0; j < rightWordClean.length; j++) {int rightcount = 0;for (int k = 0; k < rightword.length; k++) {if (rightWordClean[j].equals(rightword[k])) {rightcount++;}}rightFrequent[j] = rightcount;}// 计算右熵值double rightEntropy = 0.0;for (int i = 0; i < rightFrequent.length; i++) {double a = (double) rightFrequent[i] / count;double b = Math.log((double) rightFrequent[i] / count);rightEntropy += -a * b;// rightEntropy +=// (-(double)(rightFrequent[i]/count))*Math.log((double)(rightFrequent[i]/count));}// 计算词的总信息熵double wordEntropy = leftEntropy + rightEntropy;allEntropy[n] = wordEntropy;}for (int i = 0; i < allEntropy.length; i++) {System.out.println(wordClean[i] + ":" + allEntropy[i]);}}}


0 0
原创粉丝点击