词语语义度计算

来源：互联网发布：全国青少年犯罪数据编辑：程序博客网时间：2024/04/27 13:46

Recently,I am researching some technologies about the similarity of two different words,as the calculate of words are using in the wider space

for instance the information retrieval,information extraction,text classification and so on,hence I research some basic and simple themes about

the similarity of words,and I will append the relevant codes which written by Java on this blog ,however,your guys could re-implement this algorithm

by different programming language.

package com.iberdecor.words.model;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* @author Trancy
* @version 1.0
*/
public class Primitive {
/**
* DOCUMENT ME!
*/
public static Map<Integer, Primitive> ALLPRIMITIVES = new HashMap<Integer, Primitive>();

/**
* DOCUMENT ME!
*/
public static Map<String, Integer> PRIMITIVESID = new HashMap<String, Integer>();

private String primitive;

/**
* id number
*/
private int id;
private int parentId;

/**
* Creates a new Primitive object.
*
* @param id
* DOCUMENT ME!
* @param primitive
* DOCUMENT ME!
* @param parentId
* DOCUMENT ME!
*/
public Primitive(int id, String primitive, int parentId) {
this.id = id;
this.parentId = parentId;
this.primitive = primitive;
}

/**
* 加载义原文件。
*/
static {
String line = null;

try {
BufferedReader reader = new BufferedReader(new FileReader(
"dict/WHOLE.DAT"));
line = reader.readLine();

while (line != null) {
line = line.trim().replaceAll("\\s+", " ");
String[] strs = line.split(" ");
int id = Integer.parseInt(strs[0]);

String[] words = strs[1].split("\\|");
String english = words[0];
String chinaese = strs[1].split("\\|")[1];

int parentId = Integer.parseInt(strs[2]);
ALLPRIMITIVES.put(id, new Primitive(id, chinaese, parentId));
// ALLPRIMITIVES.put(id, new Primitive(id, english, parentId));
PRIMITIVESID.put(chinaese, id);
PRIMITIVESID.put(english, id);
// System.out.println("add: " + primitive + " " + id + " " + parentId);
line = reader.readLine();
}
} catch (Exception e) {

System.out.println(line);
e.printStackTrace();
}
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public String getPrimitive() {
return primitive;
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public int getId() {
return id;
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public int getParentId() {
return parentId;
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public boolean isTop() {
return id == parentId;
}

/**
* 获得一个义原的所有父义原，直到顶层位置。
*
* @param primitive
* @return 如果查找的义原没有查找到，则返回一个空list
*/
public static List<Integer> getParents(String primitive) {
List<Integer> list = new ArrayList<Integer>();

// get the id of this primitive
Integer id = PRIMITIVESID.get(primitive);

if (id != null) {
Primitive parent = ALLPRIMITIVES.get(id);
list.add(id);
while (!parent.isTop()) {
list.add(parent.getParentId());
parent = ALLPRIMITIVES.get(parent.getParentId());
}
}

return list;
}

/**
*
* @param primitive
* @return true or false
* @desc we want to know wheater the
* dictionary contains the word we are looking for
* or not.
*/
public static boolean isPrimitive(String primitive) {
return PRIMITIVESID.containsKey(primitive);
}

/**
* DOCUMENT ME!
* @param args DOCUMENT ME!
*/
public static void main(String[] args) {
}
}

package com.iberdecor.words.model;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Represent a word
* @author Trancy
* @version 1.0
*/
public class Word {
private String word;
private String type;

/**
* The primary concept
*/
private String firstPrimitive;

/**
* Other concepts
*/
private List<String> otherPrimitives = new ArrayList<String>();

/**
* 如果该list非空，则该词是一个虚词。列表里存放的是该虚词的一个义原，部分虚词无中文虚词解释
*/
private List<String> structruralWords = new ArrayList<String>();

/**
* 该词的关系义原。key: 关系义原。 value：基本义原|(具体词)的一个列表
*/
private Map<String, List<String>> relationalPrimitives = new HashMap<String, List<String>>();

/**
* 该词的关系符号义原。Key: 关系符号。 value: 属于该挂系符号的一组基本义原|(具体词)
*/
private Map<String, List<String>> relationSimbolPrimitives = new HashMap<String, List<String>>();

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public String getWord() {
return word;
}
/**

* @return
*/
public boolean isStructruralWord(){
return !structruralWords.isEmpty();
}

/**
* DOCUMENT ME!
*
* @param word
* DOCUMENT ME!
*/
public void setWord(String word) {
this.word = word;
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public String getType() {
return type;
}

/**
* DOCUMENT ME!
*
* @param type
* DOCUMENT ME!
*/
public void setType(String type) {
this.type = type;
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public String getFirstPrimitive() {
return firstPrimitive;
}

/**
* DOCUMENT ME!
*
* @param firstPrimitive
* DOCUMENT ME!
*/
public void setFirstPrimitive(String firstPrimitive) {
this.firstPrimitive = firstPrimitive;
}

/**
* DOCUMENT ME!
*
* @return DOCUMENT ME!
*/
public List<String> getOtherPrimitives() {
return otherPrimitives;
}

/**
* DOCUMENT ME!
*
* @param otherPrimitives
* DOCUMENT ME!
*/
public void setOtherPrimitives(List<String> otherPrimitives) {
this.otherPrimitives = otherPrimitives;
}

/**
* DOCUMENT ME!
* @param otherPrimitive
* DOCUMENT ME!
*/
public void addOtherPrimitive(String otherPrimitive) {
this.otherPrimitives.add(otherPrimitive);
}

/**
* DOCUMENT ME!
* DOCUMENT ME!
* @return
*/
public List<String> getStructruralWords() {
return structruralWords;
}

/**
* DOCUMENT ME!
*
* @param structruralWords
* DOCUMENT ME!
*/
public void setStructruralWords(List<String> structruralWords) {
this.structruralWords = structruralWords;
}

/**
* DOCUMENT ME!
* @param structruralWord
* DOCUMENT ME!
*/
public void addStructruralWord(String structruralWord) {
this.structruralWords.add(structruralWord);
}

/**
* DOCUMENT ME!
* @param key DOCUMENT ME!
* @param value DOCUMENT ME!
*/
public void addRelationalPrimitive(String key, String value) {
List<String> list = relationalPrimitives.get(key);

if (list == null) {
list = new ArrayList<String>();
list.add(value);
relationalPrimitives.put(key, list);
} else {
list.add(value);
}
}
/**
*
* @param key
* @param value
*/
public void addRelationSimbolPrimitive(String key,String value){
List<String> list = relationSimbolPrimitives.get(key);

if (list == null) {
list = new ArrayList<String>();
list.add(value);
relationSimbolPrimitives.put(key, list);
} else {
list.add(value);
}
}

/**
* @return Map which contains relationalPrimitives
*/
public Map<String, List<String>> getRelationalPrimitives() {
return relationalPrimitives;
}

/**
* @return Map which contains relationSimbolPrimitives
*/
public Map<String, List<String>> getRelationSimbolPrimitives() {
return relationSimbolPrimitives;
}
}

package com.iberdecor.words.model;

public enum WordType {
PREFIX,
PREP,
ECHO,
EXPR,
SUFFIX,
PUNC,
N,
ADV,
CLAS,
COOR,
CONJ,
V,
STRU,
PP,
P,
ADJ,
PRON,
AUX,
NUM;
}

/*
* Copyright (C) 2008 SKLSDE(State Key Laboratory of Software Development and Environment, Beihang University)., All Rights Reserved.
*/
package com.iberdecor.util;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import com.iberdecor.words.model.Primitive;
import com.iberdecor.words.model.Word;

/**
* @author Trancy
* @version 1.0
*/
public class WordSimilarity {
// 词库中所有的具体词，或者义原
private static Map<String, List<Word>> ALLWORDS = new TreeMap<String, List<Word>>();
/**
* sim(p1,p2) = alpha/(d+alpha)
*/
private static double alpha = 1.6;
/**
* 计算实词的相似度，参数，基本义原权重
*/
private static double beta1 = 0.5;
/**
* 计算实词的相似度，参数，其他义原权重
*/
private static double beta2 = 0.2;
/**
* 计算实词的相似度，参数，关系义原权重
*/
private static double beta3 = 0.17;
/**
* 计算实词的相似度，参数，关系符号义原权重
*/
private static double beta4 = 0.13;
/**
* 具体词与义原的相似度一律处理为一个比较小的常数. 具体词和具体词的相似度，如果两个词相同，则为1，否则为0.
*/
private static double gamma = 0.2;
/**
* 将任一非空值与空值的相似度定义为一个比较小的常数
*/
private static double delta = 0.2;
/**
* 两个无关义原之间的默认距离
*/
private static int DEFAULT_PRIMITIVE_DIS = 20;
/**
* 逻辑符号
*/
private static String LOGICAL_SYMBOL = ",~^";
/**
* 关系符号
*/
private static String RELATIONAL_SYMBOL = "#%$*+&@?!";
/**
*特殊符号，虚词，或具体词
*/
private static String SPECIAL_SYMBOL = "{";
/**
* 默认加载文件
*/
static {
loadGlossary();
}

/**
* 加载 glossay.dat 文件
*/
public static void loadGlossary() {

String line = null;
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader("dict/glossary.dat"));
line = reader.readLine();
while (line != null) {

// parse the line
// the line format is like this:
// 阿布扎比 N place|地方,capital|国都,ProperName|专,(the United Arab
// Emirates|阿拉伯联合酋长国)

line = line.trim().replaceAll("\\s+", " ");
String[] strs = line.split(" ");
String word = strs[0];
String type = strs[1];

// 因为是按空格划分，最后一部分的加回去
String related = strs[2];
for (int i = 3; i < strs.length; i++) {
related += (" " + strs[i]);
}

// Create a new word
Word w = new Word();
w.setWord(word);
w.setType(type);
parseDetail(related, w);

// save this word.
addWord(w);

// read the next line
line = reader.readLine();
}
} catch (Exception e) {
System.out.println("Error line: " + line);
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

/**
* 解析具体概念部分，将解析的结果存入<code>Word word</code>.
*
* @param related
*/
public static void parseDetail(String related, Word word) {
// spilt by ","
String[] parts = related.split(",");
boolean isFirst = true;
boolean isRelational = false;
boolean isSimbol = false;
String chinese = null;
String relationalPrimitiveKey = null;
String simbolKey = null;
for (int i = 0; i < parts.length; i++) {
// 如果是具体词，则以括号开始和结尾: (Bahrain|巴林)
if (parts[i].startsWith("(")) {
parts[i] = parts[i].substring(1, parts[i].length() - 1);
// parts[i] = parts[i].replaceAll("\\s+", "");
}
// 关系义原，之后的都是关系义原
if (parts[i].contains("=")) {
isRelational = true;
// format: content=fact|事情
String[] strs = parts[i].split("=");
relationalPrimitiveKey = strs[0];
String value = strs[1].split("\\|")[1];
word.addRelationalPrimitive(relationalPrimitiveKey, value);

continue;
}
String[] strs = parts[i].split("\\|");
// 开始的第一个字符，确定是否为义原，或是其他关系。
int type = getPrimitiveType(strs[0]);
// 其中中文部分的词语,部分虚词没有中文解释
if (strs.length > 1) {
chinese = strs[1];
}
if (chinese != null && (chinese.endsWith(")") || chinese.endsWith("}"))) {
chinese = chinese.substring(0, chinese.length() - 1);
}
// 义原
if (type == 0) {
// 之前有一个关系义原
if (isRelational) {
word.addRelationalPrimitive(relationalPrimitiveKey, chinese);
continue;
}

// 之前有一个是符号义原
if (isSimbol) {
word.addRelationSimbolPrimitive(simbolKey, chinese);
continue;
}

// Default we use the top interpretation as the primary meaning
if (isFirst) {
word.setFirstPrimitive(chinese);
isFirst = false;
continue;
} else {
word.addOtherPrimitive(chinese);
continue;
}
}
// 关系符号表
if (type == 1) {
isSimbol = true;
isRelational = false;
simbolKey = Character.toString(strs[0].charAt(0));
word.addRelationSimbolPrimitive(simbolKey, chinese);
continue;
}
if (type == 2) {
// 虚词
if (strs[0].startsWith("{")) {
// 去掉开始第一个字符 "{"
String english = strs[0].substring(1);
// 去掉有半部分 "}"
if (chinese != null) {
word.addStructruralWord(chinese);
continue;
} else {
// 如果没有中文部分，则使用英文词
word.addStructruralWord(english);
continue;
}
}
}
}
}

/**
* 
* 从英文部分确定这个义原的类别。
* 
* 
* 0-----Primitive 
* 1-----Relational 
* 2-----Special
* 
*
* @param english
* @return 一个代表类别的整数，其值为1，2，3。
*/
public static int getPrimitiveType(String str) {
String first = Character.toString(str.charAt(0));
if (RELATIONAL_SYMBOL.contains(first)) {
return 1;
}
if (SPECIAL_SYMBOL.contains(first)) {
return 2;
}
return 0;
}

/**
* 计算两个词语的相似度
*/
public static double simWord(String word1, String word2) {
try{
if (ALLWORDS.containsKey(word1) && ALLWORDS.containsKey(word2)) {
List<Word> list1 = ALLWORDS.get(word1);
List<Word> list2 = ALLWORDS.get(word2);
double max = 0;
for (Word w1 : list1) {
for (Word w2 : list2) {
double sim = simWord(w1, w2);
max = (sim > max) ? sim : max;
}
}
return max;
}else{
throw new Exception("输入的词没有被收录");
}

}catch(Exception e){
e.printStackTrace();
}

return 0.0;
}

/**
* 计算两个词语的相似度
*
* @param w1
* @param w2
* @return
*/
public static double simWord(Word w1, Word w2) {
// 虚词和实词的相似度为零
if (w1.isStructruralWord() != w2.isStructruralWord()) {
System.out.println("虚词和实词不能进行比较" + "[ The practicle is not allowed comparing with notional word ]");
return 0;
}
// 虚词
if (w1.isStructruralWord() && w2.isStructruralWord()) {
List<String> list1 = w1.getStructruralWords();
List<String> list2 = w2.getStructruralWords();
return simList(list1, list2);
}
// 实词
if (!w1.isStructruralWord() && !w2.isStructruralWord()) {
// 实词的相似度分为4个部分
// 基本义原相似度
String firstPrimitive1 = w1.getFirstPrimitive();
String firstPrimitive2 = w2.getFirstPrimitive();
double sim1 = simPrimitive(firstPrimitive1, firstPrimitive2);
// 其他基本义原相似度
List<String> list1 = w1.getOtherPrimitives();
List<String> list2 = w2.getOtherPrimitives();
double sim2 = simList(list1, list2);
// 关系义原相似度
Map<String, List<String>> map1 = w1.getRelationalPrimitives();
Map<String, List<String>> map2 = w2.getRelationalPrimitives();
double sim3 = simMap(map1, map2);
// 关系符号相似度
map1 = w1.getRelationSimbolPrimitives();
map2 = w2.getRelationSimbolPrimitives();
double sim4 = simMap(map1, map2);
double product = sim1;
double sum = beta1 * product;
product *= sim2;
sum += beta2 * product;
product *= sim3;
sum += beta3 * product;
product *= sim4;
sum += beta4 * product;
return sum;
}
return 0.0;
}

/**
* map的相似度。
*
* @param map1
* @param map2
* @return
*/
public static double simMap(Map<String, List<String>> map1,Map<String, List<String>> map2) {
if (map1.isEmpty() && map2.isEmpty()) {
return 1;
}
int total = map1.size() + map2.size();
double sim = 0;
int count = 0;
for (String key : map1.keySet()) {
if (map2.containsKey(key)) {
List<String> list1 = map1.get(key);
List<String> list2 = map2.get(key);
sim += simList(list1, list2);
count++;
}
}
return (sim + delta * (total - 2 * count)) / (total - count);
}

/**
* 比较两个集合的相似度
* @param list1
* @param list2
* @return
*/
public static double simList(List<String> list1, List<String> list2) {
if (list1.isEmpty() && list2.isEmpty())
return 1;
int m = list1.size();
int n = list2.size();

//This sentence means that you ought to get a bigger one from two collections
int big = m > n ? m : n;
int N = (m < n) ? m : n;
int count = 0;
int index1 = 0, index2 = 0;
double sum = 0;
double max = 0;
while (count < N) {
max = 0;
for (int i = 0; i < list1.size(); i++) {
for (int j = 0; j < list2.size(); j++) {
double sim = innerSimWord(list1.get(i), list2.get(j));
if (sim > max) {
index1 = i;
index2 = j;
max = sim;
}
}
}
sum += max;
list1.remove(index1);
System.out.println(list1 + " Chuixi___List1");
list2.remove(index2);
System.out.println(list2 + " Chuixi___List2");
count++;
}
System.out.println(sum + " is sum");
return sum + delta * (big - N) / big;
}

/**
* 内部比较两个词，可能是为具体词，也可能是义原
* @param sourceWord
* @param targetWord
* @return
*/
private static double innerSimWord(String sourceWord, String targetWord) {

boolean isPrimitiveFirst = Primitive.isPrimitive(sourceWord);
boolean isPrimitiveSecond = Primitive.isPrimitive(targetWord);
// 两个义原
if (isPrimitiveFirst && isPrimitiveSecond)
return simPrimitive(sourceWord, targetWord);
// 具体词
if (!isPrimitiveFirst && !isPrimitiveSecond) {
if (sourceWord.equals(targetWord))
return 1;
else
return 0;
}
// 义原和具体词的相似度, 默认为gamma=0.2
return gamma;
}

/**
* @param primitive1
* @param primitive2
* @return
*/
public static double simPrimitive(String primitive1, String primitive2) {
int dis = disPrimitive(primitive1, primitive2);
return alpha / (dis + alpha);
}

/**
* 计算两个义原之间的距离，如果两个义原层次没有共同节点，则设置他们的距离为20
* @param primitive1
* @param primitive2
* @return
*/
public static int disPrimitive(String primitive1, String primitive2) {

List<Integer> sourceList = Primitive.getParents(primitive1);
List<Integer> targetList = Primitive.getParents(primitive2);
for (int i = 0; i < sourceList.size(); i++) {
int id1 = sourceList.get(i);
if (targetList.contains(id1)) {
int index = targetList.indexOf(id1);
return index - i;
}
}
return DEFAULT_PRIMITIVE_DIS;
}

/**
* 加入一个词语
* @param word
*/
public static void addWord(Word word) {

List<Word> list = ALLWORDS.get(word.getWord());
if (list == null) {
list = new ArrayList<Word>();
list.add(word);
ALLWORDS.put(word.getWord(), list);
} else {
list.add(word);
}
}

/**
* @param args
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
BufferedReader reader = new BufferedReader(new FileReader("dict/glossary.dat"));
Set<String> set = new HashSet<String>();
/*
* String line = reader.readLine(); while (line != null) {
* //System.out.println(line); line = line.replaceAll("\\s+", " ");
* String[] strs = line.split(" "); for (int i = 0; i < strs.length;
* i++) { // System.out.print(" " + strs[i]); } System.out.println();
* set.add(strs[1]); line = reader.readLine(); }
*/
System.out.println(set.size());
for (String name : set) {
System.out.println(name);
}
double simval = simWord("黩武", "联合国");
System.out.println(simval);
}
}

0 0