segment-based 解析

来源:互联网 发布:java正则表达式 可选 编辑:程序博客网 时间:2024/04/30 09:25
此代码有错误,判断预测的segment 是否在 goldSegment 中时仅仅对比了字符串是否相等,需要改进
List<InputStructure> input = new ArrayList<InputStructure>();List<OutputStructure> output = new ArrayList<OutputStructure>();Stemmer stem = new Stemmer();CorpusProcessing corpus = new CorpusProcessing();//加载InputStructure和OutputStructuretry{FileReader frX = new FileReader("E:\\multiword-trigger\\merge\\train.txt");BufferedReader brX = new BufferedReader(frX);FileReader frY = new FileReader("E:\\multiword-trigger\\merge\\goldSent.txt");BufferedReader brY = new BufferedReader(frY);String sentX, sentY;while((sentX = brX.readLine()) != null && (sentY = brY.readLine()) != null){InputStructure XStructure = new InputStructure();List<String> tokens = corpus.getToken(sentX);XStructure.sentStructure = corpus.getToken(sentX);input.add(XStructure);OutputStructure YStructure = new OutputStructure();//对 YStructure 中的 segment 进行赋值String[] goldSeg= sentY.split(" ");for(int yi = 0; yi < goldSeg.length; yi++){if(goldSeg[yi].startsWith("@")){YStructure.goldSegment.add(goldSeg[yi].substring(1));}else{YStructure.goldSegment.add(goldSeg[yi]);}}//对 YStructure 中 goldFeature 进行赋值List<List<String>> result = corpus.getSentInfo(sentX);List<String> sentPos = result.get(1);List<String> sentLemma = result.get(2);for(int endId = 0; endId < tokens.size(); endId++){int maxLength = 5;if(endId + 1 >= maxLength){for(int segLength = 1; segLength <= maxLength; segLength++){String preSeg = "", xtokenPos="", xtokenLemma="", xtokenStem= "";// 两行 * 号之间的 for 循环就是为了得到 segment//************************************for(int k = 1; k <= segLength; k++){if(k == 1){preSeg = tokens.get(endId - k + 1);xtokenPos = sentPos.get(endId - k + 1);xtokenLemma = sentLemma.get(endId - k + 1);xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ");}else{preSeg = tokens.get(endId - k + 1) + " " + preSeg;xtokenPos = sentPos.get(endId - k + 1) + " " + xtokenPos;xtokenLemma = sentLemma.get(endId - k + 1) + " " + xtokenLemma;xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ") + " " + xtokenStem;}}//************************************//判断得到的 xtoken 是不是标准的 triggerfor(int segId = 0; segId < endId; segId++){String[] segType = goldSeg[segId].split("__");String seg = segType[0];String type = segType[1];if(preSeg.equals(seg)){//计算 seg#type 的 goldFeatureMap<String, Double> goldFeature = new HashMap<String, Double>();goldFeature.put("f1=" + preSeg + "#" + type, 1.0);goldFeature.put("f2=" + xtokenPos + "#" + type, 1.0);goldFeature.put("f3=" + xtokenLemma + "#" + type, 1.0);goldFeature.put("f4=" + xtokenStem + "#" + type, 1.0);YStructure.sentGoldFeature.add(goldFeature);}else{//计算 seg#non 的 goldFeatureMap<String, Double> goldFeature = new HashMap<String, Double>();goldFeature.put("f1=" + preSeg + "#non", 1.0);goldFeature.put("f2=" + xtokenPos + "#non", 1.0);goldFeature.put("f3=" + xtokenLemma + "#non", 1.0);goldFeature.put("f4=" + xtokenStem + "#non", 1.0);YStructure.sentGoldFeature.add(goldFeature);}}//内部代码逻辑有错误}}else{                        maxLength = endId+1;for(int segLength = 1; segLength <= maxLength; segLength++){String preSeg = "", xtokenPos="", xtokenLemma="", xtokenStem= "";// 两行 * 号之间的 for 循环就是为了得到 segment//************************************for(int k = 1; k <= segLength; k++){if(k == 1){preSeg = tokens.get(endId - k + 1);xtokenPos = sentPos.get(endId - k + 1);xtokenLemma = sentLemma.get(endId - k + 1);xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ");}else{preSeg = tokens.get(endId - k + 1) + " " + preSeg;xtokenPos = sentPos.get(endId - k + 1) + " " + xtokenPos;xtokenLemma = sentLemma.get(endId - k + 1) + " " + xtokenLemma;xtokenStem = stem.tokenStemmer(tokens.get(endId - k + 1) + " ") + " " + xtokenStem;}}//************************************//判断得到的 xtoken 是不是标准的 triggerboolean equal = false;String triType = "";for(int segId = 0; segId <= endId; segId++){String[] segType = goldSeg[segId].split("__");String seg = segType[0];String type = segType[1];triType = type;if(preSeg.equals(seg)){equal = true;break;}else{equal = false;}}//&&&&&&&&&&&&if(equal == true){//计算 seg#type 的 goldFeatureMap<String, Double> goldFeature = new HashMap<String, Double>();goldFeature.put("f1=" + preSeg + "#" + triType, 1.0);goldFeature.put("f2=" + xtokenPos + "#" + triType, 1.0);goldFeature.put("f3=" + xtokenLemma + "#" + triType, 1.0);goldFeature.put("f4=" + xtokenStem + "#" + triType, 1.0);YStructure.sentGoldFeature.add(goldFeature);}else{//计算 seg#non 的 goldFeatureMap<String, Double> goldFeature = new HashMap<String, Double>();goldFeature.put("f1=" + preSeg + "#non", 1.0);goldFeature.put("f2=" + xtokenPos + "#non", 1.0);goldFeature.put("f3=" + xtokenLemma + "#non", 1.0);goldFeature.put("f4=" + xtokenStem + "#non", 1.0);YStructure.sentGoldFeature.add(goldFeature);}//************}}   }      output.add(YStructure);}//对句子进行循环的结尾brY.close();frY.close();brX.close();frX.close();}catch(IOException io){io.printStackTrace();}

0 0
原创粉丝点击