英语分词
来源:互联网 发布:linux x264文件 编辑:程序博客网 时间:2024/05/09 06:19
package com.triggerprotein;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.util.HashMap;import java.util.Map;public class TriggerLocation {public static void main(String[] args) {SentTokenizer sentT = new SentTokenizer();try{File file = new File("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_triggerInfo");File[] files = file.listFiles();for(int fileId = 0; fileId < files.length; fileId++){String fileName = files[fileId].getName();FileReader fr = new FileReader(files[fileId]);BufferedReader br = new BufferedReader(fr);FileWriter fw = new FileWriter("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_NewTriggerInfo\\" + fileName);BufferedWriter bw = new BufferedWriter(fw);Map<Integer,Character> oldSentMap = new HashMap<Integer,Character>();Map<Integer,Character> newSentMap = new HashMap<Integer,Character>();String sent;while((sent = br.readLine())!= null){//对句子进行处理if(sent.length() == 0) { oldSentMap.clear();newSentMap.clear();bw.newLine();bw.flush(); }else if(sent.startsWith("#")) //蛋白质{ //# T6 Protein S5 75 79 ERK2 String[] pro = sent.split(" "); int proStart = Integer.parseInt(pro[4]); int proEnd = Integer.parseInt(pro[5]); StringBuffer proB = new StringBuffer(); for(int j = 6; j < pro.length; j++){if(j == pro.length - 1){proB.append(pro[j]);}else{proB.append(pro[j] + " ");}}String protein = proB.toString();int num1 = 0,num2 = 0,num3 = 0, num4 = 0;String oldMapStr ="", newMapStr ="";int mark = 0;for(int proX = 0; proX < oldSentMap.size(); proX++){if(mark == 1){proX = proX - 1;mark = 0;}Character oldMapChar = oldSentMap.get(proX);Character newMapChar = newSentMap.get(proX+num1+num3);if(proX >= proStart && proX < proEnd){if(oldMapChar.equals(newMapChar)&& oldMapChar.equals(' ')){ num4 +=1;oldMapStr += oldSentMap.get(proX); newMapStr += newSentMap.get(proX +num1+num3);}else if(oldMapChar.equals(newMapChar)&& !oldMapChar.equals(' ')){oldMapStr += oldSentMap.get(proX); newMapStr += newSentMap.get(proX +num1+num3); if(proX == proEnd-1) { if(oldMapStr.equals(newMapStr) && oldMapStr.equals(protein)) { String newProtein = sentT.sentTokenizer(protein); int numm = 0; if(newProtein.indexOf(" ") != -1) { String[] strr =newProtein.split(" "); numm = strr.length; }else { numm =1; } if(proStart > 0 && (oldSentMap.get(proStart-1).equals('-') || oldSentMap.get(proStart-1).equals('(')|| oldSentMap.get(proStart-1).equals('/'))) { int proIndexStart = num1 + num2 + 1; int proIndexEnd = num1 + num2 + num3 + num4; if(numm == (proIndexEnd - proIndexStart +1)) { bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein); bw.newLine(); bw.flush(); break; }else { System.out.println(fileName +" %%%"); System.out.println(sent+" %%% "); System.out.println(protein + " " + proIndexStart + " " + proIndexEnd + " " + newProtein); System.out.println("计算蛋白质单词个数的时候出错"); break; } }else { int proIndexStart = num1 + num2; int proIndexEnd = num1 + num2 + num3 + num4; bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein); bw.newLine(); bw.flush(); break; } } } }else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num3 +=1;mark = 1;}else{System.out.println(fileName);System.out.println(sent + "***");System.out.println("处理蛋白质时字符出错");} }else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num1 += 1;mark = 1;}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' ')){num2 += 1;}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' ')){}else {System.out.println(fileName);System.out.println(sent);System.out.println(proX +" " + oldMapChar +"***"+ newMapChar+"不相等");}}}else if(sent.startsWith("@")) //trigger{//@ T120 Gene_expression S4 90 97 produceString[] tri = sent.split(" ");String triType = tri[2];int triStart = Integer.parseInt(tri[4]);int triEnd = Integer.parseInt(tri[5]);StringBuffer triB = new StringBuffer(); for(int j = 6; j < tri.length; j++){if(j == tri.length - 1){triB.append(tri[j]);}else{triB.append(tri[j] + " ");}}String trigger = triB.toString();int num1 = 0, num2 = 0, num3 = 0, num4 = 0;String oldMapStr ="", newMapStr ="";int label = 0;for(int triX = 0; triX < oldSentMap.size(); triX++){if(label == 1){triX = triX - 1;label = 0;}Character oldMapChar = oldSentMap.get(triX);Character newMapChar = newSentMap.get(triX+num1+num3);if(triX >= triStart && triX < triEnd){if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' ')){num4 += 1;oldMapStr += oldSentMap.get(triX); newMapStr += newSentMap.get(triX +num1+num3);}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' ')){oldMapStr += oldSentMap.get(triX); newMapStr += newSentMap.get(triX +num1+num3); if(triX == triEnd-1) { if(oldMapStr.equals(newMapStr)&& oldMapStr.equals(trigger)) { String newTrigger = sentT.sentTokenizer(trigger); int numm = 0; if(newTrigger.indexOf(" ") != -1) { String[] strr =newTrigger.split(" "); numm = strr.length; }else { numm =1; } if(triStart > 0 && (oldSentMap.get(triStart-1).equals('-')||oldSentMap.get(triStart-1).equals('(')||oldSentMap.get(triStart-1).equals('/') || oldSentMap.get(triStart-1).equals('-') || oldSentMap.get(triStart-1).equals('[') || Character.isDigit(oldSentMap.get(triStart-1)))) { int triIndexStart = num1 + num2 + 1; int triIndexEnd = num1 + num2 + num3 + num4; bw.write("@ " + triType + " " + + triIndexStart + " " + triIndexEnd + " " + newTrigger); bw.newLine(); bw.flush(); break; }else { int triIndexStart = num1 + num2; int triIndexEnd = num1 + num2 + num3 + num4; if(numm == (triIndexEnd - triIndexStart +1)) { bw.write("@ "+ triType + " " + triIndexStart + " " + triIndexEnd + " " + newTrigger); bw.newLine(); bw.flush(); break; }else { System.out.println(fileName +" *** "); System.out.println(sent + " *** "); System.out.println(trigger + " " + newTrigger + " "+ triIndexStart + " " + triIndexEnd); System.out.println("计算的trigger词长度有问题"); break; } } } }}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num3 += 1; label = 1;}else{System.out.println(fileName);System.out.println(sent + "&&&");System.out.println("处理触发词时出错");}}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num1 += 1;label = 1;}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' ')){num2 += 1;}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' ')){}else {System.out.println(oldMapChar +"***"+ newMapChar+"不相等");}}}else if(sent.startsWith("%"))//event{}else //sentence{char[] oldSentChar = sent.toCharArray();for(int charId = 0; charId<oldSentChar.length; charId++){oldSentMap.put(charId, oldSentChar[charId]);}String newSent = sentT.sentTokenizer(sent);char[] newSentChar = newSent.toCharArray();for(int chId = 0; chId<newSentChar.length; chId++){newSentMap.put(chId, newSentChar[chId]);}bw.write(newSent);bw.newLine();bw.flush();}}}}catch(IOException io){io.printStackTrace();}}}
上面的这个程序基本没有问题, 原理是分词前的句子每个字符都存储到Map中,分词后的字符也都存储到Map中,两个Map 进行比对,根据单词的对应字符找到单词在分词后的句中的位置。
package com.triggerprotein;import java.util.List;import java.util.StringTokenizer;import cmu.arktweetnlp.Twokenize;public class SentTokenizer {public String sentTokenizer(String sent) //返回句子{String result = "";StringBuffer sentBuff = new StringBuffer();List<String> tokenList = Twokenize.tokenizeRawTweetText(sent);for(int i = 0; i < tokenList.size(); i++){String str11 = tokenList.get(i);if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).startsWith("/")&& tokenList.get(i).endsWith("/")){String stri = tokenList.get(i).substring(0, tokenList.get(i).length()-1);sentBuff.append(stri + " / ");}else if(tokenList.get(i).indexOf("/") != -1 && tokenList.get(i).startsWith("/")&& !tokenList.get(i).endsWith("/")){ String st = tokenList.get(i).substring(1); sentBuff.append("/ "); if(st.indexOf("-") != -1) { sentBuff.append(st.replace("-", " - ")); sentBuff.append(" "); }else { sentBuff.append(st + " "); }}else if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).endsWith("/")) {String[] string = tokenList.get(i).split("/");for(int p = 0; p < string.length; p++){if(p == string.length - 1){if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){if(string[p].equals("-")){sentBuff.append(string[p].replace("-", "- "));}else{sentBuff.append(string[p].replace("-", " - "));sentBuff.append(" ");}}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){sentBuff.append(string[p] + " ");}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1){StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符while (strToke.hasMoreElements()) {sentBuff.append(strToke.nextToken()+" ");}}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") == -1){ if(!string[p].endsWith("(")) { sentBuff.append(string[p].replace("(", " ( ")); sentBuff.append(" "); }else { sentBuff.append(string[p].replace("(", " ( ")); } }else if(string[p].indexOf(")") != -1 && !string[p].endsWith(")")){String strr = string[p].replace(")", " )");if(strr.indexOf(")-") != -1){sentBuff.append(strr.replace("-", " - "));}else{sentBuff.append(strr);sentBuff.append(" ");}sentBuff.append(" ");}else if(string[p].equals("+)")){sentBuff.append("+ ) ");}else if(string[p].equals("-)")){sentBuff.append("- ) ");}else{sentBuff.append(string[p]+" ");}}else{if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){if(string[p].startsWith("-")){sentBuff.append(string[p].replace("-", "- "));sentBuff.append("/ ");}else{sentBuff.append(string[p].replace("-", " - "));sentBuff.append(" / ");}}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){if(string[p].endsWith("+")){sentBuff.append(string[p].replace("+", " + "));sentBuff.append("/ ");}else{sentBuff.append(string[p] + " ");sentBuff.append("/ ");}}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1){StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符while (strToke.hasMoreElements()) {sentBuff.append(strToke.nextToken()+" ");}sentBuff.append("/ ");}else if(string[p].indexOf(")") != -1 && string[p].endsWith(")")&& string[p].indexOf("(") == -1){String st = string[p].replace(")", " ) ");if(st.indexOf("-") != -1){sentBuff.append(string[p].replace("-", " - "));}else {sentBuff.append(st);}sentBuff.append("/ ");}else if(string[p].endsWith("+") || string[p].endsWith("-")){ if(string[p].endsWith("+")) {sentBuff.append(string[p].replace("+", " + "));sentBuff.append("/ "); }else if(string[p].endsWith("-")) { sentBuff.append(string[p].replace("-", " - ")); sentBuff.append("/ "); } }}}}else if((tokenList.get(i).indexOf("-")) != -1 && !tokenList.get(i).startsWith("-") && !tokenList.get(i).endsWith("-")){String[] str = tokenList.get(i).split("-");for(int p = 0; p < str.length; p++){if(p == str.length - 1){sentBuff.append(str[p] + " ");}else{if(str[p].endsWith(")")){if(str[p].indexOf("(")!= -1){String rr = str[p].replace("(", " ( ");sentBuff.append(rr.replace(")", " ) "));sentBuff.append("- ");}else{sentBuff.append(str[p].replace(")", " ) "));sentBuff.append("- ");}}else{sentBuff.append(str[p] + " ");sentBuff.append("- ");}}}}else if(tokenList.get(i).startsWith("-") && !tokenList.get(i).equals("-")){sentBuff.append("- " + tokenList.get(i).substring(1)+" ");}else if(tokenList.get(i).endsWith("-") && !tokenList.get(i).equals("-")){String preStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);if(preStr.indexOf("+") != -1){String[] plusSplit = preStr.split("\\+");for(int p = 0; p < plusSplit.length; p++){if(p == plusSplit.length - 1){sentBuff.append(plusSplit[p] + " ");}else{sentBuff.append(plusSplit[p] + " ");sentBuff.append("+ ");}}}else if(preStr.indexOf("-") != -1){String[] plusSplit = preStr.split("-");for(int p = 0; p < plusSplit.length; p++){if(p == plusSplit.length - 1){sentBuff.append(plusSplit[p] + " ");}else{sentBuff.append(plusSplit[p] + " ");sentBuff.append("- ");}}}else{sentBuff.append(preStr + " ");}sentBuff.append("- ");}else if(tokenList.get(i).startsWith("+") && !tokenList.get(i).equals("+")){sentBuff.append("+ " + tokenList.get(i).substring(1)+ " ");}else if(tokenList.get(i).endsWith("+") && !tokenList.get(i).equals("+")){String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);if(plusStr.indexOf("+") != -1){String[] plusSplit = plusStr.split("\\+");for(int p = 0; p < plusSplit.length; p++){if(p == plusSplit.length - 1){sentBuff.append(plusSplit[p] + " ");}else{sentBuff.append(plusSplit[p] + " ");sentBuff.append("+ ");}}}else{sentBuff.append(plusStr + " ");}sentBuff.append("+ ");}else if((tokenList.get(i).indexOf("+") != -1) && !tokenList.get(i).startsWith("+") && !tokenList.get(i).endsWith("+")){String[] str = tokenList.get(i).split("\\+");for(int p = 0; p < str.length; p++){if(p == str.length - 1){sentBuff.append(str[p] + " ");}else{sentBuff.append(str[p] + " ");sentBuff.append("+ ");}}}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).endsWith(")")){String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);sentBuff.append(plusStr +" ) ");}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).startsWith(")")){String sufixStr = tokenList.get(i).substring(1);sentBuff.append(") " + sufixStr+" "); }else if(tokenList.get(i).indexOf("(") != -1 && !tokenList.get(i).endsWith("(") && tokenList.get(i).indexOf(")") == -1){sentBuff.append(tokenList.get(i).replace("(", " ( "));sentBuff.append(" ");}else if(i==tokenList.size()-1){sentBuff.append(tokenList.get(i));}else{sentBuff.append(tokenList.get(i) + " ");} }result = sentBuff.toString();return result;}}
上面的程序是对推特分词工具分词后处理
0 0
- 英语分词
- 英语学习--chap6 分词
- 英语句子分词
- 逻辑英语——动词和分词-----名词
- 分词
- 分词
- 分词
- 分词
- 分词
- 分词
- 分词
- 对于2-gram 条件下对英语文本的分词处理
- 英语
- 英语
- 英语
- 英语
- 英语
- 英语
- LeetCode | Sqrt (x)
- ubuntu日常使用常用快捷键(待补充)
- hibernate n+1问题
- redis翻译_redis lua脚本
- params.equals(""),!params.isEmpty(),params!=null的区别
- 英语分词
- 动软代码生成器 常用函数
- 网络图片加载的第三方类:EGOImageView的使用方法及注意事项
- 多态机制
- 黑马程序员——学习日记之--Java网络编程小结
- 渣渣ACM日记——278-排队(NYOJ)
- 长轮询解决方案
- 毕业季致青春
- leetcode--Implement Stack using Queues