英语分词

来源：互联网发布：linux x264文件编辑：程序博客网时间：2024/05/09 06:19

package com.triggerprotein;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.util.HashMap;import java.util.Map;public class TriggerLocation {public static void main(String[] args) {SentTokenizer sentT = new SentTokenizer();try{File file = new File("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_triggerInfo");File[] files = file.listFiles();for(int fileId = 0; fileId < files.length; fileId++){String fileName = files[fileId].getName();FileReader fr = new FileReader(files[fileId]);BufferedReader br = new BufferedReader(fr);FileWriter fw = new FileWriter("E:\\2009_BioEvent\\devel\\BioNLP_2009_devel_NewTriggerInfo\\" + fileName);BufferedWriter bw = new BufferedWriter(fw);Map<Integer,Character> oldSentMap = new HashMap<Integer,Character>();Map<Integer,Character> newSentMap = new HashMap<Integer,Character>();String sent;while((sent = br.readLine())!= null){//对句子进行处理if(sent.length() == 0)    {  oldSentMap.clear();newSentMap.clear();bw.newLine();bw.flush();    }else if(sent.startsWith("#")) //蛋白质{    //# T6 Protein S5 75 79 ERK2        String[] pro = sent.split(" ");        int proStart = Integer.parseInt(pro[4]);    int proEnd = Integer.parseInt(pro[5]);        StringBuffer proB = new StringBuffer();    for(int j = 6; j < pro.length; j++){if(j == pro.length - 1){proB.append(pro[j]);}else{proB.append(pro[j] + " ");}}String protein = proB.toString();int num1 = 0,num2 = 0,num3 = 0, num4 = 0;String oldMapStr ="", newMapStr ="";int mark = 0;for(int proX = 0; proX < oldSentMap.size(); proX++){if(mark == 1){proX = proX - 1;mark = 0;}Character oldMapChar = oldSentMap.get(proX);Character newMapChar = newSentMap.get(proX+num1+num3);if(proX >= proStart && proX < proEnd){if(oldMapChar.equals(newMapChar)&& oldMapChar.equals(' ')){    num4 +=1;oldMapStr += oldSentMap.get(proX);    newMapStr += newSentMap.get(proX +num1+num3);}else if(oldMapChar.equals(newMapChar)&& !oldMapChar.equals(' ')){oldMapStr += oldSentMap.get(proX);    newMapStr += newSentMap.get(proX +num1+num3);    if(proX == proEnd-1)    {    if(oldMapStr.equals(newMapStr) && oldMapStr.equals(protein))    {        String newProtein = sentT.sentTokenizer(protein);        int numm = 0;    if(newProtein.indexOf(" ") != -1)    {    String[] strr =newProtein.split(" ");    numm = strr.length;            }else     {    numm =1;    }    if(proStart > 0 && (oldSentMap.get(proStart-1).equals('-') || oldSentMap.get(proStart-1).equals('(')|| oldSentMap.get(proStart-1).equals('/')))    {    int proIndexStart = num1 + num2 + 1;    int proIndexEnd = num1 + num2 + num3 + num4;    if(numm == (proIndexEnd - proIndexStart +1))    {    bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein);    bw.newLine();    bw.flush();        break;    }else    {    System.out.println(fileName +" %%%");    System.out.println(sent+" %%% ");    System.out.println(protein + " " + proIndexStart + " " + proIndexEnd + " " + newProtein);    System.out.println("计算蛋白质单词个数的时候出错");    break;    }    }else    {    int proIndexStart = num1 + num2;    int proIndexEnd = num1 + num2 + num3 + num4;        bw.write("# " + proIndexStart + " " + proIndexEnd + " " + newProtein);    bw.newLine();    bw.flush();    break;        }    }    }    }else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num3 +=1;mark = 1;}else{System.out.println(fileName);System.out.println(sent + "***");System.out.println("处理蛋白质时字符出错");}    }else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num1 += 1;mark = 1;}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' ')){num2 += 1;}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' ')){}else {System.out.println(fileName);System.out.println(sent);System.out.println(proX +"   " + oldMapChar +"***"+ newMapChar+"不相等");}}}else if(sent.startsWith("@")) //trigger{//@ T120 Gene_expression S4 90 97 produceString[] tri = sent.split(" ");String triType = tri[2];int triStart = Integer.parseInt(tri[4]);int triEnd = Integer.parseInt(tri[5]);StringBuffer triB = new StringBuffer();    for(int j = 6; j < tri.length; j++){if(j == tri.length - 1){triB.append(tri[j]);}else{triB.append(tri[j] + " ");}}String trigger = triB.toString();int num1 = 0, num2 = 0, num3 = 0, num4 = 0;String oldMapStr ="", newMapStr ="";int label = 0;for(int triX = 0; triX < oldSentMap.size(); triX++){if(label == 1){triX = triX - 1;label = 0;}Character oldMapChar = oldSentMap.get(triX);Character newMapChar = newSentMap.get(triX+num1+num3);if(triX >= triStart && triX < triEnd){if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' ')){num4 += 1;oldMapStr += oldSentMap.get(triX);    newMapStr += newSentMap.get(triX +num1+num3);}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' ')){oldMapStr += oldSentMap.get(triX);    newMapStr += newSentMap.get(triX +num1+num3);    if(triX == triEnd-1)    {    if(oldMapStr.equals(newMapStr)&& oldMapStr.equals(trigger))    {        String newTrigger = sentT.sentTokenizer(trigger);    int numm = 0;    if(newTrigger.indexOf(" ") != -1)    {    String[] strr =newTrigger.split(" ");    numm = strr.length;    }else     {    numm =1;    }    if(triStart > 0 && (oldSentMap.get(triStart-1).equals('-')||oldSentMap.get(triStart-1).equals('(')||oldSentMap.get(triStart-1).equals('/') || oldSentMap.get(triStart-1).equals('-') || oldSentMap.get(triStart-1).equals('[') || Character.isDigit(oldSentMap.get(triStart-1))))    {    int triIndexStart = num1 + num2 + 1;    int triIndexEnd = num1 + num2 + num3 + num4;        bw.write("@ " + triType + " " +  + triIndexStart + " " + triIndexEnd + " " + newTrigger);                                            bw.newLine();                                            bw.flush();    break;    }else    {    int triIndexStart = num1 + num2;    int triIndexEnd = num1 + num2 + num3 + num4;    if(numm == (triIndexEnd - triIndexStart +1))    {    bw.write("@ "+ triType + " " + triIndexStart + " " + triIndexEnd  + " " + newTrigger);                                            bw.newLine();                                            bw.flush();    break;    }else    {    System.out.println(fileName +" *** ");    System.out.println(sent + " *** ");    System.out.println(trigger + " " + newTrigger + " "+ triIndexStart + " " + triIndexEnd);    System.out.println("计算的trigger词长度有问题");    break;    }    }                                            }    }}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num3 += 1;    label = 1;}else{System.out.println(fileName);System.out.println(sent + "&&&");System.out.println("处理触发词时出错");}}else if(!oldMapChar.equals(newMapChar) && newMapChar.equals(' ')){num1 += 1;label = 1;}else if(oldMapChar.equals(newMapChar) && oldMapChar.equals(' ')){num2 += 1;}else if(oldMapChar.equals(newMapChar) && !oldMapChar.equals(' ')){}else {System.out.println(oldMapChar +"***"+ newMapChar+"不相等");}}}else if(sent.startsWith("%"))//event{}else  //sentence{char[] oldSentChar = sent.toCharArray();for(int charId = 0; charId<oldSentChar.length; charId++){oldSentMap.put(charId, oldSentChar[charId]);}String newSent = sentT.sentTokenizer(sent);char[] newSentChar = newSent.toCharArray();for(int chId = 0; chId<newSentChar.length; chId++){newSentMap.put(chId, newSentChar[chId]);}bw.write(newSent);bw.newLine();bw.flush();}}}}catch(IOException io){io.printStackTrace();}}}

上面的这个程序基本没有问题，原理是分词前的句子每个字符都存储到Map中，分词后的字符也都存储到Map中，两个Map 进行比对，根据单词的对应字符找到单词在分词后的句中的位置。

package com.triggerprotein;import java.util.List;import java.util.StringTokenizer;import cmu.arktweetnlp.Twokenize;public class SentTokenizer {public String sentTokenizer(String sent) //返回句子{String result = "";StringBuffer sentBuff = new StringBuffer();List<String> tokenList = Twokenize.tokenizeRawTweetText(sent);for(int i = 0; i < tokenList.size(); i++){String str11 = tokenList.get(i);if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).startsWith("/")&& tokenList.get(i).endsWith("/")){String stri = tokenList.get(i).substring(0, tokenList.get(i).length()-1);sentBuff.append(stri + " / ");}else if(tokenList.get(i).indexOf("/") != -1 && tokenList.get(i).startsWith("/")&& !tokenList.get(i).endsWith("/")){   String st = tokenList.get(i).substring(1);   sentBuff.append("/ ");   if(st.indexOf("-") != -1)   {   sentBuff.append(st.replace("-", " - "));   sentBuff.append(" ");   }else   {   sentBuff.append(st + " ");   }}else if(tokenList.get(i).indexOf("/") != -1 && !tokenList.get(i).endsWith("/")) {String[] string = tokenList.get(i).split("/");for(int p = 0; p < string.length; p++){if(p == string.length - 1){if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){if(string[p].equals("-")){sentBuff.append(string[p].replace("-", "- "));}else{sentBuff.append(string[p].replace("-", " - "));sentBuff.append(" ");}}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){sentBuff.append(string[p] + " ");}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1){StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符while (strToke.hasMoreElements()) {sentBuff.append(strToke.nextToken()+" ");}}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") == -1){    if(!string[p].endsWith("("))    {    sentBuff.append(string[p].replace("(", " ( "));    sentBuff.append(" ");    }else     {    sentBuff.append(string[p].replace("(", " ( "));    }    }else if(string[p].indexOf(")") != -1 && !string[p].endsWith(")")){String strr = string[p].replace(")", " )");if(strr.indexOf(")-") != -1){sentBuff.append(strr.replace("-", " - "));}else{sentBuff.append(strr);sentBuff.append(" ");}sentBuff.append(" ");}else if(string[p].equals("+)")){sentBuff.append("+ ) ");}else if(string[p].equals("-)")){sentBuff.append("- ) ");}else{sentBuff.append(string[p]+" ");}}else{if(string[p].indexOf("-")!= -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){if(string[p].startsWith("-")){sentBuff.append(string[p].replace("-", "- "));sentBuff.append("/ ");}else{sentBuff.append(string[p].replace("-", " - "));sentBuff.append(" / ");}}else if(string[p].indexOf("-") == -1 && string[p].indexOf("(") == -1 && string[p].indexOf(")") == -1){if(string[p].endsWith("+")){sentBuff.append(string[p].replace("+", " + "));sentBuff.append("/ ");}else{sentBuff.append(string[p] + " ");sentBuff.append("/ ");}}else if(string[p].indexOf("(") != -1 && string[p].indexOf(")") != -1){StringTokenizer strToke = new StringTokenizer(string[p], "()", true);// 打印分隔符while (strToke.hasMoreElements()) {sentBuff.append(strToke.nextToken()+" ");}sentBuff.append("/ ");}else if(string[p].indexOf(")") != -1 && string[p].endsWith(")")&& string[p].indexOf("(") == -1){String st = string[p].replace(")", " ) ");if(st.indexOf("-") != -1){sentBuff.append(string[p].replace("-", " - "));}else {sentBuff.append(st);}sentBuff.append("/ ");}else if(string[p].endsWith("+") || string[p].endsWith("-")){   if(string[p].endsWith("+"))    {sentBuff.append(string[p].replace("+", " + "));sentBuff.append("/ ");    }else if(string[p].endsWith("-"))    {    sentBuff.append(string[p].replace("-", " - "));    sentBuff.append("/ ");    }   }}}}else if((tokenList.get(i).indexOf("-")) != -1 && !tokenList.get(i).startsWith("-") && !tokenList.get(i).endsWith("-")){String[] str = tokenList.get(i).split("-");for(int p = 0; p < str.length; p++){if(p == str.length - 1){sentBuff.append(str[p] + " ");}else{if(str[p].endsWith(")")){if(str[p].indexOf("(")!= -1){String rr = str[p].replace("(", " ( ");sentBuff.append(rr.replace(")", " ) "));sentBuff.append("- ");}else{sentBuff.append(str[p].replace(")", " ) "));sentBuff.append("- ");}}else{sentBuff.append(str[p] + " ");sentBuff.append("- ");}}}}else if(tokenList.get(i).startsWith("-") && !tokenList.get(i).equals("-")){sentBuff.append("- " + tokenList.get(i).substring(1)+" ");}else if(tokenList.get(i).endsWith("-") && !tokenList.get(i).equals("-")){String preStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);if(preStr.indexOf("+") != -1){String[] plusSplit = preStr.split("\\+");for(int p = 0; p < plusSplit.length; p++){if(p == plusSplit.length - 1){sentBuff.append(plusSplit[p] + " ");}else{sentBuff.append(plusSplit[p] + " ");sentBuff.append("+ ");}}}else if(preStr.indexOf("-") != -1){String[] plusSplit = preStr.split("-");for(int p = 0; p < plusSplit.length; p++){if(p == plusSplit.length - 1){sentBuff.append(plusSplit[p] + " ");}else{sentBuff.append(plusSplit[p] + " ");sentBuff.append("- ");}}}else{sentBuff.append(preStr + " ");}sentBuff.append("- ");}else if(tokenList.get(i).startsWith("+") && !tokenList.get(i).equals("+")){sentBuff.append("+ " + tokenList.get(i).substring(1)+ " ");}else if(tokenList.get(i).endsWith("+") && !tokenList.get(i).equals("+")){String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);if(plusStr.indexOf("+") != -1){String[] plusSplit = plusStr.split("\\+");for(int p = 0; p < plusSplit.length; p++){if(p == plusSplit.length - 1){sentBuff.append(plusSplit[p] + " ");}else{sentBuff.append(plusSplit[p] + " ");sentBuff.append("+ ");}}}else{sentBuff.append(plusStr + " ");}sentBuff.append("+ ");}else if((tokenList.get(i).indexOf("+") != -1) && !tokenList.get(i).startsWith("+") && !tokenList.get(i).endsWith("+")){String[] str = tokenList.get(i).split("\\+");for(int p = 0; p < str.length; p++){if(p == str.length - 1){sentBuff.append(str[p] + " ");}else{sentBuff.append(str[p] + " ");sentBuff.append("+ ");}}}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).endsWith(")")){String plusStr = tokenList.get(i).substring(0, tokenList.get(i).length()-1);sentBuff.append(plusStr +" ) ");}else if(tokenList.get(i).indexOf(")") != -1 && !tokenList.get(i).equals(")") && tokenList.get(i).startsWith(")")){String sufixStr = tokenList.get(i).substring(1);sentBuff.append(") " + sufixStr+" "); }else if(tokenList.get(i).indexOf("(") != -1 && !tokenList.get(i).endsWith("(") && tokenList.get(i).indexOf(")") == -1){sentBuff.append(tokenList.get(i).replace("(", " ( "));sentBuff.append(" ");}else if(i==tokenList.size()-1){sentBuff.append(tokenList.get(i));}else{sentBuff.append(tokenList.get(i) + " ");}    }result = sentBuff.toString();return result;}}

上面的程序是对推特分词工具分词后处理

0 0