中文分词之过滤候选词以及去重

来源:互联网 发布:sql编程基础 编辑:程序博客网 时间:2024/05/10 00:27

中文分词之过滤候选词以及去重

一、      过滤单个词,特殊符号

代码实现:

/** * 过滤特殊词(单字动词,特殊符号) * @param inputPath * @param outPath * @throws Exception  */public static void filterWordFile2File(String inputFileStr,String outFileStr) throws Exception{File inputFile = new File(inputFileStr);File outFile = new File(outFileStr);if(!inputFile.exists()){throw new Exception("can not read the file");}if(!outFile.exists()){outFile.createNewFile();}FileWriter fw=null;BufferedReader br=null;try { br= new BufferedReader(new FileReader(inputFile));String temp ="";StringBuilder sb  = new StringBuilder();while((temp = br.readLine()) !=null){String[] temps = temp.split(" ");if(temps.length == 1){ // 单词的情况String[] spiltsWords = temps[0].split(ConstantString.slash);if(spiltsWords.length ==1){ // "/x"的情况continue;}else if(spiltsWords.length ==2){// "要/v" 或者 "民主/v"的情况if(spiltsWords[0].length() ==1 ){//"要/v" 的情况continue ;} }}sb.append(temp+ConstantString.WIN_NextLine);}//while fw = new FileWriter(outFile);fw.write(sb.toString());} catch (Exception e) {e.printStackTrace();}finally{try {if(br != null){br.close();}if(fw != null){fw.close();}} catch (Exception e2) {e2.printStackTrace();}}}/** * 过滤特殊词(单字动词,特殊符号) * @param inputPath * @param outPath * @throws Exception  */public static void filterWordDir2Dir(String inputFileDir,String outFileDir) throws Exception{File inputDir = new File(inputFileDir);File outputDir = new File(outFileDir);if(!inputDir.exists()){throw new Exception("can not read the inputDir");}if(!outputDir.exists()){outputDir.mkdirs();}try {ArrayList<String> paths = new StringUtil().getAllPath(inputFileDir);for(String path : paths){String name = StringUtil.getNameFromPath(path);String outputFileStr = outFileDir+ConstantString.slash+name+ConstantString.postText;filterWordFile2File(path,outputFileStr);}} catch (Exception e) {e.printStackTrace();}}

结果:



过滤后:



一、      候选词去重

思路: 用list.contain()来判断


/** * 利用list.contains(temp)去除txt中重复的候选词 * @param inputPath * @param outputPath */public static void delRepWords(String inputPath,String outputPath){//try {List<String> list = new ArrayList<String>();String result = "";File file = new File(inputPath);InputStream is = new FileInputStream(file);InputStreamReader isr = new InputStreamReader(is,"gb2312");BufferedReader br = new BufferedReader(isr);String temp = "";while((temp=br.readLine()) != null){if(!list.contains(temp)){list.add(temp);result += temp+ConstantString.WIN_NextLine;}}br.close();StringUtil.String2File(result, outputPath);} catch (Exception e) {e.printStackTrace();}}

去重前:


去重后:





0 0