使用Lucene词频统计与d3.cloud展示的中文英文词云系统
来源:互联网 发布:xlsx 导入 java 编辑:程序博客网 时间:2024/05/20 00:12
用Lucene 4.7对进行词频统计,使用的分词器为IKAnalyzer,对中文的支持较低。
中文词频统计仅包含中文,英文词频统计仅包含英文。可在中文解析器与英文解析器中进行修改。
在获取到排序好的词频后,可使用d3.cloud在web中展示出来。
Github地址:github.com/panzejia/WebGuide
package cn.iflin.project.participle;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldType;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import cn.iflin.project.model.WordModel;import cn.iflin.project.participle.englishwords.EnglishParser;import cn.iflin.project.participle.wordcloud.ChineseParser;/**主要进行对中文与英语进行词频计算*/public class WordsParticiple { /** * 判断文件夹是否存在之后调用索引计算词频 */ private static ArrayList<WordModel> checkFile(String text, String articleId) { ArrayList<WordModel> wordList = new ArrayList<WordModel>(); return wordList; } private static void addDoc(IndexWriter w, String text) throws IOException { Document doc = new Document(); FieldType ft = new FieldType(); ft.setIndexed(true);// 存储 ft.setStored(true);// 索引 ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);// 存储位置 ft.setStoreTermVectorOffsets(true);// 存储偏移量 doc.add(new Field("text", text, ft)); w.addDocument(doc); } /** * 删除文件 * * @param dir * @return */ private static boolean deleteDir(File dir) { if (dir.isDirectory()) { String[] children = dir.list(); for (int i = 0; i < children.length; i++) { boolean success = deleteDir(new File(dir, children[i])); if (!success) { return false; } } } // 目录此时为空,可以删除 return dir.delete(); } /** * 计算英文词频 * * @param text * @return * @throws IOException */ public static ArrayList<WordModel> getTF(String text, String articleId, String tag) throws IOException { ArrayList<WordModel> wordList = new ArrayList<WordModel>(); File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId); WordsParticiple.deleteDir(file); Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式,如果构造函数参数为false,那么使用最细粒度分词。 IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息 Directory fileindex; fileindex = FSDirectory.open(file); IndexWriter filew = new IndexWriter(fileindex, configfile); try { WordsParticiple.addDoc(filew, text); } finally { // 统一释放内存 filew.close(); } try { IndexReader reader = DirectoryReader.open(fileindex); for (int i = 0; i < reader.numDocs(); i++) { int docId = i; Terms terms = reader.getTermVector(docId, "text"); if (terms == null) continue; TermsEnum termsEnum = terms.iterator(null); BytesRef thisTerm = null; while ((thisTerm = termsEnum.next()) != null) { String termText = thisTerm.utf8ToString(); DocsEnum docsEnum = termsEnum.docs(null, null); while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if(tag.equals("noLevel")){ WordModel wm = new WordModel(); wm.setWord(termText); wm.setWordFrequency(docsEnum.freq()); wordList.add(wm); } else if (EnglishParser.checkEnglishWord(termText, tag)) { WordModel wm = new WordModel(); wm.setWord(termText); wm.setWordFrequency(docsEnum.freq()); wordList.add(wm); } } } } reader.close(); fileindex.close(); } catch (Exception e) { e.printStackTrace(); } return wordList; } /** * 计算中文词频 * * @param text * @return * @throws IOException */ public static ArrayList<WordModel> getTF(String text, String articleId) throws IOException { ArrayList<WordModel> wordList = new ArrayList<WordModel>(); File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId); WordsParticiple.deleteDir(file); Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式,如果构造函数参数为false,那么使用最细粒度分词。 IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息 Directory fileindex; fileindex = FSDirectory.open(file); IndexWriter filew = new IndexWriter(fileindex, configfile); try { WordsParticiple.addDoc(filew, text); } finally { // 统一释放内存 filew.close(); } try { IndexReader reader = DirectoryReader.open(fileindex); for (int i = 0; i < reader.numDocs(); i++) { int docId = i; Terms terms = reader.getTermVector(docId, "text"); if (terms == null) continue; TermsEnum termsEnum = terms.iterator(null); BytesRef thisTerm = null; while ((thisTerm = termsEnum.next()) != null) { String termText = thisTerm.utf8ToString(); DocsEnum docsEnum = termsEnum.docs(null, null); while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (ChineseParser.isChinese(termText) && termText.length() >= 2) { WordModel wm = new WordModel(); wm.setWord(termText); wm.setWordFrequency(docsEnum.freq()); wordList.add(wm); } } } } reader.close(); fileindex.close(); } catch (Exception e) { e.printStackTrace(); } return wordList; }}
接下来对英语进行词频排序
package cn.iflin.project.participle.englishwords;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import cn.iflin.project.model.WordModel;import cn.iflin.project.participle.WordsParticiple;import cn.iflin.project.participle.wordcloud.CalculateChineseFrequency;public class CalculateEnglishFrequency extends WordsParticiple { /** * 返回排序后的词频 * * @param text 需分词内容 * @param articleId 分词内容 属性 (数据库or用户自定义(temp)) * @param tag English:siji、liuji、kaoyan;Chinese:chinese * @return */ public static ArrayList<WordModel> getWordFre(String text, String articleId, String tag) { ArrayList<WordModel> wordList = new ArrayList<WordModel>(); try { //过滤掉常见标点符号 text = EnglishParser.delPunctuation(text); wordList = getTF(text, articleId, tag); } catch (IOException e) { e.printStackTrace(); } // 定义排序规则 class SortByFre implements Comparator { public int compare(Object o1, Object o2) { WordModel s1 = (WordModel) o1; WordModel s2 = (WordModel) o2; return s2.getWordFrequency().compareTo(s1.getWordFrequency()); } } Collections.sort(wordList, new SortByFre()); return wordList; }}
最后是对中文词频进行排序
package cn.iflin.project.participle.wordcloud;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldType;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import cn.iflin.project.model.WordModel;import cn.iflin.project.participle.WordsParticiple;import cn.iflin.project.participle.englishwords.EnglishParser;/** * 计算词频 * * @author Jaypan * */public class CalculateChineseFrequency extends WordsParticiple{ /** * 返回排序后的词频 * * @param text 需分词内容 * @param articleId 分词内容 属性 (数据库or用户自定义(temp)) * @param tag English:siji、liuji、kaoyan;Chinese:chinese * @return */ public static ArrayList<WordModel> getWordFre(String text, String articleId) { ArrayList<WordModel> wordList = new ArrayList<WordModel>(); try { wordList = getTF(text, articleId); } catch (IOException e) { e.printStackTrace(); } // 定义排序规则 class SortByFre implements Comparator { public int compare(Object o1, Object o2) { WordModel s1 = (WordModel) o1; WordModel s2 = (WordModel) o2; return s2.getWordFrequency().compareTo(s1.getWordFrequency()); } } Collections.sort(wordList, new SortByFre()); return wordList; }}
英语解析器
package cn.iflin.project.participle.englishwords;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;public class EnglishParser {// public static void main(String[] args) {// testReg("B. Closing offices on holidays.");// } public static String delPunctuation(String text) { // 先去掉标点,再合并空格 Pattern p = Pattern.compile("[(.|,|\"|\\?|!|:;')]");// 这边增加所有的符号,例如要加一个'则变成[(.|,|\"|\\?|!|:|')],如果是特殊符号要加转换 Matcher m = p.matcher(text);// 这为要整理的字符串 String first = m.replaceAll(""); p = Pattern.compile(" {2,}"); m = p.matcher(first); String second = m.replaceAll(""); return second; } //判断单词是否属于某一分类 public static boolean checkEnglishWord(String checkWord, String englishClass) { ArrayList<String> words = getEnglishWords(englishClass); for (String word : words) { if (word.equals(checkWord)) { return true; } } return false; } /** * 获取单词列表 * * @param sourceName * 单词级别 * @return 单词列表 */ public static ArrayList<String> getEnglishWords(String sourceName) { ArrayList<String> words = new ArrayList<String>(); String filePath = "C:\\Spider\\EnglishWords\\" + sourceName + ".txt"; try { String encoding = "GBK"; File file = new File(filePath); if (file.isFile() && file.exists()) { InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding); BufferedReader br = new BufferedReader(read); String lineText = null; while ((lineText = br.readLine()) != null) { words.add(lineText); } read.close(); } else { System.out.println("找不到指定文件"); } } catch (Exception e) { System.out.println("读取文件出错"); e.printStackTrace(); } return words; }}
中文解析器
package cn.iflin.project.participle.wordcloud;import java.util.regex.Matcher;import java.util.regex.Pattern;public class ChineseParser { // 判断一个字符是否是中文 private static boolean isChinese(char c) { return c >= 0x4E00 && c <= 0x9FA5;// 根据字节码判断 } // 判断一个字符串是否含有中文 public static boolean isChinese(String str) { if (str == null) return false; for (char c : str.toCharArray()) { if (isChinese(c)) return true;// 有一个中文字符就返回 } return false; } //将一句话转换成数组 public static String[] changeList(String s) { String[] data =new String[20]; Pattern pattern = Pattern.compile("[\u4e00-\u9fa5]*"); Matcher matcher = pattern.matcher(s); int i =0; while (matcher.find()){ if(matcher.group().equals("")){ continue; } data[i]=matcher.group(); i++; } return data; }}
阅读全文
0 0
- 使用Lucene词频统计与d3.cloud展示的中文英文词云系统
- 使用storm统计英文版<<圣经>>的词频
- 基于LUCENE的java词频统计
- 用ruby统计英文文章的词频
- 统计一个英文文本的单词词频
- 英文文本词频统计
- java 英文词频统计
- 中文分词与词频统计实例
- Python jieba 中文分词与词频统计
- Hadoop中文词频统计
- 词频统计系统
- c++ 统计英文文本中每个单词的词频并且按照词频对每行排序
- c++ 统计英文文本中每个单词的词频并且按照词频对每行排序
- 使用python对中文文档进行词频统计
- 毕业设计-基于深度神经网络的语音关键词检出系统-使用python脚本作词频统计-TIMIT
- 毕业设计-基于深度神经网络的语音关键词检出系统-使用python脚本作词频统计-Librispeech
- C++ 对一段英文进行词频统计
- C++ 对一段英文进行词频统计
- 2017 09 03 小结
- 17.9.3日报
- ajax 的过程
- ArrayList遍历时不能写
- HDU
- 使用Lucene词频统计与d3.cloud展示的中文英文词云系统
- 数据库(3)---基本操作
- Kotlin 实践项目(密码本)
- Spring和它的AOP
- 首发博客,记录心情
- qml如何进行动态翻译
- 2017年8月18日提高组T2 队伍统计
- 文本三剑客之sed
- 大量数据如何排序