使用Lucene词频统计与d3.cloud展示的中文英文词云系统

来源：互联网发布：xlsx 导入 java 编辑：程序博客网时间：2024/05/20 00:12

用Lucene 4.7对进行词频统计，使用的分词器为IKAnalyzer，对中文的支持较低。
中文词频统计仅包含中文，英文词频统计仅包含英文。可在中文解析器与英文解析器中进行修改。
在获取到排序好的词频后，可使用d3.cloud在web中展示出来。
Github地址：github.com/panzejia/WebGuide

package cn.iflin.project.participle;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldType;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import cn.iflin.project.model.WordModel;import cn.iflin.project.participle.englishwords.EnglishParser;import cn.iflin.project.participle.wordcloud.ChineseParser;/**主要进行对中文与英语进行词频计算*/public class WordsParticiple {    /**     * 判断文件夹是否存在之后调用索引计算词频     */    private static ArrayList<WordModel> checkFile(String text, String articleId) {        ArrayList<WordModel> wordList = new ArrayList<WordModel>();        return wordList;    }    private static void addDoc(IndexWriter w, String text) throws IOException {        Document doc = new Document();        FieldType ft = new FieldType();        ft.setIndexed(true);// 存储        ft.setStored(true);// 索引        ft.setStoreTermVectors(true);        ft.setTokenized(true);        ft.setStoreTermVectorPositions(true);// 存储位置        ft.setStoreTermVectorOffsets(true);// 存储偏移量        doc.add(new Field("text", text, ft));        w.addDocument(doc);    }    /**     * 删除文件     *      * @param dir     * @return     */    private static boolean deleteDir(File dir) {        if (dir.isDirectory()) {            String[] children = dir.list();            for (int i = 0; i < children.length; i++) {                boolean success = deleteDir(new File(dir, children[i]));                if (!success) {                    return false;                }            }        }        // 目录此时为空，可以删除        return dir.delete();    }    /**     * 计算英文词频     *      * @param text     * @return     * @throws IOException     */     public static ArrayList<WordModel> getTF(String text, String articleId, String tag) throws IOException {        ArrayList<WordModel> wordList = new ArrayList<WordModel>();        File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId);        WordsParticiple.deleteDir(file);        Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式，如果构造函数参数为false，那么使用最细粒度分词。        IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息        Directory fileindex;        fileindex = FSDirectory.open(file);        IndexWriter filew = new IndexWriter(fileindex, configfile);        try {            WordsParticiple.addDoc(filew, text);        } finally {            // 统一释放内存            filew.close();        }        try {            IndexReader reader = DirectoryReader.open(fileindex);            for (int i = 0; i < reader.numDocs(); i++) {                int docId = i;                Terms terms = reader.getTermVector(docId, "text");                if (terms == null)                    continue;                TermsEnum termsEnum = terms.iterator(null);                BytesRef thisTerm = null;                while ((thisTerm = termsEnum.next()) != null) {                    String termText = thisTerm.utf8ToString();                    DocsEnum docsEnum = termsEnum.docs(null, null);                    while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {                        if(tag.equals("noLevel")){                            WordModel wm = new WordModel();                            wm.setWord(termText);                            wm.setWordFrequency(docsEnum.freq());                            wordList.add(wm);                        }                        else if (EnglishParser.checkEnglishWord(termText, tag)) {                            WordModel wm = new WordModel();                            wm.setWord(termText);                            wm.setWordFrequency(docsEnum.freq());                            wordList.add(wm);                        }                    }                }            }            reader.close();            fileindex.close();        } catch (Exception e) {            e.printStackTrace();        }        return wordList;    }    /**     * 计算中文词频     *      * @param text     * @return     * @throws IOException     */     public static ArrayList<WordModel> getTF(String text, String articleId) throws IOException {        ArrayList<WordModel> wordList = new ArrayList<WordModel>();        File file = new File("C:\\Spider\\WordCloud_Lucene\\" + articleId);        WordsParticiple.deleteDir(file);        Analyzer analyzer = new IKAnalyzer(true);// 智能分词模式，如果构造函数参数为false，那么使用最细粒度分词。        IndexWriterConfig configfile = new IndexWriterConfig(Version.LUCENE_47, analyzer);// 创建索引的配置信息        Directory fileindex;        fileindex = FSDirectory.open(file);        IndexWriter filew = new IndexWriter(fileindex, configfile);        try {            WordsParticiple.addDoc(filew, text);        } finally {            // 统一释放内存            filew.close();        }        try {            IndexReader reader = DirectoryReader.open(fileindex);            for (int i = 0; i < reader.numDocs(); i++) {                int docId = i;                Terms terms = reader.getTermVector(docId, "text");                if (terms == null)                    continue;                TermsEnum termsEnum = terms.iterator(null);                BytesRef thisTerm = null;                while ((thisTerm = termsEnum.next()) != null) {                    String termText = thisTerm.utf8ToString();                    DocsEnum docsEnum = termsEnum.docs(null, null);                    while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {                        if (ChineseParser.isChinese(termText) && termText.length() >= 2) {                            WordModel wm = new WordModel();                            wm.setWord(termText);                            wm.setWordFrequency(docsEnum.freq());                            wordList.add(wm);                        }                    }                }            }            reader.close();            fileindex.close();        } catch (Exception e) {            e.printStackTrace();        }        return wordList;    }}

接下来对英语进行词频排序

package cn.iflin.project.participle.englishwords;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import cn.iflin.project.model.WordModel;import cn.iflin.project.participle.WordsParticiple;import cn.iflin.project.participle.wordcloud.CalculateChineseFrequency;public class CalculateEnglishFrequency extends WordsParticiple {    /**     * 返回排序后的词频     *      * @param text 需分词内容     * @param articleId 分词内容 属性 （数据库or用户自定义(temp)）     * @param tag English：siji、liuji、kaoyan;Chinese：chinese     * @return     */    public static ArrayList<WordModel> getWordFre(String text, String articleId, String tag) {        ArrayList<WordModel> wordList = new ArrayList<WordModel>();        try {            //过滤掉常见标点符号            text = EnglishParser.delPunctuation(text);            wordList = getTF(text, articleId, tag);        } catch (IOException e) {            e.printStackTrace();        }        // 定义排序规则        class SortByFre implements Comparator {            public int compare(Object o1, Object o2) {                WordModel s1 = (WordModel) o1;                WordModel s2 = (WordModel) o2;                return s2.getWordFrequency().compareTo(s1.getWordFrequency());            }        }        Collections.sort(wordList, new SortByFre());        return wordList;    }}

最后是对中文词频进行排序

package cn.iflin.project.participle.wordcloud;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.Collections;import java.util.Comparator;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldType;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import cn.iflin.project.model.WordModel;import cn.iflin.project.participle.WordsParticiple;import cn.iflin.project.participle.englishwords.EnglishParser;/** * 计算词频 *  * @author Jaypan * */public class CalculateChineseFrequency  extends WordsParticiple{    /**     * 返回排序后的词频     *      * @param text 需分词内容     * @param articleId 分词内容 属性 （数据库or用户自定义(temp)）     * @param tag English：siji、liuji、kaoyan;Chinese：chinese     * @return     */    public static ArrayList<WordModel> getWordFre(String text, String articleId) {        ArrayList<WordModel> wordList = new ArrayList<WordModel>();        try {            wordList = getTF(text, articleId);        } catch (IOException e) {            e.printStackTrace();        }        // 定义排序规则        class SortByFre implements Comparator {            public int compare(Object o1, Object o2) {                WordModel s1 = (WordModel) o1;                WordModel s2 = (WordModel) o2;                return s2.getWordFrequency().compareTo(s1.getWordFrequency());            }        }        Collections.sort(wordList, new SortByFre());        return wordList;    }}

英语解析器

package cn.iflin.project.participle.englishwords;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;public class EnglishParser {//  public static void main(String[] args) {//      testReg("B. Closing offices on holidays.");//  }    public static String delPunctuation(String text) {        // 先去掉标点,再合并空格        Pattern p = Pattern.compile("[(.|,|\"|\\?|!|:;')]");// 这边增加所有的符号,例如要加一个'则变成[(.|,|\"|\\?|!|:|')],如果是特殊符号要加转换        Matcher m = p.matcher(text);// 这为要整理的字符串        String first = m.replaceAll("");        p = Pattern.compile("   {2,}");        m = p.matcher(first);        String second = m.replaceAll("");        return second;    }    //判断单词是否属于某一分类    public static boolean checkEnglishWord(String checkWord, String englishClass) {        ArrayList<String> words = getEnglishWords(englishClass);        for (String word : words) {            if (word.equals(checkWord)) {                return true;            }        }        return false;    }    /**     * 获取单词列表     *      * @param sourceName     *            单词级别     * @return 单词列表     */    public static ArrayList<String> getEnglishWords(String sourceName) {        ArrayList<String> words = new ArrayList<String>();        String filePath = "C:\\Spider\\EnglishWords\\" + sourceName + ".txt";        try {            String encoding = "GBK";            File file = new File(filePath);            if (file.isFile() && file.exists()) {                InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);                BufferedReader br = new BufferedReader(read);                String lineText = null;                while ((lineText = br.readLine()) != null) {                    words.add(lineText);                }                read.close();            } else {                System.out.println("找不到指定文件");            }        } catch (Exception e) {            System.out.println("读取文件出错");            e.printStackTrace();        }        return words;    }}

中文解析器

package cn.iflin.project.participle.wordcloud;import java.util.regex.Matcher;import java.util.regex.Pattern;public class ChineseParser {    // 判断一个字符是否是中文    private static boolean isChinese(char c) {        return c >= 0x4E00 && c <= 0x9FA5;// 根据字节码判断    }    // 判断一个字符串是否含有中文    public static boolean isChinese(String str) {        if (str == null)            return false;        for (char c : str.toCharArray()) {            if (isChinese(c))                return true;// 有一个中文字符就返回        }        return false;    }    //将一句话转换成数组    public static String[] changeList(String s) {        String[] data =new String[20];        Pattern pattern =  Pattern.compile("[\u4e00-\u9fa5]*");        Matcher matcher = pattern.matcher(s);        int i =0;        while (matcher.find()){            if(matcher.group().equals("")){                continue;            }            data[i]=matcher.group();            i++;        }        return data;    }}

阅读全文

0 0