java 实现 计算tfidf 使用ik分词

来源:互联网 发布:js数组去重系统方法 编辑:程序博客网 时间:2024/06/13 09:54

java 实现 计算tfidf 使用ik分词 


Doc类,即文档

package pojo;import java.io.IOException;import java.io.Reader;import java.io.StringReader;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;/** * Class description * * * @version        Enter version here..., 17/09/04 * @author         Enter your name here... */public class Doc {    /** 是否已分词标记 */    private boolean isSeg = false;    /** 标题 */    private String title;    /** url */    private String url;    /** 文档内容 */    private String content;    /** 文档编号 */    private String docno;    /** 分词后词项列表 */    private List<String> segs;    /** 词项计数Map */    private Map<String, Integer> segMap;    /**     * Constructs ...     *     */    public Doc() {}    /**     * Constructs ...     *     *     * @param title     * @param url     * @param content 文档内容     * @param docno 文档编号     */    public Doc(String url, String docno, String title,  String content) {        this.title   = title;        this.url     = url;        this.content = content;        this.docno   = docno;    }    /**     * 分词     * 对文档内容进行分词,使用ik分词器     *     */    public void seg() {        if (isSeg()){//判断是否已分词            System.out.println("已分词");            return;        }        Reader      reader      = new StringReader(content);        IKSegmenter ikSegmenter = new IKSegmenter(reader, true);        Lexeme      lexeme      = null;        segs = new ArrayList<String>();        segMap = new HashMap<String, Integer>();        try {            while ((lexeme = ikSegmenter.next()) != null) {                System.out.println(lexeme.getLexemeText());                segs.add(lexeme.getLexemeText());                if (segMap.containsKey(lexeme.getLexemeText())) {                    segMap.put(lexeme.getLexemeText(), segMap.get(lexeme.getLexemeText()) + 1);                } else {                    segMap.put(lexeme.getLexemeText(), 1);                }            }            isSeg = true;        } catch (IOException e) {            e.printStackTrace();        }    }    @Override    public String toString() {        return "Doc{" +                "isSeg=" + isSeg +                ", title='" + title + '\'' +                ", url='" + url + '\'' +                ", content='" + content + '\'' +                ", docno='" + docno + '\'' +                ", segs=" + segs +                ", segMap=" + segMap +                '}';    }    /**     * Method description     *     *     * @return     */    public String getContent() {        return content;    }    /**     * Method description     *     *     * @return     */    public String getDocno() {        return docno;    }    /**     * 判断是否已分词     *     * @return     */    public boolean isSeg() {        return isSeg;    }    /**     * Method description     *     *     * @return     */    public Map<String, Integer> getSegMap() {        return segMap;    }    /**     * Method description     *     *     * @return     */    public List<String> getSegs() {        return segs;    }    /**     * Method description     *     *     * @return     */    public String getTitle() {        return title;    }    /**     * Method description     *     *     * @return     */    public String getUrl() {        return url;    }}//~ Formatted by Jindent --- http://www.jindent.com

TfIdfUtil类,即计算tfidf工具类

package util;import java.io.*;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;import pojo.Doc;/** * 计算TfIdf * 根据给定语料库(这里采用搜狗中文新闻开放预料库) * 对文档集中各文档进行内容提取、分词 * 分别计算tf idf tfidf * 并同时输出结果至控制台和目标文件 * * @version        v0.1, 17/09/04 * @author         Kiwi Liu */public class TfIdfUtil {    /** 文档集 */    private static List<Doc> docList = new ArrayList<Doc>();    /** 文档集中各文档的词项频率 */    private static Map<String, Map<String, Double>> docSetTfMap = new HashMap<String, Map<String, Double>>();    /** 文档集中各词项的文档频数 */    private static Map<String, Integer> docSetDcMap = new HashMap<String, Integer>();    /** 文档集中各词项的逆向文档频率 */    private static Map<String, Double> docSetIdfMap = new HashMap<String, Double>();    /**     * 根据语料库文件创建文档集并对每个文档进行分词和词项计数     * 这里利用正则表达式进行标题、url、标号、内容的抽取和去除标签     *     * @param corpusPath     */    private static void createDocSet(String corpusPath) {        StringBuffer corpusBuffer = new StringBuffer();//语料库缓冲器        String       line         = null;        try {//读取语料库            BufferedReader bufferedReader = new BufferedReader(new FileReader(new File(corpusPath)));            while ((line = bufferedReader.readLine()) != null) {                corpusBuffer.append(line);            }    //while            //预处理部分            //提取各个文档            Pattern patternDoc = Pattern.compile("<doc>.*?</doc>");            Matcher matcherDoc = patternDoc.matcher(corpusBuffer.toString());            while (matcherDoc.find()) {                String title   = null;                String url     = null;                String docno   = null;                String content = null;                String  doc          = matcherDoc.group();                //提取标题                Pattern patternTitle = Pattern.compile("<contenttitle>.*</contenttitle>");                Matcher matcherTitle = patternTitle.matcher(doc);                if (matcherTitle.find()) {                    title = matcherTitle.group();                    //去除标签                    Pattern patternTag = Pattern.compile("<.*?>");                    Matcher matcherTag = patternTag.matcher(title);                    title = matcherTag.replaceAll(" ");                    System.out.println("title: " + title);                }                //提取url                Pattern patternUrl = Pattern.compile("<url>.*?</url>");                Matcher matcherUrl = patternUrl.matcher(doc);                if (matcherUrl.find()) {                    url = matcherUrl.group();                    Pattern patternTag = Pattern.compile("<.*?>");                    Matcher matcherTag = patternTag.matcher(url);                    url = matcherTag.replaceAll("");                    System.out.println("url: " + url);                }                //提取文档标号                Pattern patternDocno = Pattern.compile("<docno>.*?</docno>");                Matcher matcherDocno = patternDocno.matcher(doc);                if (matcherDocno.find()) {                    docno = matcherDocno.group();                    Pattern patternTag = Pattern.compile("<.*?>");                    Matcher matcherTag = patternTag.matcher(docno);                    docno = matcherTag.replaceAll("");                    System.out.println("docno: " + docno);                }                //提取内容                Pattern patternContent = Pattern.compile("<content>.*?</content>");                Matcher matcherContent = patternContent.matcher(doc);                if (matcherContent.find()) {                    content = matcherContent.group();                    Pattern patternTag = Pattern.compile("<.*?>");                    Matcher matcherTag = patternTag.matcher(content);                    content = matcherTag.replaceAll("");                    System.out.println("content: " + content);                }                //创建文档                Doc d = new Doc(url, docno, title, content);                //分词                d.seg();                docList.add(d);            }    // while        } catch (FileNotFoundException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }    }    /**     * 计算文档集的逆向文档频率     *     */    private static void idf() {        for (Doc doc : docList) {//统计各个词项的文档频数            for (String seg : doc.getSegMap().keySet()) {                if (docSetDcMap.containsKey(seg)) {                    docSetDcMap.put(seg, docSetDcMap.get(seg) + 1);                } else {                    docSetDcMap.put(seg, 1);                }            }        }        Set<String> segSet = docSetDcMap.keySet();        //计算idf=log(size(doc set)/docs(d,w))        for (String seg : segSet) {            docSetIdfMap.put(seg, Math.log((double) docList.size() / (double) docSetDcMap.get(seg)));        }    }    /**     * 输出结果信息到控制台和目标文件     *     * @param dstPath 目标文件路径     */    private static void print(String dstPath) {        try {            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File(dstPath)));            for (Doc doc : docList) {                System.out.println(doc);                bufferedWriter.write(doc.toString());                bufferedWriter.newLine();                for (String seg : doc.getSegMap().keySet()) {                    double tf      = docSetTfMap.get(doc.getDocno()).get(seg);                    double idf     = docSetIdfMap.get(seg);                    String segInfo = seg + " tf: " + tf + " idf: " + idf + " tfidf: " + tf * idf;                    System.out.println(segInfo);                    bufferedWriter.write(segInfo);                    bufferedWriter.newLine();                }            }            bufferedWriter.close();        } catch (IOException e) {            e.printStackTrace();        }    }    /**     * 计算文档集各文档中的词项频率     * tf=count(w,d)/size(d)     */    private static void tf() {        for (Doc doc : docList) {//各文档            Map<String, Double> docTf = new HashMap<String, Double>();//单篇文档的词项频率            for (String seg : doc.getSegMap().keySet()) {//文档中各词项                docTf.put(seg, Double.valueOf((double) doc.getSegMap().get(seg) / (double) doc.getSegs().size()));                //tf=count(w,d)/size(d)            }            docSetTfMap.put(doc.getDocno(), docTf);        }    }    /**     * 计算tfidf     *     * @param corpusPath 语料库文件路径     * @param dstPath  目标文件路径     */    public static void tfidf(String corpusPath, String dstPath) {        createDocSet(corpusPath);        tf();        idf();        print(dstPath);    }}//~ Formatted by Jindent --- http://www.jindent.com


原创粉丝点击