基于朴素贝叶斯的垃圾邮件检测
来源:互联网 发布:已知矩阵 计算A5次α 编辑:程序博客网 时间:2024/05/01 20:08
package cn.zhf.test;import java.io.*;import java.util.*;public class SpamMailDetection { public static final String BASE_PATH = "C:\\Users\\zhf\\Desktop\\mail"; public static final String SPAM_PATH = BASE_PATH + "\\train_illegal.txt";//垃圾邮件语料 public static final String OK_PATH = BASE_PATH + "\\train_legal.txt";//正常邮件语料 public static final String EMAIL_PATH = BASE_PATH + "\\to_judge.txt";//要判别的邮件 public static final String DICT_PATH = BASE_PATH + "\\dict.txt";//分词用的词典 public static void main(String[] args) { SpamMailDetection smc = new SpamMailDetection(); //<word,(word/NonSpamCorpus)> Map<String, Double> okmap = smc.createMailMap(OK_PATH); //<word,(word/SpamCorpus)> Map<String, Double> spammap = smc.createMailMap(SPAM_PATH); Map<String, Double> ratemap = smc.createSpamProbabilityMap(spammap, okmap); double probability = smc.judgeMail(EMAIL_PATH, ratemap); if (probability > 0.5)//概率大于0.5则判定为垃圾 System.out.println("It's an ok mail."); else System.out.println("It's a spam mail."); } /** * 给定邮件,分词,根据分词结果判断是垃圾邮件的概率 * P(Spam|t1,t2,t3……tn)=(P1*P2*……PN)/(P1*P2*……PN+(1-P1)*(1-P2)*……(1-PN)) */ public double judgeMail(String emailPath, Map<String, Double> ratemap) { List<String> list = segment(readFile(emailPath)); double rate = 1.0; double tempRate = 1.0; for (String str : list) { if (ratemap.containsKey(str)) { double tmp = ratemap.get(str); tempRate *= 1 - tmp; rate *= tmp; } } return rate / (rate + tempRate); } /** * 从给定的垃圾邮件、正常邮件语料中建立map <切出来的词,出现的频率> */ public Map<String, Double> createMailMap(String filePath) { String str = readFile(filePath); List<String> list = segment(str); Map<String, Integer> tmpmap = new HashMap<String, Integer>(); Map<String, Double> retmap = new HashMap<String, Double>(); double rate = 0.0; int count = 0; for (String s : list) { tmpmap.put(s, tmpmap.containsKey(s) ? count + 1 : 1); } for (Iterator iter = tmpmap.keySet().iterator(); iter.hasNext();) { String key = (String) iter.next(); rate = tmpmap.get(key) / list.size(); retmap.put(key, rate); } return retmap; } /** * 建立map,<str,rate> 邮件中出现ti时,该邮件为垃圾邮件的概率 * P( Spam|ti) =P2(ti )/((P1 (ti ) +P2 ( ti )) */ public Map<String, Double> createSpamProbabilityMap(Map<String, Double> spammap, Map<String, Double> okmap) { Map<String, Double> retmap = new HashMap<String, Double>(); for (Iterator iter = spammap.keySet().iterator(); iter.hasNext();) { String key = (String) iter.next(); double rate = spammap.get(key); double allRate = rate; if (okmap.containsKey(key)) { allRate += okmap.get(key); } retmap.put(key, rate / allRate); } return retmap; } /** * 中文分词 */ public List<String> segment(String str) { Map<String, Integer> map = loadDict(); List<String> list = new ArrayList<String>(); int len = str.length(); String term; int maxSize = 6; int i = 0, j = 0; while (i < len) { int n = i + maxSize < len ? i + maxSize : len + 1; boolean findFlag = false; for (j = n - 1; j > i; j--) { term = str.substring(i, j); if (map.containsKey(term)) { list.add(term); findFlag = true; i = j; break; } } if (findFlag == false) i = j + 1; } return list; } /** * 加载词典文件 */ public Map<String, Integer> loadDict() { Map<String, Integer> map = new HashMap<String, Integer>(); String[] str; try { BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(new File(DICT_PATH)), "gbk")); String tmp = ""; while ((tmp = br.readLine()) != null) { str = tmp.split("\t"); map.put(str[0], 0); } br.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return map; } /** * 读文件 */ public String readFile(String filePath) { String str = ""; try { BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(new File(filePath)), "gbk")); String tmp = ""; while ((tmp = br.readLine()) != null) str += tmp; br.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return str; }}
0 0
- 基于朴素贝叶斯的垃圾邮件检测
- 基于朴素贝叶斯的垃圾邮件检测
- 整理-分类-朴素贝叶斯分类-垃圾邮件检测
- 朴素贝叶斯 垃圾邮件检测 Naive Bayes Spam detection
- 朴素贝叶斯---过滤垃圾邮件
- 朴素贝叶斯过滤垃圾邮件
- 《机器学习实战》学习笔记:基于朴素贝叶斯的垃圾邮件过滤
- 自然语言处理一:基于朴素贝叶斯的语种检测
- 朴素贝叶斯算法----垃圾邮件识别
- R 朴素贝叶斯 垃圾邮件分类
- 朴素贝叶斯与垃圾邮件分类
- 基于朴素贝叶斯到中文垃圾邮件分类器
- 机器学习手记[3]---朴素贝叶斯识别垃圾邮件的应用
- Python实现基于朴素贝叶斯的垃圾邮件分类 标签: python朴素贝叶斯垃圾邮件分类 2016-04-20 15:09 2750人阅读 评论(1) 收藏 举报 分类: 机器学习(19) 听说
- 基于卷积神经网络(CNN)的中文垃圾邮件检测
- 基于卷积神经网络(CNN)的中文垃圾邮件检测
- 基于贝叶斯方法的垃圾邮件过滤工具
- 基于贝叶斯的垃圾邮件过滤器 JAVA
- 一步步学习SPD2010--第一章节--探索SPD2010(2)--使用SharePoint Designer完成普通任务
- C#中使用Win32和其他库
- Hadoop-2.3.0记录
- C++ list容器应用
- DB2的COALESCE函数
- 基于朴素贝叶斯的垃圾邮件检测
- 1023. 组个最小数 (20)
- 诊断oracle性能
- 控件命名规范
- 转:女大学生设计避孕套获风投青睐
- GIS学习资料
- C++中的static关键字
- 中国石化视频会议/即时通讯/监控应急指挥平台转让
- 动态设置网页的各种Meta