Mahout之bayes算法学习(三)
来源:互联网 发布:听课软件哪个好 编辑:程序博客网 时间:2024/05/01 18:19
本来只是简单的将 classify-20newsgroups.sh 中的linux 命令转换成java 代码;在已知 相关java 工具类的情况下 代码实现是很简单的。 但是由于 测试数据量太大,第一步序列化数据的时候就报错了;报错原因是 hadoop-eclipse-plugin 连接hadoop时,由于数据量过大,造成读取数据失败。说来也惭愧!!!这个bug我至今还没想到好的解决办法,困于自身苦无计策;也只能暂时搁置于此了。
有的时候,既然一时想不出好的方法,那就暂时跳过这个问题。 测试 其他工具类的方法,主函数代码如下:
//序列化文件
mahout_seqdirectory();
//向量化
mahout_seq2sparse();
//数据拆分 训练数据 和 测试数据
mahout_split();
//产生训练模型,boolean参数设定是否为 完全朴素贝叶斯
mahout_trainnb(true);
//检测测试数据,boolean参数设定是否为 完全朴素贝叶斯
mahout_testnb(true);
提示: 第一步,序列化可能有问题,建议如果执行报错,可以现在linux上面命令执行生成 下一步需要的数据文件夹。
将java 打包jar;放到服务器上面运行。
工具类方法实现如下:
private static final String WORK_DIR = "hdfs://192.168.9.72:9000/tmp/mahout-work-java-sh";
/*
* echo "Converting sequence files to vectors"
./bin/mahout seq2sparse \
-i ${WORK_DIR}/20news-seq \
-o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
*
*/
public static void mahout_seq2sparse(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-seq";
String output = WORK_DIR+Path.SEPARATOR+"20news-vectors";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败!");
System.out.println(2);
}
}
/*
* echo "Creating sequence files from 20newsgroups data"
./bin/mahout seqdirectory \
-i ${WORK_DIR}/20news-all \
-o ${WORK_DIR}/20news-seq -ow
*/
public static void mahout_seqdirectory(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-all";
String output = WORK_DIR+Path.SEPARATOR+"20news-seq";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败!");
System.exit(1);
}
}
/*
* echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
./bin/mahout split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
--testOutput ${WORK_DIR}/20news-test-vectors \
--randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
*/
public static void mahout_split(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-vectors"+Path.SEPARATOR+"tfidf-vectors";
String trainingOutput = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String testOutput = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
Path in = new Path(input);
Path trainOut = new Path(trainingOutput);
Path testOut = new Path(testOutput);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(trainOut)){
//boolean参数是,是否递归删除的意思
fs.delete(trainOut, true);
}
if(fs.exists(testOut)){
//boolean参数是,是否递归删除的意思
fs.delete(testOut, true);
}
SplitInput si = new SplitInput();
String[] params = new String[]{"-i",input,"--trainingOutput",trainingOutput,"--testOutput",testOutput,
"--randomSelectionPct","40","--overwrite","--sequenceFiles","-xm","sequential"};
ToolRunner.run(si, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据拆分成 训练数据 和 检测数据 失败!");
System.exit(3);
}
}
/*
* echo "Training Naive Bayes model"
./bin/mahout trainnb \
-i ${WORK_DIR}/20news-train-vectors -el \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
-ow $c
*/
public static void mahout_trainnb(boolean completelyNB){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
Path in = new Path(input);
Path out = new Path(model);
Path label = new Path(labelindex);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
if(fs.exists(label)){
//boolean参数是,是否递归删除的意思
fs.delete(label, true);
}
TrainNaiveBayesJob tnbj = new TrainNaiveBayesJob();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow"};
}
ToolRunner.run(tnbj, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}
/*
* echo "Testing on holdout set"
./bin/mahout testnb \
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing $c
*/
public static void mahout_testnb(boolean completelyNB){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
String output = WORK_DIR+Path.SEPARATOR+"20news-testing";
Path in = new Path(input);
Path modelIn = new Path(model);
Path labelIn = new Path(labelindex);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in) && fs.exists(modelIn)&& fs.exists(labelIn)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
TestNaiveBayesDriver tnbd = new TestNaiveBayesDriver();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow"};
}
ToolRunner.run(tnbd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}
- Mahout之bayes算法学习(三)
- Mahout之bayes算法学习(一)
- Mahout之bayes算法学习(二)
- Mahout之bayes算法学习(四)
- Mahout分类算法学习之实现Naive Bayes分类示例
- 机器学习算法之三:5分钟上手Bayes
- hadoop学习-mahout-Bayes分类算法示例程序
- hadoop学习-mahout-Bayes分类算法示例程序
- mahout测试naive Bayes算法
- 机器学习之bayes算法
- hadoop下mahout bayes(贝叶斯)算法研究(1)
- hadoop下mahout bayes(贝叶斯)算法研究(2)
- hadoop下mahout bayes(贝叶斯)算法研究(1)
- hadoop下mahout bayes(贝叶斯)算法研究(1)
- hadoop下mahout bayes(贝叶斯)算法研究(1)
- hadoop下mahout bayes(贝叶斯)算法研究(2)
- Mahout学习之Mahout算法分类
- mahout学习之推荐算法
- c++中字符串转字符
- 哲理故事
- margin:0 auto;不居中的原因
- 美丽,颜色和评定中的大理石和石英
- 哲理故事
- Mahout之bayes算法学习(三)
- poj2762 Going from u to v or from v to u? --- 缩点+拓扑
- oracle RAC启动和停止
- 哲理故事
- 数据元素 概论
- 数据库系统中事务的ACID原则
- 哲理故事
- Android新版NDK环境配置(免Cygwin)
- Windows & Linux 文件格式之迷 < VI下删除文本中的^M>