Mahout之bayes算法学习(三)

来源:互联网 发布:听课软件哪个好 编辑:程序博客网 时间:2024/05/01 18:19

本来只是简单的将 classify-20newsgroups.sh 中的linux 命令转换成java 代码;在已知 相关java 工具类的情况下 代码实现是很简单的。 但是由于 测试数据量太大,第一步序列化数据的时候就报错了;报错原因是 hadoop-eclipse-plugin 连接hadoop时,由于数据量过大,造成读取数据失败。说来也惭愧!!!这个bug我至今还没想到好的解决办法,困于自身苦无计策;也只能暂时搁置于此了。

有的时候,既然一时想不出好的方法,那就暂时跳过这个问题。 测试 其他工具类的方法,主函数代码如下:

//序列化文件
mahout_seqdirectory();
//向量化
mahout_seq2sparse();

//数据拆分 训练数据 和 测试数据
mahout_split();
//产生训练模型,boolean参数设定是否为 完全朴素贝叶斯
mahout_trainnb(true);
//检测测试数据,boolean参数设定是否为 完全朴素贝叶斯
mahout_testnb(true);

提示: 第一步,序列化可能有问题,建议如果执行报错,可以现在linux上面命令执行生成 下一步需要的数据文件夹。

将java 打包jar;放到服务器上面运行。

工具类方法实现如下:

private static final String WORK_DIR = "hdfs://192.168.9.72:9000/tmp/mahout-work-java-sh";

/*
*   echo "Converting sequence files to vectors"
 ./bin/mahout seq2sparse \
   -i ${WORK_DIR}/20news-seq \
   -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf

*/
public static void mahout_seq2sparse(){
try {
Configuration conf = new Configuration();

conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-seq";
String output = WORK_DIR+Path.SEPARATOR+"20news-vectors";

Path in = new Path(input);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}

SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);

}



} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败!");
System.out.println(2);
}
}



/*
*   echo "Creating sequence files from 20newsgroups data"
 ./bin/mahout seqdirectory \
   -i ${WORK_DIR}/20news-all \
   -o ${WORK_DIR}/20news-seq -ow
*/
public static void mahout_seqdirectory(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-all";
String output = WORK_DIR+Path.SEPARATOR+"20news-seq";

Path in = new Path(input);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败!");
System.exit(1);
}
}




/*
*   echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
 ./bin/mahout split \
   -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
   --trainingOutput ${WORK_DIR}/20news-train-vectors \
   --testOutput ${WORK_DIR}/20news-test-vectors  \
   --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
*/
public static void mahout_split(){

try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-vectors"+Path.SEPARATOR+"tfidf-vectors";
String trainingOutput = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String testOutput = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";

Path in = new Path(input);
Path trainOut = new Path(trainingOutput);
Path testOut = new Path(testOutput);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(trainOut)){
//boolean参数是,是否递归删除的意思
fs.delete(trainOut, true);
}

if(fs.exists(testOut)){
//boolean参数是,是否递归删除的意思
fs.delete(testOut, true);
}

SplitInput si = new SplitInput();
String[] params = new String[]{"-i",input,"--trainingOutput",trainingOutput,"--testOutput",testOutput,
"--randomSelectionPct","40","--overwrite","--sequenceFiles","-xm","sequential"};
ToolRunner.run(si, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据拆分成 训练数据 和 检测数据 失败!");
System.exit(3);
}

}


/*
*   echo "Training Naive Bayes model"
 ./bin/mahout trainnb \
   -i ${WORK_DIR}/20news-train-vectors -el \
   -o ${WORK_DIR}/model \
   -li ${WORK_DIR}/labelindex \
   -ow $c
*/
public static void mahout_trainnb(boolean completelyNB){
try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";

Path in = new Path(input);
Path out = new Path(model);
Path label = new Path(labelindex);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}

if(fs.exists(label)){
//boolean参数是,是否递归删除的意思
fs.delete(label, true);
}
TrainNaiveBayesJob tnbj = new TrainNaiveBayesJob();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow"};
}
ToolRunner.run(tnbj, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}


/*
*   echo "Testing on holdout set"
 ./bin/mahout testnb \
   -i ${WORK_DIR}/20news-test-vectors\
   -m ${WORK_DIR}/model \
   -l ${WORK_DIR}/labelindex \
   -ow -o ${WORK_DIR}/20news-testing $c
*/
public static void mahout_testnb(boolean completelyNB){
try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
String output = WORK_DIR+Path.SEPARATOR+"20news-testing";
Path in = new Path(input);
Path modelIn = new Path(model);
Path labelIn = new Path(labelindex);
Path out = new Path(output);


FileSystem fs = FileSystem.get(conf);

if(fs.exists(in) && fs.exists(modelIn)&& fs.exists(labelIn)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
TestNaiveBayesDriver tnbd = new TestNaiveBayesDriver();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow"};
}
ToolRunner.run(tnbd, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}

0 0