Mahout 聚类算法学习之Canopy(一)

来源:互联网 发布:excel函数重复数据 编辑:程序博客网 时间:2024/06/08 04:23

网上找到的学习博客参差不齐,好多都不能实现。因此我整合了一下,写出此篇博客

1.首先要下载测试数据

最好在csdn 上下载

下载好数据后在ubuntu下一定要将后缀名改为.data,否则运行时将出现错误

2.将测试数据转化为序列文件,借鉴了《Mahout算法解析与案例实战》但它的代码无论如何都跑不出来,因此对转化序列文件的代码修改了一下。

package canopy;import java.io.IOException;  import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.LongWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Reducer;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;  import org.apache.hadoop.util.GenericOptionsParser;import org.apache.mahout.math.RandomAccessSparseVector;  import org.apache.mahout.math.Vector;  import org.apache.mahout.math.VectorWritable;  /**   ??* transform text data to vectorWritable data   ??* @author fansy   ??*   ??*/  public class Text2VectorWritable {  public static void main(String[] args) throws Exception {Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: wordcount <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "Text2VectorWritable");          job.setOutputFormatClass(SequenceFileOutputFormat.class);            job.setMapperClass(Text2VectorWritableMapper.class);            job.setMapOutputKeyClass(LongWritable.class);            job.setMapOutputValueClass(VectorWritable.class);            job.setReducerClass(Text2VectorWritableReducer.class);            job.setOutputKeyClass(LongWritable.class);            job.setOutputValueClass(VectorWritable.class);            job.setJarByClass(Text2VectorWritable.class);            FileInputFormat.addInputPath(job, new Path(otherArgs[0]));          SequenceFileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));          System.exit(job.waitForCompletion(true) ? 0 : 1);}    public static class Text2VectorWritableMapper extends Mapper<LongWritable,Text,LongWritable,VectorWritable>{              public void map(LongWritable key,Text value,Context context)throws  IOException,InterruptedException{              String[] str=value.toString().split("\\s{1,}");             // split data use one or more blanker              Vector vector=new RandomAccessSparseVector(str.length);             for(int i=0;i<str.length;i++){                  vector.set(i, Double.parseDouble(str[i]));              }              VectorWritable va=new VectorWritable(vector);             context.write(key, va);          }     }     /**      ??* Reducer: do nothing but output      ??* @author fansy      ??*      ??*/     public static class Text2VectorWritableReducer extends Reducer<LongWritable, VectorWritable,LongWritable,VectorWritable>{         public void reduce(LongWritable key,Iterable<VectorWritable> values,Context context)throws IOException,InterruptedException{             for(VectorWritable v:values){                   context.write(key, v);              }          }     }}                       
将上面的文件打包,运行测试数据,此处很简单,就是普通的hadoop运行,结果会生成序列文件 part-r-00000

3.使用如下命令将序列文件运行

mahout canopy --input testdata/part-r-00000 --output output/canopy --distanceMeasure org.apache.mahout.common.distance.EuclideanDistanceMeasure --t1 80 --t2 55 --t3 80 --t4 55 --clustering