mapreduce代码整理之简单的kmeans聚类
来源:互联网 发布:金融书籍知乎 编辑:程序博客网 时间:2024/06/07 13:31
我们的目标是将如下坐标点分成两类,可以很清楚地看到这个数据可以很明确的分为两类。要分类的数据在kmeans.txt里,初始中心在cluster.center.conf.txt里,把这两个文件都上传到hdfs中。
Utils.java
import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IOUtils;import org.apache.hadoop.io.Text;import org.apache.hadoop.util.LineReader;public class Utils { //读取中心文件的数据 public static ArrayList<ArrayList<Double>> getCentersFromHDFS(String centersPath,boolean isDirectory) throws IOException{ ArrayList<ArrayList<Double>> result = new ArrayList<ArrayList<Double>>(); Path path = new Path(centersPath); Configuration conf = new Configuration(); FileSystem fileSystem = path.getFileSystem(conf); if(isDirectory){ FileStatus[] listFile = fileSystem.listStatus(path); for (int i = 0; i < listFile.length; i++) { result.addAll(getCentersFromHDFS(listFile[i].getPath().toString(),false)); } return result; } FSDataInputStream fsis = fileSystem.open(path); LineReader lineReader = new LineReader(fsis, conf); Text line = new Text(); while(lineReader.readLine(line) > 0){ ArrayList<Double> tempList = textToArray(line); result.add(tempList); } lineReader.close(); return result; } //删掉文件 public static void deletePath(String pathStr) throws IOException{//删除以前的文件 Configuration conf = new Configuration(); Path path = new Path(pathStr); FileSystem hdfs = path.getFileSystem(conf); hdfs.delete(path ,true); } public static ArrayList<Double> textToArray(Text text){ //从一行中提取出两个double型 ArrayList<Double> list = new ArrayList<Double>(); String[] fileds = text.toString().split(","); for(String val:fileds){ System.out.println(val); } for(int i=0;i<fileds.length;i++){ list.add(Double.parseDouble(fileds[i])); } return list; } public static boolean compareCenters(String centerPath,String newPath) throws IOException{//查看两个中心是否没变化并返回一个布尔型 List<ArrayList<Double>> oldCenters = Utils.getCentersFromHDFS(centerPath,false); List<ArrayList<Double>> newCenters = Utils.getCentersFromHDFS(newPath,true); int size = oldCenters.size(); int fildSize = oldCenters.get(0).size(); double distance = 0; for(int i=0;i<size;i++){ for(int j=0;j<fildSize;j++){ double t1 = Math.abs(oldCenters.get(i).get(j)); double t2 = Math.abs(newCenters.get(i).get(j)); distance += Math.pow((t1 - t2) / (t1 + t2), 2); } } if(distance == 0.0){ //删掉新的中心文件以便最后依次归类输出 Utils.deletePath(newPath); return true; }else{ //先清空中心文件,将新的中心文件复制到中心文件中,再删掉中心文件 Configuration conf = new Configuration(); Path outPath = new Path(centerPath); FileSystem fileSystem = outPath.getFileSystem(conf); FSDataOutputStream overWrite = fileSystem.create(outPath,true); overWrite.writeChars(""); overWrite.close(); Path inPath = new Path(newPath); FileStatus[] listFiles = fileSystem.listStatus(inPath); for (int i = 0; i < listFiles.length; i++) { FSDataOutputStream out = fileSystem.create(outPath); FSDataInputStream in = fileSystem.open(listFiles[i].getPath()); IOUtils.copyBytes(in, out, 4096, true); } //删掉新的中心文件以便第二次任务运行输出 Utils.deletePath(newPath); } return false; }}
mapreduce.java
import java.io.IOException;import java.util.ArrayList;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import java.util.Arrays;import java.util.Iterator;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class MapReduce { public static class Map extends Mapper<LongWritable, Text, IntWritable, Text>{ //中心集合 ArrayList<ArrayList<Double>> centers = null; //用k个中心 int k = 0; //读取中心 protected void setup(Context context) throws IOException, InterruptedException { centers = Utils.getCentersFromHDFS(context.getConfiguration().get("centersPath"),false); k = centers.size(); } /** * 1.每次读取一条要分类的条记录与中心做对比,归类到对应的中心 * 2.以中心ID为key,中心包含的记录为value输出(例如: 1 0.2 。 1为聚类中心的ID,0.2为靠近聚类中心的某个值) */ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //读取一行数据 ArrayList<Double> fileds = Utils.textToArray(value); int sizeOfFileds = fileds.size(); double minDistance = 99999999; int centerIndex = 0; //依次取出k个中心点与当前读取的记录做计算 for(int i=0;i<k;i++){ double currentDistance = 0; for(int j=0;j<sizeOfFileds;j++){ double centerPoint = Math.abs(centers.get(i).get(j)); double filed = Math.abs(fileds.get(j)); currentDistance += Math.pow((centerPoint - filed) / (centerPoint + filed), 2); } //循环找出距离该记录最接近的中心点的ID if(currentDistance<minDistance){ minDistance = currentDistance; centerIndex = i; } } //以中心点为Key 将记录原样输出 context.write(new IntWritable(centerIndex+1), value); } } //利用reduce的归并功能以中心为Key将记录归并到一起 public static class Reduce extends Reducer<IntWritable, Text, Text, Text>{ /** * 1.Key为聚类中心的ID value为该中心的记录集合 * 2.计数所有记录元素的平均值,求出新的中心 */ protected void reduce(IntWritable key, Iterable<Text> value,Context context) throws IOException, InterruptedException { ArrayList<ArrayList<Double>> filedsList = new ArrayList<ArrayList<Double>>(); //依次读取记录集,每行为一个ArrayList<Double> for(Iterator<Text> it =value.iterator();it.hasNext();){ ArrayList<Double> tempList = Utils.textToArray(it.next()); filedsList.add(tempList); } //计算新的中心 //每行的元素个数 int filedSize = filedsList.get(0).size(); double[] avg = new double[filedSize]; for(int i=0;i<filedSize;i++){ //求没列的平均值 double sum = 0; int size = filedsList.size(); for(int j=0;j<size;j++){ sum += filedsList.get(j).get(i); } avg[i] = sum / size; } context.write(new Text("") , new Text(Arrays.toString(avg).replace("[", "").replace("]", ""))); } } @SuppressWarnings("deprecation") public static void run(String centerPath,String dataPath,String newCenterPath,boolean runReduce) throws IOException, ClassNotFoundException, InterruptedException{ Configuration conf = new Configuration(); conf.set("centersPath", centerPath); Job job = new Job(conf, "mykmeans"); job.setJarByClass(MapReduce.class); job.setMapperClass(Map.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); if(runReduce){ //最后依次输出不许要reduce job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); } FileInputFormat.addInputPath(job, new Path(dataPath)); FileOutputFormat.setOutputPath(job, new Path(newCenterPath)); System.out.println(job.waitForCompletion(true)); } public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { String localhost = "back03"; String centerPath = "hdfs://"+localhost+":9000/user/root/in/cluster.center.conf.txt"; String dataPath = "hdfs://"+localhost+":9000/user/root/in/kmeans.txt"; String newCenterPath = "hdfs://"+localhost+":9000//user/root/out/kmeans"; int count = 0; while(true){ run(centerPath,dataPath,newCenterPath,true); System.out.println(" 第 " + ++count + " 次计算 "); if(Utils.compareCenters(centerPath,newCenterPath )){ run(centerPath,dataPath,newCenterPath,false); break; } } } }最后输出为
1114,1151100,1001105,1091108,1041110,11122,222,424,224,426,626,828,628,8210,1721,825,621,922,326.6,7.726.7,5.721.2,3.221.6,7.825.4,3.628.9,2.920.02,3.6825.6,4.9
可以看到很明确的分为了两类。
0 0
- mapreduce代码整理之简单的kmeans聚类
- MapReduce 运行Kmeans代码
- mapreduce代码整理之wordcount
- mapreduce代码整理之sort
- kmeans基于mapreduce的实现
- mapreduce代码整理之MyInputFormat(自定义InputFormat)
- Java实现简单的kmeans聚类
- 我的kmeans聚类 python代码
- MapReduce的Kmeans聚类算法
- MapReduce框架实现的kmeans算法
- Kmeans 的MapReduce实现原理
- mapreduce代码整理之mywritable(自定义writable)
- mapreduce代码整理之实现压缩输出减少网络传输
- sparkML之kmeans聚类
- 数据挖掘--kmeans聚类算法mapreduce实现代码<转>
- 数据挖掘--kmeans聚类算法mapreduce实现 代码
- 数据挖掘--kmeans聚类算法mapreduce实现 代码
- 我的 kmeans matlab 代码
- 线性代数mooc课(一)
- QT开发(三十五)——QT进程间通信
- QT开发(三十)——计算器实例开发
- QT开发(三十一)——NotePad实例开发
- QT开发教程版权声明
- mapreduce代码整理之简单的kmeans聚类
- QT开发(五十二)———QML语言
- QT开发(五十三)———QML基本元素
- QT开发(五十四)———QML组件
- QT开发(五十五)———Qt Quick Controls
- QT开发(六十一)———Json简介
- QT开发(六十二)———QT5解析Json文件
- QT开发(六十三)——QT事件机制分析
- QT开发(六十四)——QT样式表(一)