TotalSort全排序（抽样取中值）

来源：互联网发布：电子商务系统代码java 编辑：程序博客网时间：2024/05/18 18:03
package com.zhiyou.bd17.mr1014;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.lib.InputSampler;import org.apache.hadoop.mapred.lib.TotalOrderPartitioner;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class TotalSort {public static class TotalSortMap extends Mapper<LongWritable, Text, IntWritable, Text> {private String[] infos;private IntWritable oKey = new IntWritable();private Text oValue= new Text();@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, Text>.Context context)throws IOException, InterruptedException {infos = value.toString().split("\\s");oKey.set(Integer.valueOf(infos[1]));oValue.set(infos[0]);context.write(oKey, oValue);}}public static class TotalSortReduce extends Reducer<IntWritable, Text, Text, IntWritable> {@Overrideprotected void reduce(IntWritable key, Iterable<Text> values,Reducer<IntWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {for (Text value : values) {context.write(value, key);}}}//比较public static class WritableDescComparetor extends IntWritable.Comparator{@Overridepublic int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {// - 号表示从大到小排序，没有表示从小到大排序return -super.compare(b1, s1, l1, b2, s2, l2);}}//定义全排序jobpublic static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {//定义抽样Configuration configuration = new Configuration();InputSampler.Sampler<IntWritable,Text> sampler = new InputSampler.RandomSampler(0.6, 5);//设置分区文件FileSystem hdFileSystem = FileSystem.get(configuration);//Path partitionFile = new Path("/bd17/totalsort/_partition");Path partitionFile = new Path("/bd17/output/partition");//设置后，全排序的partitioner程序就会读取这个分区文件来完成按顺序进行分区TotalOrderPartitioner.setPartitionFile(configuration, partitionFile);//设置jobJob job = Job.getInstance(configuration);job.setJarByClass(TotalSort.class);job.setJobName("全排序");job.setMapperClass(Mapper.class);job.setReducerClass(TotalSortReduce.class);job.setMapOutputKeyClass(IntWritable.class);job.setMapOutputValueClass(Text.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);//设置分区文件加入分布式缓冲中job.addCacheFile(partitionFile.toUri());//设置分区器job.setPartitionerClass(TotalOrderPartitioner.class);//设置reducer节点个数job.setNumReduceTasks(2);//如果要倒叙排序的，方法之一是指定job的setSortComparator类型job.setSortComparatorClass(WritableDescComparetor.class);Path inputPath = new Path("/bd17/output/desdump3");Path outputDir = new Path("/bd17/output/totalSort3");hdFileSystem.delete(outputDir,true);hdFileSystem.delete(partitionFile,true);//map端的输入会把文本文件读取成kv对时，按照分隔符把一行分成两部分，前面key后面value//如果分隔符不存在，则整行都是key，value 则为空，默认分隔符是\t,//手动指定分隔符参数：mapreduce，input.keyvaluelinerecordreader.key.value.separator//job.setInputFormatClass(KeyValueTextInputFormat.class); //以文本字符串的形式存储job.setInputFormatClass(SequenceFileInputFormat.class);//以kv的形式存储FileInputFormat.addInputPath(job, inputPath);FileOutputFormat.setOutputPath(job, outputDir);//将随机抽样写入分区文件InputSampler.writePartitionFile(job, sampler);//启动jobSystem.exit(job.waitForCompletion(true)?0:1);}}
阅读全文
0 0