hadoop_7 : MapReduce代码

来源:互联网 发布:梦幻网络远程锁机 编辑:程序博客网 时间:2024/05/11 19:04

HelloWord

import ......public class WordCount{  //设置Mapper接口,设置map的输入类型是<objetc, Text>: object是通用封装,封装多个类型  //RPC的返回和参数用该类型  //输出类型<Text,IntWritabl>: Text是针对UTF-8的封装  public static class TokenizerMapper extend  Mapper<Object, Text, Text, IntWritable>        public Text word=new Text();  //IO流异常             public void map(Object key, Text value, Context context) throws IOException,        InterruptedException        {           private final static IntWiratble one= new IntWritable(1);       //StringTokenizer是字符串分隔解析类型       StringTokenizer itr=new StringTokenizer(value.toString()); //对输入的词切分       while(itr.hasMoreTokens()) //返回是否还有分隔符。       {          word.set(itr.nextToken()); //切入的单词存入word          //返回从当前位置到下一个分隔符的字符串。          context.wirte(word, one)      }public static class IntSumReduer extends reducer<Text,IntWritable,Text,IntWirtable>   //result 记录单词的频数   private IntWiratable result= new IntWiratable();   public void reduce(Text key, Iterable<IntWritable> values, Context   context)throws IOExcaption,InterruptedException   {       int sum=0;       //对获取的<key, value-list> 计算value的和        for(IntWritable val:values)    {            sum+=val.get();         }    //频数设置到result中    result.set(sum);    //结果    context.wirte(key,result);   }}  }

数据去重

// 样例file_1:2006-6-9  a2006-6-10 b2006-6-11 c2006-6-12 d2006-6-13 a2006-6-14 b2006-6-15 c2006-6-11 cfile_2:2006-6-9  b2006-6-10 a2006-6-11 b2006-6-12 d2006-6-13 a2006-6-14 c2006-6-15 d2006-6-11 c输出:2006-6-10 a2006-6-10 b2006-6-11 b2006-6-11 c2006-6-12 d2006-6-13 a2006-6-14 b2006-6-14 c2006-6-15 c2006-6-9  a2006-6-9  b
  • 算法
    1. 将所有结果交给1台reduce
    2. <key, value>中将value设置为空
import ......public class Dedup{//map 将输入中的value 复制到输出数据key上, 并直接输出   public static class Map extends Mapper<Object,Text, Text, Text>   public static Text line= new Text();   public void map(Object key, Text value, Context context)throws IOException   ,InterruptedException   {      line=value;      context.wirte(line, next Text(""));   } //reduce 将输入的key复制到输出数据key上, 并直接输出   public static class Reduce extends Reducer<Text, Text, Text, Text>   {      public void reduce (Text key, Iterable<Text> values, Context context )      throws IOException, InterruptedException      {          context.wirtable(key, new Text(""))      }   }}

排序

file_1:232654321575665223file_2:59562265092file_3:26546输出:1 22 63 154 225 266 327 328 349 9210 65011 65412 75613 595614 65223
  • 算法:
    key封装成IntWritable类型,reduce的自动排序是发送到自己所在的节点,默认排序无法满足全局的顺序; 构建自己的partiton,再对局部的reduce上的数据进行默认排序

    1. 封装Int的IntWritable类型的数据结构
    2. 重写partitions,保证整体有序:输入数据的最大值除以系统的parititions的数量的商,
    3. reduce获取后,分别排序,输出的key是全局变量
    4. 默认是hashpartition,产生均衡的分区
public class Sort{//map 将输入的value转换为IntWratable类型, 作为输出的keypublic static class Map extends Mapper<Object,Text, IntWriatable, IntWriatable>{   private static IntWritable data= new IntWritable();   public void map(Object key, Text value, Context context) throws IOException, InterruptedException   {      String line=value.tostring();      data.set(Interger.parseInt(line));      context.write(data,new IntWritable(1));   }}//reduce 将输入的key复制到输出的value上,然后根据输入的//value-list中的元素的个数决定key的输出次数//用全局linenum来代表key的位次public static class Redcue exteneds Reducer<IntWritable, Int Writable, IntWriatble, Intwriatble>{   private static IntWritable linenum=new IntWritable(1);   public void redcue(IntWritable key, Iterable<IntWritable> values, Context context)   throws IOException, InterruptedException   {      for(IntWritable val : values)      {         context.wirte(linenum, key);     linenum=new IntWritable(linenum.get()+1);      }   }}//自定义Partitions,函数根据输入的数据的最大值和MapReducer框架中//Partition的数量获取//返回对于的partition的IDpublic static class Partitions extends Partitioner<IntWritable,IntWritable>{   public int getPartition(IntWriatable key, IntWriatable value, int numPartitions)   {      int Maxnumber=65523;      int bound=Maxnumber/numPartitions+1;      int keynumber=key.get();      for(int i=0;i< numPartitions;i++)      {         if(keynumber<bound*i && keynumber>=bound*(i-1))        return i-1;      }      return -1;   }}}//main{   job.setPartitionerclass(Partition.class)}

全排序

  • partitioner的实现方法: HashPartitioner;TotalOrderPartitioner
  • 抽样的方法确定分区,预防数据倾斜
import ......public class TotalSort{   public static void main(String[] args) throws Exception   {       Path inputPath=new Path(args[0]);       Path outputPath=new Path(args[1]);       //分区文件路径       Path partitionFile=new Path(args[2]);       int reducerNumber=Integer.parsInt(args[3]);       //RandomSampler第一个参数表示会被选中的概率,       //第二个参数是一个选取的样本数       //第三个参数是最大读取的intputsplit的数目       RandomSampler<Text,Text> sampler=new InputSampler.RandomSampler<Text,Text>(0.1, 10000,10);       Configuration.setPartitionFile(conf, partitionFile);       Job job=new Job(conf);       job.setJobName("TotalSort");       job.setJarByClass(TotalSort.class);       job.setInputForamtClass(KeyValueTextInputFormat.class);       job.setMapOutputKeyClass(Text.class);       job.setMapOutputValueClass(Text.class);       job.setNumReduceTasks(reduceNumber);       //设置partitions       job.setPartitionerClass(TotalOrderPartitioner.class);       FileInputFormat.setInputPaths(job, inputPath);       FileOutputFormat.setOutputPath(job, outputPath);       outputPath.getFileSystem(conf).delete(outputPath,true);       //写入分区文件       InputSampler.writePartitionFile(job,sampler);       System.out.println(job.waitForCompletion(true)? 0:1);   }}

二次排序

http://www.linuxidc.com/Linux/2014-03/98498.htm

sort1    1sort2    3sort2    77sort2    54sort1    2sort6    22sort6    221sort6    20输出:sort1 1,2sort2 3,54,77sort6 20,22,221
  • 流程

  • Map端处理:生成如下格式

{[sort1,1],1}{[sort2,3],3}{[sort2,77],77}{[sort2,54],54}{[sort1,2],2}{[sort6,22],22}{[sort6,221],221}{[sort6,20],20}
  • 分区处理器:将新key中的第一个字段相同的才放到同一个reduce中进行分组合并

操作后,得到的数据流如下:
Partition1:{[sort1,1],1}、{[sort1,2],2}
Partition2:{[sort2,3],3}、{[sort2,77],77}、{[sort2,54],54}
Partition3:{[sort6,22],22}、{[sort6,221],221}、{[sort6,20],20}

  • 调用自己的自定义排序器对新的Key值进行排序。
    {[sort1,1],1}
    {[sort1,2],2}
    {[sort2,3],3}
    {[sort2,54],54}
    {[sort2,77],77}
    {[sort6,20],20}
    {[sort6,22],22}
    {[sort6,221],221}

  • Reduce端处理:
    {sort1,[1,2]}
    {sort2,[3,54,77]}
    {sort6,[20,22,221]}

//map代码public class SecondaryMapper extends Mapper<LongWirtable,Text,Text,NullWritable>{    protected void map(LongWirtable key, Text value, Context context) throws    java.io.IOexception,InterruptedException    {       //将value做为key输出,value代表一整行数据       context.write(value, NullWritable.get());    }}//partitioner代码//分区过程中,按照第一个排序字段进行分发(新key的第一个字段)public class KeyPartitioner extends HashPartitioner<Text, NullWritable>{    public int getPartition(Text key, NullWritable value, int numReduceTasks)    {        return (key.toString().split(" ")[0].hashCode() & Inter.MAX_VALUE) %    numRedcueTasks;    }}//自定义的排序,对新的value值排序public class SortComparator extends WritableComparator{    protected SortComparator()    {       super(Text.class,true);    }    public int compare(WritableComparable key1, WirtableComparable key2)    {    //如果第一个排序字段相同,则需要比较第二个排序字段    if(Integer.parseInt(Key1.toString().split(" ")[0]==Integer.parseInt(key2.toString().split(" ")[0])))    {        if(Integer.parseInt(Key1.toString().split(" ")[1]>Integer.parseInt(key2.toString().split(" ")[1]))    return 1;    else if Integer.parseInt(Key1.toString().split(" ")[1]<Integer.parseInt(key2.toString().split(" ")[1])    {return -1;}    else if (Integer.parseInt(Key1.toString().split(" ")[1]==Integer.parseInt(key2.toString().split(" ")[1]))    {return 0;}    }    //如果第一个排序字段不同,则比较第二个排序字段    else    {        if(Integer.parseInt(Key1.toString().split(" ")[0]>Integer.parseInt(key2.toString().split(" ")[0]))    {return 1;}    else if(Integer.parseInt(Key1.toString().split(" ")[0]<Integer.parseInt(key2.toString().split(" ")[0]))    {return -1;}    }    return 0;    }}//reduce//设置reduce个数为1,分组需要再reduce之前再次分组,按默认的keypublic class SecondaryReducer extends Reducer<Text, IntWriatble, NullWritable,Text>{   protected void reduce(Text key, java.lang.Iterable<Text> values,Context context)   throws java.io.IOException, InterruptedException   {       for(Text value:values)       {           context.wirte(NullWritable.get(),value);        }   }}

join

J 0001H 0002B 00030001 chinese0001 math0002 music0002 math0003 physic// join后的输出j chinesej mathh musich mathb physic
  • 算法
    1. map阶段:读入文件的数据,并打上文件名(来源于那个文件)
    2. reduce阶段:笛卡尔积
//map函数public class JoinMapper extends Mapper<LongWritable key,Text, Text, Text>{   public static final String LEFT_FILENAME="student_info.txt";   public static final String RIGHT_FILENAME="student_class_info.txt";   public static final String LEFT_FILENAME_FLAG="l";   public static final String RIGHT_FILENAME_FLAG="r";   protected void map(LongWritable key, Text value, Context context) throws   IOException, InterruptedException   {       String filePath=((FileSplit) context.getInputSplit()).getPath().toString();       String fileFlag=null;       String joinKey=null;       String joinVlaue=null;       //判断来自那个文件       if(filePath.contains(LEFT_FILENAME))       {           FILEfLAG=left_filename_flag;       joinKey=value.toString().split("\t")[1];       jionValu=value.toString().split("\t")[0];       }else if(filePath.contains(RIGHT_FILENAME))       {           fileFlag=RIGHT_FILENAME_FLAG;       joinKey=value.toString().split("\t")[0];       jionValu=value.toString().split("\t")[1];       }       //输出键值并标示该结果来自那个文件       context.write(new Text(joinKey), new Text(joinValue+"\t"+fileFlag));   };}//reduce函数public class JoinReduce extends Reducer<Text,Text,Text,Text>{   public static final String LEFT_FILENAME="student_info.txt";   public static final String RIGHT_FILENAME="student_class_info.txt";   public static final String LEFT_FILENAME_FLAG="l";   public static final String RIGHT_FILENAME_FLAG="r";   protected void reduce(Text key,Iterable<Text> values,Context context) throws   IOException, InterruptedException   {      Iterable<Text> iterator=values.iterator();      List<string> studentClssNames=new ArrayList<String>();      String studentName="";      while(iterator.hasNext())      {        String[] infos=oteratr.next().tostring.split("\t");    //判断该记录来自那个文件    if(infos[1].equals(LEFT_FILENAME_FLAG))    {studentName=infos[0];}    else if(infos[1].equals(RIGHT_FILENAME_FLAG))    {studentClassNames.add(info[0]);}      }   }    //求笛卡尔积    for(int i=0;i<studentClassNames.size();i++)    {        context.write(new Text(studentName),new Text(studentClassName.get(i)));    }}
0 0
原创粉丝点击