第一个mapreduce

来源：互联网发布：ubuntu wily 源编辑：程序博客网时间：2024/06/05 22:33

hadoop环境搭好了，那么就试着写第一个mapreduce吧，以<hadoop: the definitive guide>中按年统计最高温度为例。温度文件的具体格式参考该书目。

原作者提供了温度文件的sample.txt，有5行记录作为测试，上传服务器。

Code和书中基本一致，书中的例子使用的早期的JobClient API，这里改为Job，

1. 建立项目

在eclipse中新建一个java project，引入hadoop -core.jar.

建3个类mapper: MaxTemperatureMapper，reducer:MaxTemperatureReducer，主程序:MaxTemperature，然后打成jar包，上传到hadoop机器上即可。

在hadoop上运行：

hadoop jar ~/jars/maxTemperature.jar MaxTemperature ~/data/test/sample.txt output

附录：

MaxTemperatureMapper.java

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class MaxTemperatureMapper extends Mapper<LongWritable,Text,Text,IntWritable>{

private static final int MISSING=9999;

public void map(LongWritable key,Text value,Context context)

throws IOException,InterruptedException{

String line=value.toString();

String year=line.substring(15,19);

int temperature;

if(line.charAt(87)=='+'){

temperature=Integer.parseInt(line.substring(87,92));

}else {

temperature=Integer.parseInt(line.substring(87,92));

}

String quality=line.substring(92,93);

//正则匹配

if(temperature!=MISSING && quality.matches("[01459]")){

context.write(new Text(year), new IntWritable(temperature));

}

MaxTemperatureReducer.java

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class MaxTemperatureReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

public void reduce(Text key,Iterable<IntWritable> values,Context context)

throws IOException,InterruptedException{

int maxTemp=Integer.MIN_VALUE;

for(IntWritable temp:values){

maxTemp=Math.max(maxTemp, temp.get());

}

context.write(key, new IntWritable(maxTemp));

}

MaxTemperature.java

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MaxTemperature {

public static void main(String[] args) throws Exception{

if(args.length!=2){

System.err.println("usage: MaxTemperature <input path> <output path>");

System.exit(-1);

}

long begainTime=System.currentTimeMillis();

Job job=new Job();

job.setJarByClass(MaxTemperature.class);

FileInputFormat.addInputPath(job, new Path(args[0]));

FileOutputFormat.setOutputPath( job, new Path(args[1]));

job.setMapperClass(MaxTemperatureMapper.class);

job.setReducerClass(MaxTemperatureReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(IntWritable.class);

boolean status=job.waitForCompletion(true);

System.out.printf("runing time(ms) : %d",System.currentTimeMillis()-begainTime);

System.exit(status?0:1);

}