一个简单的hadoop job测试

来源：互联网发布：网络被劫持怎么处理编辑：程序博客网时间：2024/06/07 01:18

1：编写map类

package org.liufu.hadoop;import java.io.IOException;import java.net.InetAddress;import java.text.SimpleDateFormat;import java.util.Date;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class Map extends Mapper<LongWritable, Text, Text, IntWritable> {    private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-hh hh:mm:ss");    private Date StartTime = null;    @Override    protected void map(LongWritable keyIn, Text valueIn, Context context)            throws IOException, InterruptedException {        String valueLine = valueIn.toString();        String year = valueLine.substring(5,9);        String wendu = valueLine.substring(16,18);        int wenduint = Integer.parseInt(wendu);        //这个上下文容器，就是装着map处理后的输出数据        //一定要对应上extends Mapper<LongWritable, Text, Text, IntWritable>的最后两个泛型类型        context.write(new Text(year), new IntWritable(wenduint));    }    @Override    protected void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub        super.setup(context);        InetAddress inetAddress = InetAddress.getLocalHost();        StartTime = new Date();        System.out.println(inetAddress.getHostName() + " : " + inetAddress.getAddress() + " 时间：" + sdf.format(StartTime));    }    @Override    protected void cleanup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)            throws IOException, InterruptedException {        // TODO Auto-generated method stub        super.cleanup(context);        System.out.println("时间：" + sdf.format(new Date().getTime() - StartTime.getTime()));    }}

2：编写reduce类

package org.liufu.hadoop;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class Red extends Reducer<Text, IntWritable, Text, IntWritable> {    @Override    protected void reduce(Text keyIn, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {        int maxValue = Integer.MIN_VALUE;        for(IntWritable i : values){            maxValue = Math.max(maxValue, i.get());        }        //注意，这个容器里面的内容类型务必要和繁星设置的类型一样        context.write(new Text(keyIn), new IntWritable(maxValue));    }}

3：最后编写JOB类，这个类的configuration对象非常重要，他里面包含了四个配置文件，这些配置文件定义了目标hdfs的很多属性。
////////////////////////////////////////////////////////////////////////////
java项目知识点～配置文件加载的顺序：
1.首先加载依赖包中的一些default.xml文件
2.加载插件中的一些pulin.xml文件
。。。
3.最后才加载项目中的src下的配置文件。这个也是为我们开发人员进行拓展的机制，注意：（配置文件的名字需要对应，否则无效）
///////////////////////////////////////////////////////////////////////////

package org.liufu.hadoop;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class JobForMapReduce {    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf = new Configuration();        //这个很重要的，指定了默认的文件系统是hdfs，默认是本地的        //但是在项目中，已经有了一个core-site.xml，所以conf就会使用项目中的core-site.xml，        //一般情况下，我们都会在项目中放置核心的配置文件，让conf从项目的配置文件中读取属性。//      conf.addResource(new Path("/usr/mysoft/hadoop-2.7.1/etc/hadoop_pause/core-site.xml"));        FileSystem fs = FileSystem.get(conf);        Path inputPath = new Path("/mytmp/test.txt");        Path outPath = new Path("/mytmp/out");        if (fs.exists(outPath)) {            fs.delete(outPath, true);        }        Job job = Job.getInstance(conf);        //这让job可以找到jar，然后在运行这个job的过程中，把这个jar发送给集群。很重要        job.setJarByClass(JobForMapReduce.class);        job.setJobName("testJob");        //设置文件的输入路径        //不仅可以是文件，也可以是目录（但不是递归）        FileInputFormat.addInputPath(job, inputPath);        //输出文件只能设置一个，而且不能够存在，以免被覆盖        FileOutputFormat.setOutputPath(job, outPath);        job.setMapperClass(Map.class);        job.setReducerClass(Red.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        job.waitForCompletion(true);    }}

总结：
1.java项目中的配置文件加载的顺序。
2.configuration对象需要在filesystem对象中放进去
FileSystem fs = FileSystem.get(conf);
3.configuration对象需要在job对象中放进去
Job job = Job.getInstance(conf);
只有configuration对象的配置正确了，接下来的处理才能够正确的执行。

0 0