Hadoop mapreduce 入门示例详解

来源：互联网发布：儿童dna数据库有用吗编辑：程序博客网时间：2024/06/06 12:41

wordcout统计词频mapreduce编程

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;
public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

//接收数据v1

String line = value.toString();

//切分数据

String[] words = line.split(" ");

//循环遍历

for(String w : words){

//出现一次记一个1，让后发送数据

context.write(new Text(w), new LongWritable(1));
}

}

}

import java.io.IOException;import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class WordRed extends Reducer<Text,LongWritable,Text,LongWritable>{

@Override

protected void reduce(Text key, Iterable<LongWritable>values,Context context)

throws IOException, InterruptedException {

//定义一个计数器

long counter = 0;

//循环累加

for(LongWritable l : values){

counter += l.get();
}

//将累加到的数据进行输出

context.write(key, new LongWritable(counter));
}
}

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

public static void main(String[] args) throws Exception {

//加载配置文件

Configuration conf = new Configuration();

//构建job对象

Job job = Job.getInstance(conf);

//设置main方法所在的类

job.setJarByClass(WordCount.class);
//设置mapper相关属性

job.setMapperClass(WCMapper.class);
//map拆分部分
job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(LongWritable.class);

FileInputFormat.setInputPaths(job, new Path("hdfs://snow521:9000/user/snow/input0/words.txt")); //设置输入路径
//注意第一次这里报错说input path not extits words.txt 原因是因为拷贝的代码为new Path(/words.txt);表示已经写死，所以我改了下，结果编译正确，但是却没有找到
output0,查找后面代码发现 setOutputPath(job,new Path(/wcout));这个是在Hdfs的/目录下，于是去找，结果找到并输出结果。记住：input可以有多个，但是output只能有一个，也要记住，input output只是起的输出输入名字，你可以起不同的名字，不一定非要是input output ，如果同一个代码，编译两次就会出错，说output path 已经存在输出文件，此时，你可以删除，再编译。

//设置reducer相关属性

job.setReducerClass(WCReducer.class);
.//reduce 类来进行map产生中间结果合并，避免给网络数据传输产生压力
job.setOutputKeyClass(Text.class); //设置job输出中的key和value 数据类型，因为结果是《单词，个数》所以key值设置为Text类型，相当于Java中的的string类型，value设置为IntWritable相当于Java中的int类型。
job.setOutputValueClass(LongWritable.class);

FileOutputFormat.setOutputPath(job, new Path("hdfs://snow521:9000/user/snow/output6/wcout"));
//设置输出路径。

//提交

job.waitForCompletion(true);
}

}

希望对入门像我一样的菜鸟有所帮助

阅读全文

0 0