MapReducer入门案例MyWordCount

来源:互联网 发布:怎么样投诉淘宝客服 编辑:程序博客网 时间:2024/05/20 03:39
1.pom.xml中依赖包
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.jp</groupId>
<artifactId>hadoop-mapreduce</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>2.6.4</hadoop.version>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
<scope>runtime</scope>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>runtime</scope>
</dependency>

</dependencies>
</project>


2.日志配制文件 log4j.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration PUBLIC
  "-//APACHE//DTD LOG4J 1.2//EN" 
  "http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/xml/doc-files/log4j.dtd">

<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="[%d{dd HH:mm:ss,SSS} %-5p] [%t] %c{2} - %m%n" />
</layout>
</appender>
<logger name="com.jp">
<level value="debug" />
</logger>
<root>
<priority value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

3.源代码
package com.jp.hadoop.mapreduce;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;

/**
 * 单词统计
 * @author jp
 */
public class MyWordCount {
private static final Logger log = Logger.getLogger(MyWordCount.class); // 日志记录器
private static final String JOB_NAME = "mywordcount"; // 作业的名称
private static final String HDFS_PATH = "hdfs://master:9000/"; // 作业的名称

/**
 * 拆分数据
 * @author jp
 */
public static class MyWordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private static final IntWritable ONE = new IntWritable(1); // 创建一个hadoop int类型常量,用来统计单词个数

/**
 * 拆分操作
 * @param key 每行第一个字符的偏移量
 * @param value 每一行的字符串
 * @param context hadoop map上下对象
 */
@Override
public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// Text Hadoop的String类型, IntWritable Hadoop的int类型
String[] words = value.toString().split("\\s"); // 通过空格符把一行数据拆分成一个个单词
for (String word : words) {
log.debug("map ==> 拆分的单词为" + word);
context.write(new Text(word), ONE); // 把数据写入hadoop上下文对象,交给reader做汇总处理
}
}
}

/**
 * 合并数据
 * @author jp
 */
public static class MyWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

/**
 * 合并操作
 * @param key map拆分出来的作为key数据
 * @param value 相同key合并value成的集合
 * @param context hadoop reducer上下对象
 */
@Override
public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

int count = 0;
for (IntWritable value : values) {
count++; // 统计集合中的数量
}

log.debug("reduce ==> 单词" + key + "的数量是" + count);
context.write(key, new IntWritable(count)); // 把数据写入hadoop上下文对象,输出到hdfs上
}
}

/**
 * 统计操作
 * @param inout 输入数据和输出数据的路径
 */
public void count(String... inout) {
if (inout.length < 2) {
log.error("job ==> 参数个数不对, 只少要两个!!!");
throw new RuntimeException("参数个数不对, 只少要两个!!!");
}

boolean flag = false; // hadoop运行结果标识

try {
Configuration conf = new Configuration(); // 创建hadoop配制文件实例对象
Job job = Job.getInstance(conf, "mywordcount"); // 创建hadoop作业实例对象

job.setJarByClass(MyWordCount.class); // 设置作业的入口执行类

job.setMapperClass(MyWordCountMap.class); // 设置作业的map处理类
job.setMapOutputKeyClass(Text.class); //map后,输出key的数据类型
job.setMapOutputValueClass(IntWritable.class);//map后,输出value的数据类型
job.setReducerClass(MyWordCountReducer.class); // 设置作业的reduce处理类

Path[] inPaths = new Path[inout.length - 1]; // 要进行处理数据的所有路径
for (int i = 0; i < inout.length - 1; i++) {
inPaths[0] = new Path(inout[i]);
}

Path outPath = new Path(inout[inout.length - 1]); // 处理完后,数据的输出路径

FileInputFormat.setInputPaths(job, inPaths); // 设置作业的数据来源
FileOutputFormat.setOutputPath(job, outPath); // 设置处理完后,数据的输出位置

flag = job.waitForCompletion(true); // 执行作业

} catch (IOException e) {
log.error("job ==> 创建作业失败!!!", e);
} catch (Exception e) {
log.error("job ==> 作业执行失败!!!", e);
} finally {
log.info("hadoop运行" + JOB_NAME + (flag ? "成功..." : "失败!!!"));
}
}

/**
 * 创建测试数据
 * @return 输入数据和输出数据的路径
 */
public String[] createTestData() {
String[] args = new String[2];
args[0] = HDFS_PATH + "test/input/"; //输入数据的路径
args[1] = HDFS_PATH + "test/output/"; //输出数据的路径
//测试使用的模拟数据
try {
FileSystem fs = FileSystem.get(new URI(HDFS_PATH), new Configuration()); //创建文件系统实例对象
Path inPath = new Path(args[0]); //转换成hadoop的文件对象类型
Path outPath = new Path(args[1]); //转换成hadoop的文件对象类型
//输出目录
if(fs.exists(outPath)){  //判断目录是在文件系统中存在
fs.delete(outPath, true); //删除目录
}
//输入目录
if(fs.exists(inPath)){  //判断目录是在文件系统中存在
fs.delete(inPath, true); //删除目录
}
fs.mkdirs(inPath); //创建目录
FSDataOutputStream fsdou = fs.create(new Path(inPath, "input.txt")); //创建文件
fsdou.write(("hello world hello hadoop\n"
+ "hadoop common hadoop mapreduce hadoop yarn\n"
+ "hi hdfs hi mapduce hi yarn").getBytes()); //给文件写入内容
fsdou.flush();
fsdou.close();
} catch (Exception e) {
log.error("test data ==> 创建测试数据失败!!!", e);

return args;
}
/**
 * 入口方法
 * @param args 输入数据和输出数据的路径
 */
public static void main(String[] args) {
MyWordCount mwc = new MyWordCount();
if (args.length < 2) {
args = mwc.createTestData();
}
mwc.count(args);
}

}


4.运行结果


0 0