第2.1章 hadoop之mrunit自定义writable
来源:互联网 发布:php o2o系统 编辑:程序博客网 时间:2024/06/03 17:41
MRUnit mvnrepository中没有找到对应的mrunit的jar包,但是可以在mrunit1.1.0中下载到
1 pom.xml
<properties> <mrunit.version>1.1.0</mrunit.version> <mockito.version>1.10.19</mockito.version> <hadoop.version>2.6.4</hadoop.version></properties><dependency> <groupId>org.apache.mrunit</groupId> <artifactId>mrunit</artifactId> <version>${mrunit.version}</version></dependency><dependency> <groupId>org.mockito</groupId> <artifactId>mockito-all</artifactId> <version>${mockito.version}</version></dependency><!-- hadoop相关jar --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>jdk.tools</groupId> <artifactId>jdk.tools</artifactId> <version>${java.version}</version> <scope>system</scope> <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath> </dependency>
2. 自定义writable
hadoop自带的writable类使用可参考MapReduce 单元测试工具 MRUnit 使用,下图用到的是binarycomparable比较器。
import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.nio.ByteBuffer;import java.util.Date;import org.apache.hadoop.io.BinaryComparable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import com.dzmsoft.framework.base.util.DateUtil;/** * 这里自定义一个Writable类,只标注了几个关键的数据。 * @author dzm * */public class InviteInfoWritable extends BinaryComparable implements WritableComparable<BinaryComparable> { private String id; /** * 统计日期 */ private Date countDate; /** * 用户帐户 */ private String account; private String username; /** * 省份 */ private String provinceId; /** * 数量 */ private Integer count; private String departmentName; public String getId() { return id; } public void setId(String id) { this.id = id; } public Date getCountDate() { return countDate; } public void setCountDate(Date countDate) { this.countDate = countDate; } public String getAccount() { return account; } public void setAccount(String account) { this.account = account; } public String getUsername() { return username; } public void setUsername(String username) { this.username = username; } public String getProvinceId() { return provinceId; } public void setProvinceId(String provinceId) { this.provinceId = provinceId; } public Integer getCount() { return count; } public void setCount(Integer count) { this.count = count; } public String getDepartmentName() { return departmentName; } public void setDepartmentName(String departmentName) { this.departmentName = departmentName; } /** * 作为mapper Key使用时请使用 */ public InviteInfoWritable() { super(); bytes = EMPTY_BYTES; } private static final byte[] EMPTY_BYTES = new byte[0]; private byte[] bytes; private int length; @Override public void write(DataOutput out) throws IOException { out.writeUTF(DateUtil.formatDate(this.getCountDate())); out.writeUTF(this.getAccount()); out.writeUTF(this.getProvinceId()); } @Override public void readFields(DataInput in) throws IOException { this.setCountDate(DateUtil.parseDate(in.readUTF())); this.setAccount(in.readUTF()); this.setProvinceId(in.readUTF()); // hadoop比较key时,首先创建两个空的key对象,然后通过readFields方法写入创建的两个对象中,再比较 // 因此byte生成方法必须写在此处,否则比较时byte为空 ByteBuffer bb = Text.encode(countDate + account, true); bytes = bb.array(); length = bb.limit(); } @Override public int getLength() { return length; } @Override public byte[] getBytes() { return bytes; }}
这里可以看到org.apache.hadoop.io.BinaryComparable
中字节比较抽象方法,所以上面在readFields中做了手工赋值
3 定义mapreduce作业
import java.io.Serializable;public class Business<T> implements Serializable { /** * */ private static final long serialVersionUID = 7056019679965982739L; /** * 业务类型 * 现在只有一种:userDataAcquisitionInfoUpload * */ private String business; private T params; public String getBusiness() { return business; } public void setBusiness(String business) { this.business = business; } public T getParams() { return params; } public void setParams(T params) { this.params = params; }}
public class InviteInfoParam implements Serializable { /** * */ private static final long serialVersionUID = 6791671668489980464L; /** * 用户名 */ private String username; /** * 省份ID */ private String provinceId; /** * 省份ID */ private String subscribe_time; /** * 省份ID */ private String openId; public String getUsername() { return username; } public void setUsername(String username) { this.username = username; } public String getProvinceId() { return provinceId; } public void setProvinceId(String provinceId) { this.provinceId = provinceId; } public String getSubscribe_time() { return subscribe_time; } public void setSubscribe_time(String subscribe_time) { this.subscribe_time = subscribe_time; } public String getOpenId() { return openId; } public void setOpenId(String openId) { this.openId = openId; }}
import java.io.IOException;import java.sql.SQLException;import java.util.HashSet;import java.util.Set;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.Tool;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import com.dzmsoft.dsj.hadoop.dto.Business;import com.dzmsoft.dsj.hadoop.dto.InviteInfoParam;import com.dzmsoft.dsj.hadoop.util.IntegerDefault0Adapter;import com.dzmsoft.dsj.hadoop.writable.InviteInfoWritable;import com.dzmsoft.framework.base.util.DateUtil;import com.dzmsoft.framework.base.util.StringUtil;import com.google.gson.Gson;import com.google.gson.GsonBuilder;import com.google.gson.reflect.TypeToken;public class InviteInfoCountJob extends Configured implements Tool { private static Logger logger = LoggerFactory.getLogger(InviteInfoCountJob.class); private static Gson gson; static{ gson = new GsonBuilder().setDateFormat("yyyy-MM-dd HH:mm:ss") .registerTypeAdapter(Integer.class, new IntegerDefault0Adapter()) .registerTypeAdapter(int.class, new IntegerDefault0Adapter()) .create(); } public static class InviteInfoCountMapper extends Mapper<LongWritable, Text, InviteInfoWritable, Text> { @Override public void map(LongWritable key, Text line, Context context) throws InterruptedException, IOException { logger.debug("inpit line:{}", line); Business<InviteInfoParam> item = gson.fromJson(line.toString(), new TypeToken<Business<InviteInfoParam>>() { }.getType()); if(!StringUtil.isBlank(item.getParams().getUsername())) { InviteInfoWritable outkey = new InviteInfoWritable(); outkey.setAccount(item.getParams().getUsername()); outkey.setProvinceId(item.getParams().getProvinceId()); // 只保存时分秒 outkey.setCountDate(DateUtil.parseDate(item.getParams().getSubscribe_time().substring(0, 10))); context.write(outkey, new Text(item.getParams().getOpenId())); } } } public static class InviteInfoCountReducer extends Reducer<InviteInfoWritable, Text, InviteInfoWritable, Text> { //在执行过程中的数据不能使用静态变量的方式传递,必须放入config /** * 统计个数 */ @Override public void reduce(InviteInfoWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //每次job都是一个新的进程,crmSysUserList不能共享,所有job之间只有config是共享的 //去重 Set<String> set = new HashSet<String>(); for (Text value : values) { set.add(value.toString()); } InviteInfoWritable bean = new InviteInfoWritable(); bean.setCountDate(key.getCountDate());// yyyyMMdd to yyyy-MM-dd bean.setAccount(key.getAccount()); bean.setProvinceId(key.getProvinceId()); bean.setId(StringUtil.getUuidString()); bean.setCount(set.size()); context.write(bean, null); } } public static void main(String[] args) throws Exception { //recreate参数为创建表结构,可选// Configuration conf = MyConfiguration.createWithDB();// int res = ToolRunner.run(conf, new InviteInfoCountJob(), args);// System.exit(res); } public final int run(final String[] args) throws IOException, InterruptedException, ClassNotFoundException, SQLException {// Configuration conf = MyConfiguration.createWithDB(); Configuration conf = super.getConf(); String inputPath = null; Job job = Job.getInstance(conf, "dms_invite_info_count"); job.setJarByClass(InviteInfoCountJob.class); job.setMapperClass(InviteInfoCountMapper.class); // 设置输出的表明名 job.setMapOutputKeyClass(InviteInfoWritable.class); job.setMapOutputValueClass(Text.class); // 设置输入,可设置多个 FileInputFormat.addInputPath(job, new Path(inputPath)); //遍历子目录 FileInputFormat.setInputDirRecursive(job, true); // 输出到mysql job.setReducerClass(InviteInfoCountReducer.class); job.setOutputFormatClass(DBOutputFormat.class); DBOutputFormat.setOutput(job, "dms_invite_info_count", "id", "account", "username", "department_name", "province_id", "count", "count_date"); // 后面的这些就是数据库表的字段名 job.setNumReduceTasks(7);// 必须和列数一致 boolean result = job.waitForCompletion(true); logger.info("job {} is {}!", job.getJobName(), result ? "success" : "failed"); return result ? 0 : 1; }}
4 mrunit单元测试用例
import java.io.IOException;import java.text.ParseException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mrunit.mapreduce.MapDriver;import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;import org.junit.Before;import org.junit.Test;import com.dzmsoft.dsj.hadoop.job.InviteInfoCountJob;import com.dzmsoft.dsj.hadoop.writable.InviteInfoWritable;import com.dzmsoft.framework.base.util.DateUtil;public class JobMrUnitTest { Configuration conf ; MapDriver<LongWritable, Text, InviteInfoWritable, Text> mapDriver; ReduceDriver<InviteInfoWritable, Text, InviteInfoWritable, Text> reduceDriver; MapReduceDriver<LongWritable,Text,InviteInfoWritable,Text,InviteInfoWritable,Text> mapReduceDriver; @Before public void init(){ conf = new Configuration(); InviteInfoCountJob.InviteInfoCountMapper mapper = new InviteInfoCountJob.InviteInfoCountMapper(); InviteInfoCountJob.InviteInfoCountReducer reducer = new InviteInfoCountJob.InviteInfoCountReducer(); mapDriver = MapDriver.newMapDriver(mapper); reduceDriver = ReduceDriver.newReduceDriver(reducer); mapReduceDriver = MapReduceDriver.newMapReduceDriver(mapper,reducer); } @Test public void test_mapper() throws IOException{ String text = "{\"business\":\"wcnInviteInfoUpload\",\"params\":{\"username\":\"wanghui\",\"provinceId\":\"789\",\"subscribe_time\":\"2017-02-10 00:04:28\",\"openId\":\"ou7nSs1fX2IWh3iXBGYbTMVxDQy2\"}}"; mapDriver.withInput(new LongWritable(), new Text(text)); // 按照account、countDate作为分组,provinceId存储留作它用 InviteInfoWritable outkey = new InviteInfoWritable(); outkey.setAccount("wanghui"); outkey.setProvinceId("789"); try { outkey.setCountDate(DateUtil.parseDate("2017-02-10", "yyyy-MM-dd")); } catch (ParseException e) { e.printStackTrace(); } // 断言运行结果 mapDriver.withOutput(outkey, new Text("ou7nSs1fX2IWh3iXBGYbTMVxDQy2")); // mapper运行 mapDriver.runTest(); }}
0 0
- 第2.1章 hadoop之mrunit自定义writable
- 第2.2章 hadoop之mrunit 多个结果验证
- 自定义Hadoop Writable
- Hadoop 自定义Writable NullPointerException
- Hadoop 自定义Writable NullpointerException
- hadoop 自定义Writable
- Hadoop(11) 自定义Writable
- Hadoop 之 Writable , WritableComparable 接口
- hadoop之Writable序列化
- [Hadoop]Hadoop单元测试MRUnit
- Hadoop: the definitive guide 第三版 拾遗 第五章 之MRUnit
- Hadoop如何实现自定义的Writable
- Hadoop自定义Writable实现二次排序
- Hadoop学习笔记之三:用MRUnit做单元测试
- Hadoop Serialization -- hadoop序列化详解 (3)【ObjectWritable,集合Writable以及自定义的Writable】
- Hadoop Serialization -- hadoop序列化详解 (3)【ObjectWritable,集合Writable以及自定义的Writable】
- 自定义Writable
- MRUNIT hadoop逐步调试工具!
- CentOS7.3部署OpenStack-Ocata版本手记(计算节点)
- Set接口
- Centos7官网qcow2镜像的使用方法
- AS3的BitmapData内存占用
- HashSet类
- 第2.1章 hadoop之mrunit自定义writable
- Java EE基础知识学习(三)
- 排序算法
- HDU 5317(数论,素数筛法)
- 在Ubuntu16-04版本上搭建离线免费地图osm(一)
- 【SSH网上商城项目实战09】添加和更新商品类别功能的实现
- 坑了我一个小时的脚本执行
- Spring 注解
- Android自定义view之- BitMap的绘制