6. 将清洗后的数据写入到Hbase中（仅mapper和runner）

来源：互联网发布：电脑网络图标不见了编辑：程序博客网时间：2024/05/16 19:07

3. 编写mapper类和runner类

（1） Mapper

AnalyserLogDataMapper.java
— 自定义数据解析map类，对输入、输出、过滤掉的数据进行统计
- 首先辨别事件，根据事件处理数据
- 设计rowkey（crc编码），将数据put到hbase中去

package com.neu.etl.mr.ald;import java.io.IOException;import java.util.Map;import java.util.zip.CRC32;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import org.apache.log4j.Logger;import com.neu.common.EventLogConstants;import com.neu.common.EventLogConstants.EventEnum;import com.neu.etl.util.LoggerUtil;/** * 自定义数据解析map类 * * @author neu * */public class AnalyserLogDataMapper extends Mapper<Object, Text, NullWritable, Put> {     private final Logger logger = Logger.getLogger(AnalyserLogDataMapper.class);     private int inputRecords, filterRecords, outputRecords; // 主要用于标志，方便查看过滤数据     private byte[] family = Bytes.toBytes(EventLogConstants.EVENT_LOGS_FAMILY_NAME);     private CRC32 crc32 = new CRC32();     @Override     protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {           this.inputRecords++;           this.logger.debug("Analyse data of :" + value);           try {                // 解析日志                Map<String, String> clientInfo = LoggerUtil.handleLog(value.toString());                // 过滤解析失败的数据                if (clientInfo.isEmpty()) {                     this.filterRecords++;                     return;                }                // 获取事件名称,根据en键返回en的值 value                String eventAliasName = clientInfo.get(EventLogConstants.LOG_COLUMN_NAME_EVENT_NAME);                EventEnum event = EventEnum.valueOfAlias(eventAliasName); // 根据事件别名，得到事件名                switch (event) {                case LAUNCH:                case PAGEVIEW:                case CHARGEREQUEST:                case CHARGEREFUND:                case CHARGESUCCESS:                case EVENT:                     // 处理数据                     this.handleData(clientInfo, event, context);                     break;                default:                     this.filterRecords++;                     this.logger.warn("该事件没法进行解析，事件名称为:" + eventAliasName);                }           } catch (Exception e) {                this.filterRecords++;                this.logger.error("处理数据发出异常，数据:" + value, e);           }     }     /**      * 打印日志：输入，输出，过滤      */     @Override     protected void cleanup(Context context) throws IOException, InterruptedException {           super.cleanup(context);           logger.info("输入数据:" + this.inputRecords + "；输出数据:" + this.outputRecords + "；过滤数据:" + this.filterRecords);     }     /**      * 具体处理数据的方法      *      * @param clientInfo      * @param context      * @param event      * @throws InterruptedException      * @throws IOException      */     private void handleData(Map<String, String> clientInfo, EventEnum event, Context context)                throws IOException, InterruptedException {           String uuid = clientInfo.get(EventLogConstants.LOG_COLUMN_NAME_UUID);           String memberId = clientInfo.get(EventLogConstants.LOG_COLUMN_NAME_MEMBER_ID);           String serverTime = clientInfo.get(EventLogConstants.LOG_COLUMN_NAME_SERVER_TIME);           if (StringUtils.isNotBlank(serverTime)) {                // 要求服务器时间不为空                clientInfo.remove(EventLogConstants.LOG_COLUMN_NAME_USER_AGENT); // 浏览器信息去掉                // rowkey设计格式为： timestamp + (uuid+memberid+event).crc                String rowkey = this.generateRowKey(uuid, memberId, event.alias, serverTime);                Put put = new Put(Bytes.toBytes(rowkey));                // 写出                for (Map.Entry<String, String> entry : clientInfo.entrySet()) {                     if (StringUtils.isNotBlank(entry.getKey()) && StringUtils.isNotBlank(entry.getValue())) {                           put.add(family, Bytes.toBytes(entry.getKey()), Bytes.toBytes(entry.getValue()));                     }                }                context.write(NullWritable.get(), put);                this.outputRecords++;           } else {                this.filterRecords++;           }     }     /**      * 根据uuid memberid servertime创建rowkey，并进行crc编码      *      * rowkey设计格式为： timestamp + (uuid+memberid+event).crc      *      * @param uuid      *            用户id      * @param memberId      *            会员id      * @param eventAliasName      *            事件名称      * @param serverTime      *            时间戳，最后修改时间      * @return      */     private String generateRowKey(String uuid, String memberId, String eventAliasName, String serverTime) {           StringBuilder sb = new StringBuilder();           sb.append(serverTime).append("_");           this.crc32.reset();           if (StringUtils.isNotBlank(uuid)) {                this.crc32.update(uuid.getBytes());           }           if (StringUtils.isNotBlank(memberId)) {                this.crc32.update(memberId.getBytes());           }           this.crc32.update(eventAliasName.getBytes());           sb.append(this.crc32.getValue() % 100000000L);  // j降低数据长度           return sb.toString();     }}

（2）Runner

AnalyserLogDataRunner.java
- 除了正常的mapreduce hbase操作
- 特别注意两个：
- 1. 参数处理：输入处理哪一天的数据：yyyy-MM-dd。其中需要在TimeUtil类中设置时间转换方法、时间有效性方法（是否匹配正则表达式）。
- 2. job输入路径设置：根据时间设置路径

package com.neu.etl.mr.ald;import java.io.File;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.log4j.Logger;import com.neu.common.EventLogConstants;import com.neu.common.GlobalConstants;import com.neu.util.EJob;import com.neu.util.TimeUtil;/** * 编写mapreduce的runner类 * * @author gerry * */public class AnalyserLogDataRunner implements Tool {     private static final Logger logger = Logger.getLogger(AnalyserLogDataRunner.class);     private Configuration conf = null;     public static void main(String[] args) {           try {                ToolRunner.run(new Configuration(), new AnalyserLogDataRunner(), args);           } catch (Exception e) {                logger.error("执行日志解析job异常", e);                throw new RuntimeException(e);           }     }     @Override     public void setConf(Configuration conf) {           this.conf = HBaseConfiguration.create(conf);     }     @Override     public Configuration getConf() {           return this.conf;     }     @Override     public int run(String[] args) throws Exception {           Configuration conf = this.getConf();           this.processArgs(conf, args); // 处理参数           Job job = Job.getInstance(conf, "analyser_logdata");           // 设置本地提交job，集群运行，需要代码           // File jarFile = EJob.createTempJar("target/classes");           //((JobConf) job.getConfiguration()).setJar(jarFile.toString());           // 设置本地提交job，集群运行，需要代码结束           job.setJarByClass(AnalyserLogDataRunner.class);           job.setMapperClass(AnalyserLogDataMapper.class);           job.setMapOutputKeyClass(NullWritable.class);           job.setMapOutputValueClass(Put.class);           // 设置reducer配置           // 1. 集群上运行，打成jar运行(要求addDependencyJars参数为true，默认就是true)          TableMapReduceUtil.initTableReducerJob(EventLogConstants.HBASE_NAME_EVENT_LOGS, null, job);           // 2. 本地运行，要求参数addDependencyJars为false           // TableMapReduceUtil.initTableReducerJob(EventLogConstants.HBASE_NAME_EVENT_LOGS,           // null, job, null, null, null, null, false);           job.setNumReduceTasks(0);           // 设置输入路径           this.setJobInputPaths(job);           return job.waitForCompletion(true) ? 0 : -1;     }     /**      * 处理参数      *      * @param conf      * @param args      *            处理那一天的数据      */     private void processArgs(Configuration conf, String[] args) {           String date = null;           for (int i = 0; i < args.length; i++) {                if ("-d".equals(args[i])) {                     if (i + 1 < args.length) {                           date = args[++i];                           break;                     }                }           }           // 要求date格式为: yyyy-MM-dd           if (StringUtils.isBlank(date) || !TimeUtil.isValidateRunningDate(date)) {                // date是一个无效时间数据                date = TimeUtil.getYesterday(); // 默认时间是昨天           }           conf.set(GlobalConstants.RUNNING_DATE_PARAMES, date);     }     /**      * 设置job的输入路径      *      * @param job      */     private void setJobInputPaths(Job job) {           Configuration conf = job.getConfiguration();           FileSystem fs = null;           try {                fs = FileSystem.get(conf);                String date = conf.get(GlobalConstants.RUNNING_DATE_PARAMES);                // 输入路径设计                // 由于输入的时间格式是 yyyy-MM-dd,故先将时间转化为时间戳                // 然后取出时间戳对应的MM/dd/，作为路径的一部分。                Path inputPath = new Path("/logs/" + TimeUtil.parseLong2String(TimeUtil.parseString2Long(date), "MM/dd/"));                if (fs.exists(inputPath)) {                     FileInputFormat.addInputPath(job, inputPath);                } else {                     throw new RuntimeException("文件不存在:" + inputPath);                }           } catch (IOException e) {                throw new RuntimeException("设置job的mapreduce输入路径出现异常", e);           } finally {                if (fs != null) {                     try {                           fs.close();                     } catch (IOException e) {                           // nothing                     }                }           }     }}

4. 添加环境变量文件，core-site.xml hbase-site.xml log4j.properties

根据不同的运行情况，修改源码将修改后的源码放到代码中。

5. 添加pom编译代码，并进行测试

（1）本地运行测试：
需要注意的就是windows环境可能会导致出现access方法调用异常，需要修改nativeio这个java文件。
使用TableMapReduceUtil的时候如果出现异常：/htrace-core-2.04.jar from hdfs://**/htrace-core-2.04.jar is not a valid DFS filename. 就需要将addDependencyJars参数设置为false。
（2）本地提交job，集群运行测试：【不建议使用】

本地需要知道提交的job是需要提交到集群上的，所以需要指定两个参数mapreduce.framework.name和yarn.resourcemanager.address，value分别为yarn和hh:8032即可，但是可能会出现异常信息，此时需要将参数mapreduce.app-submission.cross-platform设置为true。
参数设置：core-site.xml 中修改
mapreduce.framework.name=yarn
yarn.resourcemanager.address=hh:8032
mapreduce.app-submission.cross-platform=true
异常：
1. Permission denied: user=gerry, access=EXECUTE, inode=”/tmp”:hadoop:supergroup:drwx——
解决方案：执行hdfs dfs -chmod -R 777 /tmp
2. Stack trace: ExitCodeException exitCode=1: /bin/bash: line 0: fg: no job control
解决方案：添加mapreduce.app-submission.cross-platform=true
3. ExitCodeException exitCode=1:
解决方案：habse指定输出reducer的时候必须给定addDependencyJars参数为true。
4. Class com.beifeng.etl.mr.ald.AnalyserLogDataMapper not found
解决方案：引入EJob.java文件，然后再runner类中添加代码
File jarFile = EJob.createTempJar(“target/classes”);
((JobConf) job.getConfiguration()).setJar(jarFile.toString());
（3）集群提交&运行job测试：
需要在pom.xml对之前的解析userAgent的依赖做出一些配置。

[hadoop@hh jobs]$ hadoop jar neu_transformer-0.0.1.jar com.neu.etl.mr.ald.AnalyserLogDataRunner -d 2017-03-05

git 源代码托管地址：
https://github.com/LiZhongping/Offline-data-analysis-platform/tree/master/4.2neu_transformer

0 0