数据清洗(1clean to hbase)

来源:互联网 发布:下载办公软件最新版 编辑:程序博客网 时间:2024/04/30 05:47
package com.rainbow.cleantohbase;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.zip.CRC32;import org.apache.commons.lang3.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.HColumnDescriptor;import org.apache.hadoop.hbase.HTableDescriptor;import org.apache.hadoop.hbase.MasterNotRunningException;import org.apache.hadoop.hbase.TableName;import org.apache.hadoop.hbase.ZooKeeperConnectionException;import org.apache.hadoop.hbase.client.HBaseAdmin;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class CMR implements Tool{private Configuration conf = new Configuration();@Overridepublic Configuration  getConf() {return this.conf;}@Overridepublic void setConf(Configuration conf) {/** * 本地执行 * 集群执行 将下面三行注释掉 */this.conf.set("fs.defaultFS", "hdfs://rainbow.com.cn:8020");this.conf.set("yarn.resourcemanager.hostname", "rainbow.com.cn");this.conf.set("hbase.zookeeper.quorum", "rainbow.com.cn:2181");// 集群执行conf = HBaseConfiguration.create(this.conf);}// set mappublic static class CMRM extendsMapper<LongWritable, Text, NullWritable, Put> {private CRC32 crc32 = new CRC32();String[] str = { "id", "url", "referer", "keyword", "type", "guid","pageId", "moduleId", "linkId", "attachedInfo", "sessionId","trackerU", "trackerType", "ip", "trackerSrc", "cookie","orderCode", "trackTime", "endUserId", "firstLink","sessionViewNo", "productId", "curMerchantId", "provinceId","cityId", "fee", "edmActivity", "edmEmail", "dmJobId","ieVersion", "platform", "internalKeyword", "resultSum","currentPage", "linkPosition", "buttonPosition" };@Overrideprotected void map(LongWritable key, Text value,Mapper<LongWritable, Text, NullWritable, Put>.Context context)throws IOException, InterruptedException {if (StringUtils.isBlank(value.toString())) {context.getCounter("clean err log", "line is null or blank").increment(1);return;}String[] strings = value.toString().split("\t");if (strings[0] == null) {context.getCounter("clearn err log", "id is null").increment(1);return;}if (StringUtils.isBlank(strings[1])) {context.getCounter("clearn err log", "url is blank or null").increment(1);return;}if (StringUtils.isBlank(strings[23])) {context.getCounter("clearn err log", "provinceId is null").increment(1);return;}try {Integer.valueOf(strings[23]);} catch (NumberFormatException e) {e.printStackTrace();}Long timestap = System.currentTimeMillis();Put put = new Put(Bytes.toBytes(getRowkey(timestap, strings[0])));for (int i = 0; i < strings.length; i++) {put.add(Bytes.toBytes("info"), Bytes.toBytes(str[i]),Bytes.toBytes(strings[i]));}context.write(NullWritable.get(), put);}private String getRowkey(Long timestamp, String s) {StringBuilder sbuilder = new StringBuilder();sbuilder.append(timestamp + "_");this.crc32.reset();if (StringUtils.isNotBlank(s)) {this.crc32.update(Bytes.toBytes(s));}sbuilder.append(this.crc32.getValue() % 100000000L);return sbuilder.toString();}}@Overridepublic int run(String[] args) throws Exception {// set jobJob job=null;try {job = Job.getInstance(this.conf);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}job.setJarByClass(CMR.class);// set mapjob.setMapperClass(CMRM.class);job.setMapOutputKeyClass(NullWritable.class);job.setMapOutputValueClass(Put.class);String tableName = getTableNames(conf);try {//TableMapReduceUtil.initTableReducerJob(tableName, null, job);TableMapReduceUtil.initTableReducerJob(tableName, null, job,null, null, null, null, false);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}// set pathString str = "/data/" + tableName;try {FileInputFormat.addInputPath(job, new Path(str));} catch (IllegalArgumentException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}return job.waitForCompletion(true) ? 0 : 1;}private static String getTableNames(Configuration c) {Date date = new Date();SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");String tableName = sdf.format(date);byte[] tablenames = Bytes.toBytes(tableName);HBaseAdmin admin = null;try {admin = new HBaseAdmin(c);} catch (MasterNotRunningException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (ZooKeeperConnectionException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}try {if (admin.tableExists(tablenames)) {admin.disableTable(tablenames);admin.deleteTable(tablenames);}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}// create Hbase tableHTableDescriptor desc = new HTableDescriptor(TableName.valueOf(tablenames));HColumnDescriptor family = new HColumnDescriptor(Bytes.toBytes("info"));desc.addFamily(family);try {admin.createTable(desc);} catch (IOException e) {e.printStackTrace();}try {admin.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return tableName;}public static void main(String[] args) {// 执行该MapReducetry {int exitCode = ToolRunner.run(new CMR(), args);if(exitCode == 0){System.out.println("ETL MapReduce  Job  has runned successfully!");}} catch (Exception e) {e.printStackTrace();}}}


                                             
1 0
原创粉丝点击