MapReduce多路径输出

来源:互联网 发布:淘宝客网址 编辑:程序博客网 时间:2024/06/04 19:39

本文地址:http://blog.csdn.net/hblfyla/article/details/71710066


最近频繁用到MR清洗数据到不同的路径, 就写了以下的程序,不废话了直接上代码,里面包含动态交表语句


maven配置:

依赖jar包

<dependencies>    <dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>4.12</version>      <scope>test</scope>    </dependency>        <dependency>      <groupId>jdk.tools</groupId>      <artifactId>jdk.tools</artifactId>      <version>1.8</version>      <scope>system</scope>      <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>  </dependency>            <!-- hadoop start -->    <dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-common</artifactId>    <version>2.7.3</version></dependency><dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-hdfs</artifactId>    <version>2.7.3</version></dependency><dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-mapreduce-client-core</artifactId>    <version>2.7.3</version></dependency> <dependency>    <groupId>org.apache.hadoop</groupId>    <artifactId>hadoop-mapreduce-client-common</artifactId>    <version>2.7.3</version></dependency><!-- hadoop end --><!-- common lang3 --><dependency>    <groupId>org.apache.commons</groupId>    <artifactId>commons-lang3</artifactId>    <version>3.5</version></dependency><!-- fastjson --><dependency>    <groupId>com.alibaba</groupId>    <artifactId>fastjson</artifactId>    <version>1.2.29</version></dependency><!-- properties --><dependency>    <groupId>commons-configuration</groupId>    <artifactId>commons-configuration</artifactId>    <version>1.10</version></dependency>  </dependencies>


maven打包

 <build>    <finalName>xxxxxxx</finalName>    <plugins>      <plugin>          <artifactId>maven-assembly-plugin</artifactId>          <configuration>              <appendAssemblyId>false</appendAssemblyId>              <descriptorRefs>                  <descriptorRef>jar-with-dependencies</descriptorRef>              </descriptorRefs>              <archive>                  <manifest>                      <mainClass>xxxxxxxx</mainClass>                  </manifest>              </archive>          </configuration>          <executions>              <execution>                  <id>make-assembly</id>                  <phase>package</phase>                  <goals>                      <goal>assembly</goal>                  </goals>              </execution>          </executions>        </plugin>    </plugins>  </build>


Java文件

JsonUtils:

package com.tinyv.mr;import java.util.Collection;import java.util.Iterator;import java.util.LinkedHashMap;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.TreeSet;import org.apache.commons.lang3.StringUtils;import org.apache.log4j.Logger;import com.alibaba.fastjson.JSON;/** * JsonArray解析类 */public class JsonUtils {private static Logger logger =Logger.getLogger(JsonUtils.class.getName());/** * 解析数组编程一个个数据的集合 * @param  arrayValue * @return 集合 */@SuppressWarnings("rawtypes")public static Map<String,Collection<String>> parseArray(String arrayValue,String arrayName) {Map<String,Collection<String>> map = new LinkedHashMap<String,Collection<String>>();List<String> listvalues = new LinkedList<String>();Set<String> listkeys = new TreeSet<String>();//System.out.println("arrayName: "+arrayName);//System.out.println("arrayValue: "+arrayValue);if (StringUtils.isNoneBlank(arrayValue)) {if(arrayValue.equalsIgnoreCase("[]")){//no todo}else if(arrayValue.startsWith("[")&!arrayValue.contains(":")){listvalues= JSON.parseArray(arrayValue,String.class);listkeys.add(arrayName);}else{List<LinkedHashMap> arr = JSON.parseArray(arrayValue, LinkedHashMap.class);//logger.info("aaaaaaaaaaa: "+arrayValue);Iterator<LinkedHashMap> iterator = arr.iterator();for (LinkedHashMap linkedHashMap : arr) {Set<String> keySet = linkedHashMap.keySet();StringBuffer sb = new StringBuffer();for (String m : keySet) {listkeys.add(m);sb.append(linkedHashMap.get(m).toString() + "\001");}String line = sb.toString();if (line.endsWith("\001")) {line = line.substring(0, line.length() - 1);//logger.info("line: "+line);}listvalues.add(line);}}map.put("keys",listkeys);map.put("values",listvalues);}return map;}}

多路输出类

package com.tinyv.mr;import java.io.File;import java.io.IOException;import java.util.HashMap;import java.util.LinkedHashMap;import java.util.List;import java.util.Map;import java.util.Map.Entry;import java.util.Set;import java.util.TreeSet;import org.apache.commons.io.FileUtils;import org.apache.commons.lang3.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.log4j.Logger;/** *多路输出  就提取一集数组为单张表  */@SuppressWarnings({ "unchecked", "rawtypes" })public class MulOutput {private static Logger logger =Logger.getLogger(MulOutput.class.getName());/*map函数*/public static class MapClass extends Mapper<LongWritable, Text, NullWritable, Text> {private MultipleOutputs mos;   //声明多路输出private Map<String,Set<String>> tableMap ;Set<String> ss; private Map<String,Integer> map;@Overrideprotected void setup(Context context) throws IOException, InterruptedException {//map函数运行之前执行super.setup(context);mos = new MultipleOutputs(context); //实例化tableMap = new LinkedHashMap<String,Set<String>>();map=new HashMap<String,Integer>();}@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {ss = new TreeSet<String>();int primary_key_index = Integer.parseInt(context.getConfiguration().get("primary_key_index"));//参数channel 就是要清洗的目录//1.1读取文件的一行数据String line =value.toString();//logger.info("the current info is: "+line);//1.2按照规定好的分隔符切分String[] splited = line.split("\001");/*存贮map完毕*/String parseArrays=context.getConfiguration().get("parseArrays");if(StringUtils.isNoneBlank(parseArrays)){String[] arrays =parseArrays.split(",");for (String array : arrays) {String[] arraysSplided =array.split("-");map.put(arraysSplided[0], Integer.parseInt(arraysSplided[1]));}}//1.3抽取mongo唯一idString mongouniqueid=splited[primary_key_index];if(StringUtils.isNoneBlank(mongouniqueid)){//1.5获取所有的要提取的数组名称Set<String> mapkeys = map.keySet();for (String mapkey : mapkeys) {//循环遍历//logger.info("mapkey: "+mapkey);int array_index = map.get(mapkey);//找到对应数组名称的下标索引String array_value = splited[array_index];//获取当前数组下标内容数据//logger.info("map_value: "+array_value);List<String> arrayValues=(List<String>) JsonUtils.parseArray(array_value,mapkey).get("values");//把内容转变为list//根据 key值创建表Set<String> cl_keys = (Set<String>) JsonUtils.parseArray(array_value,mapkey).get("keys");//把内容转变为listSet<String> set = tableMap.get(mapkey);if(set==null){set =new TreeSet<String>();}if(cl_keys!=null){set.addAll(cl_keys);}//谁的keytableMap.put(mapkey, set);if(arrayValues!=null&&arrayValues.size()>0){for (String str : arrayValues) {//写出mos.write(NullWritable.get(), mongouniqueid+"\001"+str, generateFileName(new Text(mapkey)));}}}}}private String generateFileName(Text tableName) {return tableName+ "/";}/** * 最后执行关闭 setup -->map-->cleanup */@Overrideprotected void cleanup(Context context) throws IOException, InterruptedException {super.cleanup(context);logger.info("cleanUp: "+tableMap);String dt = context.getConfiguration().get("dt");String table_name = context.getConfiguration().get("table_name");String out_path = context.getConfiguration().get("out_path");String path=null;Set<Entry<String, Set<String>>> entrySet = tableMap.entrySet();StringBuffer sbb = new StringBuffer();sbb.append("create database if not exists ods_tinyv_outer_array;");sbb.append("\n");sbb.append("\n");for (Entry<String, Set<String>> entry : entrySet) {String arrayName = entry.getKey(); //数组名字Set<String> fieldNames = entry.getValue();//字段名字if(!fieldNames.isEmpty()){StringBuffer sb = new StringBuffer();sb.append("create table if not exists ods_tinyv_outer_array."+table_name+"_");sb.append(arrayName+"("+"\n");sb.append("primary_key string,"+"\n");if(!fieldNames.isEmpty()){for (String field : fieldNames) {sb.append(field+" string"+","+"\n");}}String s=sb.toString().substring(0,sb.toString().length()-2);sbb.append(s);sbb.append(")"+"\n");sbb.append("partitioned by(dt string comment '时间分区字段') row format delimited fields terminated by '\\001';");path=out_path+"/"+table_name+"/dt="+dt+"/"+arrayName;sbb.append("\n");sbb.append("\n");sbb.append("load data inpath \'"+path +"\' overwrite into table ods_tinyv_outer_array."+table_name+"_"+arrayName+" partition(dt=\'"+dt+"\');");sbb.append("\n");sbb.append("\n");sbb.append("\n");}}//FileUtils.write(new File("table/"+table_name+".sql"),sbb.toString());mos.close();}}public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {if(args.length!=6){logger.error("参数个数错去,请按照如下格式运行程序   hadoop jar xxx.jar [hdfs输入所在库] [hdfs输出所在库] [表名称] [表主键下标] [表中需要解析的数组名称和下标 a-0,b-1] [时间] ");logger.error("Demo: hadoop jar xxxx.jar hdfs://192.168.174.128:9000/apps/hive/warehouse/ods_tinyv_outer.db/online hdfs://192.168.174.128:9000/apps/hive/warehouse/ods_tinyv_outer.db/out weibo_miguan_anti_fraud 0 register_orgs_statistics-29,user_searched_history_by_orgs-30,blacklist_details-36,idcard_applied_in_orgs-37,idcard_with_other_names-38,idcard_with_other_phones-39,phone_with_other_idcards-40,phone_applied_in_orgs-41,phone_with_other_names-42 2017-03-30");System.exit(1);}/** * hdfs输入路径数据库跟目录 * hdfs://192.168.174.128:9000/apps/hive/warehouse/ods_tinyv_outer.db/in */String in_path=args[0];   /** * hdfs输出路径数据库跟目录 * hdfs://192.168.174.128:9000/apps/hive/warehouse/ods_tinyv_outer.array/out */String out_path=args[1];/** * 要解析表名称 * weibo_miguan_anti_fraud */String table_name=args[2];/** * 表中主键下标 * 0 */String primary_key_index=args[3];/** * 要解析的数组集合 * blacklist_details-01,idcard_applied_in_orgs-02 */String parseArrays=args[4];/** * 解析表的分区时间 * 2017-03-30 */String dt =args[5];Configuration conf = new Configuration();conf.set("in_path", in_path); conf.set("out_path",out_path);conf.set("table_name",table_name);conf.set("primary_key_index",primary_key_index);conf.set("parseArrays",parseArrays);conf.set("dt", dt); Job job = Job.getInstance(conf,MulOutput.class.getName());Path real_input_path = new Path(in_path+"/"+table_name+"/dt="+dt);Path real_out_path = new Path(out_path+"/"+table_name+"/dt="+dt);        FileSystem fileSystem = real_out_path.getFileSystem(conf);// 根据path找到这个文件          if (fileSystem.exists(real_out_path)) {              fileSystem.delete(real_out_path, true);// true的意思是,就算output有东西,也一带删除          }  FileInputFormat.setInputPaths(job, real_input_path);FileOutputFormat.setOutputPath(job, real_out_path);LazyOutputFormat.setOutputFormatClass(job,TextOutputFormat.class);job.setJarByClass(MulOutput.class);job.setMapperClass(MapClass.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputKeyClass(NullWritable.class);job.setOutputValueClass(Text.class);job.setNumReduceTasks(0);System.exit(job.waitForCompletion(true) ? 0 : 1);}}


资料参考地址:

Hadoop多文件输出:MultipleOutputFormat和MultipleOutputs深究(一) https://www.iteblog.com/archives/842.html
Hadoop多文件输出:MultipleOutputFormat和MultipleOutputs深究(二) https://www.iteblog.com/archives/848.html






0 0
原创粉丝点击