多输入路径MapReduce完整代码详解

来源:互联网 发布:nginx第三方模块下载 编辑:程序博客网 时间:2024/06/07 20:53


代码为完整的Mapreduce代码实现,含有:

1. 分区字段的获取;

2. 多输入路径的获取;

3. 方法:根据指定的日期,和天数,向前推,返回相应的日期集合。

4. 执行jar时,传入参数的获取;

5. 相关执行哪个参数的设置;


import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.fs.Trash;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;import java.io.FileInputStream;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Calendar;import java.util.Date;import java.util.Properties;/** * User:leen * Date:2017/4/5 0005 * Time:19:19 */public class test_dwd_ec_prod_info_cm {    public static class MyMapper extends Mapper<Object,Text,Text,Text>{        /**         * 通过路径获得分区时间         * @param filePath         * @return         */        public String getDateFromPath(String filePath){            if(filePath == null){                return null;            }            int index = filePath.indexOf("pt_date=");            if(index <=0 || index+16 >filePath.length()){                return null;            }            return filePath.substring(index+8,index+16);        }        @Override        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {            //获得输入的file路径。            InputSplit inputSplit = context.getInputSplit();            String filePath = ((FileSplit)inputSplit).getPath().toString();            String[] arr = value.toString().split("\t");            if(filePath.contains("dwd_ec_prod_info") && arr.length>=20){                String ec_prd_cd         =arr[0]  ; //电商商品编码                String ec_shop_cd        =arr[1]  ; //电商店铺编码                String ec_seller_name    =arr[2]  ; //电商卖家名称                String ec_cat_cd         =arr[3]  ; //商品类目ID                String ec_brand_name     =arr[4]  ; //品牌名称                String prd_price         =arr[5]  ; //商品价格                String title             =arr[6]  ; //标题                String indu_cd           =arr[7]  ; //行业编号                String prd_type_cd       =arr[8]  ; //行业类型编号                String brand_cd          =arr[9]  ; //品牌编号                String prd_cd            =arr[10] ; //产品编号                String prd_comment_count =arr[11] ; //商品总评论数                String url               =arr[12] ; //商品的url地址                String imgurl            =arr[13] ; //图片url地址                String market_time       =arr[14] ; //上市时间                String prd_sale_cnt      =arr[15] ; //30天累计销量                String today_prd_sale_cnt=arr[16] ; //当天销量                String prd_sale_income   =arr[17] ; //当天销售额                String domain            =arr[18] ; //网站域名                String para_config       =arr[19] ; //配置参数                String pt_date = getDateFromPath(filePath); //时间分区                if((!indu_cd.equalsIgnoreCase("null") || !prd_type_cd.equalsIgnoreCase("null") ||                        !brand_cd.equalsIgnoreCase("null") || !prd_cd.equalsIgnoreCase("null") ) && !ec_prd_cd.contains(".")){                    context.write(new Text(domain + "\t" + ec_prd_cd),new Text(ec_shop_cd+"\t"+ec_seller_name+"\t"+ec_cat_cd+"\t"+ec_brand_name+"\t"+prd_price+"\t"+title+"\t"+indu_cd+"\t"+prd_type_cd+"\t"+brand_cd+"\t"+prd_cd+"\t"+prd_comment_count+"\t"+url+"\t"+imgurl+"\t"+market_time+"\t"+prd_sale_cnt+"\t"+today_prd_sale_cnt+"\t"+prd_sale_income+"\t"+domain+"\t"+para_config+"\t"+pt_date));                }            }        }    }    public static class MyReducer extends Reducer<Text,Text,Text,Text>{        @Override        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            String[] arrkeys = key.toString().split("\t");            if(arrkeys.length==2){                String ec_prd_cd = arrkeys[0],latest_content = null, lastest_date=null;                for(Text val : values){                    String[] arrvalues = val.toString().split("\t");                    String pt_date = arrvalues[arrvalues.length-1];                    if(lastest_date == null || (lastest_date != null && lastest_date.compareTo(pt_date) < 0)){                        lastest_date=pt_date;                        latest_content=arrvalues[0];                        for (int i = 1 ;i < arrvalues.length;i++){                            latest_content+= "\t" + arrvalues[i];                        }                    }                }                if(lastest_date != null && latest_content != null){                    context.write(new Text(ec_prd_cd),new Text(latest_content));                }            }        }    }    /**     * 获得输入的路径的集合     * @param selected_dates     * @param fs     * @return     * @throws IOException     */    public static ArrayList<Path> GenerateInputPaths(ArrayList<String> selected_dates ,FileSystem fs) throws IOException {        ArrayList<Path> inputPaths = new ArrayList<Path>();        for(String str : selected_dates){            String pathStr = "/group/user/tools/meta/hive-temp-table/tools.db/dwd_ec_prod_info/pt_date=" + str;            Path path = new Path(pathStr);            if(fs.exists(path)){                inputPaths.add(path);                FileStatus[] fileStatuses = fs.listStatus(path);                System.out.println(fileStatuses.length+"\t"+path);            }        }        return inputPaths;    }    /**     * 根据制定的日期,和天数,向前推,返回相应的日期集合。     * @param cur_date     * @param K     * @return     * @throws ParseException     */    public static ArrayList<String> getPeriodDates(String cur_date,Integer K) throws ParseException {        ArrayList<String> dates = new ArrayList<String>();        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");        Date cur = sdf.parse(cur_date);        for(int i=0;i< K;i++){            Calendar cal = Calendar.getInstance();            cal.setTime(cur);            cal.add(Calendar.DATE,-i);            dates.add(sdf.format(cal.getTime()));        }        return dates;    }    /**     *     * @param args     * @throws Exception     */    public static void Process(String[] args) throws Exception{        System.out.println("---------------Process-->");        Configuration conf = new Configuration();        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();        if(otherArgs.length != 2){            System.out.println("请输入两个参数:时间+配置文件");            return;        }        String cur_date = otherArgs[0],prop_filePath = otherArgs[1];        ArrayList<String> selected_dates = getPeriodDates(cur_date,30);        Properties prop = new Properties();        prop.load(new FileInputStream(prop_filePath));//加载配置文件的路径        //map完成了100%之后在开始为reduce任务申请资源,默认是0.05        conf.setInt("mapreduce.job.reduce.slowstart.completedmaps", 1);        System.out.println("第一个参数:" + conf.getStrings("mapreduce.job.reduce.slowstart.completedmaps", "1")[0]);        Job job = Job.getInstance(conf,"test_dwd_ec_prod_info_cm - chl");        job.setJarByClass(test_dwd_ec_prod_info_cm.class);        job.setNumReduceTasks(100);        job.setMapperClass(MyMapper.class);        job.setReducerClass(MyReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        FileSystem fs = FileSystem.get(conf);        ArrayList<Path> inputPaths = GenerateInputPaths(selected_dates,fs);        for (Path path : inputPaths){            FileInputFormat.addInputPath(job,path);        }        Path outputPath = new Path("/group/user/tools/meta/hive-temp-table/chenhaolin.db/dwd_ec_prod_info_for_cm/pt_date="+ cur_date);        Trash trash = new Trash(conf);        if(fs.exists(outputPath)){            trash.moveToTrash(outputPath);            System.out.println("---------------setOutputPath-->");            FileOutputFormat.setOutputPath(job, outputPath);            job.waitForCompletion(true);            fs.close();;        }    }}


0 0
原创粉丝点击