spark多文件输出

来源:互联网 发布:ubuntu查看gpu 编辑:程序博客网 时间:2024/06/05 22:49

1.因为spark是用hadoop的api进行输出的,MultipleOutputFormat是hadoop用于支持多文件输出的,所以自定义一个MultipleOutputFormat类

import java.io.IOException;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.mapred.JobConf;import org.apache.hadoop.mapred.RecordWriter;import org.apache.hadoop.mapred.TextOutputFormat;import org.apache.hadoop.mapred.lib.MultipleOutputFormat;import org.apache.hadoop.util.Progressable;public class MyMultipleOutput extends MultipleOutputFormat<String, String>{ private TextOutputFormat<String, String> output = null;@Overrideprotected String generateFileNameForKeyValue(String key, String value,String name) {String[] split = key.split(",");String device=split[0];String year=split[1].substring(0,4);String month=split[1].substring(5,7);String day=split[1].substring(8,10);/*name是reduce任务默认文件名,注意如果这里返回的文件名不追加name,*就会造成多个reduce获取到的文件名都是day,多个reduce写一个文件,文件内容只会有一个reduce输出的内容*/return device+"/"+year+"/"+month+"/"+day+"/"+name;}@Overrideprotected RecordWriter<String, String> getBaseRecordWriter(FileSystem fs,JobConf job, String name, Progressable arg3) throws IOException {if (output == null) {          output = new TextOutputFormat<String, String>();        }        return output.getRecordWriter(fs, job, name, arg3);}}

2.pair类型RDD的saveAsHadoopFile方法进行输出

deviceDateKeyPair.saveAsHadoopFile("d://multioutput/", NullWritable.class, String.class, MyMultipleOutput.class);




0 0