spark rdd根据key保存进不同的文件夹
来源:互联网 发布:内存优化级别禁用好吗 编辑:程序博客网 时间:2024/06/03 19:43
1、首先rdd必须是(key,value)形式。本例中是根据createTimeStr作为key. 这个值是yyyy-MM-dd形式
val mrdd = ds.map(x => {
val jsonObject = JSON.parseObject(x._2)
//2017-07-18 14:16:13
val createTimeStr = jsonObject.getString("createTimeStr").split(" ")(0)
println((createTimeStr, x._2))
(createTimeStr, x._2)
}).foreachRDD((rdd, time) => {
if(!rdd.isEmpty()){
//save /tmp/weibo/dt=2017-07-18/part-0
rdd.saveAsHadoopFile("hdfs://192.168.137.100:9000/tmp/weibo", classOf[String], classOf[String], classOf[RDDMultipleTextOutputFormat[_, _]])
val fileSystem: FileSystem = FileSystem.get(sparkContext.hadoopConfiguration)
val arr: Array[FileStatus] = fileSystem.listStatus(new Path("hdfs://192.168.137.100:9000/tmp/weibo/"))
for (fs <- arr if fs.getPath.getName != "_SUCCESS") {
if(fs.isDirectory()){
for(cfs <- fileSystem.listStatus(fs.getPath)){
val fileName = cfs.getPath.getName
val parentName = cfs.getPath.getParent.getName
///tmp/weibo/dt=2017-07-18/part-0
val newPath = new Path("hdfs://192.168.137.100:9000/tmp/weibo" + "/" + parentName + "/" + time.milliseconds + "-" + cfs.getPath.getName)
fileSystem.rename(cfs.getPath, newPath)
}
}
}
//fileSystem.delete(new Path("hdfs://192.168.137.100:9000/tmp/weibo/"), true)
}
})
核心方法是上面的红色代码。
文件会根据key保存到hdfs上的/tmp/weibo/dt=2017-07-18/part-m-00000中。如果key有多种,就保存到其他文件夹中,比如/tmp/weibo/dt=2017-07-19/part-m-00000
2、核心代码是RDDMultipleTextOutputFormat,
class RDDMultipleTextOutputFormat[K,V] extends MultipleTextOutputFormat[K,V] {
override def generateFileNameForKeyValue(key: K, value: V, name: String):String = {
("dt="+key+"/"+name)
}
override def getBaseRecordWriter(fs:FileSystem, job:JobConf,
name:String, arg3:Progressable):RecordWriter[K, V] ={
val theTextOutputFormat = new MyTextOutputFormat[K,V]()
theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
}
generateFileNameForKeyValue这个方法生成文件夹和文件名,本例中on个是生成dt=2017-07-18/part-m-00000这种文件名。(name是part-m-00000,key是2017-07-18)
正常情况下,这样写就可以了。文件会根据key保存到不同的目录。但是文件内容可能不是想要的。
打开其中一个文件,内容如下
2017-07-18 {"createTimeStr":"2017-07-18 14:16:13","name":"zhangsan"}
2017-07-18 {"createTimeStr":"2017-07-18 14:16:13","name":"zhangsan"}
有时候我们可能不想要前面的日期,毕竟原始数据是{"createTimeStr":"2017-07-18 14:16:13","name":"zhangsan"}。我们只是想根据createTimeStr字段,把原始数据保存到不同的文件夹。
这个时候就需要重写 getBaseRecordWriter 方法。
原始的getBaseRecordWriter方法如下
@InterfaceAudience.Public
@InterfaceStability.Stable
public class MultipleTextOutputFormat<K, V>
extends MultipleOutputFormat<K, V> {
private TextOutputFormat<K, V> theTextOutputFormat = null;
@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
String name, Progressable arg3) throws IOException {
if (theTextOutputFormat == null) {
theTextOutputFormat = new TextOutputFormat<K, V>();
}
return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
}
可以看到,是创建了TextOutputFormat对象,并从对象中得到RecordWriter对象。
我们再看一下TextOutputFormat类
public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {
//静态内部类,LineRecordWriter,实现了RecordWriter。这个就是我们想要的
protected static class LineRecordWriter<K, V>
implements RecordWriter<K, V> {
private static final String utf8 = "UTF-8";
private static final byte[] newline; //newline 是用\n标记换行
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public LineRecordWriter(DataOutputStream out) {
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
//核心方法,通过这个方法把key和value写入到hdfs文件中
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {//如果key和value都为null或者NullWritable就退出
return;
}
if (!nullKey) {
writeObject(key);//写入key
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);//写入分隔符,默认是\t
}
if (!nullValue) {
writeObject(value); //写入value
}
out.write(newline); //写入换行符
}
public synchronized void close(Reporter reporter) throws IOException {
out.close();
}
}
public RecordWriter<K, V> getRecordWriter(FileSystem ignored,
JobConf job,
String name,
Progressable progress)
throws IOException {
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator",
"\t");
if (!isCompressed) {
Path file = FileOutputFormat.getTaskOutputPath(job, name);
FileSystem fs = file.getFileSystem(job);
FSDataOutputStream fileOut = fs.create(file, progress);
return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
} else {
Class<? extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
// create the named codec
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
// build the filename including the extension
Path file =
FileOutputFormat.getTaskOutputPath(job,
name + codec.getDefaultExtension());
FileSystem fs = file.getFileSystem(job);
FSDataOutputStream fileOut = fs.create(file, progress);
return new LineRecordWriter<K, V>(new DataOutputStream
(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
根据TextOutputFormat,我们新建一个类MyTextOutputFormat,继承TextOutputFormat,在MyTextOutputFormat中,把LineRecordWriter改成MyLineRecordWriter。并改写write方法。
在getRecordWriter中,得到MyLineRecordWriter方法
public class MyTextOutputFormat<K, V> extends TextOutputFormat<K, V>{
protected static class MyLineRecordWriter<K, V>
implements RecordWriter<K, V> {
private static final String utf8 = "UTF-8";
private static final byte[] newline;
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public MyLineRecordWriter(DataOutputStream out) {
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
// if (!nullKey) {
// writeObject(key);
// }
// if (!(nullKey || nullValue)) {
// out.write(keyValueSeparator);
// }
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized void close(Reporter reporter) throws IOException {
out.close();
}
}
public RecordWriter<K, V> getRecordWriter(FileSystem ignored,
JobConf job,
String name,
Progressable progress)
throws IOException {
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator",
"\t");
if (!isCompressed) {
Path file = FileOutputFormat.getTaskOutputPath(job, name);
FileSystem fs = file.getFileSystem(job);
FSDataOutputStream fileOut = fs.create(file, progress);
return new MyLineRecordWriter<K, V>(fileOut, keyValueSeparator);
} else {
Class<? extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
// create the named codec
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
// build the filename including the extension
Path file =
FileOutputFormat.getTaskOutputPath(job,
name + codec.getDefaultExtension());
FileSystem fs = file.getFileSystem(job);
FSDataOutputStream fileOut = fs.create(file, progress);
return new MyLineRecordWriter<K, V>(new DataOutputStream
(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
在RDDMultipleTextOutputFormat类中,重写getBaseRecordWriter方法
override def getBaseRecordWriter(fs:FileSystem, job:JobConf,
name:String, arg3:Progressable):RecordWriter[K, V] ={
val theTextOutputFormat = new MyTextOutputFormat[K,V]()
theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
大功告成。
阅读全文
1 0
- spark rdd根据key保存进不同的文件夹
- spark的RDD操作对key值操作的探索
- spark rdd根据某一列去重
- Spark RDD/DataFrame map保存数据的两种方式
- spark源码阅读笔记RDD(七) RDD的创建、读取和保存
- spark RDD key/value关联操作
- spark RDD的理解
- 理解Spark的RDD
- spark RDD的原理
- Spark RDD的转换
- Spark RDD的动作
- spark RDD的理解
- Spark RDD的理解
- 理解Spark的RDD
- Spark RDD的转换
- Spark RDD的缓存 rdd.cache() 和 rdd.persist()
- Spark RDD的缓存 rdd.cache() 和 rdd.persist()
- 理解Spark的核心RDD
- Git Submodule 使用简介
- HDU 4734 数位DP
- 193. Valid Phone Numbers
- JS-上传文件
- UISearBarController
- spark rdd根据key保存进不同的文件夹
- Android进阶#(1/12)Android的构成基石——四大组件_Broadcast
- SQL之decode()函数
- Discuz触屏手机版显示分类信息的修改方法
- Word转换pdf文件之好用的pdf虚拟打印机
- uint8_t / uint16_t / uint32_t /uint64_t 数据类型大总结
- 微信小程序-实战巩固(二)
- CCF 最优灌溉
- 了解对自然语言处理的卷积神经网络