spark rdd根据key保存进不同的文件夹

来源：互联网发布：内存优化级别禁用好吗编辑：程序博客网时间：2024/06/03 19:43

1、首先rdd必须是(key,value)形式。本例中是根据createTimeStr作为key. 这个值是yyyy-MM-dd形式

val mrdd = ds.map(x => {
        val jsonObject = JSON.parseObject(x._2)
        //2017-07-18 14:16:13
        val createTimeStr = jsonObject.getString("createTimeStr").split(" ")(0)
        println((createTimeStr, x._2))
        (createTimeStr, x._2)
      }).foreachRDD((rdd, time) => {

        if(!rdd.isEmpty()){
            //save /tmp/weibo/dt=2017-07-18/part-0
            rdd.saveAsHadoopFile("hdfs://192.168.137.100:9000/tmp/weibo", classOf[String], classOf[String], classOf[RDDMultipleTextOutputFormat[_, _]])
            val fileSystem: FileSystem = FileSystem.get(sparkContext.hadoopConfiguration)
            val arr: Array[FileStatus] = fileSystem.listStatus(new Path("hdfs://192.168.137.100:9000/tmp/weibo/"))
            for (fs <- arr if fs.getPath.getName != "_SUCCESS") {
              if(fs.isDirectory()){
                for(cfs <- fileSystem.listStatus(fs.getPath)){
                  val fileName = cfs.getPath.getName
                  val parentName = cfs.getPath.getParent.getName
                  ///tmp/weibo/dt=2017-07-18/part-0
                  val newPath = new Path("hdfs://192.168.137.100:9000/tmp/weibo" + "/" + parentName + "/" + time.milliseconds + "-" + cfs.getPath.getName)
                  fileSystem.rename(cfs.getPath, newPath)
                }
              }
            }
            //fileSystem.delete(new Path("hdfs://192.168.137.100:9000/tmp/weibo/"), true)
        }
    })

核心方法是上面的红色代码。

文件会根据key保存到hdfs上的/tmp/weibo/dt=2017-07-18/part-m-00000中。如果key有多种，就保存到其他文件夹中，比如/tmp/weibo/dt=2017-07-19/part-m-00000

2、核心代码是RDDMultipleTextOutputFormat，

class RDDMultipleTextOutputFormat[K,V] extends MultipleTextOutputFormat[K,V] {

  override def generateFileNameForKeyValue(key: K, value: V, name: String):String = {
    ("dt="+key+"/"+name)
  }

  override def getBaseRecordWriter(fs:FileSystem,  job:JobConf,
       name:String,  arg3:Progressable):RecordWriter[K, V] ={
    val theTextOutputFormat = new MyTextOutputFormat[K,V]()
    theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
  }
}

generateFileNameForKeyValue这个方法生成文件夹和文件名，本例中on个是生成dt=2017-07-18/part-m-00000这种文件名。(name是part-m-00000,key是2017-07-18)

正常情况下，这样写就可以了。文件会根据key保存到不同的目录。但是文件内容可能不是想要的。

打开其中一个文件，内容如下

2017-07-18    {"createTimeStr":"2017-07-18 14:16:13","name":"zhangsan"}
2017-07-18    {"createTimeStr":"2017-07-18 14:16:13","name":"zhangsan"}

有时候我们可能不想要前面的日期，毕竟原始数据是{"createTimeStr":"2017-07-18 14:16:13","name":"zhangsan"}。我们只是想根据createTimeStr字段，把原始数据保存到不同的文件夹。

这个时候就需要重写 getBaseRecordWriter 方法。

原始的getBaseRecordWriter方法如下

@InterfaceAudience.Public
@InterfaceStability.Stable
public class MultipleTextOutputFormat<K, V>
    extends MultipleOutputFormat<K, V> {

  private TextOutputFormat<K, V> theTextOutputFormat = null;

  @Override
  protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
      String name, Progressable arg3) throws IOException {
    if (theTextOutputFormat == null) {
      theTextOutputFormat = new TextOutputFormat<K, V>();
    }
    return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
  }
}

可以看到，是创建了TextOutputFormat对象，并从对象中得到RecordWriter对象。

我们再看一下TextOutputFormat类

public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {
     //静态内部类，LineRecordWriter，实现了RecordWriter。这个就是我们想要的
  protected static class LineRecordWriter<K, V>
    implements RecordWriter<K, V> {
    private static final String utf8 = "UTF-8";
    private static final byte[] newline;  //newline  是用\n标记换行
    static {
      try {
        newline = "\n".getBytes(utf8);
      } catch (UnsupportedEncodingException uee) {
        throw new IllegalArgumentException("can't find " + utf8 + " encoding");
      }
    }

    protected DataOutputStream out;
    private final byte[] keyValueSeparator;

    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
      this.out = out;
      try {
        this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
      } catch (UnsupportedEncodingException uee) {
        throw new IllegalArgumentException("can't find " + utf8 + " encoding");
      }
    }

    public LineRecordWriter(DataOutputStream out) {
      this(out, "\t");
    }

    /**
     * Write the object to the byte stream, handling Text as a special
     * case.
     * @param o the object to print
     * @throws IOException if the write throws, we pass it on
     */
    private void writeObject(Object o) throws IOException {
      if (o instanceof Text) {
        Text to = (Text) o;
        out.write(to.getBytes(), 0, to.getLength());
      } else {
        out.write(o.toString().getBytes(utf8));
      }
    }
     //核心方法，通过这个方法把key和value写入到hdfs文件中
    public synchronized void write(K key, V value)
      throws IOException {

      boolean nullKey = key == null || key instanceof NullWritable;
      boolean nullValue = value == null || value instanceof NullWritable;
      if (nullKey && nullValue) {//如果key和value都为null或者NullWritable就退出
        return;
      }
      if (!nullKey) {
        writeObject(key);//写入key
      }
      if (!(nullKey || nullValue)) {
        out.write(keyValueSeparator);//写入分隔符，默认是\t
      }
      if (!nullValue) {
        writeObject(value); //写入value
      }
      out.write(newline); //写入换行符
    }

    public synchronized void close(Reporter reporter) throws IOException {
      out.close();
    }
  }

  public RecordWriter<K, V> getRecordWriter(FileSystem ignored,
                                                  JobConf job,
                                                  String name,
                                                  Progressable progress)
    throws IOException {
    boolean isCompressed = getCompressOutput(job);
    String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator",
                                       "\t");
    if (!isCompressed) {
      Path file = FileOutputFormat.getTaskOutputPath(job, name);
      FileSystem fs = file.getFileSystem(job);
      FSDataOutputStream fileOut = fs.create(file, progress);
      return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
    } else {
      Class<? extends CompressionCodec> codecClass =
        getOutputCompressorClass(job, GzipCodec.class);
      // create the named codec
      CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
      // build the filename including the extension
      Path file =
        FileOutputFormat.getTaskOutputPath(job,
                                           name + codec.getDefaultExtension());
      FileSystem fs = file.getFileSystem(job);
      FSDataOutputStream fileOut = fs.create(file, progress);
      return new LineRecordWriter<K, V>(new DataOutputStream
                                        (codec.createOutputStream(fileOut)),
                                        keyValueSeparator);
    }
  }
}

根据TextOutputFormat，我们新建一个类MyTextOutputFormat，继承TextOutputFormat，在MyTextOutputFormat中，把LineRecordWriter改成MyLineRecordWriter。并改写write方法。

在getRecordWriter中，得到MyLineRecordWriter方法

public class MyTextOutputFormat<K, V> extends TextOutputFormat<K, V>{

       protected static class MyLineRecordWriter<K, V>
         implements RecordWriter<K, V> {
         private static final String utf8 = "UTF-8";
         private static final byte[] newline;
         static {
           try {
             newline = "\n".getBytes(utf8);
           } catch (UnsupportedEncodingException uee) {
             throw new IllegalArgumentException("can't find " + utf8 + " encoding");
           }
         }

         protected DataOutputStream out;
         private final byte[] keyValueSeparator;

         public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) {
           this.out = out;
           try {
             this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
           } catch (UnsupportedEncodingException uee) {
             throw new IllegalArgumentException("can't find " + utf8 + " encoding");
           }
         }

         public MyLineRecordWriter(DataOutputStream out) {
           this(out, "\t");
         }

         /**
          * Write the object to the byte stream, handling Text as a special
          * case.
          * @param o the object to print
          * @throws IOException if the write throws, we pass it on
          */
         private void writeObject(Object o) throws IOException {
           if (o instanceof Text) {
             Text to = (Text) o;
             out.write(to.getBytes(), 0, to.getLength());
           } else {
             out.write(o.toString().getBytes(utf8));
           }
         }

         public synchronized void write(K key, V value)
           throws IOException {

           boolean nullKey = key == null || key instanceof NullWritable;
           boolean nullValue = value == null || value instanceof NullWritable;
           if (nullKey && nullValue) {
             return;
           }
//         if (!nullKey) {
//           writeObject(key);
//         }
//         if (!(nullKey || nullValue)) {
//           out.write(keyValueSeparator);
//         }
           if (!nullValue) {
             writeObject(value);
           }
           out.write(newline);
         }

         public synchronized void close(Reporter reporter) throws IOException {
           out.close();
         }
       }

       public RecordWriter<K, V> getRecordWriter(FileSystem ignored,
                                                       JobConf job,
                                                       String name,
                                                       Progressable progress)
         throws IOException {
         boolean isCompressed = getCompressOutput(job);
         String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator",
                                            "\t");
         if (!isCompressed) {
           Path file = FileOutputFormat.getTaskOutputPath(job, name);
           FileSystem fs = file.getFileSystem(job);
           FSDataOutputStream fileOut = fs.create(file, progress);
           return new MyLineRecordWriter<K, V>(fileOut, keyValueSeparator);
         } else {
           Class<? extends CompressionCodec> codecClass =
             getOutputCompressorClass(job, GzipCodec.class);
           // create the named codec
           CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
           // build the filename including the extension
           Path file =
             FileOutputFormat.getTaskOutputPath(job,
                                                name + codec.getDefaultExtension());
           FileSystem fs = file.getFileSystem(job);
           FSDataOutputStream fileOut = fs.create(file, progress);
           return new MyLineRecordWriter<K, V>(new DataOutputStream
                                             (codec.createOutputStream(fileOut)),
                                             keyValueSeparator);
         }
       }
}

在RDDMultipleTextOutputFormat类中，重写getBaseRecordWriter方法

override def getBaseRecordWriter(fs:FileSystem,  job:JobConf,
       name:String,  arg3:Progressable):RecordWriter[K, V] ={
    val theTextOutputFormat = new MyTextOutputFormat[K,V]()
    theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
  }

大功告成。

阅读全文

1 0