hadoop 自学指南六之IO /HDFS 操作API

来源：互联网发布：板绘软件编辑：程序博客网时间：2024/05/18 00:03

一、前言

I/O相关的包如下：.apache.hadoop.io.* ,以下介绍一些常用的hdfs的API操作

二、HDFS API

package hadoop.utils;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.FileUtil;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IOUtils;/** * @author : chenhaipeng * @date : 2015年8月21日 上午1:02:26 */public class HDFSUtils {public static void WriteToHDFS(String file, String words) throws IOException, URISyntaxException {Configuration conf = new Configuration();FileSystem fs = FileSystem.get(URI.create(file), conf);Path path = new Path(file);FSDataOutputStream out = null;out = fs.create(path); // 创建文件// 两个方法都用于文件写入，好像一般多使用后者// out.writeBytes(words);out.write(words.getBytes("UTF-8"));out.close();// 如果是要从输入流中写入，或是从一个文件写到另一个文件（此时用输入流打开已有内容的文件）// 可以使用如下IOUtils.copyBytes方法。// FSDataInputStream in = fs.open(new Path(args[0]));// IOUtils.copyBytes(in, out, 4096, true) //4096为一次复制块大小，true表示复制完成后关闭流}public static void ReadFromHDFS(String file) throws IOException {Configuration conf = new Configuration();FileSystem fs = FileSystem.get(URI.create(file), conf);Path path = new Path(file);FSDataInputStream in = null;try {in = fs.open(path);IOUtils.copyBytes(in, System.out, 4096, true);} finally {IOUtils.closeStream(in);}// 使用FSDataInoutStream的read方法会将文件内容读取到字节流中并返回/** * FileStatus stat = fs.getFileStatus(path); // create the buffer byte[] * buffer = new byte[Integer.parseInt(String.valueOf(stat.getLen()))]; * is.readFully(0, buffer); is.close(); fs.close(); return buffer; */}public static void DeleteHDFSFile(String file) throws IOException {Configuration conf = new Configuration();FileSystem fs = FileSystem.get(URI.create(file), conf);Path path = new Path(file);// 查看fs的delete API可以看到三个方法。deleteonExit实在退出JVM时删除，下面的方法是在指定为目录是递归删除fs.delete(path, true);fs.close();}public static void UploadLocalFileHDFS(String src, String dst) throws IOException {Configuration conf = new Configuration();FileSystem fs = FileSystem.get(URI.create(dst), conf);Path pathDst = new Path(dst);Path pathSrc = new Path(src);fs.copyFromLocalFile(pathSrc, pathDst);fs.close();}public static void ListDirAll(String DirFile) throws IOException {Configuration conf = new Configuration();FileSystem fs = FileSystem.get(URI.create(DirFile), conf);Path path = new Path(DirFile);FileStatus[] status = fs.listStatus(path);// 方法1for (FileStatus f : status) {System.out.println(f.getPath().toString());}// 方法2Path[] listedPaths = FileUtil.stat2Paths(status);for (Path p : listedPaths) {System.out.println(p.toString());}}public static void main(String[] args) throws IOException, URISyntaxException {// 下面做的是显示目录下所有文件ListDirAll("hdfs://192.168.100.150:9000/user");String fileWrite = "hdfs://192.168.100.150:9000/user/readme2.txt";String words = "This words is to write into file!\n";WriteToHDFS(fileWrite, words);// 这里我们读取fileWrite的内容并显示在终端ReadFromHDFS(fileWrite);// 这里删除上面的fileWrite文件DeleteHDFSFile(fileWrite);// 假设本地有一个uploadFile，这里上传该文件到HDFSString LocalFile = "c:/2015-04-10.txt";UploadLocalFileHDFS(LocalFile, fileWrite);}}

三、Hadoop 压缩

数据压缩能带来相当大的好处、hadoop 支持的压缩codec如下：

其中：bzip2支持分区，其他不支持

/** * 使用由文件扩展名推断而来的codec来压缩来对文件进行压缩 * @author : chenhaipeng * @date : 2015年9月20日 下午7:48:15 */public class FileDecompressor {public static void main(String[] args) throws Exception {String uri = args[0];Configuration conf = new Configuration();FileSystem fs = FileSystem.get(URI.create(uri),conf);Path inputPath = new Path(uri);CompressionCodecFactory factory = new CompressionCodecFactory(conf);CompressionCodec codec = factory.getCodec(inputPath);if(codec == null){System.err.println("No codec found for "+ uri);System.exit(1);}String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());InputStream in = null;OutputStream out = null;try {in = codec.createInputStream(fs.open(inputPath));out = fs.create(new Path(outputUri));IOUtils.copyBytes(in, out, conf);} finally{IOUtils.closeStream(in);IOUtils.closeStream(out);}}}

tip：例如

WordCount使用压缩的时候

FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

四、MapReduce 的格式与类型

自定义数据类型

import java.io.*;import org.apache.hadoop.io.*;public class IntPair implements WritableComparable<IntPair> {  private int first;  private int second;    public IntPair() {  }    public IntPair(int first, int second) {    set(first, second);  }    public void set(int first, int second) {    this.first = first;    this.second = second;  }    public int getFirst() {    return first;  }  public int getSecond() {    return second;  }  @Override  public void write(DataOutput out) throws IOException {    out.writeInt(first);    out.writeInt(second);  }  @Override  public void readFields(DataInput in) throws IOException {    first = in.readInt();    second = in.readInt();  }    @Override  public int hashCode() {    return first * 163 + second;  }    @Override  public boolean equals(Object o) {    if (o instanceof IntPair) {      IntPair ip = (IntPair) o;      return first == ip.first && second == ip.second;    }    return false;  }  @Override  public String toString() {    return first + "\t" + second;  }    @Override  public int compareTo(IntPair ip) {    int cmp = compare(first, ip.first);    if (cmp != 0) {      return cmp;    }    return compare(second, ip.second);  }    /**   * Convenience method for comparing two ints.   */  public static int compare(int a, int b) {    return (a < b ? -1 : (a == b ? 0 : 1));  }  }

0 0