Hadoop读取sequencefile和textfile文件内容

来源：互联网发布：杨德昌一一知乎编辑：程序博客网时间：2024/05/16 13:47

读取sequencefile，其中MockReporter是从Reporter接口派生出的一个假的reporter，它什么也不做（只实现那些接口）：

    public static List<String> parseSequenceFile(String path) throws IOException {        List<String> result = new ArrayList<String>();        Configuration conf = new Configuration();        FileSystem fs = FileSystem.get(conf);        FileStatus status = fs.getFileStatus(new Path(path));        InputSplit split = getFullSplit(status);        SequenceFileInputFormat<Writable, Text> inputFormat = new SequenceFileInputFormat<Writable, Text>();        RecordReader<Writable, Text> reader = inputFormat.getRecordReader(split, new JobConf(), new MockReporter());        Text textValue = new Text();        while (reader.next(null, textValue)) {            String line = new String(textValue.getBytes(), 0, textValue.getLength(), "UTF-8");            result.add(line);        }        return result;    }

不知道key和value的类型的情况下读取sequence file（这段代码摘自hadoop权威指南）：

public class SequenceFileReadDemo {    public static void main(String[] args) throws IOException {        String uri = args[0];        Configuration conf = new Configuration();        FileSystem fs = FileSystem.get(URI.create(uri), conf);        Path path = new Path(uri);        SequenceFile.Reader reader = null;        try {            reader = new SequenceFile.Reader(fs, path, conf);            Writable key = (Writable)                ReflectionUtils.newInstance(reader.getKeyClass(), conf);            Writable value = (Writable)                ReflectionUtils.newInstance(reader.getValueClass(), conf);            long position = reader.getPosition();            while (reader.next(key, value)) {                String syncSeen = reader.syncSeen() ? "*" : "";                System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key, value);                position = reader.getPosition(); // beginning of next record            }        } finally {            IOUtils.closeStream(reader);        }    }}

写sequencefile（这段代码摘自hadoop权威指南）：

public class SequenceFileWriteDemo {    private static final String[] DATA = {        "One, two, buckle my shoe",        "Three, four, shut the door",        "Five, six, pick up sticks",        "Seven, eight, lay them straight",        "Nine, ten, a big fat hen"    };    public static void main(String[] args) throws IOException {        String uri = args[0];        Configuration conf = new Configuration();        FileSystem fs = FileSystem.get(URI.create(uri), conf);        Path path = new Path(uri);        IntWritable key = new IntWritable();        Text value = new Text();        SequenceFile.Writer writer = null;        try {            writer = SequenceFile.createWriter(fs, conf, path,                    key.getClass(), value.getClass());            for (int i = 0; i < 100; i++) {                key.set(100 - i);                value.set(DATA[i % DATA.length]);                System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value);                writer.append(key, value);            }        } finally {            IOUtils.closeStream(writer);        }    }}

读取textfile：

    public static List<String> parseTextFiles(String path) throws IOException {        List<String> result = new ArrayList<String>();        Configuration conf = new Configuration();        FileSystem fs = FileSystem.get(conf);        FileStatus[] status_list = fs.listStatus(new Path(path));        for (FileStatus status : status_list) {            FileSplit split = (FileSplit) getFullSplit(status);            org.apache.hadoop.mapred.LineRecordReader reader =                    new org.apache.hadoop.mapred.LineRecordReader(conf, split);            Text textValue = new Text();            LongWritable pos = new LongWritable();            while (reader.next(pos, textValue)) {                String line = new String(textValue.getBytes(), 0, textValue.getLength(), "UTF-8");                result.add(line);            }        }        return result;    }