hadoop深入研究:(十六)——Avro序列化与反序列化

来源:互联网 发布:foobar2000 linux 编辑:程序博客网 时间:2024/03/29 14:36

转载请写明来源地址:http://blog.csdn.net/lastsweetop/article/details/9773233

所有源码在github上,https://github.com/lastsweetop/styhadoop


使用avro在很多情况下是对原有系统的改造,框架格式都已经定义好了,我们只能直接用avro对原有数据进行整合。(如果是新建系统,最好还是用avro的datafile,下一章讲datafile)

准备工作

将一下schema保存成文件StringPair.avsc,放在src/test/resources目录下
{    "type":"record",    "name":"StringPair",    "doc":"A pair of strings",    "fields":[        {"name":"left","type":"string"},        {"name":"right","type":"string"}    ]}
引入最新版本的avro时要主要,最新的avro包为1.7.4,依赖org.codehaus.jackson:jackson-core-asl:1.8.8包,但是maven库中已经没有该版本
所以要换成其他版本
    <dependency>                <groupId>org.codehaus.jackson</groupId>                <artifactId>jackson-core-asl</artifactId>                <version>1.9.9</version>            </dependency>
如果你用的时1.0.4版本的hadoop(或者其他版本),依赖于jackson-mapper-asl,如果与jackson-core-asl版本不一致就会产生找不到方法等异常
你需要入引入相同版本
            <dependency>                <groupId>org.codehaus.jackson</groupId>                <artifactId>jackson-mapper-asl</artifactId>                <version>1.9.9</version>            </dependency>

generic方式

这一节我们用代码讲解
package com.sweetop.styhadoop;import junit.framework.Assert;import org.apache.avro.Schema;import org.apache.avro.generic.GenericData;import org.apache.avro.generic.GenericDatumReader;import org.apache.avro.generic.GenericDatumWriter;import org.apache.avro.generic.GenericRecord;import org.apache.avro.io.*;import org.junit.Test;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.IOException;/** * Created with IntelliJ IDEA. * User: lastsweetop * Date: 13-8-5 * Time: 下午7:59 * To change this template use File | Settings | File Templates. */public class TestGenericMapping {    @Test    public void test() throws IOException {        //将schema从StringPair.avsc文件中加载        Schema.Parser parser = new Schema.Parser();        Schema schema = parser.parse(getClass().getResourceAsStream("/StringPair.avsc"));        //根据schema创建一个record示例        GenericRecord datum = new GenericData.Record(schema);        datum.put("left", "L");        datum.put("right", "R");        ByteArrayOutputStream out = new ByteArrayOutputStream();        //DatumWriter可以将GenericRecord变成edncoder可以理解的类型        DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);        //encoder可以将数据写入流中,binaryEncoder第二个参数是重用的encoder,这里不重用,所用传空        Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);        writer.write(datum,encoder);        encoder.flush();        out.close();        DatumReader<GenericRecord> reader=new GenericDatumReader<GenericRecord>(schema);        Decoder decoder=DecoderFactory.get().binaryDecoder(out.toByteArray(),null);        GenericRecord result=reader.read(null,decoder);        Assert.assertEquals("L",result.get("left").toString());        Assert.assertEquals("R",result.get("right").toString());    }}

result.get返回的是utf-8格式,需要调用toString方法,才能和字符串一致。

specific方式

首先使用avro-maven-plugin生成代码,pom的配置
  <plugin>                    <groupId>org.apache.avro</groupId>                    <artifactId>avro-maven-plugin</artifactId>                    <version>1.7.0</version>                    <executions>                        <execution>                            <id>schemas</id>                            <phase>generate-sources</phase>                            <goals>                                <goal>schema</goal>                            </goals>                            <configuration>                                <includes>                                    <include>StringPair.avsc</include>                                </includes>                                <sourceDirectory>src/test/resources</sourceDirectory>                                <outputDirectory>${project.build.directory}/generated-sources/java</outputDirectory>                            </configuration>                        </execution>                    </executions>                </plugin>

avro-maven-plugin插件绑定在generate-sources阶段,调用mvn generate-sources即可生成源代码,我们来看下生成的源代码
package com.sweetop.styhadoop;/** * Autogenerated by Avro * <p/> * DO NOT EDIT DIRECTLY */@SuppressWarnings("all")/** A pair of strings */public class StringPair extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {    public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"StringPair\",\"doc\":\"A pair of strings\",\"fields\":[{\"name\":\"left\",\"type\":\"string\",\"avro.java.string\":\"String\"},{\"name\":\"right\",\"type\":\"string\"}]}");    @Deprecated    public java.lang.CharSequence left;    @Deprecated    public java.lang.CharSequence right;    public org.apache.avro.Schema getSchema() {        return SCHEMA$;    }    // Used by DatumWriter.  Applications should not call.    public java.lang.Object get(int field$) {        switch (field$) {            case 0:                return left;            case 1:                return right;            default:                throw new org.apache.avro.AvroRuntimeException("Bad index");        }    }    // Used by DatumReader.  Applications should not call.    @SuppressWarnings(value = "unchecked")    public void put(int field$, java.lang.Object value$) {        switch (field$) {            case 0:                left = (java.lang.CharSequence) value$;                break;            case 1:                right = (java.lang.CharSequence) value$;                break;            default:                throw new org.apache.avro.AvroRuntimeException("Bad index");        }    }    /**     * Gets the value of the 'left' field.     */    public java.lang.CharSequence getLeft() {        return left;    }    /**     * Sets the value of the 'left' field.     *     * @param value the value to set.     */    public void setLeft(java.lang.CharSequence value) {        this.left = value;    }    /**     * Gets the value of the 'right' field.     */    public java.lang.CharSequence getRight() {        return right;    }    /**     * Sets the value of the 'right' field.     *     * @param value the value to set.     */    public void setRight(java.lang.CharSequence value) {        this.right = value;    }}

为了兼容之前的版本生成了一组get,put方法,1.6.0后生成添加了getter/setter方法,还有一个与Builder的类,没什么用已经被我删掉

另外上一篇文章有点没讲到就是schama里的name里可以使用命名空间,如com.sweetop.styhadoop.StringPair,这样生成的源代码才会是带package的

那我们来看如果使用这个生成的类,和generic方式有什么不同:

package com.sweetop.styhadoop;import junit.framework.Assert;import org.apache.avro.Schema;import org.apache.avro.io.*;import org.apache.avro.specific.SpecificDatumReader;import org.apache.avro.specific.SpecificDatumWriter;import org.junit.Test;import java.io.ByteArrayOutputStream;import java.io.IOException;/** * Created with IntelliJ IDEA. * User: lastsweetop * Date: 13-8-6 * Time: 下午2:19 * To change this template use File | Settings | File Templates. */public class TestSprecificMapping {    @Test    public void test() throws IOException {        //因为已经生成StringPair的源代码,所以不再使用schema了,直接调用setter和getter即可        StringPair datum=new StringPair();        datum.setLeft("L");        datum.setRight("R");        ByteArrayOutputStream out=new ByteArrayOutputStream();        //不再需要传schema了,直接用StringPair作为范型和参数,        DatumWriter<StringPair> writer=new SpecificDatumWriter<StringPair>(StringPair.class);        Encoder encoder= EncoderFactory.get().binaryEncoder(out,null);        writer.write(datum, encoder);        encoder.flush();        out.close();        DatumReader<StringPair> reader=new SpecificDatumReader<StringPair>(StringPair.class);        Decoder decoder= DecoderFactory.get().binaryDecoder(out.toByteArray(),null);        StringPair result=reader.read(null,decoder);        Assert.assertEquals("L",result.getLeft().toString());        Assert.assertEquals("R",result.getRight().toString());    }}
不同点总结一下,schema->StringPair.class,      GenericRecord->StringPair


原创粉丝点击