hive udf 批量写入redis

来源：互联网发布：制作新闻的软件编辑：程序博客网时间：2024/06/06 19:07

import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.*;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.io.IntWritable;import redis.clients.jedis.HostAndPort;import redis.clients.jedis.Jedis;import redis.clients.jedis.Pipeline;import java.io.IOException;import java.util.HashMap;import java.util.Map;@Description(name = "redis_batch_hset",        value = "_FUNC_(host_and_port,keyField, array<map>) - Return ret ")public class RedisBatchHSetUDF extends GenericUDF {    private HostAndPort hostAndPort;    private String keyField;    private Object writableKeyField; //实际上是org.apache.hadoop.io.Text类型    private StandardListObjectInspector paramsListInspector;    private StandardMapObjectInspector paramsElementInspector;    @Override    public Object evaluate(DeferredObject[] arg0) throws HiveException {        try (Jedis jedis = new Jedis(hostAndPort.getHost(), hostAndPort.getPort(), 10000, 60000);             Pipeline pipeline = jedis.pipelined()        ) {            for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) {                Object row = paramsListInspector.getListElement(arg0[2].get(), i);                Map<?, ?> map = paramsElementInspector.getMap(row);//                Object obj = ObjectInspectorUtils.copyToStandardJavaObject(row,paramsElementInspector); //转成标准的java map，否则里面的key value字段为hadoop writable对象                if (map.containsKey(writableKeyField)) {                    String did = map.get(writableKeyField).toString();                    Map<String, String> data = new HashMap<>();                    for (Map.Entry<?, ?> entry : map.entrySet()) {                        if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) {                            data.put(entry.getKey().toString(), entry.getValue().toString());                        }                    }                    pipeline.hmset(did,data);                }            }            pipeline.sync();            return new IntWritable(1);        } catch (IOException e) {            e.printStackTrace();            throw new HiveException(e);        }    }    @Override    public String getDisplayString(String[] arg0) {        return "redis_batch_hset(redishost_and_port,keyField, array<map<string,string>>)";    }    @Override    public ObjectInspector initialize(ObjectInspector[] arg0)            throws UDFArgumentException {        if (arg0.length != 3) {            throw new UDFArgumentException(" Expecting   two  arguments:<redishost:port>  <keyField> array<map<string,string>> ");        }        //第一个参数校验        if (arg0[0].getCategory() == Category.PRIMITIVE                && ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {            if (!(arg0[0] instanceof ConstantObjectInspector)) {                throw new UDFArgumentException("redis host:port  must be constant");            }            ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0];            String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":");            hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1]));        }        //第2个参数校验        if (arg0[1].getCategory() == Category.PRIMITIVE                && ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {            if (!(arg0[1] instanceof ConstantObjectInspector)) {                throw new UDFArgumentException("redis hset key   must be constant");            }            ConstantObjectInspector keyFieldOI = (ConstantObjectInspector) arg0[1];            keyField = keyFieldOI.getWritableConstantValue().toString();            writableKeyField = keyFieldOI.getWritableConstantValue();        }        //第3个参数校验        if (arg0[2].getCategory() != Category.LIST) {            throw new UDFArgumentException(" Expecting an array<map<string,string>> field as third argument ");        }        ListObjectInspector third = (ListObjectInspector) arg0[2];        if (third.getListElementObjectInspector().getCategory() != Category.MAP) {            throw new UDFArgumentException(" Expecting an array<map<string,string>> field as third argument ");        }        paramsListInspector = ObjectInspectorFactory.getStandardListObjectInspector(third.getListElementObjectInspector());        paramsElementInspector = (StandardMapObjectInspector) third.getListElementObjectInspector();        System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory());        System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory());        return PrimitiveObjectInspectorFactory.writableIntObjectInspector;    }}

如何注册就不说了，可以翻看原来的那个Mysql udf的文章。

用法还蛮苛刻的：

SELECT g,xydb.redis_batch_hset('192.168.78.87:6379','did',collect_list(map('did',concat('xyzs.profile.',did),'sex',sex,'age',age))) AS resultFROM   (SELECT did,pmod(abs(hash(did)),1000) AS g,sex,age FROM data_mining.adl_xyzs_user_profile_day_new WHERE ds='2017-08-02' AND isnotnull(sex) AND isnotnull(age)  ) aGROUP BY g;

redis_batch_hset(redishost_and_port,keyField, array<map<string,string>>)

第一个参数是要连接的redis的host:port。我们redis没有密码并且默认连db0，所以就配一个hostport就行了，否则自己改一下代码。。。

第三个参数是要入库的数据，是一个array，array里面的元素是一个map<string,string>。

第二个参数是指第三个数组里面每一行map数据里面作为redis hset的key的字段名，这里样例里面指did，当然did这个字段名必须在map中存在，不然会忽略。

did是用户唯一标识，由于入库的量比较大（每天有30多万个did待入库），可以按did mod分组，比如我这拆成了1000个数组分别使用pipline写入redis,这样每个数组里面其实只有几百条数据，使用pipline批量提交提高redis写入性能。

再附一个简单的测试用例：

import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.junit.Before;import org.junit.Test;public class RedisBatchHsetUDFTest {    private RedisBatchHSetUDF udf;    @Before    public void before() {        udf = new RedisBatchHSetUDF();    }    @Test    public void test() throws HiveException {        Text host = new Text("name87:6379");        Text keyField = new Text("did");        ObjectInspector hostAndPort = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, host);        ObjectInspector key = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, keyField);        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("map<string,string>");        StandardMapObjectInspector elementOI = (StandardMapObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);        StandardListObjectInspector listIO = ObjectInspectorFactory.getStandardListObjectInspector(elementOI);        ObjectInspector resultoi = udf.initialize(new ObjectInspector[]{hostAndPort, key, listIO});        Object list = listIO.create(1);        Object row = elementOI.create();        elementOI.put(row, new Text("did"), new Text("xiaojuntest"));        elementOI.put(row, new Text("sex"), new IntWritable(1));        elementOI.put(row, new Text("age"), new Text("85"));        listIO.set(list, 0, row);        Object result = udf.evaluate(new GenericUDF.DeferredObject[]{new GenericUDF.DeferredJavaObject(host), new GenericUDF.DeferredJavaObject(keyField), new GenericUDF.DeferredJavaObject(list)});        System.out.println(result);    }//    @Test//    public void test2() {//        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("array<struct<a:int,b:string>>");//        StandardListObjectInspector objInsp = (StandardListObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);//        System.out.println(objInsp);//    }}

阅读全文

0 0