hive udf 批量写入redis
来源:互联网 发布:制作新闻的软件 编辑:程序博客网 时间:2024/06/06 19:07
import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.*;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.io.IntWritable;import redis.clients.jedis.HostAndPort;import redis.clients.jedis.Jedis;import redis.clients.jedis.Pipeline;import java.io.IOException;import java.util.HashMap;import java.util.Map;@Description(name = "redis_batch_hset", value = "_FUNC_(host_and_port,keyField, array<map>) - Return ret ")public class RedisBatchHSetUDF extends GenericUDF { private HostAndPort hostAndPort; private String keyField; private Object writableKeyField; //实际上是org.apache.hadoop.io.Text类型 private StandardListObjectInspector paramsListInspector; private StandardMapObjectInspector paramsElementInspector; @Override public Object evaluate(DeferredObject[] arg0) throws HiveException { try (Jedis jedis = new Jedis(hostAndPort.getHost(), hostAndPort.getPort(), 10000, 60000); Pipeline pipeline = jedis.pipelined() ) { for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) { Object row = paramsListInspector.getListElement(arg0[2].get(), i); Map<?, ?> map = paramsElementInspector.getMap(row);// Object obj = ObjectInspectorUtils.copyToStandardJavaObject(row,paramsElementInspector); //转成标准的java map,否则里面的key value字段为hadoop writable对象 if (map.containsKey(writableKeyField)) { String did = map.get(writableKeyField).toString(); Map<String, String> data = new HashMap<>(); for (Map.Entry<?, ?> entry : map.entrySet()) { if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) { data.put(entry.getKey().toString(), entry.getValue().toString()); } } pipeline.hmset(did,data); } } pipeline.sync(); return new IntWritable(1); } catch (IOException e) { e.printStackTrace(); throw new HiveException(e); } } @Override public String getDisplayString(String[] arg0) { return "redis_batch_hset(redishost_and_port,keyField, array<map<string,string>>)"; } @Override public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException { if (arg0.length != 3) { throw new UDFArgumentException(" Expecting two arguments:<redishost:port> <keyField> array<map<string,string>> "); } //第一个参数校验 if (arg0[0].getCategory() == Category.PRIMITIVE && ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) { if (!(arg0[0] instanceof ConstantObjectInspector)) { throw new UDFArgumentException("redis host:port must be constant"); } ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0]; String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":"); hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1])); } //第2个参数校验 if (arg0[1].getCategory() == Category.PRIMITIVE && ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) { if (!(arg0[1] instanceof ConstantObjectInspector)) { throw new UDFArgumentException("redis hset key must be constant"); } ConstantObjectInspector keyFieldOI = (ConstantObjectInspector) arg0[1]; keyField = keyFieldOI.getWritableConstantValue().toString(); writableKeyField = keyFieldOI.getWritableConstantValue(); } //第3个参数校验 if (arg0[2].getCategory() != Category.LIST) { throw new UDFArgumentException(" Expecting an array<map<string,string>> field as third argument "); } ListObjectInspector third = (ListObjectInspector) arg0[2]; if (third.getListElementObjectInspector().getCategory() != Category.MAP) { throw new UDFArgumentException(" Expecting an array<map<string,string>> field as third argument "); } paramsListInspector = ObjectInspectorFactory.getStandardListObjectInspector(third.getListElementObjectInspector()); paramsElementInspector = (StandardMapObjectInspector) third.getListElementObjectInspector(); System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory()); System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory()); return PrimitiveObjectInspectorFactory.writableIntObjectInspector; }}
如何注册就不说了,可以翻看原来的那个Mysql udf的文章。
用法还蛮苛刻的:
SELECT g,xydb.redis_batch_hset('192.168.78.87:6379','did',collect_list(map('did',concat('xyzs.profile.',did),'sex',sex,'age',age))) AS resultFROM (SELECT did,pmod(abs(hash(did)),1000) AS g,sex,age FROM data_mining.adl_xyzs_user_profile_day_new WHERE ds='2017-08-02' AND isnotnull(sex) AND isnotnull(age) ) aGROUP BY g;
第一个参数是要连接的redis的host:port。我们redis没有密码并且默认连db0,所以就配一个hostport就行了,否则自己改一下代码。。。
第三个参数是要入库的数据,是一个array,array里面的元素是一个map<string,string>。
第二个参数是指第三个数组里面每一行map数据里面作为redis hset的key的字段名,这里样例里面指did,当然did这个字段名必须在map中存在,不然会忽略。
did是用户唯一标识,由于入库的量比较大(每天有30多万个did待入库),可以按did mod分组,比如我这拆成了1000个数组分别使用pipline写入redis,这样每个数组里面其实只有几百条数据,使用pipline批量提交提高redis写入性能。
再附一个简单的测试用例:
import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.junit.Before;import org.junit.Test;public class RedisBatchHsetUDFTest { private RedisBatchHSetUDF udf; @Before public void before() { udf = new RedisBatchHSetUDF(); } @Test public void test() throws HiveException { Text host = new Text("name87:6379"); Text keyField = new Text("did"); ObjectInspector hostAndPort = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, host); ObjectInspector key = PrimitiveObjectInspectorFactory.getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, keyField); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("map<string,string>"); StandardMapObjectInspector elementOI = (StandardMapObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo); StandardListObjectInspector listIO = ObjectInspectorFactory.getStandardListObjectInspector(elementOI); ObjectInspector resultoi = udf.initialize(new ObjectInspector[]{hostAndPort, key, listIO}); Object list = listIO.create(1); Object row = elementOI.create(); elementOI.put(row, new Text("did"), new Text("xiaojuntest")); elementOI.put(row, new Text("sex"), new IntWritable(1)); elementOI.put(row, new Text("age"), new Text("85")); listIO.set(list, 0, row); Object result = udf.evaluate(new GenericUDF.DeferredObject[]{new GenericUDF.DeferredJavaObject(host), new GenericUDF.DeferredJavaObject(keyField), new GenericUDF.DeferredJavaObject(list)}); System.out.println(result); }// @Test// public void test2() {// TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString("array<struct<a:int,b:string>>");// StandardListObjectInspector objInsp = (StandardListObjectInspector) TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);// System.out.println(objInsp);// }}
阅读全文
0 0
- hive udf 批量写入redis
- hive redis 集群模式udf 写入
- hive udf
- hive-udf
- hive UDF
- hive UDF
- hive UDF
- hive udf
- hive udf
- Hive UDF
- HIVE UDF
- hive udf
- hive UDF
- REDIS从LINUX文件写入批量数据
- spark 批量写入redis (pipeline + 分批提交)
- 开发hive UDF函数
- Hive UDF 开发
- HIVE UDF整理
- angularjs中的$sompile服务
- 万维链的应用场景(一)
- Mysql备份以及定期清理备份
- Java设计模式:装饰者模式(Decorator Pattern)
- Oracle用户锁定时的解决办法
- hive udf 批量写入redis
- 开发ionic2+cordova环境的搭建
- 作为一个菜鸟的反思
- FCC学习笔记-(五) Basic Algorithm Scripting
- 使用TabHost添加不同tab界面不同的菜单(包含ActionBar)
- .NET CORE 自定义特性的简单操作
- HDU-2017 多校训练赛4-1003-Counting Divisors
- java-ssh:逻辑层 控制层 数据层 之间类的关系图解
- php laravel5.4 mysql migrate