hive redis 集群模式udf 写入
来源:互联网 发布:网络交友的弊端300字 编辑:程序博客网 时间:2024/05/17 06:26
前面写了一个单实例模式的redis hset写入udf,使用pipeline提高写入性能,没用连接池,因为有使用限制,就是要按唯一键mod分成许多个组然后collect_list传入一个列表,所以创建的连接很少,保证每个组里面的这个list不会太大,就没啥问题。不然在pipeline上提交的时候应该也要计数一下,每多少条sync一下,偷懒了。。。。
后面正式使用的时候是将这些用户id写入了一个redis集群,那么那个udf就不好用了,得写过一个,因为集群模式没有pipeline啊我擦。。。。
开始没想那么多,在evaluate函数里面每次都会创建一个JedisCluster实例,类似下面这种:
@Override public Object evaluate(DeferredObject[] arg0) throws HiveException { //JedisCluster只能在方法内部创建作为局部变量使用,这个类里面会用到连接池,连接池是有状态的,无法序列化。 try (JedisCluster jedisCluster = new JedisCluster(hostAndPort)) { for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) { Object row = paramsListInspector.getListElement(arg0[2].get(), i); Map<?, ?> map = paramsElementInspector.getMap(row); if (map.containsKey(writableKeyField)) { String did = map.get(writableKeyField).toString(); for (Map.Entry<?, ?> entry : map.entrySet()) { if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) { jedisCluster.hset(did, entry.getKey().toString(), entry.getValue().toString()); } } } } return new IntWritable(1); } catch (Exception e) { e.printStackTrace(); throw new HiveException(e); } }性能简直差到爆啊,因为有300多万条数据,每条都创建一个JedisCluster,而且JedisCluster和Jedis不太一样,他内部本省就是使用了pool。。。想想都很恐怖,如果执行快的话其实都没太去管这么多,关键是执行的慢的不得了,就得优化了。
首先一个优化就是想同一个map container里面所有数据都共享一个jedisCluster实例,于是抽离到单独的类中去,延迟初始化,不能定义在当前udf类中通过initialize来初始化,会无法序列化:
import org.apache.commons.pool2.impl.GenericObjectPoolConfig;import org.joda.time.DateTime;import redis.clients.jedis.HostAndPort;import redis.clients.jedis.JedisCluster;import java.util.concurrent.*;public class JedisClusterUtil { private static GenericObjectPoolConfig initPoolConfiguration() { System.out.println(Thread.currentThread().getName() + "======初始化REDIS连接池============" + DateTime.now().toString("yyyy-MM-dd HH:mm:ss")); GenericObjectPoolConfig config = new GenericObjectPoolConfig(); config.setLifo(true); config.setTestOnBorrow(true); config.setTestOnReturn(false); config.setBlockWhenExhausted(true); // config.setMinIdle(Math.max(1, (int) (poolSize / 10))); // keep 10% hot always config.setMinIdle(5); config.setMaxTotal(500); // the number of Jedis connections to create config.setTestWhileIdle(false); config.setSoftMinEvictableIdleTimeMillis(3000L); config.setNumTestsPerEvictionRun(5); config.setTimeBetweenEvictionRunsMillis(5000L); config.setJmxEnabled(false); config.setMaxWaitMillis(5000);// config.setJmxEnabled(true); return config; } //单例,保证一个map jvm里面只有一个jedisCluster实例,这样所有分到这个map container里面的数据共享这一个jedisCluster实例,比在evaluate每次创建jedisCluster性能要好一点 private static volatile JedisCluster jedisCluster = null; public static synchronized JedisCluster getJedisCluster(HostAndPort hostAndPort) { if (jedisCluster == null) { jedisCluster = new JedisCluster(hostAndPort, 10000, 10000, 3, initPoolConfiguration()); Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { // Clean up at exit @Override public void run() { System.out.println(JedisClusterUtil.class.getSimpleName() + " shutdown"); try { if (jedisCluster != null) { jedisCluster.close(); } } catch (Exception e) { e.printStackTrace(); } finally { jedisCluster = null; } } })); } return jedisCluster; }}
最后是udf类:
import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.io.IntWritable;import redis.clients.jedis.HostAndPort;import redis.clients.jedis.exceptions.JedisException;import java.util.HashMap;import java.util.Map;/*--加大map数量,提高并发率:set mapred.max.split.size=4194304;set mapred.min.split.size.per.node=4194304;set mapred.min.split.size.per.rack=4194304;set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;set mapreduce.job.queuename=root.etl;select xydb.redis_cluster_thread_hset('192.168.110.87:6379',concat('xydbuid.',uid),dim_map) as result FROM xydb.yydb_user_act_xydb WHERE ds='2017-08-10' AND type='buy_lottery' */@Description(name = "redis_cluster_thread_hset", value = "_FUNC_(host_and_port,redis_key, map<String,String>) - Return ret ")public class RedisClusterThreadHSetUDF extends GenericUDF { private HostAndPort hostAndPort; private MapObjectInspector paramsElementInspector; private StringObjectInspector keyFieldOI; @Override public Object evaluate(DeferredObject[] arg0) throws HiveException { //JedisCluster只能在方法内部创建作为局部变量使用,这个类里面会用到连接池,连接池是有状态的,无法序列化。 try { long start = System.currentTimeMillis(); Map<?, ?> map = paramsElementInspector.getMap(arg0[2].get()); String did = keyFieldOI.getPrimitiveJavaObject(arg0[1].get()); Map<String, String> data = new HashMap<>(); for (Map.Entry<?, ?> entry : map.entrySet()) { if (entry.getValue() != null && !"".equals(entry.getValue().toString())) { data.put(entry.getKey().toString(), entry.getValue().toString()); } } try { System.out.println("submit:" + did + ",data:" + data); JedisClusterUtil.getJedisCluster(hostAndPort).hmset(did, data); //改成hmset将多个属性一次提交过去 } catch (JedisException e) { e.printStackTrace(); System.out.println("retry submit:" + did + ",data:" + data); try { JedisClusterUtil.getJedisCluster(hostAndPort).hmset(did, data); } catch (JedisException ex) { ex.printStackTrace(); throw new HiveException(ex); } } System.out.println("cost :" + (System.currentTimeMillis() - start)); return new IntWritable(1); } catch (Exception e) { e.printStackTrace(); throw new HiveException(e); } } @Override public String getDisplayString(String[] arg0) { return "redis_cluster_thread_hset(host_and_port,redis_key, map<string,string>)"; } @Override public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException { if (arg0.length != 3) { throw new UDFArgumentException(" Expecting two arguments:<redishost:port> <redis_key> map<string,string> "); } //第一个参数校验 if (arg0[0].getCategory() == Category.PRIMITIVE && ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) { if (!(arg0[0] instanceof ConstantObjectInspector)) { throw new UDFArgumentException("redis host:port must be constant"); } ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0]; String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":"); hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1])); } //第2个参数校验 if (arg0[1].getCategory() == Category.PRIMITIVE && ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) { if (!(arg0[1] instanceof StringObjectInspector)) { throw new UDFArgumentException("redis hset key must be string"); } keyFieldOI = (StringObjectInspector) arg0[1]; } //第3个参数校验 if (arg0[2].getCategory() != Category.MAP) { throw new UDFArgumentException(" Expecting an map<string,string> field as third argument "); }// MapObjectInspector third = (MapObjectInspector) arg0[2]; paramsElementInspector = (MapObjectInspector) arg0[2];// System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory());// System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory()); return PrimitiveObjectInspectorFactory.writableIntObjectInspector; }}
然后我们让JOB尽量多产生一些map,通过set这几个属性:
set mapred.max.split.size=4194304;
set mapred.min.split.size.per.node=4194304;
set mapred.min.split.size.per.rack=4194304;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
把每个split分片的大小调小,就能生成更多的map,提高并发量:
--加大map数量,提高并发率:set mapred.max.split.size=4194304;set mapred.min.split.size.per.node=4194304;set mapred.min.split.size.per.rack=4194304;set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;set mapreduce.job.queuename=root.etl;select xydb.redis_cluster_thread_hset('192.168.110.87:6379',concat('xydbuid.',uid),dim_map) as result FROM xydb.yydb_user_act_xydb WHERE ds='2017-08-10' AND type='buy_lottery'
redis_cluster_thread_hset接收3个参数,第一个字符串是集群中一个节点的host:port,写一个就行了,jedisCluster会自动发现。第二个string 是hset的key值。第三个参数是一个Map,key对应hset的field,value对应hset 的value。
阅读全文
0 0
- hive redis 集群模式udf 写入
- hive udf 批量写入redis
- hive udf
- hive-udf
- hive UDF
- hive UDF
- hive UDF
- hive udf
- hive udf
- Hive UDF
- HIVE UDF
- hive udf
- hive UDF
- redis集群哨兵模式
- 开发hive UDF函数
- Hive UDF 开发
- HIVE UDF整理
- hive UDF开发注意事项
- 冒泡排序(bubble sort)
- VGA GPU passthrough qemu虚拟桌面pci穿透
- 冒泡排序原理(java实现)
- 朱有鹏免费嵌入式,单片机视频
- 戴尔台式机bios设置u盘启动教程
- hive redis 集群模式udf 写入
- 【拜小白opencv】6-两幅图像融合简单实现1;addWeighted()函数;两幅图像尺寸相同时;
- Fluid Mask 3.3.16 for Mac 最新版 独立运行/PS 插件 均完美支持 10.12 系统 简体中文汉化版 简单易用的抠图软件
- AlexNet
- [移动战略说 · 第一期] 智能硬件产品开发从 0 到 1
- C++经典书籍推荐
- 笨办法学 Python · 续 练习 29:`diff`和`patch`
- 服务(Service)-《Android第一行代码》笔记
- 时间戳转换为时间