hive redis 集群模式udf 写入

来源:互联网 发布:网络交友的弊端300字 编辑:程序博客网 时间:2024/05/17 06:26

前面写了一个单实例模式的redis hset写入udf,使用pipeline提高写入性能,没用连接池,因为有使用限制,就是要按唯一键mod分成许多个组然后collect_list传入一个列表,所以创建的连接很少,保证每个组里面的这个list不会太大,就没啥问题。不然在pipeline上提交的时候应该也要计数一下,每多少条sync一下,偷懒了。。。。

后面正式使用的时候是将这些用户id写入了一个redis集群,那么那个udf就不好用了,得写过一个,因为集群模式没有pipeline啊我擦。。。。

开始没想那么多,在evaluate函数里面每次都会创建一个JedisCluster实例,类似下面这种:

  @Override    public Object evaluate(DeferredObject[] arg0) throws HiveException {        //JedisCluster只能在方法内部创建作为局部变量使用,这个类里面会用到连接池,连接池是有状态的,无法序列化。        try (JedisCluster jedisCluster = new JedisCluster(hostAndPort)) {            for (int i = 0; i < paramsListInspector.getListLength(arg0[2].get()); i++) {                Object row = paramsListInspector.getListElement(arg0[2].get(), i);                Map<?, ?> map = paramsElementInspector.getMap(row);                if (map.containsKey(writableKeyField)) {                    String did = map.get(writableKeyField).toString();                    for (Map.Entry<?, ?> entry : map.entrySet()) {                        if (!writableKeyField.equals(entry.getKey()) && entry.getValue() != null && !"".equals(entry.getValue().toString())) {                            jedisCluster.hset(did, entry.getKey().toString(), entry.getValue().toString());                        }                    }                }            }            return new IntWritable(1);        } catch (Exception e) {            e.printStackTrace();            throw new HiveException(e);        }    }
性能简直差到爆啊,因为有300多万条数据,每条都创建一个JedisCluster,而且JedisCluster和Jedis不太一样,他内部本省就是使用了pool。。。想想都很恐怖,如果执行快的话其实都没太去管这么多,关键是执行的慢的不得了,就得优化了。

首先一个优化就是想同一个map container里面所有数据都共享一个jedisCluster实例,于是抽离到单独的类中去,延迟初始化,不能定义在当前udf类中通过initialize来初始化,会无法序列化:

import org.apache.commons.pool2.impl.GenericObjectPoolConfig;import org.joda.time.DateTime;import redis.clients.jedis.HostAndPort;import redis.clients.jedis.JedisCluster;import java.util.concurrent.*;public class JedisClusterUtil {         private static GenericObjectPoolConfig initPoolConfiguration() {        System.out.println(Thread.currentThread().getName() + "======初始化REDIS连接池============" + DateTime.now().toString("yyyy-MM-dd HH:mm:ss"));        GenericObjectPoolConfig config = new GenericObjectPoolConfig();        config.setLifo(true);        config.setTestOnBorrow(true);        config.setTestOnReturn(false);        config.setBlockWhenExhausted(true);        // config.setMinIdle(Math.max(1, (int) (poolSize / 10))); // keep 10% hot always        config.setMinIdle(5);        config.setMaxTotal(500); // the number of Jedis connections to create        config.setTestWhileIdle(false);        config.setSoftMinEvictableIdleTimeMillis(3000L);        config.setNumTestsPerEvictionRun(5);        config.setTimeBetweenEvictionRunsMillis(5000L);        config.setJmxEnabled(false);        config.setMaxWaitMillis(5000);//        config.setJmxEnabled(true);        return config;    }    //单例,保证一个map jvm里面只有一个jedisCluster实例,这样所有分到这个map container里面的数据共享这一个jedisCluster实例,比在evaluate每次创建jedisCluster性能要好一点    private static volatile JedisCluster jedisCluster = null;    public static synchronized JedisCluster getJedisCluster(HostAndPort hostAndPort) {        if (jedisCluster == null) {            jedisCluster = new JedisCluster(hostAndPort, 10000, 10000, 3, initPoolConfiguration());            Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {                // Clean up at exit                @Override                public void run() {                    System.out.println(JedisClusterUtil.class.getSimpleName() + " shutdown");                    try {                        if (jedisCluster != null) {                            jedisCluster.close();                        }                                      } catch (Exception e) {                        e.printStackTrace();                    } finally {                        jedisCluster = null;                    }                }            }));        }        return jedisCluster;    }}


最后是udf类:

import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.io.IntWritable;import redis.clients.jedis.HostAndPort;import redis.clients.jedis.exceptions.JedisException;import java.util.HashMap;import java.util.Map;/*--加大map数量,提高并发率:set mapred.max.split.size=4194304;set mapred.min.split.size.per.node=4194304;set mapred.min.split.size.per.rack=4194304;set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;set mapreduce.job.queuename=root.etl;select xydb.redis_cluster_thread_hset('192.168.110.87:6379',concat('xydbuid.',uid),dim_map) as result  FROM xydb.yydb_user_act_xydb WHERE ds='2017-08-10' AND type='buy_lottery' */@Description(name = "redis_cluster_thread_hset",        value = "_FUNC_(host_and_port,redis_key, map<String,String>) - Return ret ")public class RedisClusterThreadHSetUDF extends GenericUDF {    private HostAndPort hostAndPort;    private MapObjectInspector paramsElementInspector;    private StringObjectInspector keyFieldOI;    @Override    public Object evaluate(DeferredObject[] arg0) throws HiveException {        //JedisCluster只能在方法内部创建作为局部变量使用,这个类里面会用到连接池,连接池是有状态的,无法序列化。        try {            long start = System.currentTimeMillis();            Map<?, ?> map = paramsElementInspector.getMap(arg0[2].get());            String did = keyFieldOI.getPrimitiveJavaObject(arg0[1].get());            Map<String, String> data = new HashMap<>();            for (Map.Entry<?, ?> entry : map.entrySet()) {                if (entry.getValue() != null && !"".equals(entry.getValue().toString())) {                    data.put(entry.getKey().toString(), entry.getValue().toString());                }            }                    try {                System.out.println("submit:" + did + ",data:" + data);                JedisClusterUtil.getJedisCluster(hostAndPort).hmset(did, data); //改成hmset将多个属性一次提交过去            } catch (JedisException e) {                e.printStackTrace();                System.out.println("retry submit:" + did + ",data:" + data);                try {                    JedisClusterUtil.getJedisCluster(hostAndPort).hmset(did, data);                } catch (JedisException ex) {                    ex.printStackTrace();                    throw new HiveException(ex);                }            }            System.out.println("cost :" + (System.currentTimeMillis() - start));            return new IntWritable(1);        } catch (Exception e) {            e.printStackTrace();            throw new HiveException(e);        }    }    @Override    public String getDisplayString(String[] arg0) {        return "redis_cluster_thread_hset(host_and_port,redis_key, map<string,string>)";    }    @Override    public ObjectInspector initialize(ObjectInspector[] arg0)            throws UDFArgumentException {        if (arg0.length != 3) {            throw new UDFArgumentException(" Expecting   two  arguments:<redishost:port>  <redis_key> map<string,string> ");        }        //第一个参数校验        if (arg0[0].getCategory() == Category.PRIMITIVE                && ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {            if (!(arg0[0] instanceof ConstantObjectInspector)) {                throw new UDFArgumentException("redis host:port  must be constant");            }            ConstantObjectInspector redishost_and_port = (ConstantObjectInspector) arg0[0];            String[] host_and_port = redishost_and_port.getWritableConstantValue().toString().split(":");            hostAndPort = new HostAndPort(host_and_port[0], Integer.parseInt(host_and_port[1]));        }        //第2个参数校验        if (arg0[1].getCategory() == Category.PRIMITIVE                && ((PrimitiveObjectInspector) arg0[1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {            if (!(arg0[1] instanceof StringObjectInspector)) {                throw new UDFArgumentException("redis hset key   must be string");            }            keyFieldOI = (StringObjectInspector) arg0[1];        }        //第3个参数校验        if (arg0[2].getCategory() != Category.MAP) {            throw new UDFArgumentException(" Expecting an map<string,string> field as third argument ");        }//        MapObjectInspector third = (MapObjectInspector) arg0[2];        paramsElementInspector = (MapObjectInspector) arg0[2];//        System.out.println(paramsElementInspector.getMapKeyObjectInspector().getCategory());//        System.out.println(paramsElementInspector.getMapValueObjectInspector().getCategory());        return PrimitiveObjectInspectorFactory.writableIntObjectInspector;    }}


然后我们让JOB尽量多产生一些map,通过set这几个属性:

set mapred.max.split.size=4194304;
set mapred.min.split.size.per.node=4194304;
set mapred.min.split.size.per.rack=4194304;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;

把每个split分片的大小调小,就能生成更多的map,提高并发量:

--加大map数量,提高并发率:set mapred.max.split.size=4194304;set mapred.min.split.size.per.node=4194304;set mapred.min.split.size.per.rack=4194304;set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;set mapreduce.job.queuename=root.etl;select   xydb.redis_cluster_thread_hset('192.168.110.87:6379',concat('xydbuid.',uid),dim_map) as result  FROM xydb.yydb_user_act_xydb WHERE ds='2017-08-10' AND type='buy_lottery' 

redis_cluster_thread_hset接收3个参数,第一个字符串是集群中一个节点的host:port,写一个就行了,jedisCluster会自动发现。第二个string 是hset的key值。
第三个参数是一个Map,key对应hset的field,value对应hset 的value。