Spout 并行重复读取问题
来源:互联网 发布:java将int转换成string 编辑:程序博客网 时间:2024/05/22 18:57
Spout如果单单设置executor的并行个数,那么其输出可能是有重复的,这样的并行策略是有问题的。
比如下面的Spout:
- package wc.redis.spout;
-
- import java.util.Map;
-
- import redis.clients.jedis.Jedis;
- import wc.redis.util.RedisUtils;
- import backtype.storm.spout.SpoutOutputCollector;
- import backtype.storm.task.TopologyContext;
- import backtype.storm.topology.OutputFieldsDeclarer;
- import backtype.storm.topology.base.BaseRichSpout;
- import backtype.storm.tuple.Fields;
- import backtype.storm.tuple.Values;
-
- public class WCSpout extends BaseRichSpout {
-
-
-
-
- private static final long serialVersionUID = 1L;
- private SpoutOutputCollector collector;
- private Jedis jedis;
- Integer taskId;
- String conponentId;
- String slow_fast;
- @Override
- public void open(@SuppressWarnings("rawtypes") Map conf, TopologyContext context,
- SpoutOutputCollector collector) {
- this.collector = collector;
- slow_fast = (String)conf.get("slow_fast");
- jedis = RedisUtils.connect(RedisUtils.HOSTNAME, RedisUtils.PORT, RedisUtils.INSERT_DB);
- taskId = context.getThisTaskId();
- conponentId = context.getThisComponentId();
- context.getThisTaskIndex();
- System.out.println(RedisUtils.getCurrDateWithInfo(conponentId, taskId, " WCSpout初始化完成!"));
- }
-
- @Override
- public void nextTuple() {
- long interval =0;
- while(true){
- interval++;
- String zero = getItem("0");
- String one = getItem("1");
- String two = getItem("2");
-
- try {
- Thread.sleep(200);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- if(zero==null||one==null||two==null){
-
-
- if(interval%15==0){
-
-
-
- }
- }else{
- this.collector.emit(new Values(zero+","+one+","+two));
- if(interval%15==0&&"fast".equals(slow_fast)){
-
-
- System.out.println(RedisUtils.getCurrDateWithInfo(conponentId, taskId, "Spout:["+zero+","+one+","+two+"]"));
- }else if("slow".equals(slow_fast)){
- System.out.println(RedisUtils.getCurrDateWithInfo(conponentId, taskId, "Spout:["+zero+","+one+","+two+"]"));
- }else{
- new RuntimeException("Wrong argument!");
- }
- }
-
- }
- }
-
- @Override
- public void declareOutputFields(OutputFieldsDeclarer declarer) {
- declarer.declare(new Fields("line"));
- }
-
-
-
-
-
- private String getItem(String index){
- if(!jedis.exists(index)){
- return null;
- }
- String val = jedis.get(index);
-
-
-
-
- jedis.del(index);
- return val;
- }
-
- }
这个Spout从Redis服务器中获取数据,获取后把对应的数据删除。两个Spout都同时读取了数据,然后进行了输出,同时只能有一个Spout删除了Redis中的数据,这样就会有重复数据输出了,类似图1:
图1
从图1红色区域可以看到Spout的输出,从时间可以看出两个输出只相差了1毫秒;从蓝色的框也可以看出Spout的下一个Bolt获取了两条相同的数据,这就说明Spout输出了重复的数据;
所以Spout的并行策略应该是获取taskid,根据数据的特征来选择(可以随机)需要处理的executor,代码如下:
- package wc.redis.spout;
-
- import java.util.Map;
-
- import redis.clients.jedis.Jedis;
- import wc.redis.util.RedisUtils;
- import backtype.storm.spout.SpoutOutputCollector;
- import backtype.storm.task.TopologyContext;
- import backtype.storm.topology.OutputFieldsDeclarer;
- import backtype.storm.topology.base.BaseRichSpout;
- import backtype.storm.tuple.Fields;
- import backtype.storm.tuple.Values;
-
- public class WCSpout extends BaseRichSpout {
-
-
-
-
- private static final long serialVersionUID = 1L;
- private SpoutOutputCollector collector;
- private Jedis jedis;
- Integer taskId;
- String componentId;
- String slow_fast;
- int numTasks ;
- int thisTaskId;
- @Override
- public void open(@SuppressWarnings("rawtypes") Map conf, TopologyContext context,
- SpoutOutputCollector collector) {
- this.collector = collector;
- slow_fast = (String)conf.get("slow_fast");
- jedis = RedisUtils.connect(RedisUtils.HOSTNAME, RedisUtils.PORT, RedisUtils.INSERT_DB);
- taskId = context.getThisTaskId();
- componentId = context.getThisComponentId();
- numTasks = context.getComponentTasks(componentId).size();
- thisTaskId = context.getThisTaskIndex();
- System.out.println(RedisUtils.getCurrDateWithInfo(componentId, taskId, " WCSpout初始化完成!"));
- }
-
- @Override
- public void nextTuple() {
- long interval =0;
- while(true){
- interval++;
- String zero = getItem("0");
- String one = getItem("1");
- String two = getItem("2");
-
- try {
- Thread.sleep(200);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- if(zero==null||one==null||two==null){
-
-
-
-
- }else{
- String tmpStr =zero+","+one+","+two;
- if(thisTaskId==tmpStr.hashCode()%numTasks){
- this.collector.emit(new Values(tmpStr));
-
- if(interval%15==0&&"fast".equals(slow_fast)){
- System.out.println(RedisUtils.getCurrDateWithInfo(String.valueOf(thisTaskId),
- taskId, "Spout:["+zero+","+one+","+two+"]"));
- }else if("slow".equals(slow_fast)){
- System.out.println(RedisUtils.getCurrDateWithInfo(String.valueOf(thisTaskId),
- taskId, "Spout:["+zero+","+one+","+two+"]"));
- }else{
- new RuntimeException("Wrong argument!");
- }
- }
- }
-
- }
- }
-
- @Override
- public void declareOutputFields(OutputFieldsDeclarer declarer) {
- declarer.declare(new Fields("line"));
- }
-
-
-
-
-
- private String getItem(String index){
- if(!jedis.exists(index)){
- return null;
- }
- String val = jedis.get(index);
-
-
-
-
- jedis.del(index);
- return val;
- }
-
- }
使用上面的代码后,Spout的输出就不会重复了,同时也达到了distribution的目的,如图2
图2
从图2中红色框中可以看到从时间21:58 883 taskId5 Spout输出后,接着到了taskID6 21:59 494 Spout输出,然后又到taskID5 21:59 905 的Spout输出,并没有重复记录;
0 0