02-Storm之Hello World:单词统计

来源:互联网 发布:大疆通信算法工程师 编辑:程序博客网 时间:2024/06/15 22:41

1、说明

设计一个topology,来实现对一个句子里面的单词出现的频率进行统计。

整个topology分为三个部分:
RandomSentenceSpout:数据源,在已知的英文句子中,随机发送一条句子出去。
SplitSentenceBolt:负责将单行文本记录(句子)切分成单词
WordCountBolt:负责对单词的频率进行累加

2、TopologyMain 驱动类

package wordcount2;import backtype.storm.Config;import backtype.storm.LocalCluster;import backtype.storm.StormSubmitter;import backtype.storm.generated.AlreadyAliveException;import backtype.storm.generated.InvalidTopologyException;import backtype.storm.topology.TopologyBuilder;import backtype.storm.tuple.Fields;import wordcount.SplitBolt;/** * Created by Cage on 2016/10/14. */public class TopologyMain {    public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException {        //Storm框架支持多语言,在Java环境下创建一个拓扑,需要使用TopologyBuilder进行构建        TopologyBuilder builder = new TopologyBuilder();        //RandomSentenceSpout类,在已知的英文句子中,随机发送一条句子出去        builder.setSpout("spout1",new RandomSentenceSpout(),3);        //SplitSentenceBolt类,主要是将一行一行的文本内容切割成单词        builder.setBolt("split1",new SplitSentenceBolt(),9).shuffleGrouping("spout1");        //WordCountBolt类,对单词出现的次数进行统计        builder.setBolt("count2",new WordCountBolt(),3).fieldsGrouping("split1",new Fields("word"));        //启动topology的配置信息        Config conf = new Config();        //TOPOLOGY_DEBUG(setDebug),当他被设置成true的话,storm会记录下每个组件所发射的每条消息        //这在本地环境调试topology很有用。但是在线上这么做的话,会影响性能        conf.setDebug(false);        //storm的运行模式有两种:本地模式和分布式模式//        if(args != null || args.length>0){//            conf.setNumWorkers(3);//            //向集群提交topology//            StormSubmitter.submitTopologyWithProgressBar(args[0],conf,builder.createTopology());//        }else{            conf.setMaxTaskParallelism(3);            LocalCluster cluster = new LocalCluster();            cluster.submitTopology("word-count",conf,builder.createTopology());//        }    }}

3、RandomSentenceSpout

package wordcount2;import backtype.storm.spout.SpoutOutputCollector;import backtype.storm.task.TopologyContext;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseBasicBolt;import backtype.storm.topology.base.BaseRichSpout;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Values;import java.util.Map;import java.util.Random;/** * Created by Cage on 2016/10/14. */public class RandomSentenceSpout  extends BaseRichSpout{    //用来收集Spout输出的tuple    private SpoutOutputCollector collector;    private Random random;    //该方法调用一次,主要由storm框架传入SpoutOutputCollector    public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {        this.collector = collector;        random = new Random();        //连接kafka mysql ,打开本地文件    }    /**     * 上帝之手     * while(true)     *      spout.nextTuple()     */    public void nextTuple() {        String[] sentences = new String[]{"the cow jumped over the moon","the dog jumped over the moon",                "the pig jumped over the gun","the fish jumped over the moon","the duck jumped over the moon",                "the man jumped over the sun","the girl jumped over the sun","the boy jumped over the sun"};        String sentence = sentences[random.nextInt(sentences.length)];        collector.emit(new Values(sentence));        System.out.println("RandomSentenceSpout 发送数据:"+sentence);    }    //消息源可以发射多条消息流stream    public void declareOutputFields(OutputFieldsDeclarer declarer) {        declarer.declare(new Fields("sentence"));    }}

4、SplitSentenceBolt

package wordcount2;import backtype.storm.task.TopologyContext;import backtype.storm.topology.BasicOutputCollector;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseBasicBolt;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Tuple;import backtype.storm.tuple.Values;import java.util.Map;/** * Created by Cage on 2016/10/14. */public class SplitSentenceBolt extends BaseBasicBolt {    /**     * 该方法只会被调用一次,用来初始化     * @param stormConf     * @param context     */    @Override    public void prepare(Map stormConf, TopologyContext context) {        super.prepare(stormConf, context);    }    /**     * 接收的参数是RandomSentenceSpout发出的句子,即input的内容是句子     * execute 方法将句子切割形成的单词发出     * @param input     * @param collector     */    public void execute(Tuple input, BasicOutputCollector collector) {        String sentence = (String) input.getValueByField("sentence");        String[] words = sentence.split(" ");        for(String word:words){            word = word.trim();            if(!word.equals("") || word!=null){                word = word.toLowerCase();                System.out.println("SplitSentenceBolt 切割单词:"+ word);                collector.emit(new Values(word,1));            }        }    }    /**     * 消息源可以发射多条消息流stream,多条消息可以理解为多种类型的数据     * @param declarer     */    public void declareOutputFields(OutputFieldsDeclarer declarer) {        declarer.declare(new Fields("word","num"));    }}

5、WordCountBolt

package wordcount2;import backtype.storm.task.TopologyContext;import backtype.storm.topology.BasicOutputCollector;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseBasicBolt;import backtype.storm.tuple.Tuple;import java.util.HashMap;import java.util.Map;/** * Created by Cage on 2016/10/14. */public class WordCountBolt extends BaseBasicBolt {    private Map<String,Integer> counters = new HashMap<String, Integer>();    //该方法只会被调用一次,用来初始化    @Override    public void prepare(Map stormConf, TopologyContext context) {        super.prepare(stormConf, context);    }    /**     * 将collector中的元素存放在成员变量counters(Map)中     * 如果counters中已经存在钙元素,getValue并对value进行累加操作     * @param input     * @param collector     */    public void execute(Tuple input, BasicOutputCollector collector) {        String str = (String)input.getValueByField("word");        Integer num = input.getIntegerByField("num");        System.out.println("----------------------"+Thread.currentThread().getId() + "    "+ str);        if(!counters.containsKey(str)){            counters.put(str,num);        }else{            Integer c = counters.get(str) + num;            counters.put(str,c);        }        System.out.println("WordCountBolt 统计单词:"+counters);    }    public void declareOutputFields(OutputFieldsDeclarer declarer) {    }}

6、运行结果

SplitSentenceBolt 切割单词:cow14:32:24.292 [split1:7-BoltExecutors] INFO  com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=7,creationTimeStamp=1476426729517,values=[the fish jumped over the moon],taskId=8,streamId=default,context=backtype.storm.task.TopologyContext@772e95e8,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>]   value:2SplitSentenceBolt 切割单词:jumped14:32:24.292 [count2:2-BoltExecutors] INFO  com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=2,creationTimeStamp=1476426743859,values=[over, 1],taskId=6,streamId=default,context=backtype.storm.task.TopologyContext@4f72bcb4,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>]   value:1476426744292----------------------106    over14:32:24.292 [count2:1-BoltExecutors] INFO  com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=1,creationTimeStamp=1476426744291,values=[dog, 1],taskId=7,streamId=default,context=backtype.storm.task.TopologyContext@3967ba17,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>]   value:1476426744292WordCountBolt 统计单词:{over=41261, moon=20716, fish=5129, duck=5158, the=82520, boy=5113, gun=5269, sun=15274}14:32:24.292 [pool-15-thread-4] INFO  com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=7,creationTimeStamp=1476426729517,values=[the dog jumped over the moon],taskId=8,streamId=default,context=backtype.storm.task.TopologyContext@772e95e8,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>]   value:641555521649770099----------------------112    dog14:32:24.292 [pool-15-thread-1] INFO  com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=6,creationTimeStamp=1476426729515,values=[the dog jumped over the moon],taskId=10,streamId=default,context=backtype.storm.task.TopologyContext@5782b51a,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>]   value:2WordCountBolt 统计单词:{jumped=42463, cow=5394, dog=5340, pig=5434}
0 0