02-Storm之Hello World:单词统计
来源:互联网 发布:大疆通信算法工程师 编辑:程序博客网 时间:2024/06/15 22:41
1、说明
设计一个topology,来实现对一个句子里面的单词出现的频率进行统计。
整个topology分为三个部分:
RandomSentenceSpout:数据源,在已知的英文句子中,随机发送一条句子出去。
SplitSentenceBolt:负责将单行文本记录(句子)切分成单词
WordCountBolt:负责对单词的频率进行累加
2、TopologyMain 驱动类
package wordcount2;import backtype.storm.Config;import backtype.storm.LocalCluster;import backtype.storm.StormSubmitter;import backtype.storm.generated.AlreadyAliveException;import backtype.storm.generated.InvalidTopologyException;import backtype.storm.topology.TopologyBuilder;import backtype.storm.tuple.Fields;import wordcount.SplitBolt;/** * Created by Cage on 2016/10/14. */public class TopologyMain { public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException { //Storm框架支持多语言,在Java环境下创建一个拓扑,需要使用TopologyBuilder进行构建 TopologyBuilder builder = new TopologyBuilder(); //RandomSentenceSpout类,在已知的英文句子中,随机发送一条句子出去 builder.setSpout("spout1",new RandomSentenceSpout(),3); //SplitSentenceBolt类,主要是将一行一行的文本内容切割成单词 builder.setBolt("split1",new SplitSentenceBolt(),9).shuffleGrouping("spout1"); //WordCountBolt类,对单词出现的次数进行统计 builder.setBolt("count2",new WordCountBolt(),3).fieldsGrouping("split1",new Fields("word")); //启动topology的配置信息 Config conf = new Config(); //TOPOLOGY_DEBUG(setDebug),当他被设置成true的话,storm会记录下每个组件所发射的每条消息 //这在本地环境调试topology很有用。但是在线上这么做的话,会影响性能 conf.setDebug(false); //storm的运行模式有两种:本地模式和分布式模式// if(args != null || args.length>0){// conf.setNumWorkers(3);// //向集群提交topology// StormSubmitter.submitTopologyWithProgressBar(args[0],conf,builder.createTopology());// }else{ conf.setMaxTaskParallelism(3); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("word-count",conf,builder.createTopology());// } }}
3、RandomSentenceSpout
package wordcount2;import backtype.storm.spout.SpoutOutputCollector;import backtype.storm.task.TopologyContext;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseBasicBolt;import backtype.storm.topology.base.BaseRichSpout;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Values;import java.util.Map;import java.util.Random;/** * Created by Cage on 2016/10/14. */public class RandomSentenceSpout extends BaseRichSpout{ //用来收集Spout输出的tuple private SpoutOutputCollector collector; private Random random; //该方法调用一次,主要由storm框架传入SpoutOutputCollector public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; random = new Random(); //连接kafka mysql ,打开本地文件 } /** * 上帝之手 * while(true) * spout.nextTuple() */ public void nextTuple() { String[] sentences = new String[]{"the cow jumped over the moon","the dog jumped over the moon", "the pig jumped over the gun","the fish jumped over the moon","the duck jumped over the moon", "the man jumped over the sun","the girl jumped over the sun","the boy jumped over the sun"}; String sentence = sentences[random.nextInt(sentences.length)]; collector.emit(new Values(sentence)); System.out.println("RandomSentenceSpout 发送数据:"+sentence); } //消息源可以发射多条消息流stream public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("sentence")); }}
4、SplitSentenceBolt
package wordcount2;import backtype.storm.task.TopologyContext;import backtype.storm.topology.BasicOutputCollector;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseBasicBolt;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Tuple;import backtype.storm.tuple.Values;import java.util.Map;/** * Created by Cage on 2016/10/14. */public class SplitSentenceBolt extends BaseBasicBolt { /** * 该方法只会被调用一次,用来初始化 * @param stormConf * @param context */ @Override public void prepare(Map stormConf, TopologyContext context) { super.prepare(stormConf, context); } /** * 接收的参数是RandomSentenceSpout发出的句子,即input的内容是句子 * execute 方法将句子切割形成的单词发出 * @param input * @param collector */ public void execute(Tuple input, BasicOutputCollector collector) { String sentence = (String) input.getValueByField("sentence"); String[] words = sentence.split(" "); for(String word:words){ word = word.trim(); if(!word.equals("") || word!=null){ word = word.toLowerCase(); System.out.println("SplitSentenceBolt 切割单词:"+ word); collector.emit(new Values(word,1)); } } } /** * 消息源可以发射多条消息流stream,多条消息可以理解为多种类型的数据 * @param declarer */ public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word","num")); }}
5、WordCountBolt
package wordcount2;import backtype.storm.task.TopologyContext;import backtype.storm.topology.BasicOutputCollector;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseBasicBolt;import backtype.storm.tuple.Tuple;import java.util.HashMap;import java.util.Map;/** * Created by Cage on 2016/10/14. */public class WordCountBolt extends BaseBasicBolt { private Map<String,Integer> counters = new HashMap<String, Integer>(); //该方法只会被调用一次,用来初始化 @Override public void prepare(Map stormConf, TopologyContext context) { super.prepare(stormConf, context); } /** * 将collector中的元素存放在成员变量counters(Map)中 * 如果counters中已经存在钙元素,getValue并对value进行累加操作 * @param input * @param collector */ public void execute(Tuple input, BasicOutputCollector collector) { String str = (String)input.getValueByField("word"); Integer num = input.getIntegerByField("num"); System.out.println("----------------------"+Thread.currentThread().getId() + " "+ str); if(!counters.containsKey(str)){ counters.put(str,num); }else{ Integer c = counters.get(str) + num; counters.put(str,c); } System.out.println("WordCountBolt 统计单词:"+counters); } public void declareOutputFields(OutputFieldsDeclarer declarer) { }}
6、运行结果
SplitSentenceBolt 切割单词:cow14:32:24.292 [split1:7-BoltExecutors] INFO com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=7,creationTimeStamp=1476426729517,values=[the fish jumped over the moon],taskId=8,streamId=default,context=backtype.storm.task.TopologyContext@772e95e8,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>] value:2SplitSentenceBolt 切割单词:jumped14:32:24.292 [count2:2-BoltExecutors] INFO com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=2,creationTimeStamp=1476426743859,values=[over, 1],taskId=6,streamId=default,context=backtype.storm.task.TopologyContext@4f72bcb4,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>] value:1476426744292----------------------106 over14:32:24.292 [count2:1-BoltExecutors] INFO com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=1,creationTimeStamp=1476426744291,values=[dog, 1],taskId=7,streamId=default,context=backtype.storm.task.TopologyContext@3967ba17,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>] value:1476426744292WordCountBolt 统计单词:{over=41261, moon=20716, fish=5129, duck=5158, the=82520, boy=5113, gun=5269, sun=15274}14:32:24.292 [pool-15-thread-4] INFO com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=7,creationTimeStamp=1476426729517,values=[the dog jumped over the moon],taskId=8,streamId=default,context=backtype.storm.task.TopologyContext@772e95e8,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>] value:641555521649770099----------------------112 dog14:32:24.292 [pool-15-thread-1] INFO com.alibaba.jstorm.utils.RotatingMap - key:TupleImplExt[targetTaskId=6,creationTimeStamp=1476426729515,values=[the dog jumped over the moon],taskId=10,streamId=default,context=backtype.storm.task.TopologyContext@5782b51a,id={},_meta=<null>,_processSampleStartTime=<null>,_executeSampleStartTime=<null>,_outAckVal=0,_map=<null>] value:2WordCountBolt 统计单词:{jumped=42463, cow=5394, dog=5340, pig=5434}
0 0
- 02-Storm之Hello World:单词统计
- storm之hello world
- storm程序-单词统计wordcount
- Storm实时流处理Hello World
- Storm(0.6.1)的Hello World
- hibernate之Hello world
- Spring之Hello World
- Lucene之Hello world
- DWR之HELLO world
- JNI 之 hello world
- ISAPI之Hello world
- Spring之Hello World
- ACE之hello world
- Hello,world之Erlang
- DWR之 Hello world
- android之Hello World!
- Android之Hello World
- lemon之Hello World
- JAVA-IO流中FileReader和FileWriter方法的使用
- SequoiaDB巨杉数据库的分区类型和分区方式
- VS2010编译OpenSSL
- html5移动端知识点总结
- TypeScript数据类型-TS学习笔记(1)
- 02-Storm之Hello World:单词统计
- java 基础测试
- mysql只explain 是一种美德(sql性能优化)
- HTML 根据元素属性的选择性显示与隐藏
- 2016年10月11号 完善NVR端ONVIF异步事件
- Android : Thread, handler, messagequeue, message, Looper 的交互。
- 小轮软件的用户体验提升
- android edittext进行字符限制
- Jenkins安装及使用