Storm1.1.0<trident+window+Hbase集成实现词频统计TopN>
来源:互联网 发布:河南软件 编辑:程序博客网 时间:2024/06/06 03:11
1.温故而知新,使用词频统计实现TopN,以下是使用到的依赖:
<dependencies> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-core</artifactId> <version>1.1.0</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-hbase</artifactId> <version>1.1.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.3</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.zookeeper</groupId> <artifactId>zookeeper</artifactId> <version>3.4.6</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> </exclusion> </exclusions> </dependency> </dependencies>
2.代码实现
import org.apache.hadoop.hbase.client.Durability;import org.apache.storm.Config;import org.apache.storm.LocalCluster;import org.apache.storm.StormSubmitter;import org.apache.storm.generated.AlreadyAliveException;import org.apache.storm.generated.AuthorizationException;import org.apache.storm.generated.InvalidTopologyException;import org.apache.storm.hbase.trident.mapper.SimpleTridentHBaseMapper;import org.apache.storm.hbase.trident.mapper.TridentHBaseMapper;import org.apache.storm.hbase.trident.state.HBaseState;import org.apache.storm.hbase.trident.state.HBaseStateFactory;import org.apache.storm.hbase.trident.state.HBaseUpdater;import org.apache.storm.topology.base.BaseWindowedBolt;import org.apache.storm.trident.TridentTopology;import org.apache.storm.trident.operation.BaseAggregator;import org.apache.storm.trident.operation.BaseFunction;import org.apache.storm.trident.operation.FlatMapFunction;import org.apache.storm.trident.operation.TridentCollector;import org.apache.storm.trident.state.StateFactory;import org.apache.storm.trident.testing.FixedBatchSpout;import org.apache.storm.trident.tuple.TridentTuple;import org.apache.storm.trident.windowing.config.SlidingDurationWindow;import org.apache.storm.trident.windowing.config.WindowConfig;import org.apache.storm.tuple.Fields;import org.apache.storm.tuple.Values;import java.util.*;import java.util.concurrent.TimeUnit;public class TopNTopology { private static class TopNFunction extends BaseFunction { private int TOPN; public TopNFunction(int n) { this.TOPN = n; } @Override public void execute(TridentTuple tuple, TridentCollector collector) { HashMap<String, Long> hashMap = (HashMap<String, Long>) tuple.get(0); List<Map.Entry<String, Long>> list = new ArrayList<>(hashMap.entrySet()); Collections.sort(list, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { return o1.getValue().compareTo(o2.getValue()); } }); int i = 1; for (int j = list.size() - 1; j >= 0; j--) { if (i > TOPN) break; collector.emit(new Values(String.valueOf(i), list.get(j).getKey(), String.valueOf(list.get(j).getValue()))); System.out.println("Sending: " + i + " " + list.get(j).getKey() + ": " + list.get(j).getValue()); i++; } System.out.println("----------------done----------------------"); } } private static class SplitFunction implements FlatMapFunction { @Override public Iterable<Values> execute(TridentTuple input) { ArrayList<Values> values = new ArrayList<>(); String sentence = input.getStringByField("sentence"); String[] split = sentence.split(" "); for (String s : split) { values.add(new Values(s)); } return values; } } private static class WordAggregator extends BaseAggregator<HashMap<String, Long>> { @Override public HashMap<String, Long> init(Object batchId, TridentCollector collector) { return new HashMap<>(); } @Override public void aggregate(HashMap<String, Long> val, TridentTuple tuple, TridentCollector collector) { String word = tuple.getStringByField("word"); long count = 1; if (val.containsKey(word)) count += val.get(word); val.put(word, count); } @Override public void complete(HashMap<String, Long> val, TridentCollector collector) { collector.emit(new Values(val)); } } public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException { FixedBatchSpout spout = new FixedBatchSpout(new Fields("sentence"), 3, new Values("the cow jumped over the moon"), new Values("the man went to the store and bought some candy"), new Values("four score and seven years ago"), new Values("how many apples can you eat"), new Values("to be or not to be the person")); spout.setCycle(true); TridentHBaseMapper tridentHBaseMapper = new SimpleTridentHBaseMapper() .withColumnFamily("result") .withColumnFields(new Fields("word", "count")) .withRowKeyField("rank"); HBaseState.Options options = new HBaseState.Options() .withConfigKey("hbase") .withDurability(Durability.SYNC_WAL) .withMapper(tridentHBaseMapper) .withTableName("Top5Count"); StateFactory hBaseStateFactory = new HBaseStateFactory(options); WindowConfig durationWindow = SlidingDurationWindow.of(BaseWindowedBolt.Duration.seconds(10), BaseWindowedBolt.Duration.seconds(5)); TridentTopology topology = new TridentTopology(); topology.newStream("fixedSpout", spout) .flatMap(new SplitFunction(), new Fields("word")) .window(durationWindow, new Fields("word"), new WordAggregator(), new Fields("wordcount")) .each(new Fields("wordcount"), new TopNFunction(5), new Fields("rank", "word", "count")) .partitionPersist(hBaseStateFactory, new Fields("rank", "word", "count"), new HBaseUpdater(), new Fields()); Config conf = new Config(); conf.put("hbase", new HashMap<String, Object>()); if (args.length == 0) { LocalCluster cluster = new LocalCluster(); cluster.submitTopology("Top5Topology", conf, topology.build()); } else { conf.setNumWorkers(3); StormSubmitter.submitTopologyWithProgressBar(args[0], conf, topology.build()); } }}
阅读全文
0 0
- Storm1.1.0<trident+window+Hbase集成实现词频统计TopN>
- Golang实现词频统计
- MapReduce实现词频统计
- Golang实现词频统计
- linux命令实现词频统计
- C语言实现 词频统计
- java实现词频统计程序
- excel实现统计词频功能
- Python 对文本先按词频统计,若相同按字典排序,后取TopN
- Storm集成Kafka的Trident实现
- Storm1.1.0<温故而知新--hdfs和storm的集成>
- Storm1.1.0<温故而知新-网站用户使用的浏览器统计>
- HBase的java编程实例-写入词频统计
- 词频统计
- 词频统计
- 词频统计
- 词频统计
- 词频统计
- 介绍两款高性价比MCU,秒杀低成本市场
- 给UGUI的Button动态添加OnClick监听
- 关于有偿提供拼图响应式后台的通知
- springboot+mybatis整合(入门)
- MTK Android Driver :add a new key
- Storm1.1.0<trident+window+Hbase集成实现词频统计TopN>
- 偶遇暖心的事
- C#获取带有汉字的char[]
- DL:epoch、 iteration和batchsize区别
- 35-第一个只出现一次的字符
- 关于C/C++中typedef的定义与用法总结
- 关于js自执行函数
- python实现剑指offer系列:重建二叉树
- 如何将oracle用户A中某个表的数据导入到用户B中的一个表