013-通过trident实现单词计数功能

来源:互联网 发布:淘宝卖的印度威格拉 编辑:程序博客网 时间:2024/05/22 04:10
使用trident框架完成wordcount单词计数,使用到大量的trident函数,比如分组,合并等

package storm.trident;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Map.Entry;import kafka.cluster.Cluster;import org.apache.commons.collections.MapUtils;import org.apache.commons.io.FileUtils;import storm.trident.fluent.GroupedStream;import storm.trident.operation.BaseAggregator;import storm.trident.operation.BaseFunction;import storm.trident.operation.TridentCollector;import storm.trident.spout.IBatchSpout;import storm.trident.tuple.TridentTuple;import backtype.storm.Config;import backtype.storm.LocalCluster;import backtype.storm.task.TopologyContext;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Values;import backtype.storm.utils.Utils;/*** Trident 实现单词计数** @author shenfl**/public class TridentWordCount {     public static void main(String[] args) {          TridentTopology tridentTopology = new TridentTopology();          Stream stream = tridentTopology.newStream("spout_id", new DataSourceSpout());          GroupedStream groupedStream = stream.each(new Fields("line"), new SplitBolt(), new Fields("word"))// 单词切分                    .groupBy(new Fields("word"));// word字段分组          // 对每批tuple进行聚合          Stream batchMap = groupedStream.aggregate(new Fields("word"), new WordBatchAggregate(), new Fields("batchMap"));          // 对所有batch进行汇总输出          batchMap.each(new Fields("batchMap"), new TotalAggregator(), new Fields(""));          LocalCluster localCluster = new LocalCluster();          localCluster.submitTopology(TridentTopology.class.getSimpleName(), new Config(), tridentTopology.build());     }     /**     *     * 这里简单打印输出     *     * @author shenfl     *     */     public static class TotalAggregator extends BaseFunction {          // 汇总hashMap          Map<String, Integer> hashMap = new HashMap<String, Integer>();          public void execute(TridentTuple tuple, TridentCollector collector) {               Map<String, Integer> batchMap = (Map<String, Integer>) tuple.getValueByField("batchMap");               for (Map.Entry<String, Integer> entry : batchMap.entrySet()) {                    // 单词                    String word = entry.getKey();                    // 对应单词在batch中的数量                    Integer value = entry.getValue();                    Integer count = hashMap.get(word);                    if (count == null) {                         count = 0;                    }                    hashMap.put(word, value + count);               }               Utils.sleep(1000);               System.out.println("===============");               for (Entry<String, Integer> entry : hashMap.entrySet()) {                    System.out.println(entry);               }          }     }     /**     * 每批tuple进行聚合,聚合结果位<k,v> ,k表示单词,v表示单词出现的次数     *     * @author shenfl     *     */     public static class WordBatchAggregate extends BaseAggregator<Map<String, Integer>> {          public Map<String, Integer> init(Object batchId, TridentCollector collector) {               return new HashMap<String, Integer>();          }          /**          * @param tuple          *            一个单词          * @val tuple 单词出现的次数          */          public void aggregate(Map<String, Integer> val, TridentTuple tuple, TridentCollector collector) {               String word = tuple.getStringByField("word");               Integer v = val.get(word);               if (v == null) {                    v = 0;               }               v++;               val.put(word, v);          }          public void complete(Map<String, Integer> val, TridentCollector collector) {               // 聚合完成后,完成batchId这一批单词的统计               collector.emit(new Values(val));          }     }     /**     * 对每行文本进行切割 ,然后word发射,每个tuple是一个单词     *     * @author shenfl     *     */     public static class SplitBolt extends BaseFunction {          public void execute(TridentTuple tuple, TridentCollector collector) {               String line = tuple.getString(0);               String[] words = line.split("\t");               for (String word : words) {                    collector.emit(new Values(word));               }          }     }     /**     * 数据源     *     * @author shenfl     *     */     public static class DataSourceSpout implements IBatchSpout {          private Map conf;          private TopologyContext context;          /**          *          */          private static final long serialVersionUID = 1L;          HashMap<Long, List<List<Object>>> batches = new HashMap<Long, List<List<Object>>>();          public void open(Map conf, TopologyContext context) {               this.conf = conf;               this.context = context;          }          int i = 0;          /**          * 死循环,,emitBatch实现每次发送多个tuple,每次都一个batchId          */          public void emitBatch(long batchId, TridentCollector collector) {               List<List<Object>> batch = this.batches.get(batchId);               if (batch == null) {                    batch = new ArrayList<List<Object>>();                    // 读取文件列表                    Collection<File> listFiles = FileUtils.listFiles(new File("d:/test"), new String[] { "txt" }, true);                    try {                         for (File file : listFiles) {                              // 当前文件所有行                              List<String> Lines = FileUtils.readLines(file);                              for (String line : Lines) {                                   // 每个参数代表一个tuple                                   batch.add(new Values(line));                              }                              // 文件移动别的目录                              FileUtils.moveFile(file, new File(file + ".done" + System.currentTimeMillis()));                         }                    } catch (IOException e) {                         e.printStackTrace();                    }                    this.batches.put(batchId, batch);               }               // 以批为单位进行发射               for (List<Object> list : batch) {                    collector.emit(list);               }          }          public void ack(long batchId) {               this.batches.remove(batchId);          }          public void close() {          }          public Map getComponentConfiguration() {               Config conf = new Config();               conf.setMaxTaskParallelism(1);               return conf;          }          /**          * 每个tuple以行为为单位进行发射,bolt每次接收一行          */          public Fields getOutputFields() {               return new Fields("line");          }     }}


0 0
原创粉丝点击