Storm之WordCount

来源:互联网 发布:vscode go环境 编辑:程序博客网 时间:2024/06/06 10:00
package com.uplooking.bigdata.storm.local;

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.shade.org.apache.commons.io.FileUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import java.io.File;
import java.util.Collection;
import java.util.List;
import java.util.Map;

/**
 * 单词计数:监控一个目录下的文件,当发现有新文件的时候,
    把文件读取过来,解析文件中的内容,统计单词出现的总次数

 E:/test/storm/a.txt---->需要让spout去监听该目录,有新文件产生
     hello you
     hello me
     hello he

    将新文件读到内存中,将其中的数据发送给下游的bolt,进行处理
        第一步,将读到这这些数据单词的拆分hello you--->转换成两个单词hello,you
        第二步,统计单词出现的个数
        一般我们用一个bolt干一件事,所以我们在这里有两个bolt,第一个做单词拆分,第二个做单词计数

 */
public class LocalWordCountTopology {
    //做数据源,监听目录,当有新文件产生,读取其中的内容,发送到下游bolt
    static class WCSpout extends BaseRichSpout {
        private Map conf;
        private TopologyContext context;
        private SpoutOutputCollector collector;

        /**
         * 这是一个生命周期方法,一个SumNumSpout实例只运行一次,主要完成初始化的参数设置
         * @param conf      ---->storm程序以及storm集群相关的配置信息
         * @param context   ---->整个Topology上下文对象,可以通过该context获得相关topology应用属性
         * @param collector ---->主要用于收集数据,并将数据发射到下一个阶段
         */
        public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
            this.conf = conf;
            this.context = context;
            this.collector = collector;
        }

        //监听一个目录新文件的产生
        public void nextTuple() {
            /**
             * File directory ----> 要要监控的目录对象
             * String[] extensions  ---->要监控的目录下面以什么结尾(说白了就是扩展名)的文件
             *          注意,写文件扩展名的时候不能写"."
             * boolean recursive    ---->是否递归遍历
             */
            Collection<File> files = FileUtils.listFiles(new File("E:/test/storm"),
                    new String[]{"txt", "log", "csv"}, true);
            List<String> lines = null;
            try {
                for (File file : files) {
//                    BufferedReader br = new BufferedReader(new FileReader(file));
//                    String line = null;
//                    while((line = br.readLine()) != null) {
//                        collector.emit(new Values(line));
//                    }
                    lines = FileUtils.readLines(file, "UTF-8");
                    for (String line : lines) {
                        System.out.println("spout读取到的内容:" + line);
                        collector.emit(new Values(line));
                    }
                    //读取完成一个文件之后,将其重命名,避免下次再读
                    FileUtils.moveFile(file, new File(file.getAbsolutePath() + "." + System.currentTimeMillis()));
                }
            }catch (Exception e) {
//                e.printStackTrace();//这里就不用输出异常信息了
            }
        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            declarer.declare(new Fields("line"));
        }
    }

    //读取上述spout发送过来的tuple,对tuple中的数据进行单词拆分,将拆分之后的单词发送给下游bolt
    static class SplitBolt extends BaseRichBolt {
        private Map conf;
        private TopologyContext context;
        private OutputCollector collector;

        public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
            this.conf = conf;
            this.context = context;
            this.collector = collector;
        }

        public void execute(Tuple tuple) {
            String line = tuple.getStringByField("line");
            String[] splits = line.split(" ");
            for (String word : splits) {
                collector.emit(new Values(word, 1));
            }
        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            declarer.declare(new Fields("word", "times"));
        }
    }

    //接收上游bolt发送过来的单词,对单词进行统计
    static class WordCountBolt extends BaseRichBolt {
        private Map conf;
        private TopologyContext context;
        private OutputCollector collector;

        public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
            this.conf = conf;
            this.context = context;
            this.collector = collector;
        }

        int sum = 0;
        public void execute(Tuple tuple) {
            String word = tuple.getStringByField("word");
            int times = tuple.getIntegerByField("times");
            sum += times;

            System.out.println("截止到目前为止出现的单词个数:" + sum);
        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {

        }
    }

    public static void main(String[] args) {

        TopologyBuilder builder = new TopologyBuilder();
        //设置spout和bolt
        builder.setSpout("wcSpout_id", new WCSpout());
        builder.setBolt("splitBolt_id", new SplitBolt()).shuffleGrouping("wcSpout_id");
        builder.setBolt("wcBolt_id", new WordCountBolt()).shuffleGrouping("splitBolt_id");

        StormTopology stormTopology = builder.createTopology();
        LocalCluster lCluster = new LocalCluster();
        String topologyName = LocalWordCountTopology.class.getSimpleName();
        Config config = new Config();

        lCluster.submitTopology(topologyName, config, stormTopology);
    }
}

0 0
原创粉丝点击