Storm1.1.0<温故而知新--hdfs和storm的集成>
来源:互联网 发布:python 运行环境 编辑:程序博客网 时间:2024/05/19 19:42
1.环境
apache-storm-1.1.0Hadoop 2.8.0
使用到的依赖:
<dependencies> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-core</artifactId> <version>1.1.0</version> </dependency> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-hdfs</artifactId> <version>1.1.0</version> </dependency> </dependencies>
需求:读取hdfs上的mapreduce日志文件,统计其中的INFO、WARN、DEBUG、Error日志级别的条数
思路很简单,Spout端读入文件,然后Bolt端做Wordcount(先用正则表达式匹配日志级别)
2.实现
LogLevelCountTopology
用到的参数:
import neu.bolt.CountBolt;import neu.bolt.ExtractBolt;import org.apache.storm.Config;import org.apache.storm.LocalCluster;import org.apache.storm.generated.AlreadyAliveException;import org.apache.storm.generated.AuthorizationException;import org.apache.storm.generated.InvalidTopologyException;import org.apache.storm.hdfs.spout.HdfsSpout;import org.apache.storm.hdfs.spout.TextFileReader;import org.apache.storm.topology.TopologyBuilder;import org.apache.storm.tuple.Fields;import java.util.HashMap;public class LogLevelCountTopology { public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException, InterruptedException { System.setProperty("HADOOP_USER_NAME", "root"); if (args.length != 4) { System.out.println("Usage <HdfsUri SourceDir ArchiveDir BadFilesDir>"); System.exit(1); } TopologyBuilder builder = new TopologyBuilder(); HdfsSpout hdfsSpout = new HdfsSpout() .setReaderType("text") .withOutputFields(TextFileReader.defaultFields) .setHdfsUri(args[0]) .setSourceDir(args[1]) .setArchiveDir(args[2]) .setBadFilesDir(args[3]); HashMap<String, Object> hashMap = new HashMap<>(); //正则表达式 hashMap.put(ExtractBolt.REGEX, ".{23}(INFO|DEBUG|WARN|ERROR)"); hashMap.put(ExtractBolt.FIELD, "line"); builder.setSpout("hdfsSpout", hdfsSpout, 1); builder.setBolt("extractbolt", new ExtractBolt(), 1) .addConfigurations(hashMap).shuffleGrouping("hdfsSpout"); builder.setBolt("countBolt", new CountBolt(), 1) .fieldsGrouping("extractbolt", new Fields("level")); Config conf = new Config(); conf.setDebug(true); conf.setMaxTaskParallelism(1); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("hdfsLogLevelCountTopology", conf, builder.createTopology()); Thread.sleep(90000); cluster.shutdown(); }}
ExtractBolt
import org.apache.storm.task.OutputCollector;import org.apache.storm.task.TopologyContext;import org.apache.storm.topology.IRichBolt;import org.apache.storm.topology.OutputFieldsDeclarer;import org.apache.storm.tuple.Fields;import org.apache.storm.tuple.Tuple;import org.apache.storm.tuple.Values;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;public class ExtractBolt implements IRichBolt { public static final String REGEX = "regex"; public static final String FIELD = "field"; String field; Pattern regex; OutputCollector collector; public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { String regexString = (String) stormConf.get(REGEX); this.collector = collector; this.field = (String) stormConf.get(FIELD); this.regex = Pattern.compile(regexString); } public void execute(Tuple input) { String log = input.getStringByField(field); if (log != null) { Matcher matcher = regex.matcher(log); if (matcher.find()) { String level = matcher.group(1); collector.emit(new Values(level)); } else { System.err.println("不包含INFO|DEBUG|ERROR|WARN 日志:" + log); } } collector.ack(input); } public void cleanup() { } public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("level")); } public Map<String, Object> getComponentConfiguration() { return null; }}
CountBolt
import org.apache.storm.topology.BasicOutputCollector;import org.apache.storm.topology.OutputFieldsDeclarer;import org.apache.storm.topology.base.BaseBasicBolt;import org.apache.storm.tuple.Fields;import org.apache.storm.tuple.Tuple;import org.apache.storm.tuple.Values;import java.util.HashMap;import java.util.Map;public class CountBolt extends BaseBasicBolt { private Map<String, Integer> counts = new HashMap<>(); public void execute(Tuple input, BasicOutputCollector collector) { String level = input.getStringByField("level"); Integer count = counts.get(level); if (count == null) count = 0; count++; counts.put(level, count); System.out.println(level + " : " + count); collector.emit(new Values(level, count)); } public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("level", "count")); }}
主函数传参示例hdfs://172.17.11.85:9000 /log /ArchiveDir /BadFilesDir
给一张IDE控制台的输出日志:
阅读全文
0 0
- Storm1.1.0<温故而知新--hdfs和storm的集成>
- Storm1.1.0<温故而知新-网站用户使用的浏览器统计>
- Storm1.1.0<Error preparing HdfsBolt: No FileSystem for scheme: hdfs>
- storm教程(六):Storm 和kafka的集成
- Storm1.1.0<trident+window+Hbase集成实现词频统计TopN>
- STORM入门之(集成HDFS)
- Storm和Redis native的集成
- Storm1.1.0<消息的可靠性机制>
- Storm1.1.0<组件的并行度和组件的实例个数的关系>
- storm-kafka 的集成
- Storm和Kafka的集成安装和测试
- Storm和JDBC native集成
- 大数据 (五)Hadoop-HDFS zookeeper和Hadoop-HDFS的集成
- Storm-HBase集成--配置和开发
- Storm-HBase集成--配置和开发
- Storm-HBase集成--配置和开发
- 堆和栈的区别 (温故而知新)
- storm笔记 -- 与kafka的集成
- display:inline-block/text-align:justify下列表的两端对齐布局
- mysql体系结构分析
- android中自定义画布Canvas的实现
- 巨杉数据库SequoiaDB分区介绍
- CSS朝花夕拾之块级格式上下文BFC
- Storm1.1.0<温故而知新--hdfs和storm的集成>
- Git学习笔记(一)
- ajax技术的运用
- Mybatis学习笔记之一:Mybatis介绍
- JAVA设计模式之工厂模式(简单工厂模式+工厂方法模式)
- Leetcode628. Maximum Product of Three Numbers不要太简单
- SpringMVC知识点汇总
- git基本操作
- String的compareTo()方法返回值