Storm1.1.0<温故而知新-网站用户使用的浏览器统计>

来源:互联网 发布:窗户打开方式 知乎 编辑:程序博客网 时间:2024/05/28 06:07

软件环境:

storm1.1.0

使用一个600多兆的网站日志来模拟网站每天产生的日志信息

120.197.87.216 - - [04/Jan/2012:00:00:02 +0800] "GET /home.php?mod=space&uid=563413&mobile=yes HTTP/1.1" 200 3388 "-" "-"123.126.50.73 - - [04/Jan/2012:00:00:02 +0800] "GET /thread-679411-1-1.html HTTP/1.1" 200 5251 "-" "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"203.208.60.187 - - [04/Jan/2012:00:00:02 +0800] "GET /archiver/tid-3003.html HTTP/1.1" 200 2056 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getgold HTTP/1.1" 200 13886 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getmedal HTTP/1.1" 200 13882 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"110.6.179.88 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=no HTTP/1.1" 200 172 "http://www.itpub.net/forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=yes" "Mozilla/5.0 (Linux; U; Android 2.2; zh-cn; ZTE-U V880 Build/FRF91) UC AppleWebKit/530+ (KHTML, like Gecko) Mobile Safari/530"116.205.130.2 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=6 HTTP/1.1" 200 32 "http://www.itpub.net/forum-6-1.html?ts=28" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; QQDownload 702; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; AskTbPTV/5.11.3.15590; .NET4.0E)"114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=133 HTTP/1.1" 200 11 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?tid=1558574 HTTP/1.1" 200 5 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"110.75.173.35 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?goto=lastpost&mod=redirect&tid=1380214 HTTP/1.1" 302 5 "-" "Yahoo! Slurp China"

以上日志文件的样本,将文件上传到HDFS上的log目录下

使用开源库https://github.com/Gpwner/user-agent-utils解析出浏览器的名称
这里写图片描述

使用的时候,简单地调用一下API就能解析出浏览器的名称了

这里写图片描述

1.思路

首先通过HdfsSpout从hdfs上读取日志文件

 HdfsSpout hdfsSpout = new HdfsSpout()                .setReaderType("text")                .withOutputFields(TextFileReader.defaultFields)                .setHdfsUri(args[0])                .setSourceDir(args[1])                .setArchiveDir(args[2])                .setBadFilesDir(args[3]);

然后数据流入BrowserExtractBolt

    public void execute(Tuple input) {        String log = input.getStringByField(field);        UserAgent agent = UserAgent.parseUserAgentString(log);        if (log != null) {            collector.emit(new Values(agent.getBrowser().getName()));        } else {            System.err.println("解析出现异常:" + log);        }        collector.ack(input);    }

接着与做词频统计类似:

  public void execute(Tuple input, BasicOutputCollector collector) {        String browser = input.getStringByField("browser");        Integer count = counts.get(browser);        if (count == null)            count = 0;        count++;        counts.put(browser, count);        System.out.println(browser + " :  " + count);        collector.emit(new Values(browser, count));    }

不清楚如何将hdfs与storm整合的看我之前的博客:
http://blog.csdn.net/gpwner/article/details/74157575

整个拓扑的构建:

这里写图片描述

2.实现

使用到的依赖

  <dependencies>        <dependency>            <groupId>org.apache.storm</groupId>            <artifactId>storm-core</artifactId>            <!--<scope>provided</scope>-->            <version>1.1.0</version>        </dependency>        <dependency>            <groupId>org.apache.storm</groupId>            <artifactId>storm-hdfs</artifactId>            <version>1.1.0</version>        </dependency>        <dependency>            <groupId>eu.bitwalker</groupId>            <artifactId>UserAgentUtils</artifactId>            <version>1.20</version>        </dependency>    </dependencies>

BrowserCountTopology

import neu.bolt.BrowserExtractBolt;import neu.bolt.CountBolt;import neu.bolt.ExtractBolt;import org.apache.storm.Config;import org.apache.storm.LocalCluster;import org.apache.storm.generated.AlreadyAliveException;import org.apache.storm.generated.AuthorizationException;import org.apache.storm.generated.InvalidTopologyException;import org.apache.storm.hdfs.spout.HdfsSpout;import org.apache.storm.hdfs.spout.TextFileReader;import org.apache.storm.topology.TopologyBuilder;import org.apache.storm.tuple.Fields;import java.util.HashMap;public class BrowserCountTopology {    public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException, InterruptedException {        System.setProperty("HADOOP_USER_NAME", "root");        if (args.length != 4) {            System.out.println("Usage <HdfsUri SourceDir ArchiveDir BadFilesDir>");            System.exit(1);        }        TopologyBuilder builder = new TopologyBuilder();        HdfsSpout hdfsSpout = new HdfsSpout()                .setReaderType("text")                .withOutputFields(TextFileReader.defaultFields)                .setHdfsUri(args[0])                .setSourceDir(args[1])                .setArchiveDir(args[2])                .setBadFilesDir(args[3]);        HashMap<String, Object> hashMap = new HashMap<>();        hashMap.put(ExtractBolt.FIELD, "line");        builder.setSpout("hdfsSpout", hdfsSpout, 1);        builder.setBolt("browserextractbolt", new BrowserExtractBolt(), 8)                .addConfigurations(hashMap).shuffleGrouping("hdfsSpout");        builder.setBolt("countBolt", new CountBolt(), 1)                .fieldsGrouping("browserextractbolt", new Fields("browser"));        Config conf = new Config();        conf.setDebug(true);        conf.setMaxTaskParallelism(1);        LocalCluster cluster = new LocalCluster();        cluster.submitTopology("CountTopology", conf, builder.createTopology());        Thread.sleep(90000);        cluster.shutdown();    }}

BrowserExtractBolt

import eu.bitwalker.useragentutils.UserAgent;import org.apache.storm.task.OutputCollector;import org.apache.storm.task.TopologyContext;import org.apache.storm.topology.IRichBolt;import org.apache.storm.topology.OutputFieldsDeclarer;import org.apache.storm.tuple.Fields;import org.apache.storm.tuple.Tuple;import org.apache.storm.tuple.Values;import java.util.Map;public class BrowserExtractBolt implements IRichBolt {    public static final String FIELD = "field";    String field;    OutputCollector collector;    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {        this.collector = collector;        this.field = (String) stormConf.get(FIELD);    }    public void execute(Tuple input) {        String log = input.getStringByField(field);        UserAgent agent = UserAgent.parseUserAgentString(log);        if (log != null) {            collector.emit(new Values(agent.getBrowser().getName()));        } else {            System.err.println("解析出现异常:" + log);        }        collector.ack(input);    }    public void cleanup() {    }    public void declareOutputFields(OutputFieldsDeclarer declarer) {        declarer.declare(new Fields("browser"));    }    public Map<String, Object> getComponentConfiguration() {        return null;    }}

CountBolt

public class CountBolt extends BaseBasicBolt {    private Map<String, Integer> counts = new HashMap<>();    public void execute(Tuple input, BasicOutputCollector collector) {        String browser = input.getStringByField("browser");        Integer count = counts.get(browser);        if (count == null)            count = 0;        count++;        counts.put(browser, count);        System.out.println(browser + " :  " + count);        collector.emit(new Values(browser, count));    }    public void declareOutputFields(OutputFieldsDeclarer declarer) {        declarer.declare(new Fields("browser", "count"));    }}

参数配置:
这里写图片描述
运行结果:
这里写图片描述

阅读全文
0 0
原创粉丝点击