pig 自定义加载函数加载apache 的access.log中的数据

来源:互联网 发布:php怎么写后台 编辑:程序博客网 时间:2024/04/30 08:24

access.log数据:

127.0.0.1 - - [08/Jan/2012:21:46:31 +0800] "GET / HTTP/1.1" 200 44127.0.0.1 - - [08/Jan/2012:21:46:31 +0800] "GET /favicon.ico HTTP/1.1" 404 209127.0.0.1 - - [08/Jan/2012:22:47:15 +0800] "GET /aa.php HTTP/1.1" 200 61261127.0.0.1 - - [08/Jan/2012:22:47:15 +0800] "GET /aa.php?=PHPE9568F34-D428-11d2-A769-00AA001ACF42 HTTP/1.1" 200 2524127.0.0.1 - - [08/Jan/2012:22:47:15 +0800] "GET /aa.php?=PHPE9568F35-D428-11d2-A769-00AA001ACF42 HTTP/1.1" 200 2146127.0.0.1 - - [08/Jan/2012:22:47:15 +0800] "GET /favicon.ico HTTP/1.1" 404 209127.0.0.1 - - [08/Jan/2012:22:49:39 +0800] "GET /aa.php HTTP/1.1" 200 61496127.0.0.1 - - [08/Jan/2012:22:49:39 +0800] "GET /aa.php?=PHPE9568F34-D428-11d2-A769-00AA001ACF42 HTTP/1.1" 200 2524127.0.0.1 - - [08/Jan/2012:22:49:39 +0800] "GET /aa.php?=PHPE9568F35-D428-11d2-A769-00AA001ACF42 HTTP/1.1" 200 2146127.0.0.1 - - [08/Jan/2012:22:49:39 +0800] "GET /favicon.ico HTTP/1.1" 404 209127.0.0.1 - - [08/Jan/2012:23:05:28 +0800] "GET /tiki HTTP/1.1" 301 230127.0.0.1 - - [08/Jan/2012:23:05:28 +0800] "GET /tiki/ HTTP/1.1" 200 30566127.0.0.1 - - [08/Jan/2012:23:05:28 +0800] "GET /favicon.ico HTTP/1.1" 404 209127.0.0.1 - - [08/Jan/2012:23:06:23 +0800] "GET /tiki/index.php HTTP/1.1" 302 -127.0.0.1 - - [08/Jan/2012:23:06:24 +0800] "GET /tiki/tiki-install.php HTTP/1.1" 200 10974127.0.0.1 - - [08/Jan/2012:23:06:25 +0800] "GET /tiki/lib/tiki-js.js HTTP/1.1" 200 54004127.0.0.1 - - [08/Jan/2012:23:06:25 +0800] "GET /tiki/styles/fivealive.css HTTP/1.1" 200 21404127.0.0.1 - - [08/Jan/2012:23:06:26 +0800] "GET /tiki/lib/jquery_tiki/tiki-jquery.js HTTP/1.1" 200 94701127.0.0.1 - - [08/Jan/2012:23:06:26 +0800] "GET /tiki/img/tiki/Tiki_WCG.png HTTP/1.1" 200 9362127.0.0.1 - - [08/Jan/2012:23:06:26 +0800] "GET /tiki/pics/icons/help.png HTTP/1.1" 200 740

pig ApacheLoader:

package pig;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.pig.LoadFunc;import org.apache.pig.backend.executionengine.ExecException;import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;import org.apache.pig.data.Tuple;import org.apache.pig.data.TupleFactory;public class ApacheLoader extends LoadFunc {protected RecordReader recordReader = null;@Overridepublic InputFormat getInputFormat() throws IOException {return new TextInputFormat();}@Overridepublic Tuple getNext() throws IOException {try {if (!recordReader.nextKeyValue()) {return null;}List<String> list = new ArrayList<String>();Text value = (Text) recordReader.getCurrentValue();String pattern = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";Pattern p = Pattern.compile(pattern);Matcher matcher = p.matcher(value.toString());if (!matcher.matches()) {return null;}list.add(matcher.group(1));list.add(matcher.group(4));list.add(matcher.group(5));list.add(matcher.group(6));list.add(matcher.group(7));return TupleFactory.getInstance().newTuple(list);} catch (Exception e) {throw new ExecException(e);}}@Overridepublic void prepareToRead(RecordReader reader, PigSplit split)throws IOException {this.recordReader = reader;}@Overridepublic void setLocation(String location, Job job) throws IOException {FileInputFormat.setInputPaths(job, location);}}

到pig grunt运行:

 register apacheLoader.jar 

A =load 'access.log' using pig.ApacheLoader();

dump A;


说明:如果access.log有9个列则正则表达式为

 String logEntryPattern = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\"";

上面我的程序是7个列的。

则则参考:

http://nc100.blog.sohu.com/148887042.html

http://www.cnblogs.com/csurn/archive/2010/06/22/1762791.html


改进:

如果最后一个也就是大小为-

String pattern = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+|-)";

0 0
原创粉丝点击