flume文件名interceptor

来源:互联网 发布:水泥胶砂强度试验数据 编辑:程序博客网 时间:2024/06/05 04:39

从文件名提取日期、小时信息,决定数据发送到hdfs哪天哪小时的分区目录。

需要自定义一个拦截器

package interceptor;import java.util.List;  import java.util.Map;  import java.util.regex.Matcher;  import java.util.regex.Pattern;    import org.apache.commons.lang.StringUtils;  import org.apache.flume.Context;  import org.apache.flume.Event;import org.apache.flume.interceptor.Interceptor;import org.apache.flume.interceptor.RegexExtractorInterceptorPassThroughSerializer;  import org.apache.flume.interceptor.RegexExtractorInterceptorSerializer;  import org.slf4j.Logger;  import org.slf4j.LoggerFactory;    import com.google.common.base.Charsets;  import com.google.common.base.Preconditions;  import com.google.common.base.Throwables;  import com.google.common.collect.Lists; /** * Interceptor that extracts matches using a specified regular expression and * appends the matches to the event headers using the specified serializers</p> * Note that all regular expression matching occurs through Java's built in * java.util.regex package</p>. Properties: * <p> * regex: The regex to use * <p> * serializers: Specifies the group the serializer will be applied to, and the * name of the header that will be added. If no serializer is specified for a * group the default {@link RegexExtractorInterceptorPassThroughSerializer} will * be used * <p> * Sample config: * <p> * agent.sources.r1.channels = c1 * <p> * agent.sources.r1.type = SEQ * <p> * agent.sources.r1.interceptors = i1 * <p> * agent.sources.r1.interceptors.i1.type = REGEX_EXTRACTOR * <p> * agent.sources.r1.interceptors.i1.regex = (WARNING)|(ERROR)|(FATAL) * <p> * agent.sources.r1.interceptors.i1.serializers = s1 s2 * agent.sources.r1.interceptors.i1.serializers.s1.type = com.blah.SomeSerializer * agent.sources.r1.interceptors.i1.serializers.s1.name = warning * agent.sources.r1.interceptors.i1.serializers.s2.type = org.apache.flume.interceptor.RegexExtractorInterceptorTimestampSerializer * agent.sources.r1.interceptors.i1.serializers.s2.name = error * agent.sources.r1.interceptors.i1.serializers.s2.dateFormat = yyyy-MM-dd * </code> * </p> * <pre> * Example 1: * </p> * EventBody: 1:2:3.4foobar5</p> Configuration: * agent.sources.r1.interceptors.i1.regex = (\\d):(\\d):(\\d) * </p> * agent.sources.r1.interceptors.i1.serializers = s1 s2 s3 * agent.sources.r1.interceptors.i1.serializers.s1.name = one * agent.sources.r1.interceptors.i1.serializers.s2.name = two * agent.sources.r1.interceptors.i1.serializers.s3.name = three * </p> * results in an event with the the following * * body: 1:2:3.4foobar5 headers: one=>1, two=>2, three=3 * * Example 2: * * EventBody: 1:2:3.4foobar5 * * Configuration: agent.sources.r1.interceptors.i1.regex = (\\d):(\\d):(\\d) * <p> * agent.sources.r1.interceptors.i1.serializers = s1 s2 * agent.sources.r1.interceptors.i1.serializers.s1.name = one * agent.sources.r1.interceptors.i1.serializers.s2.name = two * <p> * * results in an event with the the following * * body: 1:2:3.4foobar5 headers: one=>1, two=>2 * </pre> */public class RegexExtractorExtInterceptor implements Interceptor {        static final String REGEX = "regex";      static final String SERIALIZERS = "serializers";        // 增加代码开始        static final String EXTRACTOR_HEADER = "extractorHeader";      static final boolean DEFAULT_EXTRACTOR_HEADER = false;      static final String EXTRACTOR_HEADER_KEY = "extractorHeaderKey";        // 增加代码结束        private static final Logger logger = LoggerFactory              .getLogger(RegexExtractorExtInterceptor.class);        private final Pattern regex;      private final List<NameAndSerializer> serializers;        // 增加代码开始        private final boolean extractorHeader;      private final String extractorHeaderKey;        // 增加代码结束        private RegexExtractorExtInterceptor(Pattern regex,              List<NameAndSerializer> serializers, boolean extractorHeader,              String extractorHeaderKey) {          this.regex = regex;          this.serializers = serializers;          this.extractorHeader = extractorHeader;          this.extractorHeaderKey = extractorHeaderKey;      }        @Override      public void initialize() {          // NO-OP...      }        @Override      public void close() {          // NO-OP...      }        @Override      public Event intercept(Event event) {          String tmpStr;          if(extractorHeader)          {              tmpStr = event.getHeaders().get(extractorHeaderKey);          }          else          {              tmpStr=new String(event.getBody(),                      Charsets.UTF_8);          }                    Matcher matcher = regex.matcher(tmpStr);          Map<String, String> headers = event.getHeaders();          if (matcher.find()) {              for (int group = 0, count = matcher.groupCount(); group < count; group++) {                  int groupIndex = group + 1;                  if (groupIndex > serializers.size()) {                      if (logger.isDebugEnabled()) {                          logger.debug(                                  "Skipping group {} to {} due to missing serializer",                                  group, count);                      }                      break;                  }                  NameAndSerializer serializer = serializers.get(group);                  if (logger.isDebugEnabled()) {                      logger.debug("Serializing {} using {}",                              serializer.headerName, serializer.serializer);                  }                  headers.put(serializer.headerName, serializer.serializer                          .serialize(matcher.group(groupIndex)));              }          }          return event;      }        @Override      public List<Event> intercept(List<Event> events) {          List<Event> intercepted = Lists.newArrayListWithCapacity(events.size());          for (Event event : events) {              Event interceptedEvent = intercept(event);              if (interceptedEvent != null) {                  intercepted.add(interceptedEvent);              }          }          return intercepted;      }        public static class Builder implements Interceptor.Builder {            private Pattern regex;          private List<NameAndSerializer> serializerList;            // 增加代码开始            private boolean extractorHeader;          private String extractorHeaderKey;            // 增加代码结束            private final RegexExtractorInterceptorSerializer defaultSerializer = new RegexExtractorInterceptorPassThroughSerializer();            @Override          public void configure(Context context) {              String regexString = context.getString(REGEX);              Preconditions.checkArgument(!StringUtils.isEmpty(regexString),                      "Must supply a valid regex string");                regex = Pattern.compile(regexString);              regex.pattern();              regex.matcher("").groupCount();              configureSerializers(context);                // 增加代码开始              extractorHeader = context.getBoolean(EXTRACTOR_HEADER,                      DEFAULT_EXTRACTOR_HEADER);                if (extractorHeader) {                  extractorHeaderKey = context.getString(EXTRACTOR_HEADER_KEY);                  Preconditions.checkArgument(                          !StringUtils.isEmpty(extractorHeaderKey),                          "必须指定要抽取内容的header key");              }              // 增加代码结束          }            private void configureSerializers(Context context) {              String serializerListStr = context.getString(SERIALIZERS);              Preconditions.checkArgument(                      !StringUtils.isEmpty(serializerListStr),                      "Must supply at least one name and serializer");                String[] serializerNames = serializerListStr.split("\\s+");                Context serializerContexts = new Context(                      context.getSubProperties(SERIALIZERS + "."));                serializerList = Lists                      .newArrayListWithCapacity(serializerNames.length);              for (String serializerName : serializerNames) {                  Context serializerContext = new Context(                          serializerContexts.getSubProperties(serializerName                                  + "."));                  String type = serializerContext.getString("type", "DEFAULT");                  String name = serializerContext.getString("name");                  Preconditions.checkArgument(!StringUtils.isEmpty(name),                          "Supplied name cannot be empty.");                    if ("DEFAULT".equals(type)) {                      serializerList.add(new NameAndSerializer(name,                              defaultSerializer));                  } else {                      serializerList.add(new NameAndSerializer(name,                              getCustomSerializer(type, serializerContext)));                  }              }          }            private RegexExtractorInterceptorSerializer getCustomSerializer(                  String clazzName, Context context) {              try {                  RegexExtractorInterceptorSerializer serializer = (RegexExtractorInterceptorSerializer) Class                          .forName(clazzName).newInstance();                  serializer.configure(context);                  return serializer;              } catch (Exception e) {                  logger.error("Could not instantiate event serializer.", e);                  Throwables.propagate(e);              }              return defaultSerializer;          }            @Override          public Interceptor build() {              Preconditions.checkArgument(regex != null,                      "Regex pattern was misconfigured");              Preconditions.checkArgument(serializerList.size() > 0,                      "Must supply a valid group match id list");              return new RegexExtractorExtInterceptor(regex, serializerList,                      extractorHeader, extractorHeaderKey);          }      }        static class NameAndSerializer {          private final String headerName;          private final RegexExtractorInterceptorSerializer serializer;            public NameAndSerializer(String headerName,                  RegexExtractorInterceptorSerializer serializer) {              this.headerName = headerName;              this.serializer = serializer;          }      }  }  
项目的pom.xml文件

<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0"         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>    <groupId>com.flume-dev</groupId>  <artifactId>com.flume-dev</artifactId>  <name>com.flume-dev</name>  <version>1.0.0</version>  <build>    <plugins>      <plugin>        <groupId>org.apache.maven.plugins</groupId>        <artifactId>maven-jar-plugin</artifactId>      </plugin>    </plugins>  </build>  <dependencies>    <dependency>      <groupId>org.apache.flume</groupId>      <artifactId>flume-ng-sdk</artifactId>      <version>1.5.0</version>    </dependency>    <dependency>      <groupId>org.apache.flume</groupId>      <artifactId>flume-ng-core</artifactId>      <version>1.5.0</version>    </dependency>    <dependency>      <groupId>org.apache.flume</groupId>      <artifactId>flume-ng-configuration</artifactId>      <version>1.5.0</version>    </dependency>    <dependency>      <groupId>org.slf4j</groupId>      <artifactId>slf4j-api</artifactId>      <version>1.6.1</version>    </dependency>    <dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>4.10</version>      <scope>test</scope>    </dependency>  </dependencies></project>

创建flume的插件存放目录

cd $FLUME_HOMEmkdir plugins.dmkdir plugins.d/RegexExtractorExtInterceptorcd plugins.d/RegexExtractorExtInterceptormkdir lib libext native

然后将自已的jar包扔进lib目录,写flume数据流配置文件

fileheader.propertiesagent-1.channels.ch-1.type = fileagent-1.channels.ch-1.checkpointDir= /root/temp/fileheader/checkpointagent-1.channels.ch-1.dataDirs= /root/temp/fileheader/dataagent-1.sources.src-1.type = spooldiragent-1.sources.src-1.channels = ch-1agent-1.sources.src-1.spoolDir = /root/testagent-1.sources.src-1.deletePolicy= neveragent-1.sources.src-1.fileHeader = trueagent-1.sources.src-1.basenameHeader = trueagent-1.sources.src-1.interceptors =i1 hostiagent-1.sources.src-1.interceptors.i1.type = interceptor.RegexExtractorExtInterceptor$Builderagent-1.sources.src-1.interceptors.i1.regex=(.*)\\.(.*)\\.(.*)agent-1.sources.src-1.interceptors.i1.extractorHeader=trueagent-1.sources.src-1.interceptors.i1.extractorHeaderKey=basenameagent-1.sources.src-1.interceptors.i1.serializers=s1 s2 s3agent-1.sources.src-1.interceptors.i1.serializers.s1.name=oneagent-1.sources.src-1.interceptors.i1.serializers.s2.name=twoagent-1.sources.src-1.interceptors.i1.serializers.s3.name=threeagent-1.sources.src-1.interceptors.hosti.type = hostagent-1.sources.src-1.interceptors.hosti.useIP=falseagent-1.sinks.sink_hdfs.channel = ch-1agent-1.sinks.sink_hdfs.type = hdfsagent-1.sinks.sink_hdfs.hdfs.path = hdfs://xxx:port/tmp/events110/fileheader/%{three}agent-1.sinks.sink_hdfs.hdfs.filePrefix = logs.%{host}agent-1.sinks.sink_hdfs.hdfs.inUsePrefix = .agent-1.sinks.sink_hdfs.hdfs.rollInterval = 30agent-1.sinks.sink_hdfs.hdfs.rollSize = 0agent-1.sinks.sink_hdfs.hdfs.rollCount = 0agent-1.sinks.sink_hdfs.hdfs.batchSize = 100agent-1.sinks.sink_hdfs.hdfs.writeFormat = textagent-1.sinks.sink_hdfs.hdfs.fileType = DataStream#agent-1.sinks.sink_hdfs.hdfs.fileType = CompressedStream#agent-1.sinks.sink_hdfs.hdfs.codeC = lzopagent-1.channels = ch-1agent-1.sources = src-1agent-1.sinks = sink_hdfs
 如文件名为data.log.20151111 ,则写入分区20151111

最后执行bin/flume-ng agent -c /usr/local/flume/conf -f /usr/local/flume/conf/fileheader.properties -n agent-1 -Dflume.root.logger=INFO,console


参考 http://blog.csdn.net/xiao_jun_0820/article/details/38333171

 

0 0