spooldir source header 拦截器

来源:互联网 发布:mac切换系统按键 编辑:程序博客网 时间:2024/05/12 12:08

spooldir source可以将文件名作为header中的key:basename写入到event的header当中去。试想一下,如果有一个拦截器可以拦截这个event,然后抽取header中这个key的值,将其拆分成3段,每一段都放入到header中,这样就可以实现那个需求了。遗憾的是,flume没有提供可以拦截header的拦截器

http://blog.csdn.net/xiao_jun_0820/article/details/38333171


# example command line:
# bin/flume-ng agent -n agent1 -c conf -f conf/files2hdfs.conf
# bin/flume-ng agent -n $agent_name -c conf -f conf/flume-conf.properties.template
# bin/flume-ng agent --conf conf --conf-file conf/example.conf --name a1 -Dflume.root.logger=INFO,console

# Name the components on this agent
# agent1
agent1.sources = files_source
agent1.sinks = hdfs_sink1  hdfs_sink2
agent1.channels = memory_channel1  memory_channel2

# Describe/configure the source
# 配置files_source
# spooling
agent1.sources.files_source.type = spooldir
agent1.sources.files_source.spoolDir = /tmp/flumetest/
agent1.sources.files_source.deletePolicy = never
agent1.sources.files_source.fileHeader = true
agent1.sources.files_source.fileHeaderKey = fileName
# agent1.sources.files_source.basenameHeader = true
# agent1.sources.files_source.basenameHeaderKey = wlzfilehead
# agent1.sources.files_source.fileSuffix = .COMPLETED
# agent1.sources.files_source.ignorePattern = ^$ # ^string$匹配以string开始和结尾的行。单个$匹配一个空行。单个^匹配任意行
agent1.sources.files_source.consumeOrder = oldest
agent1.sources.files_source.deserializer = org.apache.flume.sink.solr.morphline.BlobDeserializer$Builder
agent1.sources.files_source.batchsize = 2

# intercepter只提取event体内的信息,要想根据文件名分channel,sink来处理,还是需要自己写代码来完成
# set interceptors
  agent1.sources.files_source.interceptors = i1
  agent1.sources.files_source.interceptors.i1.type = com.wlzflumeheaderintercepters.RegexExtractorExtInterceptor$Builder
# agent1.sources.files_source.interceptors.i1.regex = ^(\\d) # (pattern)匹配pattern并获取这一匹配。
  agent1.sources.files_source.interceptors.i1.regex = \(\\d\\d\\d\\d\\d\\d\)
  agent1.sources.files_source.interceptors.i1.extractorHeader = true
  agent1.sources.files_source.interceptors.i1.extractorHeaderKey = fileName
  agent1.sources.files_source.interceptors.i1.serializers = t1
  agent1.sources.files_source.interceptors.i1.serializers.t1.name = code


# Mapping for multiplexing selector
 agent1.sources.files_source.selector.type = multiplexing
 agent1.sources.files_source.selector.header = code
 agent1.sources.files_source.selector.mapping.020304 = memory_channel1
 agent1.sources.files_source.selector.mapping.020306 = memory_channel1
 agent1.sources.files_source.selector.mapping.020307 = memory_channel2
 agent1.sources.files_source.selector.mapping.020305 = memory_channel2
 agent1.sources.files_source.selector.default = memory_channel2
# agent1.sources.files_source.selector.mapping.0 = memory_channel1
# agent1.sources.files_source.selector.mapping.1 = memory_channel1
# agent1.sources.files_source.selector.mapping.2 = memory_channel2
# agent1.sources.files_source.selector.mapping.3 = memory_channel2
# agent1.sources.files_source.selector.default = memory_channel2
# ....


 agent1.sources.files_source.channels = memory_channel1  memory_channel2



# Describe the sink
# 配置hdfs_sink
agent1.sinks.hdfs_sink1.type = hdfs
agent1.sinks.hdfs_sink1.hdfs.path = hdfs://localhost:9000/myflumetestcluster/flume1
agent1.sinks.hdfs_sink1.fileType = SequenceFile
# agent1.sinks.hdfs_sink1.fileType = DataStream #值DataStream表示文件类型,不会被压缩
# agent1.sinks.hdfs_sink1.writeFormat = Text #针对DataStream,使用Text输出格式
# 每个事件提交一次hdfs
agent1.sinks.hdfs_sink1.hdfs.batchSize = 1
agent1.sinks.hdfs_sink1.hdfs.rollInterval = 0
agent1.sinks.hdfs_sink1.hdfs.rollcount = 1
agent1.sinks.hdfs_sink1.hdfs.rollsize = 0
# syslog-agent.sinks.HDFS-LAB.hdfs.filePrefix = %{wlzfilehead}.%{host}.%Y-%m-%d
 agent1.sinks.hdfs_sink1.hdfs.filePrefix = %{fileName}
# agent1.sinks.hdfs_sink1.hdfs.fileSuffix = .log
# agent1.sinks.hdfs_sink1.hdfs.useLocalTimeStamp = true
agent1.sinks.hdfs_sink1.hdfs.idleTimeour = 60000

agent1.sinks.hdfs_sink1.channel = memory_channel1

# 配置hdfs_sink
 agent1.sinks.hdfs_sink2.type = hdfs
 agent1.sinks.hdfs_sink2.hdfs.path = hdfs://localhost:9000/myflumetestcluster/flume2
 agent1.sinks.hdfs_sink2.fileType = SequenceFile
 agent1.sinks.hdfs_sink2.fileType = DataStream #值DataStream表示文件类型,不会被压缩
 agent1.sinks.hdfs_sink2.writeFormat = Text #针对DataStream,使用Text输出格式
# 每个事件提交一次hdfs
 agent1.sinks.hdfs_sink2.hdfs.batchSize = 1
 agent1.sinks.hdfs_sink2.hdfs.rollInterval = 0
 agent1.sinks.hdfs_sink2.hdfs.rollcount = 1
 agent1.sinks.hdfs_sink2.hdfs.rollsize = 0
 agent1.sinks.hdfs_sink2.hdfs.filePrefix = %{fileName}
 agent1.sinks.hdfs_sink2.hdfs.idleTimeour = 60000
 agent1.sinks.hdfs_sink2.channel = memory_channel2


# Use a channel which buffers events in memory
# 配置memory_channel
agent1.channels.memory_channel1.type = memory
# agent1.channels.memory_channel1.capacity = 1000
# agent1.channels.memory_channel1.transactionCapacity = 100

 agent1.channels.memory_channel2.type = memory
# agent1.channels.memory_channel2.capacity = 1000
# agent1.channels.memory_channel2.transactionCapacity = 100

0 0