flume高并发优化——(10)消灭elasticsearch sink多次插入
来源:互联网 发布:xp网络共享设置 编辑:程序博客网 时间:2024/05/16 17:46
在flume作为通道接收json数据时,最近遇到一个问题,当flume-es-sink遭遇一个错误的时候,会不断尝试插入数据,而以前的数据又没有进行回滚,导致数据重复插入,脏数据累积,为了解决这个问题,现解决如下:
原因如下:
1,事务控制在channel端
2,事务回滚,未处理已插入es中数据
解决方案:
1,es批量操作不做回滚
2,es插入出错,只做报警(日志业务,不要求强事务)
总结:
在解决问题的过程中,解决问题的关键在于两点:
1,分析错误数据
2,关键点日志分析
找到这两个点,解决问题基本就可以定位,在以后的工作生涯中,我们要沉住气,不要怕,为了以后不犯错,保留犯错现场。
修改flume源码如下
关键代码:
} catch (Throwable ex) { logger.error("=ElasticSearchSink=>has error",ex); try { txn.commit(); sinkCounter.addToEventDrainSuccessCount(0); counterGroup.incrementAndGet("transaction.success"); } catch (Exception ex2) { logger.error( "=ElasticSearchSink.counterGroup.incrementAndGet=> has exception.", ex2); } }
elasticsearch-sink:
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */package org.apache.flume.sink.elasticsearch;import com.google.common.annotations.VisibleForTesting;import com.google.common.base.Preconditions;import com.google.common.base.Throwables;import org.apache.commons.lang.StringUtils;import org.apache.flume.*;import org.apache.flume.conf.Configurable;import org.apache.flume.formatter.output.BucketPath;import org.apache.flume.instrumentation.SinkCounter;import org.apache.flume.sink.AbstractSink;import org.apache.flume.sink.elasticsearch.client.ElasticSearchClient;import org.apache.flume.sink.elasticsearch.client.ElasticSearchClientFactory;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.concurrent.TimeUnit;import java.util.regex.Matcher;import java.util.regex.Pattern;import static org.apache.flume.sink.elasticsearch.ElasticSearchSinkConstants.*;/** * A sink which reads events from a channel and writes them to ElasticSearch * based on the work done by https://github.com/Aconex/elasticflume.git.</p> * * This sink supports batch reading of events from the channel and writing them * to ElasticSearch.</p> * * Indexes will be rolled daily using the format 'indexname-YYYY-MM-dd' to allow * easier management of the index</p> * * This sink must be configured with with mandatory parameters detailed in * {@link ElasticSearchSinkConstants}</p> It is recommended as a secondary step * the ElasticSearch indexes are optimized for the specified serializer. This is * not handled by the sink but is typically done by deploying a config template * alongside the ElasticSearch deploy</p> * * http://www.elasticsearch.org/guide/reference/api/admin-indices-templates.html */public class ElasticSearchSink extends AbstractSink implements Configurable { private static final Logger logger = LoggerFactory .getLogger(ElasticSearchSink.class); // Used for testing private boolean isLocal = false; private final CounterGroup counterGroup = new CounterGroup(); private static final int defaultBatchSize = 100; private int batchSize = defaultBatchSize; private long ttlMs = DEFAULT_TTL; private String clusterName = DEFAULT_CLUSTER_NAME; private String indexName = DEFAULT_INDEX_NAME; private String indexType = DEFAULT_INDEX_TYPE; private String clientType = DEFAULT_CLIENT_TYPE; private final Pattern pattern = Pattern.compile(TTL_REGEX, Pattern.CASE_INSENSITIVE); private Matcher matcher = pattern.matcher(""); private String[] serverAddresses = null; private ElasticSearchClient client = null; private Context elasticSearchClientContext = null; private ElasticSearchIndexRequestBuilderFactory indexRequestFactory; private ElasticSearchEventSerializer eventSerializer; private IndexNameBuilder indexNameBuilder; private SinkCounter sinkCounter; /** * Create an {@link ElasticSearchSink} configured using the supplied * configuration */ public ElasticSearchSink() { this(false); } /** * Create an {@link ElasticSearchSink}</p> * * @param isLocal * If <tt>true</tt> sink will be configured to only talk to an * ElasticSearch instance hosted in the same JVM, should always be * false is production * */ @VisibleForTesting ElasticSearchSink(boolean isLocal) { this.isLocal = isLocal; } @VisibleForTesting String[] getServerAddresses() { return serverAddresses; } @VisibleForTesting String getClusterName() { return clusterName; } @VisibleForTesting String getIndexName() { return indexName; } @VisibleForTesting String getIndexType() { return indexType; } @VisibleForTesting long getTTLMs() { return ttlMs; } @VisibleForTesting ElasticSearchEventSerializer getEventSerializer() { return eventSerializer; } @VisibleForTesting IndexNameBuilder getIndexNameBuilder() { return indexNameBuilder; } @Override public Status process() throws EventDeliveryException { logger.debug("processing..."); Status status = Status.READY; Channel channel = getChannel(); Transaction txn = channel.getTransaction(); try { txn.begin(); int count; for (count = 0; count < batchSize; ++count) { Event event = channel.take(); if (event == null) { break; } String realIndexType = BucketPath.escapeString(indexType, event.getHeaders()); client.addEvent(event, indexNameBuilder, realIndexType, ttlMs); } if (count <= 0) { sinkCounter.incrementBatchEmptyCount(); counterGroup.incrementAndGet("channel.underflow"); status = Status.BACKOFF; } else { if (count < batchSize) { sinkCounter.incrementBatchUnderflowCount(); status = Status.BACKOFF; } else { sinkCounter.incrementBatchCompleteCount(); } sinkCounter.addToEventDrainAttemptCount(count); client.execute(); } txn.commit(); sinkCounter.addToEventDrainSuccessCount(count); counterGroup.incrementAndGet("transaction.success"); } catch (Throwable ex) { logger.error("=ElasticSearchSink=>has error",ex); try { txn.commit(); sinkCounter.addToEventDrainSuccessCount(0); counterGroup.incrementAndGet("transaction.success"); } catch (Exception ex2) { logger.error( "=ElasticSearchSink.counterGroup.incrementAndGet=> has exception.", ex2); } } finally { txn.close(); } return status; } @Override public void configure(Context context) { if (!isLocal) { if (StringUtils.isNotBlank(context.getString(HOSTNAMES))) { serverAddresses = StringUtils.deleteWhitespace( context.getString(HOSTNAMES)).split(","); } Preconditions.checkState(serverAddresses != null && serverAddresses.length > 0, "Missing Param:" + HOSTNAMES); } if (StringUtils.isNotBlank(context.getString(INDEX_NAME))) { this.indexName = context.getString(INDEX_NAME); } if (StringUtils.isNotBlank(context.getString(INDEX_TYPE))) { this.indexType = context.getString(INDEX_TYPE); } if (StringUtils.isNotBlank(context.getString(CLUSTER_NAME))) { this.clusterName = context.getString(CLUSTER_NAME); } if (StringUtils.isNotBlank(context.getString(BATCH_SIZE))) { this.batchSize = Integer.parseInt(context.getString(BATCH_SIZE)); } if (StringUtils.isNotBlank(context.getString(TTL))) { this.ttlMs = parseTTL(context.getString(TTL)); Preconditions.checkState(ttlMs > 0, TTL + " must be greater than 0 or not set."); } if (StringUtils.isNotBlank(context.getString(CLIENT_TYPE))) { clientType = context.getString(CLIENT_TYPE); } elasticSearchClientContext = new Context(); elasticSearchClientContext.putAll(context.getSubProperties(CLIENT_PREFIX)); String serializerClazz = DEFAULT_SERIALIZER_CLASS; if (StringUtils.isNotBlank(context.getString(SERIALIZER))) { serializerClazz = context.getString(SERIALIZER); } Context serializerContext = new Context(); serializerContext.putAll(context.getSubProperties(SERIALIZER_PREFIX)); try { @SuppressWarnings("unchecked") Class<? extends Configurable> clazz = (Class<? extends Configurable>) Class .forName(serializerClazz); Configurable serializer = clazz.newInstance(); if (serializer instanceof ElasticSearchIndexRequestBuilderFactory) { indexRequestFactory = (ElasticSearchIndexRequestBuilderFactory) serializer; indexRequestFactory.configure(serializerContext); } else if (serializer instanceof ElasticSearchEventSerializer) { eventSerializer = (ElasticSearchEventSerializer) serializer; eventSerializer.configure(serializerContext); } else { throw new IllegalArgumentException(serializerClazz + " is not an ElasticSearchEventSerializer"); } } catch (Exception e) { logger.error("Could not instantiate event serializer.", e); Throwables.propagate(e); } if (sinkCounter == null) { sinkCounter = new SinkCounter(getName()); } String indexNameBuilderClass = DEFAULT_INDEX_NAME_BUILDER_CLASS; if (StringUtils.isNotBlank(context.getString(INDEX_NAME_BUILDER))) { indexNameBuilderClass = context.getString(INDEX_NAME_BUILDER); } Context indexnameBuilderContext = new Context(); serializerContext.putAll( context.getSubProperties(INDEX_NAME_BUILDER_PREFIX)); try { @SuppressWarnings("unchecked") Class<? extends IndexNameBuilder> clazz = (Class<? extends IndexNameBuilder>) Class .forName(indexNameBuilderClass); indexNameBuilder = clazz.newInstance(); //indexnameBuilderContext.put(INDEX_NAME, indexName); indexNameBuilder.configure(context); } catch (Exception e) { logger.error("Could not instantiate index name builder.", e); Throwables.propagate(e); } if (sinkCounter == null) { sinkCounter = new SinkCounter(getName()); } Preconditions.checkState(StringUtils.isNotBlank(indexName), "Missing Param:" + INDEX_NAME); Preconditions.checkState(StringUtils.isNotBlank(indexType), "Missing Param:" + INDEX_TYPE); Preconditions.checkState(StringUtils.isNotBlank(clusterName), "Missing Param:" + CLUSTER_NAME); Preconditions.checkState(batchSize >= 1, BATCH_SIZE + " must be greater than 0"); } @Override public void start() { ElasticSearchClientFactory clientFactory = new ElasticSearchClientFactory(); logger.info("ElasticSearch sink {} started"); sinkCounter.start(); try { if (isLocal) { client = clientFactory.getLocalClient( clientType, eventSerializer, indexRequestFactory); } else { client = clientFactory.getClient(clientType, serverAddresses, clusterName, eventSerializer, indexRequestFactory); client.configure(elasticSearchClientContext); } sinkCounter.incrementConnectionCreatedCount(); } catch (Exception ex) { ex.printStackTrace(); sinkCounter.incrementConnectionFailedCount(); if (client != null) { client.close(); sinkCounter.incrementConnectionClosedCount(); } } super.start(); } @Override public void stop() { logger.info("ElasticSearch sink {} stopping"); if (client != null) { client.close(); } sinkCounter.incrementConnectionClosedCount(); sinkCounter.stop(); super.stop(); } /* * Returns TTL value of ElasticSearch index in milliseconds when TTL specifier * is "ms" / "s" / "m" / "h" / "d" / "w". In case of unknown specifier TTL is * not set. When specifier is not provided it defaults to days in milliseconds * where the number of days is parsed integer from TTL string provided by * user. <p> Elasticsearch supports ttl values being provided in the format: * 1d / 1w / 1ms / 1s / 1h / 1m specify a time unit like d (days), m * (minutes), h (hours), ms (milliseconds) or w (weeks), milliseconds is used * as default unit. * http://www.elasticsearch.org/guide/reference/mapping/ttl-field/. * * @param ttl TTL value provided by user in flume configuration file for the * sink * * @return the ttl value in milliseconds */ private long parseTTL(String ttl) { matcher = matcher.reset(ttl); while (matcher.find()) { if (matcher.group(2).equals("ms")) { return Long.parseLong(matcher.group(1)); } else if (matcher.group(2).equals("s")) { return TimeUnit.SECONDS.toMillis(Integer.parseInt(matcher.group(1))); } else if (matcher.group(2).equals("m")) { return TimeUnit.MINUTES.toMillis(Integer.parseInt(matcher.group(1))); } else if (matcher.group(2).equals("h")) { return TimeUnit.HOURS.toMillis(Integer.parseInt(matcher.group(1))); } else if (matcher.group(2).equals("d")) { return TimeUnit.DAYS.toMillis(Integer.parseInt(matcher.group(1))); } else if (matcher.group(2).equals("w")) { return TimeUnit.DAYS.toMillis(7 * Integer.parseInt(matcher.group(1))); } else if (matcher.group(2).equals("")) { logger.info("TTL qualifier is empty. Defaulting to day qualifier."); return TimeUnit.DAYS.toMillis(Integer.parseInt(matcher.group(1))); } else { logger.debug("Unknown TTL qualifier provided. Setting TTL to 0."); return 0; } } logger.info("TTL not provided. Skipping the TTL config by returning 0."); return 0; }}
0 0
- flume高并发优化——(10)消灭elasticsearch sink多次插入
- flume高并发优化——(1)load_balance
- flume高并发优化——(2)精简结构
- flume高并发优化——(3)haproxy
- flume高并发优化——(4)kafka channel
- flume高并发优化——(5)KafkaOffsetMonitor
- ]flume高并发优化——(1)load_balance
- Flume学习04 — Sink
- flume高并发优化——(6)开发多文件检索source插件
- flume高并发优化——(7)RandomAccessFile升级多文件source
- flume高并发优化——(8)多文件source扩展断点续传
- flume高并发优化——(9)配置文件交由zookeeper管理
- flume高并发优化——(11)排除json转换及中文乱码
- flume高并发优化——(12)filesource 支撑文件组&兼容cat监控
- flume高并发优化——(13)扩展三级文件配置&利用Headers扩展属性
- flume高并发优化——(15)中间件版本升级
- flume高并发优化——(16)解决offsets变小问题
- flume高并发优化——(14)解决空行停止收集数据问题,及offsets变小问题
- Linux面试题汇总(含答案)
- 安卓状态栏优化,实现透明状态栏,沉侵式状态栏,改变状态栏颜色
- new/delete和malloc/free的区别一般汇总
- (一)80c52学习之旅-起始篇
- POJ 2352 Stars
- flume高并发优化——(10)消灭elasticsearch sink多次插入
- vs工程文件解决方法以及项目之间的关系
- win7 xml 图标恢复
- uboot 分析之 启动流程
- c++笔试面试3
- javascript上机题
- android中的反射应用场景分析(map转化为bean工具)
- 仿饿了么等APP底部菜单(Fragment实现)
- ica 独立成分分析