Elastischearch bulk请求源码分析 1.0

来源：互联网发布：网络安全教育图片编辑：程序博客网时间：2024/05/20 22:26

接收bulk请求->判断是否需要自动创建index->处理bulk请求（解析request->构建map->循环获取shardId->执行构建index）->写primary ->写replica

bulk请求由多个request的实例组成了一个BulkRequest,入口是org.elasticsearch.rest.action.bulk.RestBulkAction,一个请求会构建一个BulkRequest对象,BulkRequest.add方法会解析提交的文本。

处理路径：
RestBulkAction ->TransportBulkAction ->TransportShardBulkAction

其中TransportShardBulkAction有个继承结构：主入口是TransportAction，具体的业务逻辑实现在子类(TransportReplicationAction)和孙子类(TransportShardBulkAction)里。

TransportShardBulkAction < TransportReplicationAction < TransportAction

RestBulkAction：
bulkRequest.add(request.content(), defaultIndex, defaultType, defaultRouting, defaultFields, null, allowExplicitIndex);
//这里的client其实是NodeClient，NodeClient将请求发送到TransportBulkAction类
client.bulk(bulkRequest, new RestBuilderListener<BulkResponse>(channel){...})

在服务端接收到其实是一个bulkRequest的实例，拿到这个实例之后，会根据配置action.auto_create_index来决定是否可以自动创建索引（默认是可以的），若是可以创建，会遍历所有的request取得其中包含的index和type，然后在遍历这些index和type，如果集群中不存在相应的index和type，则创建(创建过程这里先不说)之，完成之后才开始真正的bulk执行过程

TransportBulkAction 实现了HandledTransportAction，说明这个类同时也是一个逻辑处理类。

HandledTransportAction：
@Override
public final void messageReceived(final Request request, final TransportChannel channel, Task task) throws Exception {
    // We already got the task created on the netty layer - no need to create it again on the transport layer
    execute(task, request, new ActionListener<Response>() {


public class TransportBulkAction extends HandledTransportAction<BulkRequest, BulkResponse> {
     ·························
protected void doExecute(final BulkRequest bulkRequest, final ActionListener<BulkResponse> listener) {
     ·······
if (autoCreateIndex.needToCheck()) {
        // Keep track of all unique indices and all unique types per index for the create index requests:
        final Map<String, Set<String>> indicesAndTypes = new HashMap<>();
        for (ActionRequest request : bulkRequest.requests) {
            if (request instanceof DocumentRequest) {
                DocumentRequest req = (DocumentRequest) request;
                Set<String> types = indicesAndTypes.get(req.index());
                if (types == null) {
                    indicesAndTypes.put(req.index(), types = new HashSet<>());
                }
                types.add(req.type());
            } else {
                throw new ElasticsearchException("Parsed unknown request in bulk actions: " + request.getClass().getSimpleName());
            }
        }
        final AtomicInteger counter = new AtomicInteger(indicesAndTypes.size());
        ClusterState state = clusterService.state();
        for (Map.Entry<String, Set<String>> entry : indicesAndTypes.entrySet()) {
            final String index = entry.getKey();
            if (autoCreateIndex.shouldAutoCreate(index, state)) {                            
          //判断是否自动创建索引，若不存在则创建索引，直接跳入execute执行，否则由executeBulk执行处理请求
                CreateIndexRequest createIndexRequest = new CreateIndexRequest(bulkRequest);
                createIndexRequest.index(index);
                for (String type : entry.getValue()) {
                    createIndexRequest.mapping(type);
                }
                createIndexRequest.cause("auto(bulk api)");
                createIndexRequest.masterNodeTimeout(bulkRequest.timeout());
                createIndexAction.execute(createIndexRequest, new ActionListener<CreateIndexResponse>() {
                    @Override
                    public void onResponse(CreateIndexResponse result) {
                        if (counter.decrementAndGet() == 0) {
                            try {
                                executeBulk(bulkRequest, startTime, listener, responses);
                            } catch (Throwable t) {
                                listener.onFailure(t);
                            }
                        }
                    }

                    @Override
                    public void onFailure(Throwable e) {
                        if (!(ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException)) {
                            // fail all requests involving this index, if create didnt work
                            for (int i = 0; i < bulkRequest.requests.size(); i++) {
                                ActionRequest request = bulkRequest.requests.get(i);
                                if (request != null && setResponseFailureIfIndexMatches(responses, i, request, index, e)) {
                                    bulkRequest.requests.set(i, null);
                                }
                            }
                        }
                        if (counter.decrementAndGet() == 0) {
                            try {
                                executeBulk(bulkRequest, startTime, listener, responses);
                            } catch (Throwable t) {
                                listener.onFailure(t);
                            }
                        }
                    }
                });
            } else {
                if (counter.decrementAndGet() == 0) {
                    executeBulk(bulkRequest, startTime, listener, responses);
                }
            }
        }
    } else {
        executeBulk(bulkRequest, startTime, listener, responses);
    }
}

bulk请求过程：

接着通过executeBulk方法进入bulk请求流程。在该方法中，对bulkRequest.requests 进行两次循环。

判断集群的是否block了读操作,如果是blocked，就会返回timeout

// TODO use timeout to wait here if its blocked...

clusterState.blocks().globalBlockedRaiseException(ClusterBlockLevel.WRITE);

第一次判定如果是IndexRequest就调用IndexRequest.process方法，主要是为了解析出timestamp,routing,id,parent 等字段。

for (int i = 0; i < bulkRequest.requests.size(); i++) {
    ActionRequest request = bulkRequest.requests.get(i);
    //the request can only be null because we set it to null in the previous step, so it gets ignored
    if (request == null) {
        continue;
    }
    DocumentRequest documentRequest = (DocumentRequest) request;
    if (addFailureIfIndexIsUnavailable(documentRequest, bulkRequest, responses, i, concreteIndices, metaData)) {
        continue;
    }
    String concreteIndex = concreteIndices.resolveIfAbsent(documentRequest);
    if (request instanceof IndexRequest) {
        IndexRequest indexRequest = (IndexRequest) request;
        MappingMetaData mappingMd = null;
        if (metaData.hasIndex(concreteIndex)) {
            mappingMd = metaData.index(concreteIndex).mappingOrDefault(indexRequest.type());
        }
        try {
            indexRequest.process(metaData, mappingMd, allowIdGeneration, concreteIndex);
        } catch (ElasticsearchParseException | RoutingMissingException e) {
            BulkItemResponse.Failure failure = new BulkItemResponse.Failure(concreteIndex, indexRequest.type(), indexRequest.id(), e);
            BulkItemResponse bulkItemResponse = new BulkItemResponse(i, "index", failure);
            responses.set(i, bulkItemResponse);
            // make sure the request gets never processed again
            bulkRequest.requests.set(i, null);
        }
    } else if (request instanceof DeleteRequest) {
        try {
            TransportDeleteAction.resolveAndValidateRouting(metaData, concreteIndex, (DeleteRequest)request);
        } catch(RoutingMissingException e) {
            BulkItemResponse.Failure failure = new BulkItemResponse.Failure(concreteIndex, documentRequest.type(), documentRequest.id(), e);
            BulkItemResponse bulkItemResponse = new BulkItemResponse(i, "delete", failure);
            responses.set(i, bulkItemResponse);
            // make sure the request gets never processed again
            bulkRequest.requests.set(i, null);
        }
    } else if (request instanceof UpdateRequest) {
        try {
            TransportUpdateAction.resolveAndValidateRouting(metaData, concreteIndex, (UpdateRequest)request);
        } catch(RoutingMissingException e) {
            BulkItemResponse.Failure failure = new BulkItemResponse.Failure(concreteIndex, documentRequest.type(), documentRequest.id(), e);
            BulkItemResponse bulkItemResponse = new BulkItemResponse(i, "update", failure);
            responses.set(i, bulkItemResponse);
            // make sure the request gets never processed again
            bulkRequest.requests.set(i, null);
        }
    } else {
        throw new AssertionError("request type not supported: [" + request.getClass().getName() + "]");
    }
}

进行一系列加工操作（获取routing、指定timestamp（没有就使用当前时间）、在

this.allowIdGeneration = this.settings.getAsBoolean("action.bulk.action.allow_id_generation", true);

配置为true的情况下，会自动生成一个base64UUID作为id字段，并会将request的opType字段置为CREATE，如果是使用es自动生成的id的话，默认就是createdocument而不是updatedocument）

public void process(MetaData metaData, @Nullable MappingMetaData mappingMd, boolean allowIdGeneration, String concreteIndex) {
    // resolve the routing if needed
    routing(metaData.resolveIndexRouting(routing, index));

    // resolve timestamp if provided externally
    if (timestamp != null) {
        timestamp = MappingMetaData.Timestamp.parseStringTimestamp(timestamp,
                mappingMd != null ? mappingMd.timestamp().dateTimeFormatter() : TimestampFieldMapper.Defaults.DATE_TIME_FORMATTER,
                getVersion(metaData, concreteIndex));
    }
    // extract values if needed
    if (mappingMd != null) {
        MappingMetaData.ParseContext parseContext = mappingMd.createParseContext(id, routing, timestamp);

        if (parseContext.shouldParse()) {
            XContentParser parser = null;
            try {
                parser = XContentHelper.createParser(source);
                mappingMd.parse(parser, parseContext);
                if (parseContext.shouldParseId()) {
                    id = parseContext.id();
                }
                if (parseContext.shouldParseRouting()) {
                    if (routing != null && !routing.equals(parseContext.routing())) {
                        throw new MapperParsingException("The provided routing value [" + routing + "] doesn't match the routing key stored in the document: [" + parseContext.routing() + "]");
                    }
                    routing = parseContext.routing();
                }
                if (parseContext.shouldParseTimestamp()) {
                    timestamp = parseContext.timestamp();
                    if (timestamp != null) {
                        timestamp = MappingMetaData.Timestamp.parseStringTimestamp(timestamp, mappingMd.timestamp().dateTimeFormatter(), getVersion(metaData, concreteIndex));
                    }
                }
            } catch (MapperParsingException e) {
                throw e;
            } catch (Exception e) {
                throw new ElasticsearchParseException("failed to parse doc to extract routing/timestamp/id", e);
            } finally {
                if (parser != null) {
                    parser.close();
                }
            }
        }

        // might as well check for routing here
        if (mappingMd.routing().required() && routing == null) {
            throw new RoutingMissingException(concreteIndex, type, id);
        }

        if (parent != null && !mappingMd.hasParentField()) {
            throw new IllegalArgumentException("Can't specify parent if no parent field has been configured");
        }
    } else {
        if (parent != null) {
            throw new IllegalArgumentException("Can't specify parent if no parent field has been configured");
        }
    }

    // generate id if not already provided and id generation is allowed
    if (allowIdGeneration) {
        if (id == null) {
            id(Strings.base64UUID());
            // since we generate the id, change it to CREATE
            opType(IndexRequest.OpType.CREATE);
            autoGeneratedId = true;
        }
    }

    // generate timestamp if not provided, we always have one post this stage...
    if (timestamp == null) {
        String defaultTimestamp = TimestampFieldMapper.Defaults.DEFAULT_TIMESTAMP;
        if (mappingMd != null && mappingMd.timestamp() != null) {
            // If we explicitly ask to reject null timestamp
            if (mappingMd.timestamp().ignoreMissing() != null && mappingMd.timestamp().ignoreMissing() == false) {
                throw new TimestampParsingException("timestamp is required by mapping");
            }
            defaultTimestamp = mappingMd.timestamp().defaultTimestamp();
        }

        if (defaultTimestamp.equals(TimestampFieldMapper.Defaults.DEFAULT_TIMESTAMP)) {
            timestamp = Long.toString(System.currentTimeMillis());
        } else {
            timestamp = MappingMetaData.Timestamp.parseStringTimestamp(defaultTimestamp, mappingMd.timestamp().dateTimeFormatter(), getVersion(metaData, concreteIndex));
        }
    }
}

第二次是为了对数据进行分拣，构建如下的数据结构

// first, go over all the requests and create a ShardId -> Operations mapping
Map<ShardId, List<BulkItemRequest>> requestsByShard = Maps.newHashMap();

接着对新形成的这个结构(ShardId -> List[BulkItemRequest])做循环，也就是针对每个ShardId里的数据进行统一处理。

for (int i = 0; i < bulkRequest.requests.size(); i++) {
    ActionRequest request = bulkRequest.requests.get(i);
    if (request instanceof IndexRequest) {
        IndexRequest indexRequest = (IndexRequest) request;
        String concreteIndex = concreteIndices.getConcreteIndex(indexRequest.index());

          // 获取每个request应该发送到的shardId(获取过程： request有routing就直接返回，如果没有，会先对id求一个hash

        ShardId shardId = clusterService.operationRouting().indexShards(clusterState, concreteIndex, indexRequest.type(), indexRequest.id(), indexRequest.routing()).shardId();
        List<BulkItemRequest> list = requestsByShard.get(shardId);
        if (list == null) {
            list = new ArrayList<>();
            requestsByShard.put(shardId, list);
        }
        list.add(new BulkItemRequest(i, request));
    } else if (request instanceof DeleteRequest) {
        DeleteRequest deleteRequest = (DeleteRequest) request;
        String concreteIndex = concreteIndices.getConcreteIndex(deleteRequest.index());
        ShardId shardId = clusterService.operationRouting().indexShards(clusterState, concreteIndex, deleteRequest.type(), deleteRequest.id(), deleteRequest.routing()).shardId();
        List<BulkItemRequest> list = requestsByShard.get(shardId);
        if (list == null) {
            list = new ArrayList<>();
            requestsByShard.put(shardId, list);
        }
        list.add(new BulkItemRequest(i, request));
    } else if (request instanceof UpdateRequest) {
        UpdateRequest updateRequest = (UpdateRequest) request;
        String concreteIndex = concreteIndices.getConcreteIndex(updateRequest.index());
        ShardId shardId = clusterService.operationRouting().indexShards(clusterState, concreteIndex, updateRequest.type(), updateRequest.id(), updateRequest.routing()).shardId();
        List<BulkItemRequest> list = requestsByShard.get(shardId);
        if (list == null) {
            list = new ArrayList<>();
            requestsByShard.put(shardId, list);
        }
        list.add(new BulkItemRequest(i, request));
    }
}

获取shardId对request按shard来分组，进而进行分片处理。

private int generateShardId(ClusterState clusterState, String index, String type, String id, @Nullable String routing) {
    IndexMetaData indexMetaData = clusterState.metaData().index(index);
    if (indexMetaData == null) {
        throw new IndexNotFoundException(index);
    }
    final Version createdVersion = indexMetaData.getCreationVersion();
    final HashFunction hashFunction = indexMetaData.getRoutingHashFunction();
    final boolean useType = indexMetaData.getRoutingUseType();

    final int hash;
    if (routing == null) {
        if (!useType) {
            hash = hash(hashFunction, id);
        } else {
            hash = hash(hashFunction, type, id);
        }
    } else {
        hash = hash(hashFunction, routing);
    }
    if (createdVersion.onOrAfter(Version.V_2_0_0_beta1)) {
        return MathUtils.mod(hash, indexMetaData.getNumberOfShards());
    } else {
        return Math.abs(hash % indexMetaData.getNumberOfShards());
    }
}

有了ShardId,bulkRequest,List[BulkItemRequest]等信息后，遍历map 统一封装成BulkShardRequest，就是对属于同一ShardId的数据构建一个新的类似BulkRequest的对象。包含配置consistencyLevel和timeout。

for (Map.Entry<ShardId, List<BulkItemRequest>> entry : requestsByShard.entrySet()) {
    final ShardId shardId = entry.getKey();
    final List<BulkItemRequest> requests = entry.getValue();
    BulkShardRequest bulkShardRequest = new BulkShardRequest(bulkRequest, shardId, bulkRequest.refresh(), requests.toArray(new BulkItemRequest[requests.size()]));

    bulkShardRequest.consistencyLevel(bulkRequest.consistencyLevel());
    bulkShardRequest.timeout(bulkRequest.timeout());

    //这里的shardBulkAction 是TransportShardBulkAction
    shardBulkAction.execute(bulkShardRequest, new ActionListener<BulkShardResponse>() {
        @Override
        public void onResponse(BulkShardResponse bulkShardResponse) {
            for (BulkItemResponse bulkItemResponse : bulkShardResponse.getResponses()) {
                // we may have no response if item failed
                if (bulkItemResponse.getResponse() != null) {
                    bulkItemResponse.getResponse().setShardInfo(bulkShardResponse.getShardInfo());
                }
                responses.set(bulkItemResponse.getItemId(), bulkItemResponse);
            }
            if (counter.decrementAndGet() == 0) {
                finishHim();
            }
        }

       ·······························
}

根据继承关系TransportShardBulkAction < TransportReplicationAction < TransportAction ，shardBulkAction.execute其实流程逻辑还是TransportReplicationAction来完成的。入口在该类的doExecute方法:

/**
* Responsible for routing and retrying failed operations on the primary.
* The actual primary operation is done in {@linkPrimaryPhase} on the
* node with primary copy.
*
* Resolves index and shard id for the request before routing it to target node
*/
@Override
protected void doExecute(Task task, Request request, ActionListener<Response> listener) {
    new ReroutePhase((ReplicationTask) task, request, listener).run();
}

#####不直接从primary分片复制，而是分别写入，防止主分片数据损坏导致从分片数据损坏

一条数据被索引后需要经过两个阶段：

将数据写入Primary(主分片)
将数据写入Replication(从分片)

并从集群state中获得primary shard，如果primary shard不是active的会有retry机制。如果primary在本机就直接执行，如果不在会再发送到其shard所在的node。

/**
* Responsible for routing and retrying failed operations on the primary.
* The actual primary operation is done in {@link PrimaryPhase} on the
* node with primary copy.
*
* Resolves index and shard id for the request before routing it to target node
*/
final class ReroutePhase extends AbstractRunnable {

@Override
protected void doRun() {
    setPhase(task, "routing");
    final ClusterState state = observer.observedState();
    ClusterBlockException blockException = state.blocks().globalBlockedException(globalBlockLevel());
    if (blockException != null) {
        handleBlockException(blockException);
        return;
    }
    final String concreteIndex = resolveIndex() ? indexNameExpressionResolver.concreteSingleIndex(state, request) : request.index();
    blockException = state.blocks().indexBlockedException(indexBlockLevel(), concreteIndex);
    if (blockException != null) {
        handleBlockException(blockException);
        return;
    }
     //请求没有shardId,需要传递具体的索引来解析sharId
    resolveRequest(state.metaData(), concreteIndex, request);
    assert request.shardId() != null : "request shardId must be set in resolveRequest";

    IndexShardRoutingTable indexShard = state.getRoutingTable().shardRoutingTable(request.shardId().getIndex(), request.shardId().id());
    final ShardRouting primary = indexShard.primaryShard();
     //如果primary shard不是active的会有retry机制
    if (primary == null || primary.active() == false) {
        logger.trace("primary shard [{}] is not yet active, scheduling a retry: action [{}], request [{}], cluster state version [{}]", request.shardId(), actionName, request, state.version());
        retryBecauseUnavailable(request.shardId(), "primary shard is not active");
        return;
    }
    if (state.nodes().nodeExists(primary.currentNodeId()) == false) {
        logger.trace("primary shard [{}] is assigned to an unknown node [{}], scheduling a retry: action [{}], request [{}], cluster state version [{}]", request.shardId(), primary.currentNodeId(), actionName, request, state.version());
        retryBecauseUnavailable(request.shardId(), "primary shard isn't assigned to a known node.");
        return;
    }
    final DiscoveryNode node = state.nodes().get(primary.currentNodeId());
    taskManager.registerChildTask(task, node.getId());
     
     //如果primary在本机就直接执行，如果不在会再发送到其shard所在的node
    if (primary.currentNodeId().equals(state.nodes().localNodeId())) {
        setPhase(task, "waiting_on_primary");
        if (logger.isTraceEnabled()) {
            logger.trace("send action [{}] on primary [{}] for request [{}] with cluster state version [{}] to [{}] ", transportPrimaryAction, request.shardId(), request, state.version(), primary.currentNodeId());
        }
        performAction(node, transportPrimaryAction, true);
    } else {
        if (logger.isTraceEnabled()) {
            logger.trace("send action [{}] on primary [{}] for request [{}] with cluster state version [{}] to [{}]", actionName, request.shardId(), request, state.version(), primary.currentNodeId());
        }
        setPhase(task, "rerouted");   
        performAction(node, actionName, false);
    }
}


private void performAction(final DiscoveryNode node, final String action, final boolean isPrimaryAction) {

    transportService.sendRequest(node, action, request, transportOptions, new BaseTransportResponseHandler<Response>() {

......
}}

TransportService.java ：

private void sendLocalRequest(long requestId, final String action, final TransportRequest request) {
    final DirectResponseChannel channel = new DirectResponseChannel(logger, localNode, action, requestId, adapter, threadPool);
    try {
        final RequestHandlerRegistry reg = adapter.getRequestHandler(action);
        if (reg == null) {
            throw new ActionNotFoundTransportException("Action [" + action + "] not found");
        }
        final String executor = reg.getExecutor();
        if (ThreadPool.Names.SAME.equals(executor)) {
            //noinspection unchecked
            reg.processMessageReceived(request, channel);
        } else {
            threadPool.executor(executor).execute(new AbstractRunnable() {
                @Override
                protected void doRun() throws Exception {
                    //noinspection unchecked
                    reg.processMessageReceived(request, channel);
                }

                @Override
                public boolean isForceExecution() {
                    return reg.isForceExecution();
                }

                @Override
                public void onFailure(Throwable e) {
                    try {
                        channel.sendResponse(e);
                    } catch (Throwable e1) {
                        logger.warn("failed to notify channel of error message for action [" + action + "]", e1);
                        logger.warn("actual exception", e);
                    }
                }
            });
        }

    } catch (Throwable e) {
        try {
            channel.sendResponse(e);
        } catch (Throwable e1) {
            logger.warn("failed to notify channel of error message for action [" + action + "]", e1);
            logger.warn("actual exception", e1);
        }
    }

}

//最后的请求实现还是primary的doRun来实现

class PrimaryOperationTransportHandler extends TransportRequestHandler<Request> {
    @Override
    public void messageReceived(final Request request, final TransportChannel channel) throws Exception {
        throw new UnsupportedOperationException("the task parameter is required for this operation");
    }

    @Override
    public void messageReceived(Request request, TransportChannel channel, Task task) throws Exception {
        new PrimaryPhase((ReplicationTask) task, request, channel).run();
    }
}

/**
* Responsible for performing primary operation locally and delegating to replication action once successful
* <p>
* Note that as soon as we move to replication action, state responsibility is transferred to {@link ReplicationPhase}.
*/
final class PrimaryPhase extends AbstractRunnable {

     ................

    @Override
    public void onFailure(Throwable e) {
        finishAsFailed(e);
    }

    @Override
    protected void doRun() throws Exception {
        setPhase(task, "primary");
        // request shardID was set in ReroutePhase
        assert request.shardId() != null : "request shardID must be set prior to primary phase";
        final ShardId shardId = request.shardId();
        final String writeConsistencyFailure = checkWriteConsistency(shardId);
        if (writeConsistencyFailure != null) {
            finishBecauseUnavailable(shardId, writeConsistencyFailure);
            return;
        }
        final ReplicationPhase replicationPhase;
        try {
            indexShardReference = getIndexShardOperationsCounter(shardId);
               //primary写入索引
            Tuple<Response, ReplicaRequest> primaryResponse =shardOperationOnPrimary(state.metaData(), request);
            if (logger.isTraceEnabled()) {
                logger.trace("action [{}] completed on shard [{}] for request [{}] with cluster state version [{}]", transportPrimaryAction, shardId, request, state.version());
            }
           replicationPhase = new ReplicationPhase(task, primaryResponse.v2(), primaryResponse.v1(), shardId, channel,
                indexShardReference);
        } catch (Throwable e) {
            request.setCanHaveDuplicates();
            if (ExceptionsHelper.status(e) == RestStatus.CONFLICT) {
                if (logger.isTraceEnabled()) {
                    logger.trace("failed to execute [{}] on [{}]", e, request, shardId);
                }
            } else {
                if (logger.isDebugEnabled()) {
                    logger.debug("failed to execute [{}] on [{}]", e, request, shardId);
                }
            }
            finishAsFailed(e);
            return;
        }
          //primary执行结束，转交replication执行请求
        finishAndMoveToReplication(replicationPhase);
    }

    /**
    * checks whether we can perform a write based on the write consistency setting
    * returns **null* if OK to proceed, or a string describing the reason to stop
    */
    String checkWriteConsistency(ShardId shardId) {
        ......
    }

    /**
    * upon success, finish the first phase and transfer responsibility to the {@link ReplicationPhase}
    */
    void finishAndMoveToReplication(ReplicationPhase replicationPhase) {
        if (finished.compareAndSet(false, true)) {
           replicationPhase.run();
        } else {
            assert false : "finishAndMoveToReplication called but operation is already finished";
        }
    }

        ......

primary写入过程：接着进入shardOperationOnPrimary 方法,该方法是在孙子类TransportShardBulkAction类里实现的。

protected Tuple<BulkShardResponse, BulkShardRequest> shardOperationOnPrimary(MetaData metaData, BulkShardRequest request) {
           ......
          //版本号实现了对并发修改的支持
    long[] preVersions = new long[request.items().length];
    VersionType[] preVersionTypes = new VersionType[request.items().length];
    Translog.Location location = null;

//这里的request是BulkShardRequest,对应的items则是BulkItemRequest集合，遍历根据BulkItemRequest的不同类型分为三个分支
//分别是IndexRequest,DeleteRequest,UpdateRequest三种

    for (int requestIndex = 0; requestIndex < request.items().length; requestIndex++) {
        BulkItemRequest item = request.items()[requestIndex];
        if (item.request() instanceof IndexRequest) {
            IndexRequest indexRequest = (IndexRequest) item.request();
            preVersions[requestIndex] = indexRequest.version();
            preVersionTypes[requestIndex] = indexRequest.versionType();
            try {
                WriteResult<IndexResponse> result = shardIndexOperation(request, indexRequest, metaData, indexShard, true);
                location = locationToSync(location, result.location);
                // add the response
                IndexResponse indexResponse = result.response();
                setResponse(item, new BulkItemResponse(item.id(), indexRequest.opType().lowercase(), indexResponse));
            } catch (Throwable e) {
                // rethrow the failure if we are going to retry on primary and let parent failure to handle it
                if (retryPrimaryException(e)) {
                    // restore updated versions...
                    for (int j = 0; j < requestIndex; j++) {
                        applyVersion(request.items()[j], preVersions[j], preVersionTypes[j]);
                    }
                    throw (ElasticsearchException) e;
                }
                if (ExceptionsHelper.status(e) == RestStatus.CONFLICT) {
                    logger.trace("{} failed to execute bulk item (index) {}", e, request.shardId(), indexRequest);
                } else {
                    logger.debug("{} failed to execute bulk item (index) {}", e, request.shardId(), indexRequest);
                }
                // if its a conflict failure, and we already executed the request on a primary (and we execute it
                // again, due to primary relocation and only processing up to N bulk items when the shard gets closed)
                // then just use the response we got from the successful execution
                if (item.getPrimaryResponse() != null && isConflictException(e)) {
                    setResponse(item, item.getPrimaryResponse());
                } else {
                    setResponse(item, new BulkItemResponse(item.id(), indexRequest.opType().lowercase(),
                            new BulkItemResponse.Failure(request.index(), indexRequest.type(), indexRequest.id(), e)));
                }
            }
        } else if (item.request() instanceof DeleteRequest) {
          ······
        } else if (item.request() instanceof UpdateRequest) {
          ······

    processAfterWrite(request.refresh(), indexShard, location);
    BulkItemResponse[] responses = new BulkItemResponse[request.items().length];
    BulkItemRequest[] items = request.items();
    for (int i = 0; i < items.length; i++) {
        responses[i] = items[i].getPrimaryResponse();
    }
    return new Tuple<>(new BulkShardResponse(request.shardId(), responses), request);
}

shardIndexOperation里嵌套的核心方法是executeIndexRequestOnPrimary,该方法第一步是获取到Operation对象,

Engine对象是对Lucene的IndexWriter，Searcher之类的封装。这里的Engine.IndexingOperation对应的是Create或者Index类。这两个类理解为待索引的Document的Operation

/**
* Execute the given {@link IndexRequest} on a primary shard, throwing a
* {@link RetryOnPrimaryException} if the operation needs to be re-tried.
*/
public static WriteResult<IndexResponse> executeIndexRequestOnPrimary(BulkShardRequest shardRequest, IndexRequest request, IndexShard indexShard, MappingUpdatedAction mappingUpdatedAction) throws Throwable {
          //返回一个创建索引的operation，由execute来执行
    Engine.IndexingOperation operation = prepareIndexOperationOnPrimary(shardRequest, request, indexShard);
     //第二步是判断索引的Mapping是不是要动态更新，如果是，则更新
    Mapping update = operation.parsedDoc().dynamicMappingsUpdate();
    final ShardId shardId = indexShard.shardId();
    if (update != null) {
        final String indexName = shardId.getIndex();
        mappingUpdatedAction.updateMappingOnMasterSynchronously(indexName, request.type(), update);
        operation = prepareIndexOperationOnPrimary(shardRequest, request, indexShard);
        update = operation.parsedDoc().dynamicMappingsUpdate();
        if (update != null) {
            throw new RetryOnPrimaryException(shardId,
                "Dynamic mappings are not available on the node that holds the primary yet");
        }
    }
     //最后由execute执行实际的建索引工作
    final boolean created = operation.execute(indexShard);

     //执行结束后就能获得对应文档的版本等信息，这些信息会更新对应的IndexRequest等对象。

    // update the version on request so it will happen on the replicas
    final long version = operation.version();
    request.version(version);
    request.versionType(request.versionType().versionTypeForReplicationAndRecovery());

    assert request.versionType().validateVersionForWrites(request.version());

    return new WriteResult<>(new IndexResponse(shardId.getIndex(), request.type(), request.id(), request.version(), created), operation.getTranslogLocation());
}

1、在primary的节点上，遍历上文生成的BulkItemRequest中的request，分别对每一个request做处理（这里包括生成uid，对nested、parent-child类型做处理等等）。，根据上文中的opType字段来分为INDEX和CREATE两种operation。

/**
* Utility method to create either an index or a create operation depending
* on the {@link IndexRequest.OpType} of the request.
*/
public static Engine.IndexingOperation prepareIndexOperationOnPrimary(BulkShardRequest shardRequest, IndexRequest request, IndexShard indexShard) {
    SourceToParse sourceToParse = SourceToParse.source(SourceToParse.Origin.PRIMARY, request.source()).index(request.index()).type(request.type()).id(request.id())
        .routing(request.routing()).parent(request.parent()).timestamp(request.timestamp()).ttl(request.ttl());
    boolean canHaveDuplicates = request.canHaveDuplicates();
    if (shardRequest != null) {
        canHaveDuplicates |= shardRequest.canHaveDuplicates();
    }
    if (request.opType() == IndexRequest.OpType.INDEX) {
        return indexShard.prepareIndexOnPrimary(sourceToParse, request.version(), request.versionType(), canHaveDuplicates);
    } else {
        assert request.opType() == IndexRequest.OpType.CREATE : request.opType();
        return indexShard.prepareCreateOnPrimary(sourceToParse, request.version(), request.versionType(), canHaveDuplicates, request.autoGeneratedId());
    }
}

static Engine.Index prepareIndex(DocumentMapperForType docMapper, SourceToParse source, long version, VersionType versionType, Engine
        .Operation.Origin origin, boolean canHaveDuplicates) {
    long startTime = System.nanoTime();
     /** 解析json为ParsedDocument */
    ParsedDocument doc = docMapper.getDocumentMapper().parse(source);
    if (docMapper.getMapping() != null) {
        doc.addDynamicMappingsUpdate(docMapper.getMapping());
    }
     
    return new Engine.Index(docMapper.getDocumentMapper().uidMapper().term(doc.uid().stringValue()), doc, version, versionType,
            origin, startTime, canHaveDuplicates);
}

首先来看看直接CREATE的：

最后调用indexShard对象的create方法来进行索引的创建

IndexShard.java

public void create(Engine.Create create) {
    ensureWriteAllowed(create);
    markLastWrite();
    create = indexingService.preCreate(create);
    try {
        if (logger.isTraceEnabled()) {
            logger.trace("index [{}][{}]{}", create.type(), create.id(), create.docs());
        }
        engine().create(create);
        create.endTime(System.nanoTime());
    } catch (Throwable ex) {
        indexingService.postCreate(create, ex);
        throw ex;
    }
    indexingService.postCreate(create);
}

engine()方法返回的是Engine实例，create方法最后实现是由InternalEngine .innerCreate方法来执行构建索引操作。

因为写入时并发的，所以对于每一个写入都加了锁，synchronized (dirtyLock(create.uid()))用uid来判别，防止写入脏数据。

private void innerCreate(Create create) throws IOException {

     /**首先，如果满足如下三个条件就无需进行版本检查：
*- index.optimize_auto_generated_id 被设置为true
*- id设置为自动生成
*- create.canHaveDuplicates == false 
          /*
     ！！！采用自动生成的ID,可以跳过版本检查，从而提高入库的效率。

    if (engineConfig.isOptimizeAutoGenerateId() && create.autoGeneratedId() && !create.canHaveDuplicates()) {
        // We don't need to lock because this ID cannot be concurrently updated:
        innerCreateNoLock(create, Versions.NOT_FOUND, null);
    } else {
        synchronized (dirtyLock(create.uid())) {
            final long currentVersion;
            final VersionValue versionValue;
          //如果对应文档在缓存中没有找到(versionMap),那么就会由如下的代码执行实际磁盘查询操作
            versionValue = versionMap.getUnderLock(create.uid().bytes());
            if (versionValue == null) {
                currentVersion = loadCurrentVersionFromIndex(create.uid());
            } else {
                if (engineConfig.isEnableGcDeletes() && versionValue.delete() && (engineConfig.getThreadPool().estimatedTimeInMillis
                        () - versionValue.time()) > engineConfig.getGcDeletesInMillis()) {
                    currentVersion = Versions.NOT_FOUND; // deleted, and GC
                } else {
                    currentVersion = versionValue.version();
                }
            }
            innerCreateNoLock(create, currentVersion, versionValue);
        }
    }
}

通过对比create对象里的版本号和从索引文件里加载的版本号，最终决定是进行update还是create操作。

开始写入索引indexWrite：indexWriter.updateDocuments(create.uid(), create.docs());具体细节由lucence去做

private void innerCreateNoLock(Create create, long currentVersion, VersionValue versionValue) throws IOException {

    // same logic as index
    long updatedVersion;
    long expectedVersion = create.version();
    if (create.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) {
        if (create.origin() == Operation.Origin.RECOVERY) {
            return;
        } else {
            throw new VersionConflictEngineException(shardId, create.type(), create.id(), currentVersion, expectedVersion);
        }
    }
    updatedVersion = create.versionType().updateVersion(currentVersion, expectedVersion);

    // if the doc exists
    boolean doUpdate = false;
    if ((versionValue != null && versionValue.delete() == false) || (versionValue == null && currentVersion != Versions.NOT_FOUND)) {
        if (create.origin() == Operation.Origin.RECOVERY) {
            return;
        } else if (create.origin() == Operation.Origin.REPLICA) {
            // #7142: the primary already determined it's OK to index this document, and we confirmed above that the version doesn't
            // conflict, so we must also update here on the replica to remain consistent:
            doUpdate = true;
        } else if (create.origin() == Operation.Origin.PRIMARY && create.autoGeneratedId() && create.canHaveDuplicates() &&
                currentVersion == 1 && create.version() == Versions.MATCH_ANY) {
            /**
            * If bulk index request fails due to a disconnect, unavailable shard etc. then the request is
            * retried before it actually fails. However, the documents might already be indexed.
            * For autogenerated ids this means that a version conflict will be reported in the bulk request
            * although the document was indexed properly.
            * To avoid this we have to make sure that the index request is treated as an update and set updatedVersion to 1.
            * See also discussion on https://github.com/elasticsearch/elasticsearch/pull/9125
            */
            doUpdate = true;
            updatedVersion = 1;
        } else {
            // On primary, we throw DAEE if the _uid is already in the index with an older version:
            assert create.origin() == Operation.Origin.PRIMARY;
            throw new DocumentAlreadyExistsException(shardId, create.type(), create.id());
        }
    }

    create.updateVersion(updatedVersion);

    if (doUpdate) {
        if (create.docs().size() > 1) {
            indexWriter.updateDocuments(create.uid(), create.docs());
        } else {
            indexWriter.updateDocument(create.uid(), create.docs().get(0));
        }
    } else {
        if (create.docs().size() > 1) {
            indexWriter.addDocuments(create.docs());
        } else {
            indexWriter.addDocument(create.docs().get(0));
        }
    }
     //写入translog
    Translog.Location translogLocation = translog.add(new Translog.Create(create));

    versionMap.putUnderLock(create.uid().bytes(), new VersionValue(updatedVersion, translogLocation));
    create.setTranslogLocation(translogLocation);
    indexingService.postCreateUnderLock(create);
}

再来看看INDEX的：

将uid作为一个term去索引里面查找version的值，根据是否查到version来决定是add还是update：这里的update其实是将原来的document置为delete状态，并将新的document插入，merge的时候会将delete状态的document删除掉。

private boolean innerIndex(Index index) throws IOException {
    synchronized (dirtyLock(index.uid())) {
        final long currentVersion;
        VersionValue versionValue = versionMap.getUnderLock(index.uid().bytes());
        if (versionValue == null) {
            currentVersion = loadCurrentVersionFromIndex(index.uid());
        } else {
            if (engineConfig.isEnableGcDeletes() && versionValue.delete() && (engineConfig.getThreadPool().estimatedTimeInMillis() -
                    versionValue.time()) > engineConfig.getGcDeletesInMillis()) {
                currentVersion = Versions.NOT_FOUND; // deleted, and GC
            } else {
                currentVersion = versionValue.version();
            }
        }

        long updatedVersion;
        long expectedVersion = index.version();
        if (index.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) {
            if (index.origin() == Operation.Origin.RECOVERY) {
                return false;
            } else {
                throw new VersionConflictEngineException(shardId, index.type(), index.id(), currentVersion, expectedVersion);
            }
        }
        updatedVersion = index.versionType().updateVersion(currentVersion, expectedVersion);

        final boolean created;
        index.updateVersion(updatedVersion);
        if (currentVersion == Versions.NOT_FOUND) {
            // document does not exists, we can optimize for create
            created = true;
            if (index.docs().size() > 1) {
                indexWriter.addDocuments(index.docs());
            } else {
                indexWriter.addDocument(index.docs().get(0));
            }
        } else {
            if (versionValue != null) {
                created = versionValue.delete(); // we have a delete which is not GC'ed...
            } else {
                created = false;
            }
            if (index.docs().size() > 1) {
                indexWriter.updateDocuments(index.uid(), index.docs());
            } else {
                indexWriter.updateDocument(index.uid(), index.docs().get(0));
            }
        }
        Translog.Location translogLocation = translog.add(new Translog.Index(index));

        versionMap.putUnderLock(index.uid().bytes(), new VersionValue(updatedVersion, translogLocation));
        index.setTranslogLocation(translogLocation);
        indexingService.postIndexUnderLock(index);
        return created;
    }
}

完成数据索引之后，索引数据写入完成，但是只是将数据写入了buffer和translog里面。后面还有refresh和flush，保证数据可查和安全性。

如果请求里面要求refresh，则会立即refresh，如果要求立即将translog写入storage也会立即执行。

Replication写入流程：

Replication 流程大致和Primary 相同，ReplicationPhase的doRun方法是入口，核心方法是performOnReplica 发送replica operation到目标节点

,如果发现Replication shardId所属的节点就是自己的话，异步执行shardOperationOnReplica，大体逻辑如下：

/**
* Responsible for sending replica requests (see {@link AsyncReplicaAction}) to nodes with replica copy, including
* relocating copies
*/
final class ReplicationPhase extends AbstractRunnable {

     ......

    /**
    * start sending replica requests to target nodes
    */
    @Override
    protected void doRun() {
        setPhase(task, "replicating");
        if (pending.get() == 0) {
            doFinish();
            return;
        }
        for (ShardRouting shard : shards) {
            if (shard.primary() == false && executeOnReplica == false) {
                // If the replicas use shadow replicas, there is no reason to
                // perform the action on the replica, so skip it and
                // immediately return

                // this delays mapping updates on replicas because they have
                // to wait until they get the new mapping through the cluster
                // state, which is why we recommend pre-defined mappings for
                // indices using shadow replicas
                continue;
            }
            if (shard.unassigned()) {
                continue;
            }
            // we index on a replica that is initializing as well since we might not have got the event
            // yet that it was started. We will get an exception IllegalShardState exception if its not started
            // and that's fine, we will ignore it

            // we never execute replication operation locally as primary operation has already completed locally
            // hence, we ignore any local shard for replication
            if (nodes.localNodeId().equals(shard.currentNodeId()) == false) {
                performOnReplica(shard);
            }
            // send operation to relocating shard
            if (shard.relocating()) {
                performOnReplica(shard.buildTargetRelocatingShard());
            }
        }
    }

    /**
    * send replica operation to target node
    */
    void performOnReplica(final ShardRouting shard) {
        // if we don't have that node, it means that it might have failed and will be created again, in
        // this case, we don't have to do the operation, and just let it failover
        final String nodeId = shard.currentNodeId();
        if (!nodes.nodeExists(nodeId)) {
            logger.trace("failed to send action [{}] on replica [{}] for request [{}] due to unknown node [{}]", transportReplicaAction, shard.shardId(), replicaRequest, nodeId);
            onReplicaFailure(nodeId, null);
            return;
        }
        if (logger.isTraceEnabled()) {
            logger.trace("send action [{}] on replica [{}] for request [{}] to [{}]", transportReplicaAction, shard.shardId(), replicaRequest, nodeId);
        }

        final DiscoveryNode node = nodes.get(nodeId);
        transportService.sendRequest(node, transportReplicaAction, replicaRequest, transportOptions, new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {

    }

    .......
   
}

class ReplicaOperationTransportHandler extends TransportRequestHandler<ReplicaRequest> {
    @Override
    public void messageReceived(final ReplicaRequest request, final TransportChannel channel) throws Exception {
        throw new UnsupportedOperationException("the task parameter is required for this operation");
    }

    @Override
    public void messageReceived(ReplicaRequest request, TransportChannel channel, Task task) throws Exception {
        new AsyncReplicaAction(request, channel, (ReplicationTask) task).run();
    }
}

最后执行由AsyncReplicaAction的 doRun实现。

protected void doRun() throws Exception {
    setPhase(task, "replica");
    assert request.shardId() != null : "request shardId must be set";
    try (Releasable ignored = getIndexShardOperationsCounter(request.shardId())) {
        shardOperationOnReplica(request);
        if (logger.isTraceEnabled()) {
            logger.trace("action [{}] completed on shard [{}] for request [{}]", transportReplicaAction, request.shardId(), request);
        }
    }
    setPhase(task, "finished");
    channel.sendResponse(TransportResponse.Empty.INSTANCE);
}

在Replication阶段，shardOperationOnReplica 该方法完成了索引内容解析，mapping动态新增，最后进入索引(operation.execute)等动作，在Primary 和 Replication , 一个BulkShardRequest 处理完成后(也就是一个Shard 对应的数据集合)，会刷写Translog日志。

5、完成数据索引之后，如果请求里面要求refresh，则会立即refresh，如果要求立即将translog写入storage也会立即执行。

阅读全文

1 0