Zookeeper源码解析——快速选举流程

来源:互联网 发布:巨人网络a股上市 编辑:程序博客网 时间:2024/06/05 20:07

一 解析过程

由于没有zookeeper官网上看到关于算法的相关介绍,可能是没仔细搜,毕竟有点麻烦,于是就参考了网上关于快速选举的介绍。再结合zk 3.5.0的源码进行验证和补充。

二 快速选举

概念

  • 服务器ID: 即配置的myId

    id越大,选举时权重越高

  • 数据id:即zkid, 这里指本地最新snapshot的id

    id越大说明数据越新,选举时权重越高

  • 逻辑时钟:logicalclock

    含义是投票的轮次,同轮投票,时钟相同,依轮次递增,如果收到低于当前轮次的投票结果,该投票无效,需更新到当前轮次和当前的投票结果。

  • 选举状态

    1. LOOKING,竞选状态。
    2. FOLLOWING,随从状态,同步leader状态,参与投票。
    3. OBSERVING,观察状态,同步leader状态,不参与投票。
    4. LEADING,领导者状态。
  • 投票内容

    选举人ID
    选举人数据ID
    选举人选举轮数
    选举人选举状态
    推举人ID
    推举人选举轮数

  • 选举流程图
    根据源码梳理出流程图,请结合下面的源码实现去理解
    fle流程图

三 源码实现

3.1 类图

FastLeaderElection 类图

FastLeaderElection是一个高内聚的Election接口实现。通过start,shutdown启动或停止选举,通过lookForLeader参与选举流程。借助Messenger对投票内容进行处理,最终借由QuorumCnxManager进行数据传输。

3.2 源码说明

Constructor & init

    public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager){        this.stop = false;        this.manager = manager;        starter(self, manager);    }    private void starter(QuorumPeer self, QuorumCnxManager manager) {        this.self = self;        proposedLeader = -1;        proposedZxid = -1;        sendqueue = new LinkedBlockingQueue<ToSend>();        recvqueue = new LinkedBlockingQueue<Notification>();        this.messenger = new Messenger(manager);    }    Messenger(QuorumCnxManager manager) {        this.ws = new WorkerSender(manager);        this.wsThread = new Thread(this.ws,                "WorkerSender[myid=" + self.getId() + "]");        this.wsThread.setDaemon(true);        this.wr = new WorkerReceiver(manager);        this.wrThread = new Thread(this.wr,                "WorkerReceiver[myid=" + self.getId() + "]");        this.wrThread.setDaemon(true);    }

start

    //FastLeaderElection.start    public void start() {        this.messenger.start();    }    //Messenger.start, messager会处理消息的收发,socket通信和收发包则在QuorumCnxManager类里。    void start(){        //WorkerSender        this.wsThread.start();        //WorderReceiver        this.wrThread.start();    }

LookForLeader

lookForLeader的调用是在QuorumPeer.run时,如果服务在LOOKING状态,则不断参与选举,直至选出Leader,切换成其他状态。

public Vote lookForLeader() throws InterruptedException {            ...            //接收到的投票集合            HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();            HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();            int notTimeout = finalizeWait;            synchronized(this){                //进入新一轮选举,logicalclock+1                logicalclock.incrementAndGet();                //初始化提议,getInitId会将自己作为初始的leader                updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());            }            //先广播下当前的决议            sendNotifications();            //只要状态还在LOOKING,就循环直到选出leader            while ((self.getPeerState() == ServerState.LOOKING) &&                    (!stop)){                /*                 * Remove next notification from queue, times out after 2 times                 * the termination time                 * 一次循环处理一个接收到的消息                 */                Notification n = recvqueue.poll(notTimeout,                        TimeUnit.MILLISECONDS);                /*                 * Sends more notifications if haven't received enough.                 * Otherwise processes new notification.                 */                if(n == null){                    if(manager.haveDelivered()){                        //如果所有服务器的待发消息都发完了,就广播自己的投票结果                        sendNotifications();                    } else {                        //                        manager.connectAll();                    }                    /*                     * Exponential backoff, 适当延长超时时间                     */                    int tmpTimeOut = notTimeout*2;                    notTimeout = (tmpTimeOut < maxNotificationInterval?                            tmpTimeOut : maxNotificationInterval);                    LOG.info("Notification time out: " + notTimeout);                }                 else if (self.getCurrentAndNextConfigVoters().contains(n.sid)) {                    /*                     * Only proceed if the vote comes from a replica in the current or next                     * voting view.只有来自选举人的投票才需要被处理                     */                    switch (n.state) {                    case LOOKING:                        // If notification > current, replace and send messages out                        if (n.electionEpoch > logicalclock.get()) {                            logicalclock.set(n.electionEpoch);                            recvset.clear();                            //totalOrderPredicate将验证投票有效逻辑,这个判断逻辑在第二节快速选举-概念,介绍的一致                            if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,                                    getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {                                //如果他人的投票内容验证确实有效,则接受他人的投票结果                                updateProposal(n.leader, n.zxid, n.peerEpoch);                            } else {                                //如果他人的投票内容无效,则刷新下自己的投票内容                                updateProposal(getInitId(),                                        getInitLastLoggedZxid(),                                        getPeerEpoch());                            }                            sendNotifications();                        } else if (n.electionEpoch < logicalclock.get()) { //如果他人的选举轮数不及当前,直接无视投票内容                            break;                        } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {//在同一轮选举中,且他人投票有效,则接收投票结果                            updateProposal(n.leader, n.zxid, n.peerEpoch);                            sendNotifications();                        }                        recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));                        //判断选举是否结束                        if (termPredicate(recvset,                                new Vote(proposedLeader, proposedZxid,                                        logicalclock.get(), proposedEpoch))) {                            // Verify if there is any change in the proposed leader 如果选票有变动,则选举不算结束                            while((n = recvqueue.poll(finalizeWait,                                    TimeUnit.MILLISECONDS)) != null){                                if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,                                        proposedLeader, proposedZxid, proposedEpoch)){                                    recvqueue.put(n);                                    break;                                }                            }                            /*                             * This predicate is true once we don't read any new                             * relevant message from the reception queue                             */                            if (n == null) {                                //选举结束,更新状态                                self.setPeerState((proposedLeader == self.getId()) ?                                        ServerState.LEADING: learningState());                                Vote endVote = new Vote(proposedLeader,                                        proposedZxid, proposedEpoch);                                leaveInstance(endVote);                                return endVote;                            }                        }                        break;                    case OBSERVING:                        LOG.debug("Notification from observer: " + n.sid);                        break;                    case FOLLOWING:                    case LEADING:                        /*                         * Consider all notifications from the same epoch                         * together.                         */                        if(n.electionEpoch == logicalclock.get()){                            recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));                            if(termPredicate(recvset, new Vote(n.leader,                                            n.zxid, n.electionEpoch, n.peerEpoch, n.state))                                            && checkLeader(outofelection, n.leader, n.electionEpoch)) {                                self.setPeerState((n.leader == self.getId()) ?                                        ServerState.LEADING: learningState());                                Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);                                leaveInstance(endVote);                                return endVote;                            }                        }                        /*                         * Before joining an established ensemble, verify that                         * a majority are following the same leader.                         * Only peer epoch is used to check that the votes come                         * from the same ensemble. This is because there is at                         * least one corner case in which the ensemble can be                         * created with inconsistent zxid and election epoch                         * info. However, given that only one ensemble can be                         * running at a single point in time and that each                          * epoch is used only once, using only the epoch to                          * compare the votes is sufficient.                         *                          * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732                         * 简言之,是不是所有follower都追随同一个leader,并且这个leader确实在线                         */                        outofelection.put(n.sid, new Vote(n.leader,                                 IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));                        if (termPredicate(outofelection, new Vote(n.leader,                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))                                && checkLeader(outofelection, n.leader, IGNOREVALUE)) {                            synchronized(this){                                logicalclock.set(n.electionEpoch);                                self.setPeerState((n.leader == self.getId()) ?                                        ServerState.LEADING: learningState());                            }                            Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);                            leaveInstance(endVote);                            return endVote;                        }                        break;                    default:                        LOG.warn("Notification state unrecoginized: " + n.state                              + " (n.state), " + n.sid + " (n.sid)");                        break;                    }                } else {                    LOG.warn("Ignoring notification from non-cluster member " + n.sid);                }            }            return null;}    /**     * Check if a pair (server id, zxid) succeeds our     * current vote.     *     * @param id    Server identifier     * @param zxid  Last zxid observed by the issuer of this vote     */    protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {        LOG.debug("id: " + newId + ", proposed id: " + curId + ", zxid: 0x" +                Long.toHexString(newZxid) + ", proposed zxid: 0x" + Long.toHexString(curZxid));        if(self.getQuorumVerifier().getWeight(newId) == 0){            return false;        }        /*         * We return true if one of the following three cases hold:         * 1- New epoch is higher         * 2- New epoch is the same as current epoch, but new zxid is higher         * 3- New epoch is the same as current epoch, new zxid is the same         *  as current zxid, but server id is higher.         */        return ((newEpoch > curEpoch) ||                ((newEpoch == curEpoch) &&                ((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId)))));    }    /*    * 判断是否可以结束选举,依据是当轮的投票里,是不是大家都投给了同一个server    */    private boolean termPredicate(HashMap<Long, Vote> votes, Vote vote) {        SyncedLearnerTracker voteSet = new SyncedLearnerTracker();        voteSet.addQuorumVerifier(self.getQuorumVerifier());        if (self.getLastSeenQuorumVerifier() != null                && self.getLastSeenQuorumVerifier().getVersion() > self                        .getQuorumVerifier().getVersion()) {            voteSet.addQuorumVerifier(self.getLastSeenQuorumVerifier());        }        /*         * First make the views consistent. Sometimes peers will have different         * zxids for a server depending on timing.         */        for (Map.Entry<Long, Vote> entry : votes.entrySet()) {            if (vote.equals(entry.getValue())) {                voteSet.addAck(entry.getKey());            }        }        return voteSet.hasAllQuorums();    }
原创粉丝点击