mongodb源码分析(二十三)mongos chunk的迁移

1.  将要迁移chunk端A首先记录chunk迁移数据的位置.

2. 通知远端B,让其执行_recvChunkStart开始chunk的迁移.

3. B端首先从A端system.indexes读取索引,并将其插入到自身上.

4. B端读取A端数据,并插入到自己的collection.

5. B端执行在从A端读取数据时A端产生的删除,插入操作.

6. A端在B端读取数据时一直向B端询问是否已经操作完毕准备提交了.

7. B端通知自己已经ready等待提交数据.

8. A端通知B端提交数据.

9. B端提交数据.

10. A端更新configserver配置数据,更新自己的chunkmanager.

11. A端清空自己记录的迁移数据位置,清空已经移动到了B端的数据.


bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {    // 1. parse options    // 2. make sure my view is complete and lock    // 3. start migrate    //    in a read lock, get all DiskLoc and sort so we can do as little seeking as possible    //    tell to start transferring    // 4. pause till migrate caught up    // 5. LOCK    //    a) update my config, essentially locking    //    b) finish migrate    //    c) update config server    //    d) logChange to config server    // 6. wait for all current cursors to expire    // 7. remove data locally    // 1.参数检测部分    string ns = cmdObj.firstElement().str();    string to = cmdObj["to"].str();    string from = cmdObj["from"].str(); // my public address, a tad redundant, but safe    // if we do a w=2 after very write    bool secondaryThrottle = cmdObj["secondaryThrottle"].trueValue();    if ( secondaryThrottle && ! anyReplEnabled() )        secondaryThrottle = false;    BSONObj min  = cmdObj["min"].Obj();    BSONObj max  = cmdObj["max"].Obj();    BSONElement shardId = cmdObj["shardId"];    BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];    const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes    if ( ! shardingState.enabled() ) {        string configdb = cmdObj["configdb"].String();        shardingState.enable( configdb );        configServer.init( configdb );    }    MoveTimingHelper timing( "from" , ns , min , max , 6 /* steps */ , errmsg );    // Make sure we're as up-to-date as possible with shard information    // This catches the case where we had to previously changed a shard's host by    // removing/adding a shard with the same name    Shard::reloadShardInfo();    // So 2.2 mongod can interact with 2.0 mongos, mongod needs to handle either a conn    // string or a shard in the to/from fields.  The Shard constructor handles this,    // eventually we should break the compatibility.    Shard fromShard( from );    Shard toShard( to );    timing.done(1);    // 2.    //分布式锁锁ns命令的collection    DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC ) , ns );    dist_lock_try dlk;    dlk = dist_lock_try( &lockSetup , (string)"migrate-" + min.toString() );;    BSONObj chunkInfo = BSON("min" << min << "max" << max << "from" << fromShard.getName() << "to" << toShard.getName() );    configServer.logChange( "moveChunk.start" , ns , chunkInfo );    ShardChunkVersion maxVersion;    string myOldShard;    {        scoped_ptr<ScopedDbConnection> conn(ScopedDbConnection::getInternalScopedDbConnection(shardingState.getConfigServer()) );        BSONObj x;        BSONObj currChunk;        x = conn->get()->findOne( ShardNS::chunk,Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) );        currChunk = conn->get()->findOne( ShardNS::chunk , shardId.wrap( "_id" ) );//currChunk为当前要move的chunk        maxVersion = ShardChunkVersion::fromBSON( x, "lastmod" );        myOldShard = currChunk["shard"].String();        conn->done();        BSONObj currMin = currChunk["min"].Obj();        BSONObj currMax = currChunk["max"].Obj();        if ( myOldShard != fromShard.getName() )            return false;        if ( maxVersion < shardingState.getVersion( ns ) )            return false;        // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date        shardingState.gotShardName( myOldShard );        // Using the maxVersion we just found will enforce a check - if we use zero version,        // it's possible this shard will be *at* zero version from a previous migrate and        // no refresh will be done        // TODO: Make this less fragile        ShardChunkVersion shardVersion = maxVersion;//更新版本信息,加载chunkmanager        shardingState.trySetVersion( ns , shardVersion /* will return updated */ );    }    timing.done(2);    // 3.    ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );    BSONObj shardKeyPattern = chunkManager->getKey();    MigrateStatusHolder statusHolder( ns , min , max , shardKeyPattern );    {        // this gets a read lock, so we know we have a checkpoint for mods//这里是存储当前这个chunk的数据的地址,后面方便来自B端数据的读取操作,记录地址使用的是一个set,因为需要排序.        if ( ! migrateFromStatus.storeCurrentLocs( maxChunkSize , errmsg , result ) )            return false;        scoped_ptr<ScopedDbConnection> connTo(                ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );        BSONObj res;        bool ok;        ok = connTo->get()->runCommand( "admin" ,//这里是通知B端开始chunk的迁移.                                            BSON( "_recvChunkStart" << ns <<                                                  "from" << fromShard.getConnString() <<                                                  "min" << min <<                                                  "max" << max <<                                                  "shardKeyPattern" << shardKeyPattern <<                                                  "configServer" << configServer.modelServer() <<                                                  "secondaryThrottle" << secondaryThrottle                                                  ) ,                                            res );        connTo->done();    }

bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {    if ( migrateStatus.getActive() ) {//一个chunk迁移已经开始了        errmsg = "migrate already in progress";        return false;    }    if ( ! configServer.ok() )        configServer.init( cmdObj["configServer"].String() );    migrateStatus.prepare();    migrateStatus.ns = cmdObj.firstElement().String();    migrateStatus.from = cmdObj["from"].String();    migrateStatus.min = cmdObj["min"].Obj().getOwned();    migrateStatus.max = cmdObj["max"].Obj().getOwned();    migrateStatus.shardKeyPattern = cmdObj["shardKeyPattern"].Obj().getOwned();    migrateStatus.secondaryThrottle = cmdObj["secondaryThrottle"].trueValue();    if ( migrateStatus.secondaryThrottle && ! anyReplEnabled() )        migrateStatus.secondaryThrottle = false;    boost::thread m( migrateThread );//开启一个线程专门负责迁移工作    result.appendBool( "started" , true );    return true;}

void _go() {    slaveCount = ( getSlaveCount() / 2 ) + 1;    scoped_ptr<ScopedDbConnection> connPtr(ScopedDbConnection::getScopedDbConnection( from ) );    ScopedDbConnection& conn = *connPtr;//建立来自from的连接    conn->getLastError(); // just test connection    {        // 0. copy system.namespaces entry if collection doesn't already exist        Client::WriteContext ctx( ns );        // Only copy if ns doesn't already exist        if ( ! nsdetails( ns.c_str() ) ) {//本地collection不存在则建立一个            string system_namespaces = NamespaceString( ns ).db + ".system.namespaces";            BSONObj entry = conn->findOne( system_namespaces, BSON( "name" << ns ) );            if ( entry["options"].isABSONObj() ) {                string errmsg;                if ( ! userCreateNS( ns.c_str(), entry["options"].Obj(), errmsg, true, 0 ) )                    warning() << "failed to create collection with options: " << errmsg                              << endl;            }        }    }    {                        // 1. copy indexes           vector<BSONObj> all;        {  //建立关于这个collection的所有索引            auto_ptr<DBClientCursor> indexes = conn->getIndexes( ns );            while ( indexes->more() ) {                all.push_back( indexes->next().getOwned() );            }        }//向.system.indexes插入一条数据将自动建立相应的索引,这个可详见插入数据部分        for ( unsigned i=0; i<all.size(); i++ ) {            BSONObj idx = all[i];            Client::WriteContext ct( ns );            string system_indexes = cc().database()->name + ".system.indexes";            theDataFileMgr.insertAndLog( system_indexes.c_str() , idx, true /* flag fromMigrate in oplog */ );        }    }    {        // 2. delete any data already in range        RemoveSaver rs( "moveChunk" , ns , "preCleanup" );        long long num = Helpers::removeRange( ns ,                                              min ,                                              max ,                                              findShardKeyIndexPattern_unlocked( ns , shardKeyPattern ) ,                                               false , /*maxInclusive*/                                              secondaryThrottle , /* secondaryThrottle */                                              cmdLine.moveParanoia ? &rs : 0 , /*callback*/                                              true ); /* flag fromMigrate in oplog */    }    {        // 3. initial bulk clone        state = CLONE;        while ( true ) {//从from端克隆数据,复制到自己的数据库里            BSONObj res;//这里从A端读取数据            if ( ! conn->runCommand( "admin" , BSON( "_migrateClone" << 1 ) , res ) ) {  // gets array of objects to copy, in disk order                state = FAIL;                conn.done();                return;            }//实际的数据            BSONObj arr = res["objects"].Obj();            int thisTime = 0;            BSONObjIterator i( arr );            while( i.more() ) {                BSONObj o =;                {                    PageFaultRetryableSection pgrs;                    while ( 1 ) {                        try {                            Lock::DBWrite lk( ns );                            Helpers::upsert( ns, o, true );//数据插入本地                            break;                        }                        catch ( PageFaultException& e ) {                            e.touch();                        }                    }                }                thisTime++;                numCloned++;                clonedBytes += o.objsize();                if ( secondaryThrottle ) {//设定了这个参数时需要等待至少两个secondary端插入了数据,才能继续                    if ( ! waitForReplication( cc().getLastOp(), 2, 60 /* seconds to wait */ ) ) {                    }                }            }            if ( thisTime == 0 )                break;        }    }    // if running on a replicated system, we'll need to flush the docs we cloned to the secondaries    ReplTime lastOpApplied = cc().getLastOp().asDate();    {        // 4. do bulk of mods        state = CATCHUP;//之前从from复制了数据,但是复制数据期间可能这个chunk的数据        while ( true ) {//被更改了,所以这里从from端传递更改的信息过来,然后应用到本地            BSONObj res;//更改信息中            if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {                state = FAIL;                conn.done();                return;            }            if ( res["size"].number() == 0 )                break;            apply( res , &lastOpApplied );//应用来自A端的修改操作            const int maxIterations = 3600*50;            int i;            for ( i=0;i<maxIterations; i++) {                if ( state == ABORT ) {                    timing.note( "aborted" );                    return;                }                if ( opReplicatedEnough( lastOpApplied ) )                    break;                                sleepmillis( 20 );            }            if ( i == maxIterations ) {                conn.done();                state = FAIL;                return;            }         }    }    {         // pause to wait for replication        // this will prevent us from going into critical section until we're ready        Timer t;        while ( t.minutes() < 600 ) {            if ( flushPendingWrites( lastOpApplied ) )                break;            sleepsecs(1);        }    }    {        // 5. wait for commit        state = STEADY;//等待来自A端的命令,然后提交数据,等待的同时需要不断的replayA端的修改操作        while ( state == STEADY || state == COMMIT_START ) {            BSONObj res;            if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {                state = FAIL;                conn.done();                return;            }            if ( res["size"].number() > 0 && apply( res , &lastOpApplied ) )                continue;            if ( state == ABORT ) {                timing.note( "aborted" );                return;            }            if ( state == COMMIT_START ) {                if ( flushPendingWrites( lastOpApplied ) )                    break;            }            sleepmillis( 10 );        }        if ( state == FAIL ) {            return;        }        timing.done(5);    }    state = DONE;    conn.done();}

bool clone( string& errmsg , BSONObjBuilder& result ) {    if ( ! _getActive() ) {//必须是chunk迁移已经启动        return false;    }    ElapsedTracker tracker (128, 10); // same as ClientCursor::_yieldSometimesTracker    int allocSize;    {        Client::ReadContext ctx( _ns );        NamespaceDetails *d = nsdetails( _ns.c_str() );        scoped_spinlock lk( _trackerLocks );        allocSize = std::min(BSONObjMaxUserSize, (int)((12 + d->averageObjectSize()) * _cloneLocs.size()));    }    BSONArrayBuilder a (allocSize);    while ( 1 ) {        bool filledBuffer = false;        auto_ptr<LockMongoFilesShared> fileLock;        Record* recordToTouch = 0;        {            Client::ReadContext ctx( _ns );            scoped_spinlock lk( _trackerLocks );//之前记录的chunk数据的地址.            set<DiskLoc>::iterator i = _cloneLocs.begin();            for ( ; i!=_cloneLocs.end(); ++i ) {                if (tracker.intervalHasElapsed()) // should I yield?                    break;                DiskLoc dl = *i;//实际数据的加载                Record* r = dl.rec();                if ( ! r->likelyInPhysicalMemory() ) {                    fileLock.reset( new LockMongoFilesShared() );                    recordToTouch = r;                    break;                }                BSONObj o = dl.obj();                // use the builder size instead of accumulating 'o's size so that we take into consideration                // the overhead of BSONArray indices                if ( a.len() + o.objsize() + 1024 > BSONObjMaxUserSize ) {                    filledBuffer = true; // break out of outer while loop                    break;                }                a.append( o );//实际数据的复制            }//已经复制了的删除            _cloneLocs.erase( _cloneLocs.begin() , i );            if ( _cloneLocs.empty() || filledBuffer )                break;        }        if ( recordToTouch ) {            // its safe to touch here bceause we have a LockMongoFilesShared            // we can't do where we get the lock because we would have to unlock the main readlock and tne _trackerLocks            // simpler to handle this out there            recordToTouch->touch();            recordToTouch = 0;        }    }//这里返回的数据将被发往B端.    result.appendArray( "objects" , a.arr() );    return true;}

bool transferMods( string& errmsg , BSONObjBuilder& b ) {    if ( ! _getActive() )        return false;    long long size = 0;    Client::ReadContext cx( _ns );//将deleted和reload的操作日志发送给B端    xfer( &_deleted , b , "deleted" , size , false );    xfer( &_reload , b , "reload" , size , true );    b.append( "size" , size );    return true;}void xfer( list<BSONObj> * l , BSONObjBuilder& b , const char * name , long long& size , bool explode ) {    const long long maxSize = 1024 * 1024;    if ( l->size() == 0 || size > maxSize )        return;    BSONArrayBuilder arr(b.subarrayStart(name));    list<BSONObj>::iterator i = l->begin();    while ( i != l->end() && size < maxSize ) {        BSONObj t = *i;        if ( explode ) {            BSONObj it;//再次查看是否又有修改            if ( Helpers::findById( cc() , _ns.c_str() , t, it ) ) {                arr.append( it );                size += it.objsize();            }        }        else             arr.append( t );        i = l->erase( i );        size += t.objsize();    }    arr.done();}

void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b, bool fromMigrate) {    if ( replSettings.master )         _logOp(opstr, ns, 0, obj, patt, b, fromMigrate);    logOpForSharding( opstr , ns , obj , patt );}void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {    migrateFromStatus.logOp( opstr , ns , obj , patt );}void logOp( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {    if ( ! _getActive() )//chunk迁移时记录删除以及插入等动作,以达到同步的目的        return;    if ( _ns != ns )        return;    // no need to log if this is not an insertion, an update, or an actual deletion    // note: opstr 'db' isn't a deletion but a mention that a database exists (for replication    // machinery mostly)    char op = opstr[0];    if ( op == 'n' || op =='c' || ( op == 'd' && opstr[1] == 'b' ) )        return;    BSONElement ide;    if ( patt )        ide = patt->getField( "_id" );    else        ide = obj["_id"];    BSONObj it;    switch ( opstr[0] ) {    case 'd': {        if ( getThreadName() == cleanUpThreadName ) {            // we don't want to xfer things we're cleaning            // as then they'll be deleted on TO            // which is bad            return;        }        // can't filter deletes :(        _deleted.push_back( ide.wrap() );        _memoryUsed += ide.size() + 5;        return;    }    case 'i':        it = obj;        break;    case 'u':        if ( ! Helpers::findById( cc() , _ns.c_str() , ide.wrap() , it ) ) {            return;        }        break;    }    if ( ! isInRange( it , _min , _max ) )        return;    _reload.push_back( ide.wrap() );    _memoryUsed += ide.size() + 5;}

    // 4.    for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day        sleepsecs( 1 );        scoped_ptr<ScopedDbConnection> conn(                ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );        BSONObj res;        bool ok;        //查看接收端数据接收信息,是否已经完成迁移等待提交了        ok = conn->get()->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res );        res = res.getOwned();        conn->done();        if ( ! ok || res["state"].String() == "fail" ) {            result.append( "cause" , res );            return false;        }        if ( res["state"].String() == "steady" )//等待ready状态            break;//迁移用内存太多,告知B端终止动作        if ( migrateFromStatus.mbUsed() > (500 * 1024 * 1024) ) {            // this is too much memory for us to use for this            // so we're going to abort the migrate            scoped_ptr<ScopedDbConnection> conn(ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );            BSONObj res;            conn->get()->runCommand( "admin" , BSON( "_recvChunkAbort" << 1 ) , res );            res = res.getOwned();            conn->done();            result.appendBool( "split" , true );            return false;        }        killCurrentOp.checkForInterrupt();    }    // 5.    {//真正的提交部分        // 5.a        // we're under the collection lock here, so no other migrate can change maxVersion or ShardChunkManager state        migrateFromStatus.setInCriticalSection( true );        ShardChunkVersion currVersion = maxVersion;        ShardChunkVersion myVersion = currVersion;        myVersion.incMajor();//chunk的迁移造成了主version增加1        {            Lock::DBWrite lk( ns );            verify( myVersion > shardingState.getVersion( ns ) );            // bump the chunks manager's version up and "forget" about the chunk being moved            // this is not the commit point but in practice the state in this shard won't until the commit it done            shardingState.donateChunk( ns , min , max , myVersion );//本地chunkmanager移出这个chunk        }        // 5.b        // we're under the collection lock here, too, so we can undo the chunk donation because no other state change        // could be ongoing        {            BSONObj res;            scoped_ptr<ScopedDbConnection> connTo(ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );            bool ok;//告知B端提交数据            ok = connTo->get()->runCommand( "admin" ,BSON( "_recvChunkCommit" << 1 ) ,res );            connTo->done();            if ( ! ok ) {//数据提交失败,这里将之前移出的那个chunk添加回去                Lock::DBWrite lk( ns );                // revert the chunk manager back to the state before "forgetting" about the chunk                shardingState.undoDonateChunk( ns , min , max , currVersion );                result.append( "cause" , res );                return false;            }        }        // 5.c        // version at which the next highest lastmod will be set        // if the chunk being moved is the last in the shard, nextVersion is that chunk's lastmod        // otherwise the highest version is from the chunk being bumped on the FROM-shard        ShardChunkVersion nextVersion;        // we want to go only once to the configDB but perhaps change two chunks, the one being migrated and another        // local one (so to bump version for the entire shard)        // we use the 'applyOps' mechanism to group the two updates and make them safer        // TODO pull config update code to a module        BSONObjBuilder cmdBuilder;        //更新configserver chunks信息,新产出了一个chunk当然需要修改chunks这个collection了        BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) );        {            // update for the chunk being moved            BSONObjBuilder op;            op.append( "op" , "u" );            op.appendBool( "b" , false /* no upserting */ );            op.append( "ns" , ShardNS::chunk );            BSONObjBuilder n( op.subobjStart( "o" ) );            n.append( "_id" , Chunk::genID( ns , min ) );            myVersion.addToBSON( n, "lastmod" );            n.append( "ns" , ns );            n.append( "min" , min );            n.append( "max" , max );            n.append( "shard" , toShard.getName() );            n.done();            BSONObjBuilder q( op.subobjStart( "o2" ) );            q.append( "_id" , Chunk::genID( ns , min ) );            q.done();            updates.append( op.obj() );        }        nextVersion = myVersion;        // if we have chunks left on the FROM shard, update the version of one of them as well        // we can figure that out by grabbing the chunkManager installed on 5.a        // TODO expose that manager when installing it        ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );        if( chunkManager->getNumChunks() > 0 ) {            // get another chunk on that shard            BSONObj lookupKey;            BSONObj bumpMin, bumpMax;            do {                chunkManager->getNextChunk( lookupKey , &bumpMin , &bumpMax );                lookupKey = bumpMin;            }            while( bumpMin == min );            BSONObjBuilder op;            op.append( "op" , "u" );            op.appendBool( "b" , false );            op.append( "ns" , ShardNS::chunk );            nextVersion.incMinor();  // same as used on donateChunk            BSONObjBuilder n( op.subobjStart( "o" ) );            n.append( "_id" , Chunk::genID( ns , bumpMin ) );            nextVersion.addToBSON( n, "lastmod" );            n.append( "ns" , ns );            n.append( "min" , bumpMin );            n.append( "max" , bumpMax );            n.append( "shard" , fromShard.getName() );            n.done();            BSONObjBuilder q( op.subobjStart( "o2" ) );            q.append( "_id" , Chunk::genID( ns , bumpMin  ) );            q.done();            updates.append( op.obj() );        }        updates.done();        BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) );        {            BSONObjBuilder b;            b.append( "ns" , ShardNS::chunk );            b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) );            {                BSONObjBuilder bb( b.subobjStart( "res" ) );                // TODO: For backwards compatibility, we can't yet require an epoch here                bb.appendTimestamp( "lastmod", maxVersion.toLong() );                bb.done();            }            preCond.append( b.obj() );        }        preCond.done();        BSONObj cmd = cmdBuilder.obj();        bool ok = false;        BSONObj cmdResult;//执行更新命令        scoped_ptr<ScopedDbConnection> conn(                    ScopedDbConnection::getInternalScopedDbConnection(                            shardingState.getConfigServer() ) );            ok = conn->get()->runCommand( "config" , cmd , cmdResult );            conn->done();        migrateFromStatus.setInCriticalSection( false );        // 5.d        configServer.logChange( "moveChunk.commit" , ns , chunkInfo );    }    migrateFromStatus.done();    {//最后删除本地的这个chunk的数据        // 6.        OldDataCleanup c;        c.secondaryThrottle = secondaryThrottle;        c.ns = ns;        c.min = min.getOwned();        c.max = max.getOwned();        c.shardKeyPattern = shardKeyPattern.getOwned();        ClientCursor::find( ns , c.initial );        if ( c.initial.size() ) {            boost::thread t( boost::bind( &cleanupOldData , c ) );        }        else {            // 7.            c.doRemove();        }    }    return true;}

原文链接: mongodb源码分析(二十三)mongos chunk的迁移

作者: yhjj0108,杨浩
