leveldb:write(如何处理并发写操作)

来源：互联网发布：淘宝怎样登陆子账号编辑：程序博客网时间：2024/06/08 06:51

Put与Delete操作

Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {  WriteBatch batch;  batch.Put(key, value);  return Write(opt, &batch);}Status DB::Delete(const WriteOptions& opt, const Slice& key) {  WriteBatch batch;  batch.Delete(key);  return Write(opt, &batch);}

LevelDB对外暴露的写接口包括Put，Delete和Write，其中Write需要WriteBatch作为参数，而Put和Delete首先就是将当前的操作封装到一个WriteBatch对象，并调用Write接口。
opt是写选项，从上面代码并没有看出处理并发的逻辑，其实对于多线程的处理是在DBImpl::Write函数中完成

WriteBatch

WriteBatch可以记录许多个操作，每一个操作代表着要插入或删除相应数据

class WriteBatch { public:  WriteBatch();  ~WriteBatch();  // Store the mapping "key->value" in the database.  void Put(const Slice& key, const Slice& value);  // If the database contains a mapping for "key", erase it.  Else do nothing.  void Delete(const Slice& key);  // Clear all updates buffered in this batch.  void Clear();  // Support for iterating over the contents of a batch.  class Handler {   public:    virtual ~Handler();    virtual void Put(const Slice& key, const Slice& value) = 0;    virtual void Delete(const Slice& key) = 0;  };  Status Iterate(Handler* handler) const; private:  friend class WriteBatchInternal;  std::string rep_;  //只有一个string成员变量，来存放所有操作};

每一个WriteBatch都是以一个固定长度的头部开始，然后后面接着许多连续的记录（插入或删除操作）
固定头部格式：
固定头部共12字节，其中前8字节为WriteBatch的序列号（也就是每个操作对应的全局序列号），对应rep_[0]到rep_[7]，每次处理Batch中的记录时才会更新，后四字节为当前Batch中的记录数，对应rep_[8]到rep_[11]；
后面的记录结构为：
插入数据时：type（kTypeValue、kTypeDeletion），Key_size，Key，Value_size，Value
删除数据时：type（kTypeValue、kTypeDeletion），Key_size，Key
这里写图片描述
WriteBatchInternal提供了一系列的静态操作接口来对WriteBatch的接口进行封装，而不是直接操作WriteBatch的接口

DBImpl::Write

Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {/*struct DBImpl::Writer {*  WriteBatch* batch;*  bool sync;*  bool done;* port::CondVar cv;*};*Writer封装WriteBatch，主要是多了信号量cv用于多线程的同步，以及该batch是否完成的标志done*/  Writer w(&mutex_);  w.batch = my_batch;  w.sync = options.sync;  w.done = false;//加锁,因为w要插入全局队列writers_中  MutexLock l(&mutex_);  writers_.push_back(&w);//只有当w是位于队列头部且w并没有完成时才不用等待  while (!w.done && &w != writers_.front()) {    w.cv.Wait();  }  //可能该w中的batch被其他线程通过下面讲到的合并操作一起完成了  if (w.done) {    return w.status;  }  // May temporarily unlock and wait.  Status status = MakeRoomForWrite(my_batch == NULL);  uint64_t last_sequence = versions_->LastSequence();  Writer* last_writer = &w;  if (status.ok() && my_batch != NULL) {    //合并队列中的各个batch到一个新batch中    WriteBatch* updates = BuildBatchGroup(&last_writer);  //为合并后的新batch中的第一个操作赋上全局序列号    WriteBatchInternal::SetSequence(updates, last_sequence + 1);  //并计算新的全局序列号    last_sequence += WriteBatchInternal::Count(updates);    {    //往磁盘写日志文件开销很大，此时可以释放锁来提高并发，此时其他线程可以将    //新的writer插入到队列writers_中      mutex_.Unlock();    //将batch中的每条操作写入日志文件log_中      status = log_->AddRecord(WriteBatchInternal::Contents(updates));      bool sync_error = false;      if (status.ok() && options.sync) {      //是否要求立马刷盘将log写到磁盘，因为我们知道文件系统还有自己的缓存        status = logfile_->Sync();        if (!status.ok()) {          sync_error = true;        }      }      if (status.ok()) {       //将batch中每条操作插入到memtable中        status = WriteBatchInternal::InsertInto(updates, mem_);      }      //重新加锁      mutex_.Lock();    }    //因为updates已经写入了log和memtable，可以清空了    if (updates == tmp_batch_) tmp_batch_->Clear();    //重新设置新的全局序列号    versions_->SetLastSequence(last_sequence);  }  while (true) {  //因为我们的updates可能合并了writers_队列中的很多,当前线程完成了其他线程的  //writer，只需唤醒这些已完成writer的线程    Writer* ready = writers_.front();  //从队列头部取出已完成的writer    writers_.pop_front();    if (ready != &w) {   //如果取出的writer不是当前线程的自己的，则唤醒writer所属的线程，唤醒的线程会执   //行 if (w.done) {   // return w.status;  //}逻辑      ready->status = status;      ready->done = true;      ready->cv.Signal();    }    //ready == last_writer说明这已经是合并的batch中最后一个已完成的writer了    if (ready == last_writer) break;  }  // Notify new head of write queue  if (!writers_.empty()) {  //队列不空，则唤醒队列头部writer所属的线程，参见上面 while (!w.done && &w != writers_.front())    writers_.front()->cv.Signal();  }  return status;}

DBImpl::BuildBatchGroup

// REQUIRES: Writer list must be non-empty// REQUIRES: First writer must have a non-NULL batchWriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {  assert(!writers_.empty());  Writer* first = writers_.front();  WriteBatch* result = first->batch;  assert(result != NULL);  size_t size = WriteBatchInternal::ByteSize(first->batch);  // 设置合并后产生的batch的最大容量  size_t max_size = 1 << 20;  if (size <= (128<<10)) {  //如果第一个待合并的batch的size很小，则相应减小合并后batch的最大容量    max_size = size + (128<<10);  }//我们需要记录writers_队列中最后一个被合并的writer，因为write函数中唤醒线程需要用//到，防止小的batch需要等待过久用于合并  *last_writer = first;  std::deque<Writer*>::iterator iter = writers_.begin();  ++iter;  // Advance past "first"  for (; iter != writers_.end(); ++iter) {    Writer* w = *iter;    if (w->sync && !first->sync) {      //能合并到一起的batch大家的sync属性必须相同      break;    }    if (w->batch != NULL) {      size += WriteBatchInternal::ByteSize(w->batch);      if (size > max_size) {        // Do not make batch too big        break;      }      // Append to *result      if (result == first->batch) {        // 用db数据成员tmp_batch_存放合并后的结果，相当于把各个待合并的writer中        //的数据全都拷贝进了tmp_batch_        result = tmp_batch_;        assert(WriteBatchInternal::Count(result) == 0);        WriteBatchInternal::Append(result, first->batch);      }      WriteBatchInternal::Append(result, w->batch);    }    *last_writer = w;//记录最后一个合并的writer  }  return result;}

阅读全文

1 0