leveldb研究7-Version/VersionSet/VersionEdit，内存中的数据结构Memtable/SkipList

来源：互联网发布：淘宝怎么打不开了编辑：程序博客网时间：2024/04/30 19:09

leveldb研究7-Version/VersionSet/VersionEdit

Posted on 2012-03-16 17:10 小明阅读(1152) 评论(0) 编辑收藏所属分类: 分布式计算

leveldb 使用 version 来保存数据库的状态。

先看看一个重要的数据结果，sst file的META info
<db/version_edit.h>

struct FileMetaData {
  int refs; //引用计数
  int allowed_seeks; //允许的seeks次数
  uint64_t number;//文件编号
  uint64_t file_size;  //文件大小
  InternalKey smallest;    //最小的key
  InternalKey largest;      //最大的key

  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
};

这里面有一个很有意思的字段: allowed_seeks,代表了可以seek的次数，为0的时候表示这个文件需要被compaction.如何设置seeks次数呢？文件大小除以16k，不到100算100。

f->allowed_seeks = (f->file_size / 16384);
if (f->allowed_seeks < 100) f->allowed_seeks = 100;

原因，请看leveldb的注释：

// We arrange to automatically compact this file after a certain number of seeks. Let's assume:
      //   (1) One seek costs 10ms
      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
      //   (3) A compaction of 1MB does 25MB of IO:
      //         1MB read from this level
      //         10-12MB read from next level (boundaries may be misaligned)
      //         10-12MB written to next level
      // This implies that 25 seeks cost the same as the compaction
      // of 1MB of data. I.e., one seek costs approximately the
      // same as the compaction of 40KB of data. We are a little
      // conservative and allow approximately one seek for every 16KB
      // of data before triggering a compaction.

接下来看Version的定义，version其实就是一系列的SST file的集合。

class Version {
public:
  //生成iterator用于遍历
  void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);

  //根据key来查询，若没有查到，更新GetStats
  struct GetStats {
    FileMetaData* seek_file;
    int seek_file_level;
  };
  Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
             GetStats* stats);

  //是否需要进行compaction
  bool UpdateStats(const GetStats& stats);

  //引用计算，避免在被引用时候删除
  void Ref();
  void Unref();

  //查询和key range有关的files
  void GetOverlappingInputs(
      int level,
      const InternalKey* begin,         // NULL means before all keys
      const InternalKey* end,           // NULL means after all keys
      std::vector<FileMetaData*>* inputs);

  //计算是否level对某个key range是否有overlap
  bool OverlapInLevel(int level,
                      const Slice* smallest_user_key,
                      const Slice* largest_user_key);

  //memtable output应该放到哪个level
  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
                                 const Slice& largest_user_key);

  //某个level的文件个数
   int NumFiles(int level) const { return files_[level].size(); }

  // Return a human readable string that describes this version's contents.
  std::string DebugString() const;

private:
  friend class Compaction;
  friend class VersionSet;

  class LevelFileNumIterator;
  Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;

  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
  Version* prev_;               // Previous version in linked list
  int refs_;                    // Number of live refs to this version

  //sst files
  std::vector<FileMetaData*> files_[config::kNumLevels];

  //下一个要被compaction的文件
  FileMetaData* file_to_compact_;
  int file_to_compact_level_;

  //compaction score:>1表示要compaction
  double compaction_score_;
  int compaction_level_;

  explicit Version(VersionSet* vset)
      : vset_(vset), next_(this), prev_(this), refs_(0),
        file_to_compact_(NULL),
        file_to_compact_level_(-1),
        compaction_score_(-1),
        compaction_level_(-1) {
  }

  ~Version();

  // No copying allowed
  Version(const Version&);
  void operator=(const Version&);
};

那VersionSet呢？VersionSet 是version组成一个双向循环链表。

class VersionSet{
//. . .
Env* const env_;
  const std::string dbname_;
  const Options* const options_;
  TableCache* const table_cache_;
  const InternalKeyComparator icmp_;
  uint64_t next_file_number_;
  uint64_t manifest_file_number_;
  uint64_t last_sequence_;
  uint64_t log_number_;

  WritableFile* descriptor_file_;
  log::Writer* descriptor_log_;
  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
  Version* current_;        // == dummy_versions_.prev_

  //每层都有一个compact pointer用于指示下次从哪里开始compact,以用于实现循环compact
  std::string compact_pointer_[config::kNumLevels];
//. . .
}

VersionEdit是version对象的变更记录，用于写入manifest.这样通过原始的version加上一系列的versionedit的记录，就可以恢复到最新状态。

class VersionEdit {
public:
  VersionEdit() { Clear(); }
  ~VersionEdit() { }

  void Clear();

  void SetComparatorName(const Slice& name) {
    has_comparator_ = true;
    comparator_ = name.ToString();
  }
  void SetLogNumber(uint64_t num) {
    has_log_number_ = true;
    log_number_ = num;
  }
  void SetPrevLogNumber(uint64_t num) {
    has_prev_log_number_ = true;
    prev_log_number_ = num;
  }
  void SetNextFile(uint64_t num) {
    has_next_file_number_ = true;
    next_file_number_ = num;
  }
  void SetLastSequence(SequenceNumber seq) {
    has_last_sequence_ = true;
    last_sequence_ = seq;
  }
  void SetCompactPointer(int level, const InternalKey& key) {
    compact_pointers_.push_back(std::make_pair(level, key));
  }

  //添加meta file
  void AddFile(int level, uint64_t file,
               uint64_t file_size,
               const InternalKey& smallest,
               const InternalKey& largest) {
    FileMetaData f;
    f.number = file;
    f.file_size = file_size;
    f.smallest = smallest;
    f.largest = largest;
    new_files_.push_back(std::make_pair(level, f));
  }

  //删除特定的文件
  void DeleteFile(int level, uint64_t file) {
    deleted_files_.insert(std::make_pair(level, file));
  }

  //编码，解码：用于写入manifest
  void EncodeTo(std::string* dst) const;
  Status DecodeFrom(const Slice& src);

  std::string DebugString() const;

private:
  friend class VersionSet;

  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;

  std::string comparator_;
  uint64_t log_number_;
  uint64_t prev_log_number_;
  uint64_t next_file_number_;
  SequenceNumber last_sequence_;
  bool has_comparator_;
  bool has_log_number_;
  bool has_prev_log_number_;
  bool has_next_file_number_;
  bool has_last_sequence_;

  std::vector< std::pair<int, InternalKey> > compact_pointers_;
  DeletedFileSet deleted_files_;
  std::vector< std::pair<int, FileMetaData> > new_files_;
};

leveldb研究8- 内存中的数据结构Memtable/SkipList

Posted on 2012-03-19 16:31 小明阅读(926) 评论(0) 编辑收藏所属分类: 分布式计算

我们知道,leveldb在写数据的时候，除了log文件，都要在内存中写一份。

先看看SkipList【跳表】这个数据结构：

SkipList有如下特点：
1. 本质上一个排序好的链表
2. 分层，上层节点比下层的少，更具有跳跃性
3. 查询的复杂度是O(logn)

SkipList跟红黑树等还是比较容易实现和理解的，主要长处是比较容易实现Lock free和遍历。
我们来看看leveldb的实现
插入：

//插入一个新的key
template<typename Key, class Comparator>
void SkipList<Key,Comparator>::Insert(const Key& key) {
  //查找插入节点,prev为各层的前置节点
  Node* prev[kMaxHeight];
  Node* x = FindGreaterOrEqual(key, prev);

  // Our data structure does not allow duplicate insertion
  assert(x == NULL || !Equal(key, x->key));

  //生成随机高度
  int height = RandomHeight();
  if (height > GetMaxHeight()) {
    for (int i = GetMaxHeight(); i < height; i++) {
      prev[i] = head_;
    }
    //设置当前最大高度
    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
  }

  //生成新节点
  x = NewNode(key, height);
  for (int i = 0; i < height; i++) {
    //设置新节点的各层的下一跳
    x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
    //设置前节点的next为当前节点，完成插入
    prev[i]->SetNext(i, x);
  }
}

查询：

template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
    const {
  Node* x = head_;
  int level = GetMaxHeight() - 1; //从高层开始查找，依次到0 level
  while (true) {
    Node* next = x->Next(level);
    if (KeyIsAfterNode(key, next)) { //比next key 要大
      // Keep searching in this list
      x = next;
    } else { //比next key小，查找下一层
      //标记当前level的前置节点
      if (prev != NULL) prev[level] = x;
      if (level == 0) {
        return next;
      } else {
        level--;
      }
    }
  }
}

template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::Contains(const Key& key) const {
  Node* x = FindGreaterOrEqual(key, NULL);
  if (x != NULL && Equal(key, x->key)) {
    return true;
  } else {
    return false;
  }
}

接着我们看看leveldb MemTable的实现，很简单了，基本是对SkipList访问一个封装
<db/memtable.h>

class MemTable {
public:
  explicit MemTable(const InternalKeyComparator& comparator);

  //增加引用计数
  void Ref() { ++refs_; }

  //减少引用计数
  void Unref() {
    --refs_;
    assert(refs_ >= 0);
    if (refs_ <= 0) {
      delete this;
    }
  }

  //内存使用量
  size_t ApproximateMemoryUsage();

  //遍历操作
  Iterator* NewIterator();

  //插入
  void Add(SequenceNumber seq, ValueType type,
           const Slice& key,
           const Slice& value);

  //查询
  bool Get(const LookupKey& key, std::string* value, Status* s);

private:
  ~MemTable();  // Private since only Unref() should be used to delete it

  //key compartor，用于排序
  struct KeyComparator {
    const InternalKeyComparator comparator;
    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
    int operator()(const char* a, const char* b) const;
  };
  friend class MemTableIterator;
  friend class MemTableBackwardIterator;

  typedef SkipList<const char*, KeyComparator> Table;

  KeyComparator comparator_;
  int refs_; //引用计数
  Arena arena_; //内存分配器
  Table table_; //数据存放SkipList

  // No copying allowed
  MemTable(const MemTable&);
  void operator=(const MemTable&);
};

先看看插入
<db/memtable.cc>

void MemTable::Add(SequenceNumber s, ValueType type,
                   const Slice& key,
                   const Slice& value) {
  //数据结构：
  //1.internal key size : Varint32 (length of 2+3)
  //2.key data
  //3.SequenceNumber+Key type: 8 bytes
  //4 value size: Varint32
  //5 value data
  size_t key_size = key.size();
  size_t val_size = value.size();
  size_t internal_key_size = key_size + 8;
  const size_t encoded_len =
      VarintLength(internal_key_size) + internal_key_size +
      VarintLength(val_size) + val_size;
  char* buf = arena_.Allocate(encoded_len);
  char* p = EncodeVarint32(buf, internal_key_size);
  memcpy(p, key.data(), key_size);
  p += key_size;
  EncodeFixed64(p, (s << 8) | type);
  p += 8;
  p = EncodeVarint32(p, val_size);
  memcpy(p, value.data(), val_size);
  assert((p + val_size) - buf == encoded_len);
  table_.Insert(buf);
}

查询
<db/memtable.cc>

bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
  Slice memkey = key.memtable_key();
  Table::Iterator iter(&table_);
  iter.Seek(memkey.data());
  if (iter.Valid()) {
    // entry format is:
    //    klength  varint32
    //    userkey  char[klength]
    //    tag      uint64
    //    vlength  varint32
    //    value    char[vlength]
    // Check that it belongs to same user key.  We do not check the
    // sequence number since the Seek() call above should have skipped
    // all entries with overly large sequence numbers.
    const char* entry = iter.key();
    uint32_t key_length;
    const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
    if (comparator_.comparator.user_comparator()->Compare(
            Slice(key_ptr, key_length - 8),
            key.user_key()) == 0) {
      // Correct user key
      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
      switch (static_cast<ValueType>(tag & 0xff)) {
        case kTypeValue: {
          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
          value->assign(v.data(), v.size());
          return true;
        }
        case kTypeDeletion:
          *s = Status::NotFound(Slice());
          return true;
      }
    }
  }
  return false;
}