HBase MemstoreLAB

来源：互联网发布：周末是否算法定节假日编辑：程序博客网时间：2024/06/02 02:49

MemstoreLAB其实很简单，就是为了避免每次向系统要小片内存，进而产生内存碎片问题，所以每次都申请一大片内存chunk，后面的分配会首先从chunk要，如果不能得到满足就分配新的chunk。

A memstore-local allocation buffer.

The MemStoreLAB is basically a bump-the-pointer allocator that allocates big (2MB) byte[] chunks from and then doles it out to threads that request slices into the array.

The purpose of this class is to combat heap fragmentation in the regionserver. By ensuring that all KeyValues in a given memstore refer only to large chunks of contiguous memory, we ensure that large blocks get freed up when the memstore is flushed.

Without the MSLAB, the byte array allocated during insertion end up interleaved throughout the heap, and the old generation gets progressively more fragmented until a stop-the-world compacting collection occurs.

TODO: we should probably benchmark whether word-aligning the allocations would provide a performance improvement - probably would speed up the Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached anyway

源代码主要包含三部分：

Allocation

  /**   * The result of a single allocation. Contains the chunk that the   * allocation points into, and the offset in this array where the   * slice begins.   */  public static class Allocation {    private final byte[] data;    private final int offset;    private Allocation(byte[] data, int off) {      this.data = data;      this.offset = off;    }    @Override    public String toString() {      return "Allocation(data=" + data +        " with capacity=" + data.length +        ", off=" + offset + ")";    }    byte[] getData() {      return data;    }    int getOffset() {      return offset;    }  }

Chunk

  /**   * A chunk of memory out of which allocations are sliced.   */  private static class Chunk {    /** Actual underlying data */    private byte[] data;    private static final int UNINITIALIZED = -1;    private static final int OOM = -2;    /**     * Offset for the next allocation, or the sentinel value -1     * which implies that the chunk is still uninitialized.     * */    private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED);    /** Total number of allocations satisfied from this buffer */    private AtomicInteger allocCount = new AtomicInteger();    /** Size of chunk in bytes */    private final int size;    /**     * Create an uninitialized chunk. Note that memory is not allocated yet, so     * this is cheap.     * @param size in bytes     */    private Chunk(int size) {      this.size = size;    }    /**     * Actually claim the memory for this chunk. This should only be called from     * the thread that constructed the chunk. It is thread-safe against other     * threads calling alloc(), who will block until the allocation is complete.     */    public void init() {      assert nextFreeOffset.get() == UNINITIALIZED;      try {        data = new byte[size];// 真正的数据在这里呢，被chunk代表了      } catch (OutOfMemoryError e) {        boolean failInit = nextFreeOffset.compareAndSet(UNINITIALIZED, OOM);        assert failInit; // should be true.        throw e;      }      // Mark that it's ready for use      boolean initted = nextFreeOffset.compareAndSet(          UNINITIALIZED, 0);      // We should always succeed the above CAS since only one thread      // calls init()!      Preconditions.checkState(initted,          "Multiple threads tried to init same chunk");    }    /**     * Try to allocate <code>size</code> bytes from the chunk.     * @return the offset of the successful allocation, or -1 to indicate not-enough-space     */    public int alloc(int size) {      while (true) {        int oldOffset = nextFreeOffset.get(); // 分配的起始位置        if (oldOffset == UNINITIALIZED) {          // The chunk doesn't have its data allocated yet.          // Since we found this in curChunk, we know that whoever          // CAS-ed it there is allocating it right now. So spin-loop          // shouldn't spin long!          Thread.yield();          continue;        }        if (oldOffset == OOM) {          // doh we ran out of ram. return -1 to chuck this away.          return -1;        }        if (oldOffset + size > data.length) {          return -1; // alloc doesn't fit        }        // Try to atomically claim this chunk        if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size)) {          // we got the alloc          allocCount.incrementAndGet();          return oldOffset;        }        // we raced and lost alloc, try again      }    }    @Override    public String toString() {      return "Chunk@" + System.identityHashCode(this) +        " allocs=" + allocCount.get() + "waste=" +        (data.length - nextFreeOffset.get());    }  }

MemStoreLAB

public class MemStoreLAB {  private AtomicReference<Chunk> curChunk = new AtomicReference<Chunk>();  final static String CHUNK_SIZE_KEY = "hbase.hregion.memstore.mslab.chunksize";  final static int CHUNK_SIZE_DEFAULT = 2048 * 1024; // 默认chunk size是2M  final int chunkSize;  final static String MAX_ALLOC_KEY = "hbase.hregion.memstore.mslab.max.allocation";  final static int MAX_ALLOC_DEFAULT = 256  * 1024; // allocs bigger than this don't go through allocator  final int maxAlloc;  public MemStoreLAB() {    this(new Configuration());  }  public MemStoreLAB(Configuration conf) {    chunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT);    maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT);    // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one!    Preconditions.checkArgument(      maxAlloc <= chunkSize,      MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY);  }  // MemStore向mslab请求内存的入口函数  /**   * Allocate a slice of the given length.   *   * If the size is larger than the maximum size specified for this   * allocator, returns null.   */  public Allocation allocateBytes(int size) {    Preconditions.checkArgument(size >= 0, "negative size");    // Callers should satisfy large allocations directly from JVM since they    // don't cause fragmentation as badly.    if (size > maxAlloc) {      return null;    }    while (true) {      Chunk c = getOrMakeChunk();  // 获取chunk，可能已经存在了，或者需要重新向系统索要      // Try to allocate from this chunk      int allocOffset = c.alloc(size);      if (allocOffset != -1) {        // We succeeded - this is the common case - small alloc        // from a big buffer        return new Allocation(c.data, allocOffset);      }      // 这个chunk剩余的空间是不是就被浪费了啊？      // not enough space!      // try to retire this chunk      tryRetireChunk(c); // current chunk已经没有足够的空间了，赶紧将current chunk置空，这样就会获取新的chunk    }  }  /**   * Try to retire the current chunk if it is still   * <code>c</code>. Postcondition is that curChunk.get()   * != c   */  private void tryRetireChunk(Chunk c) {    @SuppressWarnings("unused")    boolean weRetiredIt = curChunk.compareAndSet(c, null);    // If the CAS succeeds, that means that we won the race    // to retire the chunk. We could use this opportunity to    // update metrics on external fragmentation.    //    // If the CAS fails, that means that someone else already    // retired the chunk for us.  }  /**   * Get the current chunk, or, if there is no current chunk,   * allocate a new one from the JVM.   */  private Chunk getOrMakeChunk() {    while (true) {      // Try to get the chunk      Chunk c = curChunk.get();      if (c != null) {        return c;      }      // No current chunk, so we want to allocate one. We race      // against other allocators to CAS in an uninitialized chunk      // (which is cheap to allocate)      c = new Chunk(chunkSize);      if (curChunk.compareAndSet(null, c)) {        // we won race - now we need to actually do the expensive        // allocation step        c.init();        return c;      }      // someone else won race - that's fine, we'll try to grab theirs      // in the next iteration of the loop.    }  }

两个问题需要确认：

线程安全控制
retire chunk造成空间浪费