7 buffer pool的组织结构

来源:互联网 发布:现在淘宝做什么产品好 编辑:程序博客网 时间:2024/05/16 09:56

一、buf_page_get_gen

它的作用是将文件数据读入缓存。

前面在“6 InnoDB相关的数据结构”一节中,已经提到了buf_pool_t的初始化;并在事务初始化时,宏buf_page_get会调用buf_page_get_gen。我们知道buf_pool中会缓存多种页:索引页、数据页、undo页等,buf_page_get_gen视为了进入database page。现把它的主要代码展示如下:

本次执行以buf_page_get_gen (space=0, zip_size=0, offset=5, rw_latch=2, guess=0x0, mode=10,  file=..., line=120, mtr=0xbfffdfd8)为例。(参考上一篇)

buf_block_t*buf_page_get_gen(/*=============*/ulintspace,/*!< in: space id */ulintzip_size,/*!< in: compressed page size in bytes or 0 for uncompressed pages */ulintoffset,/*!< in: page number */ulintrw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */buf_block_t*guess,/*!< in: guessed block or NULL */ulintmode,/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, orBUF_GET_IF_IN_POOL_OR_WATCH */const char*file,/*!< in: file name */ulintline,/*!< in: line where called */mtr_t*mtr)/*!< in: mini-transaction */ //不可分割的事务,原子性{buf_block_t*block;ulintfold;unsignedaccess_time;ulintfix_type;iboolmust_read;ulintretries = 0;buf_pool_t*buf_pool = buf_pool_get(space, offset);buf_pool->stat.n_page_gets++;fold = buf_page_address_fold(space, offset); //Calculates a folded value of a file page address to use in the page hash table.loop:block = guess;buf_pool_mutex_enter(buf_pool);if (block) {/* If the guess is a compressed page descriptor thathas been allocated by buf_page_alloc_descriptor(),it may have been freed by buf_relocate(). */if (!buf_block_is_uncompressed(buf_pool, block)    || offset != block->page.offset    || space != block->page.space    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {block = guess = NULL;} else {ut_ad(!block->page.in_zip_hash);ut_ad(block->page.in_page_hash);}}if (block == NULL) {block = (buf_block_t*) buf_page_hash_get_low(buf_pool, space, offset, fold); //Returns the control block of a file page}if (block && buf_pool_watch_is_sentinel(buf_pool, &block->page)) {block = NULL;}if (block == NULL) {/* Page not in buf_pool: needs to be read from file */if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {block = (buf_block_t*) buf_pool_watch_set(space, offset, fold);if (UNIV_LIKELY_NULL(block)) {goto got_block;}}buf_pool_mutex_exit(buf_pool);if (mode == BUF_GET_IF_IN_POOL    || mode == BUF_PEEK_IF_IN_POOL    || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {return(NULL);}if (buf_read_page(space, zip_size, offset)) {  //异步从文件读入数据到buf_poolbuf_read_ahead_random(space, zip_size, offset, ibuf_inside(mtr));//随机预读相关retries = 0;} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {++retries;} else {//打印错误}goto loop; //回到loop,这次异步线程已经将数据取来,所以所要找的block就不为空了。}got_block:ut_ad(page_zip_get_size(&block->page.zip) == zip_size);must_read = buf_block_get_io_fix(block) == BUF_IO_READ; //本例执行中must_read=0if (must_read && (mode == BUF_GET_IF_IN_POOL  || mode == BUF_PEEK_IF_IN_POOL)) {/* The page is being read to buffer pool,but we cannot wait around for the read tocomplete. */null_exit:buf_pool_mutex_exit(buf_pool);return(NULL);}switch (buf_block_get_state(block)) {buf_page_t*bpage;iboolsuccess;case BUF_BLOCK_FILE_PAGE: //本例是这种情况,表示该block存的是一个文件页break;.... //其他case情况}ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);mutex_enter(&block->mutex);buf_block_buf_fix_inc(block, file, line); //block->page.buf_fix_count++;buf_pool_mutex_exit(buf_pool);/* Check if this is the first access to the page */access_time = buf_page_is_accessed(&block->page);buf_page_set_accessed(&block->page);  //设置page->accsess_timemutex_exit(&block->mutex);if (mode != BUF_PEEK_IF_IN_POOL) {buf_page_make_young_if_needed(&block->page);}switch (rw_latch) {case RW_NO_LATCH:if (must_read) {/* Let us wait until the read operationcompletes */for (;;) {enum buf_io_fixio_fix;mutex_enter(&block->mutex);io_fix = buf_block_get_io_fix(block);mutex_exit(&block->mutex);if (io_fix == BUF_IO_READ) {/* wait by temporaly s-latch */rw_lock_s_lock(&(block->lock));rw_lock_s_unlock(&(block->lock));} else {break;}}}fix_type = MTR_MEMO_BUF_FIX;break;case RW_S_LATCH:rw_lock_s_lock_inline(&(block->lock), 0, file, line);fix_type = MTR_MEMO_PAGE_S_FIX;break;default:ut_ad(rw_latch == RW_X_LATCH);rw_lock_x_lock_inline(&(block->lock), 0, file, line); //Lock an rw-lock in exclusive mode for the current thread.fix_type = MTR_MEMO_PAGE_X_FIX;break;}mtr_memo_push(mtr, block, fix_type);//Pushes an object to an mtr memo stack.将这个block的指针和锁类型放入一个slot,并添加到mtr->memo这个数组中。if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {/* In the case of a first access, try to apply linearread-ahead */buf_read_ahead_linear(space, zip_size, offset, ibuf_inside(mtr));//可能会去线性预读}return(block);}


二、表空间文件中的页

如图所示,第0页存放extent描述符(之前我们就知道共享区空间先是段---extent---页的结构),每个描述符占40字节,描述了64页的使用情况(使用bitmap)。每页有16k,有256个描述符,即256*40=10k<16k,所以尚有结余。256个描述符,每个描述了64页,这就是16384页(256M)。因此,每16384个页,就有一个页用于存储extent描述符了。

对于我们的执行情况,我们的buf_pool为64M,尚不足256M,所以一个页用于存储extent描述符就足够了。可以看到,第五页用于存放事务系统头,这也是前面buf_page_get_gen()的调用情况。

extent描述符结构:(XDES_BITMAP部分为16字节,每两位描述一个页,正好可以表示16*8/2=64页的使用情况)

File extent descriptor data structure: contains bits to tell which pages inthe extent are free and which contain old tuple version to clean. *//*-------------------------------------*/#defineXDES_ID0/* The identifier of the segmentto which this extent belongs */#define XDES_FLST_NODE8/* The list node data structurefor the descriptors */#defineXDES_STATE(FLST_NODE_SIZE + 8)/* contains state informationof the extent */#defineXDES_BITMAP(FLST_NODE_SIZE + 12)/* Descriptor bitmap of the pagesin the extent */

三、buf_page_t

typedef struct buf_page_struct buf_page_t;

struct buf_page_struct{/** @name General fieldsNone of these bit-fields must be modified without holdingbuf_page_get_mutex() [buf_block_struct::mutex orbuf_pool->zip_mutex], since they can be stored in the samemachine word.  Some of these fields are additionally protectedby buf_pool->mutex. *//* @{ */unsignedspace:32;/*!< tablespace id; also protectedby buf_pool->mutex. */unsignedoffset:32;/*!< page number; also protectedby buf_pool->mutex. */unsignedstate:BUF_PAGE_STATE_BITS;/*!< state of the control block; alsoprotected by buf_pool->mutex.State transitions fromBUF_BLOCK_READY_FOR_USE toBUF_BLOCK_MEMORY need not beprotected by buf_page_get_mutex().@see enum buf_page_state */#ifndef UNIV_HOTBACKUPunsignedflush_type:2;/*!< if this block is currently beingflushed to disk, this tells theflush_type.@see enum buf_flush */unsignedio_fix:2;/*!< type of pending I/O operation;also protected by buf_pool->mutex@see enum buf_io_fix */unsignedbuf_fix_count:19;/*!< count of how manyfold this blockis currently bufferfixed */unsignedbuf_pool_index:6;/*!< index number of the buffer poolthat this block belongs to */#endif /* !UNIV_HOTBACKUP */page_zip_des_tzip;/*!< compressed page; zip.data(but not the data it points to) isalso protected by buf_pool->mutex;state == BUF_BLOCK_ZIP_PAGE andzip.data == NULL means an activebuf_pool->watch */#ifndef UNIV_HOTBACKUPbuf_page_t*hash;/*!< node used in chaining tobuf_pool->page_hash orbuf_pool->zip_hash *//** @name Page flushing fieldsAll these are protected by buf_pool->mutex. *//* @{ */UT_LIST_NODE_T(buf_page_t) list;/*!< based on state, this is alist node, protected either bybuf_pool->mutex or bybuf_pool->flush_list_mutex,in one of the following lists inbuf_pool:- BUF_BLOCK_NOT_USED:free- BUF_BLOCK_FILE_PAGE:flush_list- BUF_BLOCK_ZIP_DIRTY:flush_list- BUF_BLOCK_ZIP_PAGE:zip_clean- BUF_BLOCK_ZIP_FREE:zip_free[]If bpage is part of flush_listthen the node pointers arecovered by buf_pool->flush_list_mutex.Otherwise these pointers areprotected by buf_pool->mutex.The contents of the list nodeis undefined if !in_flush_list&& state == BUF_BLOCK_FILE_PAGE,or if state is one ofBUF_BLOCK_MEMORY,BUF_BLOCK_REMOVE_HASH orBUF_BLOCK_READY_IN_USE. */ib_uint64_tnewest_modification;/*!< log sequence number ofthe youngest modification tothis block, zero if notmodified. Protected by blockmutex */ib_uint64_toldest_modification;/*!< log sequence number ofthe START of the log entrywritten of the oldestmodification to this blockwhich has not yet been flushedon disk; zero if allmodifications are on disk.Writes to this field must becovered by both block->mutexand buf_pool->flush_list_mutex. Hencereads can happen while holdingany one of the two mutexes *//* @} *//** @name LRU replacement algorithm fieldsThese fields are protected by buf_pool->mutex only (notbuf_pool->zip_mutex or buf_block_struct::mutex). *//* @{ */UT_LIST_NODE_T(buf_page_t) LRU;/*!< node of the LRU list */unsignedold:1;/*!< TRUE if the block is in the oldblocks in buf_pool->LRU_old */unsignedfreed_page_clock:31;/*!< the value ofbuf_pool->freed_page_clockwhen this block was the lasttime put to the head of theLRU list; a thread is allowedto read this for heuristicpurposes without holding anymutex or latch *//* @} */unsignedaccess_time;/*!< time of first access, or0 if the block was never accessedin the buffer pool. Protected byblock mutex */#endif /* !UNIV_HOTBACKUP */};

以下为page type:

/** File page types (values of FIL_PAGE_TYPE) @{ */#define FIL_PAGE_INDEX17855/*!< B-tree node */#define FIL_PAGE_UNDO_LOG2/*!< Undo log page */#define FIL_PAGE_INODE3/*!< Index node */#define FIL_PAGE_IBUF_FREE_LIST4/*!< Insert buffer free list *//* File page types introduced in MySQL/InnoDB 5.1.7 */#define FIL_PAGE_TYPE_ALLOCATED0/*!< Freshly allocated page */#define FIL_PAGE_IBUF_BITMAP5/*!< Insert buffer bitmap */#define FIL_PAGE_TYPE_SYS6/*!< System page */#define FIL_PAGE_TYPE_TRX_SYS7/*!< Transaction system data */#define FIL_PAGE_TYPE_FSP_HDR8/*!< File space header */#define FIL_PAGE_TYPE_XDES9/*!< Extent descriptor page */#define FIL_PAGE_TYPE_BLOB10/*!< Uncompressed BLOB page */#define FIL_PAGE_TYPE_ZBLOB11/*!< First compressed BLOB page */#define FIL_PAGE_TYPE_ZBLOB212/*!< Subsequent compressed BLOB page */#define FIL_PAGE_TYPE_LASTFIL_PAGE_TYPE_ZBLOB2/*!< Last page type */

1、Tablespace header——位于第一页中,page header之后
Each tablespace will have a header of type fsp_header_t. This data structure is stored in the first page of a tablespace. extent描述符就是位于此结构之后

  • The table space identifier (space_id)
  • Current size of the table space in pages.
  • List of free extents(FSP_FREE)
  • List of full extents not belonging to any segment.(FSP_FULL_FRAG)
  • List of partially full/free extents not belonging to any segment.(FSP_FREE_FRAG)
  • List of pages containing segment headers, where all the segment inode slots are reserved. (pages of type FIL_PAGE_INODE)
  • List of pages containing segment headers, where not all the segment inode slots are reserved. (pages of type FIL_PAGE_INODE).

FREE_FRAG: Extents with free pages remaining that are allocated to be used in “fragments”, having individual pages allocated to different purposes rather than allocating the entire extent. For example, every extent with an FSP_HDR or XDES page will be placed on theFREE_FRAG list so that the remaining free pages in the extent can be allocated for other uses. 

FULL_FRAG: Exactly like FREE_FRAG but for extents with no free pages remaining. Extents are moved from FREE_FRAG to FULL_FRAG when they become full, and moved back to FREE_FRAGif a page is released so that they are no longer full. 

FREE: Extents that are completely unused and available to be allocated in whole to some purpose. A FREE extent could be allocated to a file segment (and placed on the appropriateINODE list), or moved to the FREE_FRAG list for individual page use.


2、保留页

#define FSP_XDES_OFFSET0/* !< extent descriptor */#define FSP_IBUF_BITMAP_OFFSET1/* !< insert buffer bitmap *//* The ibuf bitmap pages are the ones whosepage number is the number above plus amultiple of XDES_DESCRIBED_PER_PAGE */#define FSP_FIRST_INODE_PAGE_NO2/*!< in every tablespace *//* The following pages exist in the system tablespace (space 0). */#define FSP_IBUF_HEADER_PAGE_NO3/*!< insert buffer header page, in tablespace 0 */#define FSP_IBUF_TREE_ROOT_PAGE_NO4/*!< insert buffer B-tree root page in tablespace 0 *//* The ibuf tree root page number intablespace 0; its fseg inode is on the pagenumber FSP_FIRST_INODE_PAGE_NO */#define FSP_TRX_SYS_PAGE_NO5/*!< transaction system header, in tablespace 0 */#defineFSP_FIRST_RSEG_PAGE_NO6/*!< first rollback segment page, in tablespace 0 */#define FSP_DICT_HDR_PAGE_NO7/*!< data dictionary header page, in tablespace 0 */

3、file segment

Each segment has a segment header (fseg_header_t), which points to the inode (fseg_inode_t) describing the file segment. The file segment header contains the following information:
The space to which the inode belongs;
The page_no of the inode;
The byte offset of the inode;
The length of the file segment header (in bytes).

fseg_inode_t:
The segment id to which it belongs.
List of full extents.
List of free extents of this segment.
List of partially full/free extents
Array of individual pages belonging to this segment. The size of this array is half an extent(32).

typedefbytefseg_header_t;#define FSEG_HDR_SPACE0/*!< space id of the inode */#define FSEG_HDR_PAGE_NO4/*!< page number of the inode */#define FSEG_HDR_OFFSET8/*!< byte offset of the inode */#define FSEG_HEADER_SIZE10/*!< Length of the file systemheader, in bytes */

当一个表建立时,会产生两个file segments,一个是非叶子节点,一个是叶子节点。在B树的根节点中,有两个file segment headers.对于一个给定的表,B树的根页将会从数据字典获得(聚集索引)。这样,可建立关系:table——B-tree——2 file segment——fseg_inode_t——extent——pages.

注意:space header(fsp_header_t)中的FSP_SEG_INODES_FREE字段为 list of pages containing segment headers(不仅仅包括保留的inode页——第2页)。同时它的FSP_SEG_ID表示现在尚未使用的segment id,每次申请一个segment时,从此处获得id,并将它加1. 这样通过space header——pages(包含多个inode的页)——inode指向segment(inode中保存着segment id)。之后,在分配一个页作为B树的根节点(并由inode指向该page),设置该页的类型为system page,记下对应inode的页号和偏移(实际上,这就是在file segment header中记录,它位于page header中)。

在B树的这个根节点页(Index Page)中,又会有一个位置PAGE_BTR_SEG_LEAF和PAGE_BTR_SEG_TOP,分别存有一个segment header,分别指向叶子的segment和非叶子的segment对应的inode。

备注:Index Page的结构:38字节的file header,(36+2*10)的page header,其中含有两个segment header,每个10字节长,它们只在B树中有意义。Each index uses one file segment for leaf pages and one for non-leaf (internal) pages。


Each INODE page contains 85 file segment INODE entries。


File Segment ID: The ID of the file segment (FSEG) described by this file segment INODE entry. If the ID is 0, the entry is unused.

Fragment Array: An array of 32 page numbers of pages allocated individually from extents in the space’s FREE_FRAG or FULL_FRAG list of “fragment” extents. Once this array becomes full, only full extents can be allocated to the file segment.

As a table grows it will allocate individual pages in each file segment until the fragment array becomes full, and then switch to allocating 1 extent at a time, and eventually to allocating 4 extents at a time.

4、关于LRU,以及数据字典、insert buffer、rollback等与buffer pool密切相关的议题,将在之后再看。

0 0
原创粉丝点击