6 InnoDB相关的数据结构

来源:互联网 发布:sql注入过安全狗 编辑:程序博客网 时间:2024/05/21 10:54

以下讨论的结构体或函数都是出现在innobase_start_or_create_for_mysql()中。

一、srv_sys_t*  srv_sys——server system

typedef struct srv_sys_structsrv_sys_t/** The server system struct */struct srv_sys_struct{srv_table_t*threads;/*!< server thread table */UT_LIST_BASE_NODE_T(que_thr_t)tasks;/*!< task queue */  //这是一个链表,其中的节点que_thr_t称为Query graph query thread node。};
在srv_init()中,srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t))。

typedef struct srv_slot_struct srv_slot_t;

/* Thread slot in the thread table */struct srv_slot_struct{unsignedtype:1;/*!< thread type: user, utility etc. */unsignedin_use:1;/*!< TRUE if this slot is in use */unsignedsuspended:1;/*!< TRUE if the thread is waitingfor the event of this slot */ib_time_tsuspend_time;/*!< time when the thread wassuspended */os_event_tevent;/*!< event used in suspending thethread when it has nothing to do */que_thr_t*thr;/*!< suspended query thread (onlyused for MySQL threads) */};


备注:该宏定义经常被用到:用于表示一个链表

#define UT_LIST_BASE_NODE_T(TYPE)\struct {\ulintcount;/*!< count of nodes in list */\TYPE *start;/*!< pointer to list start, NULL if empty */\TYPE *end;/*!< pointer to list end, NULL if empty */\}\

二、fil_system_t* fil_system

该结构体的初始化为fil_init();fil_open_log_and_system_tablespace_files()中打开共享表文件和redo logs。

typedefstruct fil_system_structfil_system_t;/** The tablespace memory cache; also the totality of logs (the logdata space) is stored here; below we talk about tablespaces, but alsothe ib_logfiles form a 'space' and it is handled here */struct fil_system_struct {#ifndef UNIV_HOTBACKUPmutex_tmutex;/*!< The mutex protecting the cache */#endif /* !UNIV_HOTBACKUP */hash_table_t*spaces;/*!< The hash table of spaces in thesystem; they are hashed on the spaceid */hash_table_t*name_hash;/*!< hash table based on the spacename */UT_LIST_BASE_NODE_T(fil_node_t) LRU;/*!< base node for the LRU list of themost recently used open files with nopending i/o's; if we start an i/o onthe file, we first remove it from thislist, and return it to the start ofthe list when the i/o ends;log files and the system tablespace arenot put to this list: they are openedafter the startup, and kept open untilshutdown */UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;/*!< base node for the list of thosetablespaces whose files containunflushed writes; those spaces haveat least one file node wheremodification_counter > flush_counter */ulintn_open;/*!< number of files currently open */ulintmax_n_open;/*!< n_open is not allowed to exceedthis */ib_int64_tmodification_counter;/*!< when we write to a file weincrement this by one */ulintmax_assigned_id;/*!< maximum space id in the existingtables, or assigned during the timemysqld has been up; at an InnoDBstartup we scan the data dictionaryand set here the maximum of thespace id's of the tables there */ib_int64_ttablespace_version;/*!< a counter which is incremented forevery space object memory creation;every space mem object gets a'timestamp' from this; in DISCARD/IMPORT this is used to check if weshould ignore an insert buffer mergerequest */UT_LIST_BASE_NODE_T(fil_space_t) space_list;/*!< list of all file spaces */iboolspace_id_reuse_warned;/* !< TRUE if fil_space_create()has issued a warning aboutpotential space_id reuse */};

注意:其中对LRU域的描述是:redo log和共享表空间时钟被打开,所以并不放入LRU中;因此LRU是针对独立表空间的,不过本例未使用独立表空间,所以此处的LRU实际上并没有使用。

/** File node of a tablespace or the log data space */typedefstruct fil_node_structfil_node_t; //这实际上就是代表一个文件,表空间文件或redo log文件。struct fil_node_struct {fil_space_t*space;/*!< backpointer to the space where this nodebelongs */char*name;/*!< path to the file */iboolopen;/*!< TRUE if file open */os_file_thandle;/*!< OS handle to the file, if file open */ //这就是文件描述符。iboolis_raw_disk;/*!< TRUE if the 'file' is actually a rawdevice or a raw disk partition */ulintsize;/*!< size of the file in database pages, 0 ifnot known yet; the possible last incompletemegabyte may be ignored if space == 0 */ulintn_pending;/*!< count of pending i/o's on this file;closing of the file is not allowed ifthis is > 0 */ulintn_pending_flushes;/*!< count of pending flushes on this file;closing of the file is not allowed ifthis is > 0 */ib_int64_tmodification_counter;/*!< when we write to the file weincrement this by one */ib_int64_tflush_counter;/*!< up to whatmodification_counter value we haveflushed the modifications to disk */UT_LIST_NODE_T(fil_node_t) chain;/*!< link field for the file chain */UT_LIST_NODE_T(fil_node_t) LRU;/*!< link field for the LRU list */ulintmagic_n;/*!< FIL_NODE_MAGIC_N */};

三、buf_pool_t* buf_pool_ptr

typedef struct buf_pool_struct buf_pool_t;
struct buf_pool_struct{/** @name General fields *//* @{ */mutex_tmutex;/*!< Buffer pool mutex of this instance */mutex_tzip_mutex;/*!< Zip mutex of this bufferpool instance, protects compressedonly pages (of type buf_page_t, notbuf_block_t */ulintinstance_no;/*!< Array index of this bufferpool instance */ulintold_pool_size;  /*!< Old pool size in bytes */ulintcurr_pool_size;/*!< Current pool size in bytes */ulintLRU_old_ratio;  /*!< Reserve this much of the bufferpool for "old" blocks */#ifdef UNIV_DEBUGulintbuddy_n_frames; /*!< Number of frames allocated fromthe buffer pool to the buddy system */#endif#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUGulintmutex_exit_forbidden; /*!< Forbid release mutex */#endifulintn_chunks;/*!< number of buffer pool chunks */ //本次执行设为1buf_chunk_t*chunks;/*!< buffer pool chunks */ulintcurr_size;/*!< current pool size in pages */hash_table_t*page_hash;/*!< hash table of buf_page_t orbuf_block_t file pages,buf_page_in_file() == TRUE,indexed by (space_id, offset) */hash_table_t*zip_hash;/*!< hash table of buf_block_t blockswhose frames are allocated to thezip buddy system,indexed by block->frame */ulintn_pend_reads;/*!< number of pending readoperations */ulintn_pend_unzip;/*!< number of pending decompressions */time_tlast_printout_time;/*!< when buf_print_io was last timecalled */buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES + 1];/*!< Statistics of buddy system,indexed by block size */buf_pool_stat_tstat;/*!< current statistics */buf_pool_stat_told_stat;/*!< old statistics *//* @} *//** @name Page flushing algorithm fields *//* @{ */mutex_tflush_list_mutex;/*!< mutex protecting theflush list access. This mutexprotects flush_list, flush_rbtand bpage::list pointers whenthe bpage is on flush_list. Italso protects writes tobpage::oldest_modification */UT_LIST_BASE_NODE_T(buf_page_t) flush_list;  //按脏页修改先后顺序排列的链表,使得当需要同步checkpoint时,可以根据页修改的先后顺序来将脏页写入持久存储。/*!< base node of the modified block list */iboolinit_flush[BUF_FLUSH_N_TYPES];/*!< this is TRUE when a flush of thegiven type is being initialized */ulintn_flush[BUF_FLUSH_N_TYPES];/*!< this is the number of pendingwrites in the given flush type */os_event_tno_flush[BUF_FLUSH_N_TYPES];/*!< this is in the set statewhen there is no flush batchof the given type running */ib_rbt_t*flush_rbt;/*!< a red-black tree is usedexclusively during recovery tospeed up insertions in theflush_list. This tree containsblocks in order ofoldest_modification LSN and iskept in sync with theflush_list.Each member of the tree MUSTalso be on the flush_list.This tree is relevant only inrecovery and is set to NULLonce the recovery is over.Protected by flush_list_mutex */ulintfreed_page_clock;/*!< a sequence number usedto count the number of bufferblocks removed from the end ofthe LRU list; NOTE that thiscounter may wrap around at 4billion! A thread is allowedto read this for heuristicpurposes without holding anymutex or latch */ulintLRU_flush_ended;/*!< when an LRU flush ends for a page,this is incremented by one; this isset to zero when a buffer block isallocated *//* @} *//** @name LRU replacement algorithm fields *//* @{ */UT_LIST_BASE_NODE_T(buf_page_t) free;/*!< base node of the free block list */UT_LIST_BASE_NODE_T(buf_page_t) LRU;                                  /*!< base node of the LRU list */buf_page_t*LRU_old;/*!< pointer to the aboutLRU_old_ratio/BUF_LRU_OLD_RATIO_DIVoldest blocks in the LRU list;NULL if LRU length less thanBUF_LRU_OLD_MIN_LEN;NOTE: when LRU_old != NULL, its lengthshould always equal LRU_old_len */ulintLRU_old_len;/*!< length of the LRU list fromthe block to which LRU_old pointsonward, including that block;see buf0lru.c for the restrictionson this value; 0 if LRU_old == NULL;NOTE: LRU_old_len must be adjustedwhenever LRU_old shrinks or grows! */UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;/*!< base node of the unzip_LRU list */   /* @} *//** @name Buddy allocator fieldsThe buddy allocator is used for allocating compressed pageframes and buf_page_t descriptors of blocks that existin the buffer pool only in compressed form. *//* @{ */#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUGUT_LIST_BASE_NODE_T(buf_page_t)zip_clean;/*!< unmodified compressed pages */#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES];/*!< buddy free lists */buf_page_twatch[BUF_POOL_WATCH_SIZE];/*!< Sentinel records for bufferpool watches. Protected by       buf_pool->mutex. */#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"#endif#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"#endif/* @} */};

在buf_pool_init()中初始化,本次执行只创建一个实例。buf_pool_init()----->buf_chunk_init(),注意需要额外的空间为每个缓存页保留一个buf_block_t的空间。我们应该注意到,实际的获得buf_pool的页数与申请时的大小往往是不一致的。

buf_chunk_t*buf_chunk_init(/*===========*/buf_pool_t*buf_pool,/*!< in: buffer pool instance */buf_chunk_t*chunk,/*!< out: chunk of buffers */ulintmem_size)/*!< in: requested size in bytes */{buf_block_t*block;byte*frame;ulinti;/* Round down to a multiple of page size,although it already should be. */mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);/* Reserve space for the block descriptors. */mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);chunk->mem_size = mem_size;  //这个buf_chunk_t保存着这一大段内存。chunk->mem = os_mem_alloc_large(&chunk->mem_size); //内部调用mmap来获取一大段内存,MAP_PRIVATE | OS_MAP_ANON(此处后者指匿名内存映射:同时调用mmap时fd=-1);mmap调用时就觉得映射大小,不能再增加。注意mmap与malloc的区别。if (UNIV_UNLIKELY(chunk->mem == NULL)) {return(NULL);}/* Allocate the block descriptors fromthe start of the memory block. */chunk->blocks = chunk->mem;  //开始处是block描述符。/* Align a pointer to the first frame.  Note that whenos_large_page_size is smaller than UNIV_PAGE_SIZE,we may allocate one fewer block than requested.  Whenit is bigger, we may allocate more blocks than requested. */frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);chunk->size = chunk->mem_size / UNIV_PAGE_SIZE- (frame != chunk->mem);/* Subtract the space needed for block descriptors. */{   //以下用于计算出buf pool从哪里开始(开始处是多个buf_block_t),注意开始位置必须与16K对齐。ulintsize = chunk->size;while (frame < (byte*) (chunk->blocks + size)) {frame += UNIV_PAGE_SIZE;size--;}chunk->size = size;}/* Init block structs and assign frames for them. Then weassign the frames to the first blocks (we already mapped thememory above). */block = chunk->blocks;for (i = chunk->size; i--; ) {buf_block_init(buf_pool, block, frame);UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);/* Add the block to the free list */UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));  //buf_pool的buf_chunk_t* chunks->mem指向大缓存空间,开始处为多个buf_block_t,后面是多个页(每页16k);buf_block_t中有buf_page_t page域,它包含页的信息,并最终连接到buf_pool->free中。(可以看到,free是buf_page_t的链表。)ut_d(block->page.in_free_list = TRUE);ut_ad(buf_pool_from_block(block) == buf_pool);block++;frame += UNIV_PAGE_SIZE;}#ifdef PFS_GROUP_BUFFER_SYNCpfs_register_buffer_block(chunk);#endifreturn(chunk);}

四、trx_sys_t*  trx_sys——transaction system

typedef struct trx_sys_struct trx_sys_t;

struct trx_sys_struct{trx_id_tmax_trx_id;/*!< The smallest number not yetassigned as a transaction id ortransaction number */UT_LIST_BASE_NODE_T(trx_t) trx_list;/*!< List of active and committed inmemory transactions, sorted on trx id,biggest first */UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;/*!< List of transactions created for MySQL */UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;/*!< List of rollback segment objects */trx_rseg_t*latest_rseg;/*!< Latest rollback segment in theround-robin assignment of rollbacksegments to transactions */trx_rseg_t*rseg_array[TRX_SYS_N_RSEGS];/*!< Pointer array to rollbacksegments; NULL if slot not in use */ulintrseg_history_len;/*!< Length of the TRX_RSEG_HISTORYlist (update undo logs for committedtransactions), protected by rseg->mutex */UT_LIST_BASE_NODE_T(read_view_t) view_list;/*!< List of read views sortedon trx no, biggest first */};
初始化见trx_sys_init_at_db_start(),会首先获得kernel_mutex(mutex protecting the server, trx structs, query threads, and lock table)。其中trx_lists_init_at_db_start()是回滚事务(dummy)的建立。
voidtrx_sys_init_at_db_start(void)/*==========================*/{trx_sysf_t*sys_header;  //Transaction system headerib_uint64_trows_to_undo= 0;const char*unit= "";trx_t*trx;  //Transactionmtr_tmtr;  //Mini-transaction handle and bufferib_bh_t*ib_bh;// Binary heapmtr_start(&mtr);  //初始化mtrut_ad(trx_sys == NULL);mutex_enter(&kernel_mutex);/* We create the min binary heap here and pass ownership topurge when we init the purge sub-system. Purge is responsiblefor freeing the binary heap. */ib_bh = ib_bh_create(trx_rseg_compare_last_trx_no,sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);trx_sys = mem_zalloc(sizeof(*trx_sys));sys_header = trx_sysf_get(&mtr); //其中会调用bug_page_get(实际调用buf_page_get_gen)去进入database page获得一个header。trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); //Creates the memory copies for rollback segments and initializes the rseg list and array in trx_sys.其中会初始化128个rollback segment object(trx_rseg_t),并链接到trx_sys->rseg_listtrx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);/* VERY important: after the database is started, max_trx_id value isdivisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' intrx_sys_get_new_trx_id will evaluate to TRUE when the functionis first time called, and the value for trx id will be writtento the disk-based header! Thus trx id values will not overlap whenthe database is repeatedly started! */trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN+ ut_uint64_align_up(mach_read_from_8(sys_header   + TRX_SYS_TRX_ID_STORE),     TRX_SYS_TRX_ID_WRITE_MARGIN);UT_LIST_INIT(trx_sys->mysql_trx_list);trx_dummy_sess = sess_open();  //类型为sess_t,the session handletrx_lists_init_at_db_start();  //创建trx事务结构体,并初始化trx_sys->trx_list;它是根据回滚段和undo log list来创建的,这些事务将被回滚或清除。if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {trx = UT_LIST_GET_FIRST(trx_sys->trx_list);for (;;) {if (trx->conc_state != TRX_PREPARED) {rows_to_undo += trx->undo_no;}trx = UT_LIST_GET_NEXT(trx_list, trx);if (!trx) {break;}}if (rows_to_undo > 1000000000) {unit = "M";rows_to_undo = rows_to_undo / 1000000;}fprintf(stderr,"InnoDB: %lu transaction(s) which must be"" rolled back or cleaned up\n""InnoDB: in total %lu%s row operations to undo\n",(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),(ulong) rows_to_undo, unit);fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",(ullint) trx_sys->max_trx_id);}UT_LIST_INIT(trx_sys->view_list);/* Transfer ownership to purge. */trx_purge_sys_create(ib_bh); //创建trx_pruge_t* purge_sys,purge_sys->ib_bh = ib_bh,purge_sys->state = TRX_STOP_PURGE,...,mutex_exit(&kernel_mutex);mtr_commit(&mtr);}

关于buf_page_get_gen(),这是一个关键函数,This is the general function used to get access to a database page,返回值为buf_block_t的指针。


本届涉及的结构体又引出了很多其他结构体,主要分为线程系统、文件系统、缓存系统、事务系统等(还有一个重要的锁),属于比较关键的数据结构,尚有很多问题没有搞清楚。下一篇从bug_page_get_gen()入手,来弄清楚数据文件与缓存的关系。


0 0