linux内核源码阅读之facebook硬盘加速利器flashcache之一

来源:互联网 发布:mac上玩qq游戏 编辑:程序博客网 时间:2024/06/05 04:45
从来没有写过源码阅读,这种感觉越来越强烈,虽然劣于文笔,但还是下定决心认真写一回。
源代码下载请参见上一篇flashcache之我见 http://blog.csdn.net/liumangxiong/article/details/11643473
下面代码对应的是tag下面的1.0版本的。

看内核模块源码,闭着眼睛打开flashcache_init函数,区区百来行代码何足惧也。
1963int __init 1964flashcache_init(void)1965{1966int r;19671968r = flashcache_jobs_init();1969if (r)1970return r;1971atomic_set(&nr_cache_jobs, 0);1972atomic_set(&nr_pending_jobs, 0);1973#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)1974INIT_WORK(&_kcached_wq, do_work, NULL);1975#else1976INIT_WORK(&_kcached_wq, do_work);1977#endif1978for (r = 0 ; r < 33 ; r++)1979size_hist[r] = 0;1980r = dm_register_target(&flashcache_target);1981if (r < 0) {1982DMERR("cache: register failed %d", r);1983}1984#ifdef CONFIG_PROC_FS1985#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)1986flashcache_table_header = 1987register_sysctl_table(flashcache_root_table, 1);1988#else1989flashcache_table_header = 1990register_sysctl_table(flashcache_root_table);1991#endif1992{1993struct proc_dir_entry *entry;19941995entry = create_proc_entry("flashcache_stats", 0, NULL);1996if (entry)1997entry->proc_fops =  &flashcache_stats_operations;1998entry = create_proc_entry("flashcache_errors", 0, NULL);1999if (entry)2000entry->proc_fops =  &flashcache_errors_operations;2001entry = create_proc_entry("flashcache_iosize_hist", 0, NULL);2002if (entry)2003entry->proc_fops =  &flashcache_iosize_hist_operations;2004entry = create_proc_entry("flashcache_pidlists", 0, NULL);2005if (entry)2006entry->proc_fops =  &flashcache_pidlists_operations;2007entry = create_proc_entry("flashcache_version", 0, NULL);2008if (entry)2009entry->proc_fops =  &flashcache_version_operations;2010}2011#endif2012flashcache_control = (struct flashcache_control_s *)2013kmalloc(sizeof(struct flashcache_control_s *), GFP_KERNEL);2014flashcache_control->synch_flags = 0;2015register_reboot_notifier(&flashcache_notifier);2016return r;2017}

先大致看一眼,flashcache_jobs_init()分配job内存结构的,INIT_WORK初始化WORK的,接下来一看proc字眼就知道是/proc下目录的文件,再后来创建一个flashcache_control_s管理结构,再注册一个关机回调函数。
这样就走马观花地把这个函数看完了,那让写代码的人情何以堪?
再问一下自己,flashcache究竟做了什么?脑子里还是一片空白。那接下来就到每个函数内探个究竟。
441static int 442flashcache_jobs_init(void)443{444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)445_job_cache = kmem_cache_create("kcached-jobs",446                               sizeof(struct kcached_job),447                               __alignof__(struct kcached_job),448                               0, NULL, NULL);449#else450_job_cache = kmem_cache_create("kcached-jobs",451                               sizeof(struct kcached_job),452                               __alignof__(struct kcached_job),453                               0, NULL);454#endif455if (!_job_cache)456return -ENOMEM;457458_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,459                           mempool_free_slab, _job_cache);460if (!_job_pool) {461kmem_cache_destroy(_job_cache);462return -ENOMEM;463}464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)465_pending_job_cache = kmem_cache_create("pending-jobs",466       sizeof(struct pending_job),467       __alignof__(struct pending_job),468       0, NULL, NULL);469#else470_pending_job_cache = kmem_cache_create("pending-jobs",471       sizeof(struct pending_job),472       __alignof__(struct pending_job),473       0, NULL);474#endif475if (!_pending_job_cache) {476mempool_destroy(_job_pool);477kmem_cache_destroy(_job_cache);478return -ENOMEM;479}480481_pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,482   mempool_free_slab, _pending_job_cache);483if (!_pending_job_pool) {484kmem_cache_destroy(_pending_job_cache);485mempool_destroy(_job_pool);486kmem_cache_destroy(_job_cache);487return -ENOMEM;488}489490return 0;491}



首先是flashcache_jobs_init()函数,该函数里创建了两类job和两类的mem_pool,就像双胞胎看起来一样,实际上并不一样。
_job_pool => flashcache_alloc_cache_job => new_kcached_job 调用new_kcached_job 有好多个,有flashcache_dirty_writeback、flashcache_read_hit、flashcache_read_miss、flashcache_write_miss、flashcache_write_hit、flashcache_dirty_writeback_sync、flashcache_start_uncached_io。如果仔细地看一下这些函数的名称,发现这些函数所做的事情正是一个写缓存的基本操作和动作,即writeback, writethrough, hit, miss。
现在就以flashcache_dirty_writeback为例,看看到底在kcacheed_job起了什么作用?
944static void945flashcache_dirty_writeback(struct cache_c *dmc, int index)946{947struct kcached_job *job;948unsigned long flags;949struct cacheblock *cacheblk = &dmc->cache[index];950int device_removal = 0;951952DPRINTK("flashcache_dirty_writeback: Index %d", index);953spin_lock_irqsave(&dmc->cache_spin_lock, flags);954VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG);955VERIFY(cacheblk->cache_state & DIRTY);956dmc->cache_sets[index / dmc->assoc].clean_inprog++;957dmc->clean_inprog++;958spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);959job = new_kcached_job(dmc, NULL, index);960if (unlikely(sysctl_flashcache_error_inject & DIRTY_WRITEBACK_JOB_ALLOC_FAIL)) {961if (job)962flashcache_free_cache_job(job);963job = NULL;964sysctl_flashcache_error_inject &= ~DIRTY_WRITEBACK_JOB_ALLOC_FAIL;965}966/*967 * If the device is being (fast) removed, do not kick off any more cleanings.968 */969if (unlikely(atomic_read(&dmc->fast_remove_in_prog))) {970DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu", 971      cacheblk->dbn);972if (job)973flashcache_free_cache_job(job);974job = NULL;975device_removal = 1;976}977if (unlikely(job == NULL)) {978spin_lock_irqsave(&dmc->cache_spin_lock, flags);979dmc->cache_sets[index / dmc->assoc].clean_inprog--;980dmc->clean_inprog--;981flashcache_free_pending_jobs(dmc, cacheblk, -EIO);982cacheblk->cache_state &= ~(BLOCK_IO_INPROG);983spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);984if (device_removal == 0)985DMERR("flashcache: Dirty Writeback (for set cleaning) failed ! Can't allocate memory, block %lu", 986      cacheblk->dbn);987} else {988job->bio = NULL;989job->action = WRITEDISK;990atomic_inc(&dmc->nr_jobs);991dmc->ssd_reads++;992dmc->disk_writes++;993#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)994kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0, 995    flashcache_kcopyd_callback, job);996#else997dm_kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0, 998       (dm_kcopyd_notify_fn) flashcache_kcopyd_callback, 999       (void *)job);1000#endif1001}1002}

首先是用new_kcached_job申请一个kcached_job结构体,接下来判断dmc->fast_remove_in_prog,这个是移除flashcache标志,设备都要删除掉了,显然就没必要再下发命令了。再判断job是否为空,else这里才是干的正事。这里job->action = WRITEDISK;是最重要的一句话,就是前面讲的写缓存基本操作,而这个action就可以看作是一个状态机,对应的状态如下:
245/* kcached/pending job states */246#define READCACHE1247#define WRITECACHE2248#define READDISK3249#define WRITEDISK4250#define READFILL5/* Read Cache Miss Fill */251#define INVALIDATE6252#define WRITEDISK_SYNC7

这里设置的是WRITEDISK,就是写磁盘,那是从哪里写呢?是从写缓存写的,写缓存的数据又是在哪里呢?我们把SSD盘当作写缓存,所以是从SSD盘写到磁盘。那我们是不是要做很多事情,先从SSD读数据然后再往磁盘写呢?是的,但是我们不用做太多的事情,因为linux内核有大名鼎鼎的kcopyd线程,我们只需要把这些烦索的工作交给kcopyd完成就可以了,调用的接口是
int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
             unsigned int num_dests, struct dm_io_region *dests,
             unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
第一个参数是kcopyd_client,这是是flashcache_ctr即flashcache设备创建的构造函数中创建的,即每一个flashcache设备都对应一个kcopyd_client,那么为什么要创建这个结构体呢?可以简单地理解为使用kcopyd服务的一个句柄。第二参数是数据源,第三个为目的数量,第四个参数为要写的目标,第五个参数为额外标识,这里都设置为0,第六个参数fn是回调函数,设置了回调函数则此函数为异步,不阻塞,如果fn设置为NULL,则会同步等待。最后一个参数context是用于回调函数使用的参数,这里传入的正是我们现在最关心的job。
我们已经把kcached_job派发出去了,接着来看是kcached_job是什么时候回来的,回来又做了什么事情,最后是怎么销毁的?
在dm_kcopyd_copy中设置的回调函数是flashcache_kcopyd_callback。
901static void 902flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context)903{904struct kcached_job *job = (struct kcached_job *)context;905struct cache_c *dmc = job->dmc;906int index = job->index;907unsigned long flags;908909VERIFY(!in_interrupt());910DPRINTK("kcopyd_callback: Index %d", index);911VERIFY(job->bio == NULL);912spin_lock_irqsave(&dmc->cache_spin_lock, flags);913VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY));914if (unlikely(sysctl_flashcache_error_inject & KCOPYD_CALLBACK_ERROR)) {915read_err = -EIO;916sysctl_flashcache_error_inject &= ~KCOPYD_CALLBACK_ERROR;917}918if (likely(read_err == 0 && write_err == 0)) {919spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);920flashcache_md_write(job);921} else {922/* Disk write failed. We can not purge this block from flash */923DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu", 924      -read_err, -write_err, job->disk.sector);925VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);926VERIFY(dmc->clean_inprog > 0);927dmc->cache_sets[index / dmc->assoc].clean_inprog--;928dmc->clean_inprog--;929spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);930/* Set the error in the job and let do_pending() handle the error */931if (read_err) {932dmc->ssd_read_errors++;933job->error = read_err;934} else {935dmc->disk_write_errors++;936job->error = write_err;937}938flashcache_do_pending(job);939flashcache_clean_set(dmc, index / dmc->assoc); /* Kick off more cleanings */940dmc->cleanings++;941}942}

到这里就表明写缓存的数据写到磁盘的过程已经完成了。首先检查结果是否成功了,如果都成功的话就调用flashcache_md_write。
860861/* 862 * Kick off a cache metadata update (called from workqueue).863 * Cache metadata update IOs to a given metadata sector are serialized using the 864 * nr_in_prog bit in the md sector bufhead.865 * If a metadata IO is already in progress, we queue up incoming metadata updates866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we867 * cluster all these pending updates and do all of them as 1 flash write (that 868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs869 * list and does all of those updates.870 */871void872flashcache_md_write(struct kcached_job *job)873{874struct cache_c *dmc = job->dmc;875struct cache_md_sector_head *md_sector_head;876unsigned long flags;877878VERIFY(!in_interrupt());879VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 880       job->action == WRITEDISK_SYNC);881md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];882spin_lock_irqsave(&dmc->cache_spin_lock, flags);883/* If a write is in progress for this metadata sector, queue this update up */884if (md_sector_head->nr_in_prog != 0) {885struct kcached_job **nodepp;886887/* A MD update is already in progress, queue this one up for later */888nodepp = &md_sector_head->pending_jobs;889while (*nodepp != NULL)890nodepp = &((*nodepp)->next);891job->next = NULL;892*nodepp = job;893spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);894} else {895md_sector_head->nr_in_prog = 1;896spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);897flashcache_md_write_kickoff(job);898}899}

如果函数有注释还是仔细看一下吧,据个人观察,写linux内核的哥们都是惜字如金,如果他愿意写注释,那看注释绝对比看代码更重要,更有意义,如果有文档的话,那文档就是重中之重。看到这里有注释,真是欣喜万分,基本上看了注释不用看代码都行,但对于我这样的小菜鸟来说,有时还不能完全领会大侠的神意,就会继续读一下代码。
861/* 862 * Kick off a cache metadata update (called from workqueue).863 * Cache metadata update IOs to a given metadata sector are serialized using the 864 * nr_in_prog bit in the md sector bufhead.865 * If a metadata IO is already in progress, we queue up incoming metadata updates866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we867 * cluster all these pending updates and do all of them as 1 flash write (that 868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs869 * list and does all of those updates.870 */

派发cache metadata更新(从workqueue调用=》因为这里是从kcopyd回调回来的,所以这里友情提示一下,在内核要十分关心调用的上下文,是看内核代码的必修课,有时也是解决疑难问题的基础)。cache metadata的更新是由结构cache_md_sector_head中nr_in_prog字段来控制更新次序的(就是说更新cache metadata是按次序的,如果前面的更新未完成,后面的更新就排队等候)。排队等候的kcached_job就挂在cache_md_sector_head的pending_jobs上。在前面的更新操作回来时,就一次性把pending_jobs上的所有更新操作一次性派发。(因为所有更新就是对应一个sector中flashcache管理结构的)。
这一段看不明白也没关系,因为这里还没有讲到flashcache的数据组织。但必须明白,我们在flashcache_dirty_writeback中把脏数据从写缓存SSD刷到磁盘,这里要做的事情就是把这个脏数据的的metadata从内存刷到SSD,这样就保证了在异常掉电的情况下元数据可以从SSD中找回。
到这里kcached_job还没有销毁,我们继续跟踪下去 flashcache_md_write=>flashcache_md_write_kickoff。
660static void661flashcache_md_write_kickoff(struct kcached_job *job)662{663struct cache_c *dmc = job->dmc;664struct flash_cacheblock *md_sector;665int md_sector_ix;666#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)667struct io_region where;668#else669struct dm_io_region where;670#endif671int i;672struct cache_md_sector_head *md_sector_head;673struct kcached_job *orig_job = job;674unsigned long flags;675676if (flashcache_alloc_md_sector(job)) {677DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu", 678      job->action, job->disk.sector);679flashcache_md_write_callback(-EIO, job);680return;681}682spin_lock_irqsave(&dmc->cache_spin_lock, flags);683/*684 * Transfer whatever is on the pending queue to the md_io_inprog queue.685 */686md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];687md_sector_head->md_io_inprog = md_sector_head->pending_jobs;688md_sector_head->pending_jobs = NULL;689md_sector = job->md_sector;690md_sector_ix = INDEX_TO_MD_SECTOR(job->index) * MD_BLOCKS_PER_SECTOR;691/* First copy out the entire sector */692for (i = 0 ; 693     i < MD_BLOCKS_PER_SECTOR && md_sector_ix < dmc->size ; 694     i++, md_sector_ix++) {695md_sector[i].dbn = dmc->cache[md_sector_ix].dbn;696#ifdef FLASHCACHE_DO_CHECKSUMS697md_sector[i].checksum = dmc->cache[md_sector_ix].checksum;698#endif699md_sector[i].cache_state = 700dmc->cache[md_sector_ix].cache_state & (VALID | INVALID | DIRTY);701}702/* Then set/clear the DIRTY bit for the "current" index */703if (job->action == WRITECACHE) {704/* DIRTY the cache block */705md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 706(VALID | DIRTY);707} else { /* job->action == WRITEDISK* */708/* un-DIRTY the cache block */709md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;710}711712for (job = md_sector_head->md_io_inprog ; 713     job != NULL ;714     job = job->next) {715if (job->action == WRITECACHE) {716/* DIRTY the cache block */717md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 718(VALID | DIRTY);719} else { /* job->action == WRITEDISK* */720/* un-DIRTY the cache block */721md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;722}723}724spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);725where.bdev = dmc->cache_dev->bdev;726where.count = 1;727where.sector = 1 + INDEX_TO_MD_SECTOR(orig_job->index);728dmc->ssd_writes++;729dm_io_async_bvec(1, &where, WRITE,730 &orig_job->md_io_bvec,731 flashcache_md_write_callback, orig_job);732flashcache_unplug_device(dmc->cache_dev->bdev);733}

这里cacheblock 信息保存到job->md_io_bvec的page页中,再调用dm_io_async_bvec将数据写到SSD盘中。我们来看一下该函数原型:
static int dm_io_async_bvec(unsigned int num_regions,     struct dm_io_region *where, int rw,     struct bio_vec *bvec, io_notify_fn fn,     void *context)

该函数与之前的dm_kcopyd_copy类似,我们最关心的是参数where,因为这是人生最重要的一课,你是谁?你要到哪里去?
where的bdev域就是目标设备,而sector域就是起始地址,count表示要写的扇区数。这个函数就是把dmc->cache的管理结构打包到job->md_io_bvec中,然后写到SSD对应位置上。
再接下来看写SSD完成调用flashcache_md_write_callback:
621void 622flashcache_md_write_callback(unsigned long error, void *context)623{624struct kcached_job *job = (struct kcached_job *)context;625626job->error = error;627push_md_complete(job);628schedule_work(&_kcached_wq);629}

该函数只是简单地设置job的返回值,然后放到_md_complete_jobs这个链表里,然后通知workqueue处理。为什么不直接在这个函数里处理,而要放到后面处理呢?这就像每个公司都有个漂亮的前台秘书,这个物流公司送来了大箱的物料,美女秘书当然不会自己搬,随便撒个娇一大群工科男都抢着干活。这里函数是写完成的回调函数,是在软中断中调用的,软中断跟美女秘书一样,干不了重活,只能简单地签收一下,剩下的活就由workqueue来完成了。
要继续我们的跟踪,那就得问workqueue是从哪里来的,workqueue做了什么,或者说对job做了什么?
flashcache_init=>INIT_WORK(&_kcached_wq, do_work);=>process_jobs(&_md_complete_jobs, flashcache_md_write_done);
先看process_jobs
284static void285process_jobs(struct list_head *jobs,286     void (*fn) (struct kcached_job *))287{288struct kcached_job *job;289290while ((job = pop(jobs)))291(void)fn(job);292}

就是从队列中把刚才美女秘书签收的job取出来,然后调用fn,fn就是这里注册的flashcache_md_write_done。
从函数名有个蛋(done),就好像每天下午的5点半,一天的忙碌立马可以收工了,但是悲剧的LZ现在每个月都要加班72个小时,这样想想大家有没有从LZ的不幸中找到自己的幸福?
735void736flashcache_md_write_done(struct kcached_job *job)737{738struct cache_c *dmc = job->dmc;739struct cache_md_sector_head *md_sector_head;740int index;741unsigned long flags;742struct kcached_job *job_list;743int error = job->error;744struct kcached_job *next;745struct cacheblock *cacheblk;746747VERIFY(!in_interrupt());748VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 749       job->action == WRITEDISK_SYNC);750flashcache_free_md_sector(job);751job->md_sector = NULL;752md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];753job_list = job;754job->next = md_sector_head->md_io_inprog;755md_sector_head->md_io_inprog = NULL;756for (job = job_list ; job != NULL ; job = next) {757next = job->next;758job->error = error;759index = job->index;760cacheblk = &dmc->cache[index];761spin_lock_irqsave(&dmc->cache_spin_lock, flags);762if (job->action == WRITECACHE) {763if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_MD_ERROR)) {764job->error = -EIO;765sysctl_flashcache_error_inject &= ~WRITECACHE_MD_ERROR;766}767if (likely(job->error == 0)) {768if ((cacheblk->cache_state & DIRTY) == 0) {769dmc->cache_sets[index / dmc->assoc].nr_dirty++;770dmc->nr_dirty++;771}772dmc->md_write_dirty++;773cacheblk->cache_state |= DIRTY;774} else775dmc->ssd_write_errors++;776flashcache_bio_endio(job->bio, job->error);777if (job->error || cacheblk->head) {778if (job->error) {779DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu", 780      -job->error, cacheblk->dbn);781}782spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);783flashcache_do_pending(job);784} else {785cacheblk->cache_state &= ~BLOCK_IO_INPROG;786spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);787flashcache_free_cache_job(job);788if (atomic_dec_and_test(&dmc->nr_jobs))789wake_up(&dmc->destroyq);790}791} else {792int action = job->action;793794if (unlikely(sysctl_flashcache_error_inject & WRITEDISK_MD_ERROR)) {795job->error = -EIO;796sysctl_flashcache_error_inject &= ~WRITEDISK_MD_ERROR;797}798/*799 * If we have an error on a WRITEDISK*, no choice but to preserve the 800 * dirty block in cache. Fail any IOs for this block that occurred while801 * the block was being cleaned.802 */803if (likely(job->error == 0)) {804dmc->md_write_clean++;805cacheblk->cache_state &= ~DIRTY;806VERIFY(dmc->cache_sets[index / dmc->assoc].nr_dirty > 0);807VERIFY(dmc->nr_dirty > 0);808dmc->cache_sets[index / dmc->assoc].nr_dirty--;809dmc->nr_dirty--;810} else 811dmc->ssd_write_errors++;812VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);813VERIFY(dmc->clean_inprog > 0);814dmc->cache_sets[index / dmc->assoc].clean_inprog--;815dmc->clean_inprog--;816if (job->error || cacheblk->head) {817if (job->error) {818DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu", 819      -job->error, cacheblk->dbn);820}821spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);822flashcache_do_pending(job);823/* Kick off more cleanings */824if (action == WRITEDISK)825flashcache_clean_set(dmc, index / dmc->assoc);826else827flashcache_sync_blocks(dmc);828} else {829cacheblk->cache_state &= ~BLOCK_IO_INPROG;830spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);831flashcache_free_cache_job(job);832if (atomic_dec_and_test(&dmc->nr_jobs))833wake_up(&dmc->destroyq);834/* Kick off more cleanings */835if (action == WRITEDISK)836flashcache_clean_set(dmc, index / dmc->assoc);837else838flashcache_sync_blocks(dmc);839}840dmc->cleanings++;841if (action == WRITEDISK_SYNC)842flashcache_update_sync_progress(dmc);843}844}845spin_lock_irqsave(&dmc->cache_spin_lock, flags);846if (md_sector_head->pending_jobs != NULL) {847/* peel off the first job from the pending queue and kick that off */848job = md_sector_head->pending_jobs;849md_sector_head->pending_jobs = job->next;850job->next = NULL;851spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);852VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||853       job->action == WRITEDISK_SYNC);854flashcache_md_write_kickoff(job);855} else {856md_sector_head->nr_in_prog = 0;857spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);858}859}860

首先是flashcache_free_md_sector,这个函数只是简单地把刚才分配的记录cacheblock 的page页释放。哪个刚才啊?就是flashcache_md_write_kickoff中flashcache_alloc_md_sector申请的page页。所以看这个函数时要回头再去看看flashcache_md_write_kickoff,所以前面提到了上下文,那么在这里kickoff是上文,done就是下文,上文种什么因,下文就得到什么果。上文申请了page页,下文就要释放page页;上文把dmc->md_sectors_buf[]中struct kcached_job  *md_io_inprog对应的kcached_job都已经下发了,下文这里才有一个for循环。细心的你可能会问,为什么这里的kcached_job可以一起下发?那首先要来了解一下这里的kcached_job是干什么的。是结构体上的:
/*  * We have one of these for *every* cache metadata sector, to keep track * of metadata ios in progress for blocks covered in this sector. Only * one metadata IO per sector can be in progress at any given point in  * time */struct cache_md_sector_head {u_int32_tnr_in_prog;struct kcached_job*pending_jobs, *md_io_inprog;};

按规矩先看注释,每一个cache metadata扇区都有对应一个cache_md_sector_head结构,用于同步进程(内存中)cacheblock metadata到cache metadata扇区。同时只能有一个IO在同步,对应的是cache_md_sector_head->nr_in_prog。回答上面的问题,就是这些kcached_job是对应同一个扇区内的不同metadata的写,所以可以合并。这个扇区指的是SSD盘上存放flash_block结构的。
再回到flashcache_md_write_done函数中,在for循环中job->action为WRITEDISK,所以直接来到for循环中else,迎面而来的又是一行注释,在WRITEDISK*发生错误时,只有保持cacheblock的DIRTY标志。接下来判断有错误或者cacheblock上还有pending_job,那么继续下发IO,否则的话清除cacheblock的处理标志,这里我们终于见到了kcached_job完成了他的使命,调用flashcache_free_cache_job将该结构返回给内存池。
似乎到这里我们就可以像童话里讲的“从此他们过上了幸福的生活”来结束kcached_job的介绍。然而回归资源池也意味着kcached_job的再生,接着判断action==WRITEDISK,调用flashcache_clean_set,将超过脏水平线的cache块刷回到磁盘。就是说在每次写磁盘返回的时候这个workqueue都会检查一下脏水平线,如果超过就继续往下刷,这就又回到了本文最开始的flashcache_dirty_writeback函数,真是因果联系,环环相扣,kcached_job的再生不是为了自己,而是为cacheblock的再生,所以说人不能只为自己活着,每个人只是万千轮回里的一个元素,都是为了成全其他元素而进入六道轮回。
下面一篇会从flashcache的数据结构和存储设计来分析。


原创粉丝点击