文件系统中数据读取的详细过程

来源：互联网发布：淘宝店模板收费吗编辑：程序博客网时间：2024/06/08 01:54

一直在困惑文件系统sys_read、bio、io调度、硬中断、软中断、io完成通知之间的过程是怎么样的,通过代码的跟踪大致明白过程了

内核态的系统调用与bio的交界处的函数mpage_bio_submit(fs/mpage.c)

bio与io调度的交界处的函数__make_request(block/blk-core.c)

io调度与驱动层的交界函数__generic_unplug_device(block/blk-core.c)

io的返回路径

在驱动层的io完成之后,注册的设备完成方法中(例如:scsi_done,__scsi_done)中则会执行blk_complete_request

blk_complete_request是硬件的设备驱动的硬件中断上下文的最后一个函数了

blk_complete_request将会启动软中断BLOCK_SOFTIRQ

这个过程向上的通知的过程到达bio层的时候,调用的函数就是 mpage_bio_submit中注册的bio的end_io(mpage_end_io_read| mpage_end_io_write)

代码流程

Fs/mpage.cstatic struct bio *do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,sector_t *last_block_in_bio, struct buffer_head *map_bh,unsigned long *first_logical_block, get_block_t get_block){struct inode *inode = page->mapping->host;const unsigned blkbits = inode->i_blkbits;const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;const unsigned blocksize = 1 << blkbits;sector_t block_in_file;sector_t last_block;sector_t last_block_in_file;sector_t blocks[MAX_BUF_PER_PAGE];unsigned page_block;unsigned first_hole = blocks_per_page;struct block_device *bdev = NULL;int length;int fully_mapped = 1;unsigned nblocks;unsigned relative_block;if (page_has_buffers(page))goto confused;block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);last_block = block_in_file + nr_pages * blocks_per_page;last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;if (last_block > last_block_in_file)last_block = last_block_in_file;page_block = 0;nblocks = map_bh->b_size >> blkbits;if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&block_in_file < (*first_logical_block + nblocks)) {unsigned map_offset = block_in_file - *first_logical_block;unsigned last = nblocks - map_offset;for (relative_block = 0; ; relative_block++) {if (relative_block == last) {clear_buffer_mapped(map_bh);break;}if (page_block == blocks_per_page)break;blocks[page_block] = map_bh->b_blocknr + map_offset +relative_block;page_block++;block_in_file++;}bdev = map_bh->b_bdev;}map_bh->b_page = page;while (page_block < blocks_per_page) {map_bh->b_state = 0;map_bh->b_size = 0;if (block_in_file < last_block) {map_bh->b_size = (last_block-block_in_file) << blkbits;if (get_block(inode, block_in_file, map_bh, 0))goto confused;*first_logical_block = block_in_file;}if (!buffer_mapped(map_bh)) {fully_mapped = 0;if (first_hole == blocks_per_page)first_hole = page_block;page_block++;block_in_file++;continue;}if (buffer_uptodate(map_bh)) {map_buffer_to_page(page, map_bh, page_block);goto confused;}if (first_hole != blocks_per_page)goto confused;/* hole -> non-hole */if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)goto confused;nblocks = map_bh->b_size >> blkbits;for (relative_block = 0; ; relative_block++) {if (relative_block == nblocks) {clear_buffer_mapped(map_bh);break;} else if (page_block == blocks_per_page)break;blocks[page_block] = map_bh->b_blocknr+relative_block;page_block++;block_in_file++;}bdev = map_bh->b_bdev;}if (first_hole != blocks_per_page) {zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);if (first_hole == 0) {SetPageUptodate(page);unlock_page(page);goto out;}} else if (fully_mapped) {SetPageMappedToDisk(page);}/* * This page will go to BIO.  Do we need to send this BIO off first? */if (bio && (*last_block_in_bio != blocks[0] - 1))bio = mpage_bio_submit(READ, bio);alloc_new:if (bio == NULL) {bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),  min_t(int, nr_pages, bio_get_nr_vecs(bdev)),GFP_KERNEL);if (bio == NULL)goto confused;}length = first_hole << blkbits;if (bio_add_page(bio, page, length, 0) < length) {bio = mpage_bio_submit(READ, bio);goto alloc_new;}relative_block = block_in_file - *first_logical_block;nblocks = map_bh->b_size >> blkbits;if ((buffer_boundary(map_bh) && relative_block == nblocks) ||    (first_hole != blocks_per_page))bio = mpage_bio_submit(READ, bio);else*last_block_in_bio = blocks[blocks_per_page - 1];out:return bio;confused:if (bio)bio = mpage_bio_submit(READ, bio);if (!PageUptodate(page))        block_read_full_page(page, get_block);elseunlock_page(page);goto out;}

do_mpage_readpage主要是将page转换为bio

do_mpage_readpage中重点关注mpage_bio_submit和block_read_full_page

Fs/buffer.cint block_read_full_page(struct page *page, get_block_t *get_block){struct inode *inode = page->mapping->host;sector_t iblock, lblock;struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];unsigned int blocksize;int nr, i;int fully_mapped = 1;BUG_ON(!PageLocked(page));blocksize = 1 << inode->i_blkbits;if (!page_has_buffers(page))create_empty_buffers(page, blocksize, 0);head = page_buffers(page);iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;bh = head;nr = 0;i = 0;do {if (buffer_uptodate(bh))continue;if (!buffer_mapped(bh)) {int err = 0;fully_mapped = 0;if (iblock < lblock) {WARN_ON(bh->b_size != blocksize);err = get_block(inode, iblock, bh, 0);if (err)SetPageError(page);}if (!buffer_mapped(bh)) {zero_user(page, i * blocksize, blocksize);if (!err)set_buffer_uptodate(bh);continue;}if (buffer_uptodate(bh))continue;}arr[nr++] = bh;} while (i++, iblock++, (bh = bh->b_this_page) != head);if (fully_mapped)SetPageMappedToDisk(page);if (!nr) {if (!PageError(page))SetPageUptodate(page);unlock_page(page);return 0;}/* Stage two: lock the buffers */for (i = 0; i < nr; i++) {bh = arr[i];lock_buffer(bh);mark_buffer_async_read(bh);}for (i = 0; i < nr; i++) {bh = arr[i];if (buffer_uptodate(bh))end_buffer_async_read(bh, 1);elsesubmit_bh(READ, bh);}return 0;}

重点关注submit_bh

Fs/buffer.cint submit_bh(int rw, struct buffer_head * bh){struct bio *bio;int ret = 0;if (buffer_ordered(bh) && (rw & WRITE))rw |= WRITE_BARRIER;if (test_set_buffer_req(bh) && (rw & WRITE))clear_buffer_write_io_error(bh);bio = bio_alloc(GFP_NOIO, 1);bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);bio->bi_bdev = bh->b_bdev;bio->bi_io_vec[0].bv_page = bh->b_page;bio->bi_io_vec[0].bv_len = bh->b_size;bio->bi_io_vec[0].bv_offset = bh_offset(bh);bio->bi_vcnt = 1;bio->bi_idx = 0;bio->bi_size = bh->b_size;bio->bi_end_io = end_bio_bh_io_sync;bio->bi_private = bh;bio_get(bio);submit_bio(rw, bio);if (bio_flagged(bio, BIO_EOPNOTSUPP))ret = -EOPNOTSUPP;bio_put(bio);return ret;}

do_mpage_readpage中执行mpage_bio_submit

static struct bio *mpage_bio_submit(int rw,struct bio *bio){       bio->bi_end_io= mpage_end_io_read;       if(rw == WRITE)              bio->bi_end_io= mpage_end_io_write;       submit_bio(rw, bio);       returnNULL;}

Block/blk-core.cvoid submit_bio(int rw, struct bio *bio){int count = bio_sectors(bio);bio->bi_rw |= rw;if (bio_has_data(bio)) {if (rw & WRITE) {count_vm_events(PGPGOUT, count);} else {task_io_account_read(bio->bi_size);count_vm_events(PGPGIN, count);}if (unlikely(block_dump)) {char b[BDEVNAME_SIZE];printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",current->comm, task_pid_nr(current),(rw & WRITE) ? "WRITE" : "READ",(unsigned long long)bio->bi_sector,bdevname(bio->bi_bdev, b));}}generic_make_request(bio);}

generic_make_request将会把请求递交给io调度层

void generic_make_request(struct bio *bio){if (current->bio_tail) {/* make_request is active */*(current->bio_tail) = bio;bio->bi_next = NULL;current->bio_tail = &bio->bi_next;return;}BUG_ON(bio->bi_next);do {current->bio_list = bio->bi_next;if (bio->bi_next == NULL)current->bio_tail = ¤t->bio_list;elsebio->bi_next = NULL;__generic_make_request(bio);bio = current->bio_list;} while (bio);current->bio_tail = NULL; /* deactivate */}static inline void __generic_make_request(struct bio *bio){struct request_queue *q;sector_t old_sector;int ret, nr_sectors = bio_sectors(bio);dev_t old_dev;int err = -EIO;might_sleep();if (bio_check_eod(bio, nr_sectors))goto end_io;old_sector = -1;old_dev = 0;do {char b[BDEVNAME_SIZE];q = bdev_get_queue(bio->bi_bdev);if (unlikely(!q)) {goto end_io;}if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&     nr_sectors > queue_max_hw_sectors(q))) {goto end_io;}if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))goto end_io;if (should_fail_request(bio))goto end_io;blk_partition_remap(bio);if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))goto end_io;if (old_sector != -1)trace_block_remap(q, bio, old_dev, old_sector);old_sector = bio->bi_sector;old_dev = bio->bi_bdev->bd_dev;if (bio_check_eod(bio, nr_sectors))goto end_io;if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&    !blk_queue_discard(q)) {err = -EOPNOTSUPP;goto end_io;}trace_block_bio_queue(q, bio);ret = q->make_request_fn(q, bio);} while (ret);return;end_io:bio_endio(bio, err);}

make_request_fn是何时指定的呢?

需要关注请求如何从page=>bh=>bio=>request=>elevator

void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn){q->nr_requests = BLKDEV_MAX_RQ;q->make_request_fn = mfn;blk_queue_dma_alignment(q, 511);blk_queue_congestion_threshold(q);q->nr_batching = BLK_BATCH_REQ;q->unplug_thresh = 4;/* hmm */q->unplug_delay = (3 * HZ) / 1000;/* 3 milliseconds */if (q->unplug_delay == 0)q->unplug_delay = 1;q->unplug_timer.function = blk_unplug_timeout;q->unplug_timer.data = (unsigned long)q;blk_set_default_limits(&q->limits);blk_queue_max_sectors(q, SAFE_MAX_SECTORS);if (!q->queue_lock)q->queue_lock = &q->__queue_lock;blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);}Block/blk-core.cstruct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id){struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);if (!q)return NULL;q->node = node_id;if (blk_init_free_list(q)) {kmem_cache_free(blk_requestq_cachep, q);return NULL;}q->request_fn= rfn;q->prep_rq_fn= NULL;q->unplug_fn= generic_unplug_device;q->queue_flags= QUEUE_FLAG_DEFAULT;q->queue_lock= lock;blk_queue_make_request(q, __make_request);q->sg_reserved_size = INT_MAX;if (!elevator_init(q, NULL)) {blk_queue_congestion_threshold(q);return q;}blk_put_queue(q);return NULL;}Block/blk-core.cstatic int __make_request(struct request_queue *q, struct bio *bio){struct request *req;int el_ret;unsigned int bytes = bio->bi_size;const unsigned short prio = bio_prio(bio);const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;int rw_flags;if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&    (q->next_ordered == QUEUE_ORDERED_NONE)) {bio_endio(bio, -EOPNOTSUPP);return 0;}blk_queue_bounce(q, &bio);spin_lock_irq(q->queue_lock);if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))goto get_rq;el_ret = elv_merge(q, &req, bio);switch (el_ret) {case ELEVATOR_BACK_MERGE:BUG_ON(!rq_mergeable(req));if (!ll_back_merge_fn(q, req, bio))break;trace_block_bio_backmerge(q, bio);if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)blk_rq_set_mixed_merge(req);req->biotail->bi_next = bio;req->biotail = bio;req->__data_len += bytes;req->ioprio = ioprio_best(req->ioprio, prio);if (!blk_rq_cpu_valid(req))req->cpu = bio->bi_comp_cpu;drive_stat_acct(req, 0);if (!attempt_back_merge(q, req))elv_merged_request(q, req, el_ret);goto out;case ELEVATOR_FRONT_MERGE:BUG_ON(!rq_mergeable(req));if (!ll_front_merge_fn(q, req, bio))break;trace_block_bio_frontmerge(q, bio);if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {blk_rq_set_mixed_merge(req);req->cmd_flags &= ~REQ_FAILFAST_MASK;req->cmd_flags |= ff;}bio->bi_next = req->bio;req->bio = bio;req->buffer = bio_data(bio);req->__sector = bio->bi_sector;req->__data_len += bytes;req->ioprio = ioprio_best(req->ioprio, prio);if (!blk_rq_cpu_valid(req))req->cpu = bio->bi_comp_cpu;drive_stat_acct(req, 0);if (!attempt_front_merge(q, req))elv_merged_request(q, req, el_ret);goto out;default:;}get_rq:rw_flags = bio_data_dir(bio);if (sync)rw_flags |= REQ_RW_SYNC;req = get_request_wait(q, rw_flags, bio);init_request_from_bio(req, bio);spin_lock_irq(q->queue_lock);if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||    bio_flagged(bio, BIO_CPU_AFFINE))req->cpu = blk_cpu_to_group(smp_processor_id());if (queue_should_plug(q) && elv_queue_empty(q))blk_plug_device(q);add_request(q, req);out:if (unplug || !queue_should_plug(q))__generic_unplug_device(q);spin_unlock_irq(q->queue_lock);return 0;}

特别关注add_request和__generic_unplug_device

add_request将会执行电梯调度算法中的具体流程

Block/blk-core.cstatic inline void add_request(struct request_queue *q, struct request *req){drive_stat_acct(req, 1);__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);}Block/elevator.cvoid __elv_add_request(struct request_queue *q, struct request *rq, int where,       int plug){if (q->ordcolor)rq->cmd_flags |= REQ_ORDERED_COLOR;if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {if (blk_barrier_rq(rq))q->ordcolor ^= 1;if (where == ELEVATOR_INSERT_SORT)where = ELEVATOR_INSERT_BACK;if (blk_fs_request(rq) || blk_discard_rq(rq)) {q->end_sector = rq_end_sector(rq);q->boundary_rq = rq;}} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&    where == ELEVATOR_INSERT_SORT)where = ELEVATOR_INSERT_BACK;if (plug)blk_plug_device(q);elv_insert(q, rq, where);}Block/elevator.cvoid elv_insert(struct request_queue *q, struct request *rq, int where){struct list_head *pos;unsigned ordseq;int unplug_it = 1;trace_block_rq_insert(q, rq);rq->q = q;switch (where) {case ELEVATOR_INSERT_FRONT:rq->cmd_flags |= REQ_SOFTBARRIER;list_add(&rq->queuelist, &q->queue_head);break;case ELEVATOR_INSERT_BACK:rq->cmd_flags |= REQ_SOFTBARRIER;elv_drain_elevator(q);list_add_tail(&rq->queuelist, &q->queue_head);__blk_run_queue(q);break;case ELEVATOR_INSERT_SORT:BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));rq->cmd_flags |= REQ_SORTED;q->nr_sorted++;if (rq_mergeable(rq)) {elv_rqhash_add(q, rq);if (!q->last_merge)q->last_merge = rq;}q->elevator->ops->elevator_add_req_fn(q, rq);break;case ELEVATOR_INSERT_REQUEUE:rq->cmd_flags |= REQ_SOFTBARRIER;unplug_it = 0;if (q->ordseq == 0) {list_add(&rq->queuelist, &q->queue_head);break;}ordseq = blk_ordered_req_seq(rq);list_for_each(pos, &q->queue_head) {struct request *pos_rq = list_entry_rq(pos);if (ordseq <= blk_ordered_req_seq(pos_rq))break;}list_add_tail(&rq->queuelist, pos);break;default:printk(KERN_ERR "%s: bad insertion point %d\n",       __func__, where);BUG();}if (unplug_it && blk_queue_plugged(q)) {int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]- queue_in_flight(q);if (nrq >= q->unplug_thresh)_generic_unplug_device(q);}}

从io调度层取出request是__generic_unplug_device完成

Block/blk-core.cvoid __generic_unplug_device(struct request_queue *q){if (unlikely(blk_queue_stopped(q)))return;if (!blk_remove_plug(q) && !blk_queue_nonrot(q))return;q->request_fn(q);//设备函数,例如scsi设备}

request_fn是特定的设备函数,类似scsi,它将会通过scsi_dispatch_cmd将scisi指令发送到设备

那么怎么知道io请求已经完成了呢?

硬件驱动中也会提供io complete的函数,它们最终都会执行blk_complete_request

Block/blk-softirq.cvoid blk_complete_request(struct request *req){if (unlikely(blk_should_fake_timeout(req->q)))return;if (!blk_mark_rq_complete(req))__blk_complete_request(req);}Block/blk-softirq.cvoid __blk_complete_request(struct request *req){struct request_queue *q = req->q;unsigned long flags;int ccpu, cpu, group_cpu;BUG_ON(!q->softirq_done_fn);local_irq_save(flags);cpu = smp_processor_id();group_cpu = blk_cpu_to_group(cpu);/* * Select completion CPU */if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)ccpu = req->cpu;elseccpu = cpu;if (ccpu == cpu || ccpu == group_cpu) {struct list_head *list;do_local:list = &__get_cpu_var(blk_cpu_done);list_add_tail(&req->csd.list, list);if (list->next == &req->csd.list)raise_softirq_irqoff(BLOCK_SOFTIRQ);} else if (raise_blk_irq(ccpu, req))goto do_local;local_irq_restore(flags);}

blk_complete_request是硬件中断上下文的最后一个函数,它把最后io完成后需要完成的工作交给了软中断BLOCK_SOFTIRQ

在将IO请求交给软中断处理后,驱动层完成处理后,将会执行mpage_bio_submit中注册的bio的end_io,它注册为mpage_end_io_read/ mpage_end_io_write

0 0