block IO层框架分析2

来源：互联网发布：sybase数据库win7 64 编辑：程序博客网时间：2024/05/20 18:46

1. 内核层文件读写的函数调用关系

sys _read

vfs_read

do_sync_read

f_op->aio_read

generic_file_aio_read

do_generic_file_read

mpage_readpage

do_mpage_readpage

mpage_bio_submit

submit_bio

generic_make_request

__generic_make_request

q->make_request_fn

__make_request

q->request_fn

end_request

mpage_end_io_read

光读一个文件就需要这么长的函数调用关系，而且后面还有设备驱动，如果是U盘，还涉及scsi和usb框架，这复杂度真是有点大。这也充分说明了内核中分层的思想。今天就从submit_bio说起。

void submit_bio(int rw, struct bio *bio){int count = bio_sectors(bio);bio->bi_rw |= rw;/* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {if (rw & WRITE) {count_vm_events(PGPGOUT, count);} else {task_io_account_read(bio->bi_size);count_vm_events(PGPGIN, count);}if (unlikely(block_dump)) {char b[BDEVNAME_SIZE];printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",current->comm, task_pid_nr(current),(rw & WRITE) ? "WRITE" : "READ",(unsigned long long)bio->bi_sector,bdevname(bio->bi_bdev, b),count);}}generic_make_request(bio);}

void generic_make_request(struct bio *bio){struct bio_list bio_list_on_stack;if (current->bio_list) {/* make_request is active *///如果当前进程已经处于make_request状态，添加到待处理链表bio_list_add(current->bio_list, bio);return;}/* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be * added.  __generic_make_request may indeed add some more bios * through a recursive call to generic_make_request.  If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top.  In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from * bio_list, and call into __generic_make_request again. * * The loop was structured like this to make only one call to * __generic_make_request (which is important as it is large and * inlined) and to keep the structure simple. */BUG_ON(bio->bi_next);//必须保证bio->bi_next是NULLbio_list_init(&bio_list_on_stack);current->bio_list = &bio_list_on_stack;do {__generic_make_request(bio);bio = bio_list_pop(current->bio_list);} while (bio);current->bio_list = NULL; /* deactivate */}

static inline void __generic_make_request(struct bio *bio){struct request_queue *q;sector_t old_sector;int ret, nr_sectors = bio_sectors(bio);dev_t old_dev;int err = -EIO;might_sleep();if (bio_check_eod(bio, nr_sectors))goto end_io;/* * Resolve the mapping until finished. (drivers are * still free to implement/resolve their own stacking * by explicitly returning 0) * * NOTE: we don't repeat the blk_size check for each new device. * Stacking drivers are expected to know what they are doing. */old_sector = -1;old_dev = 0;do {char b[BDEVNAME_SIZE];struct hd_struct *part;q = bdev_get_queue(bio->bi_bdev);if (unlikely(!q)) {printk(KERN_ERR       "generic_make_request: Trying to access ""nonexistent block-device %s (%Lu)\n",bdevname(bio->bi_bdev, b),(long long) bio->bi_sector);goto end_io;}if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&     nr_sectors > queue_max_hw_sectors(q))) {printk(KERN_ERR "bio too big device %s (%u > %u)\n",       bdevname(bio->bi_bdev, b),       bio_sectors(bio),       queue_max_hw_sectors(q));goto end_io;}if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))goto end_io;part = bio->bi_bdev->bd_part;if (should_fail_request(part, bio->bi_size) ||    should_fail_request(&part_to_disk(part)->part0,bio->bi_size))goto end_io;/* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. */blk_partition_remap(bio);//如果是分区映射到磁盘if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))goto end_io;if (old_sector != -1)trace_block_bio_remap(q, bio, old_dev, old_sector);old_sector = bio->bi_sector;old_dev = bio->bi_bdev->bd_dev;if (bio_check_eod(bio, nr_sectors))goto end_io;/* * Filter flush bio's early so that make_request based * drivers without flush support don't have to worry * about them. */if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);if (!nr_sectors) {err = 0;goto end_io;}}if ((bio->bi_rw & REQ_DISCARD) &&    (!blk_queue_discard(q) ||     ((bio->bi_rw & REQ_SECURE) &&      !blk_queue_secdiscard(q)))) {err = -EOPNOTSUPP;goto end_io;}if (blk_throtl_bio(q, &bio))goto end_io;/* * If bio = NULL, bio has been throttled and will be submitted * later. */if (!bio)break;trace_block_bio_queue(q, bio);ret = q->make_request_fn(q, bio);//调用请求队列的make_request_fn回调函数} while (ret);return;end_io:bio_endio(bio, err);}

对于scsi设备的request_queue的钩子函数在哪初始化的呢：

scsi_alloc_sdev->scsi_alloc_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request，request_fn初始化为scsi_request_fn。

对于mtdblock设备的request_queue的钩子函数在哪初始化的呢：

add_mtd_blktrans_dev->blk_init_queue

把make_request_fn回调函数初始化为__make_request，request_fn初始化为mtd_blktrans_request。

对于mmcblock设备的requet_queue的钩子函数在哪初始化的呢：

mmc_blk_alloc_req->mmc_init_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request，request_fn初始化为mmc_request。

接下来分析__make_request如何把bio排序、合并后加入到request结构中。

static int __make_request(struct request_queue *q, struct bio *bio){const bool sync = !!(bio->bi_rw & REQ_SYNC);struct blk_plug *plug;int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;struct request *req;unsigned int request_count = 0;/* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even * ISA dma in theory) */blk_queue_bounce(q, &bio);//创建反弹缓冲区if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {spin_lock_irq(q->queue_lock);where = ELEVATOR_INSERT_FLUSH;goto get_rq;}/* * Check if we can merge with the plugged list before grabbing * any locks. */if (attempt_plug_merge(current, q, bio, &request_count))goto out;spin_lock_irq(q->queue_lock);el_ret = elv_merge(q, &req, bio);//交给IO调度器排序合并if (el_ret == ELEVATOR_BACK_MERGE) {//可以合并到某个request后面if (bio_attempt_back_merge(q, req, bio)) {if (!attempt_back_merge(q, req))elv_merged_request(q, req, el_ret);goto out_unlock;}} else if (el_ret == ELEVATOR_FRONT_MERGE) {//可以合并到某个request前面if (bio_attempt_front_merge(q, req, bio)) {if (!attempt_front_merge(q, req))elv_merged_request(q, req, el_ret);goto out_unlock;}}//在请求队列中没有找到可以合并的requestget_rq:/* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the * rq allocator and io schedulers. */rw_flags = bio_data_dir(bio);if (sync)rw_flags |= REQ_SYNC;/* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */req = get_request_wait(q, rw_flags, bio);//分配一个 request/* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). * We don't worry about that case for efficiency. It won't happen * often, and the elevators are able to handle it. */init_request_from_bio(req, bio);//根据bio初始化requestif (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||    bio_flagged(bio, BIO_CPU_AFFINE))req->cpu = raw_smp_processor_id();plug = current->plug;if (plug) {/* * If this is the first request added after a plug, fire * of a plug trace. If others have been added before, check * if we have multiple devices in this plug. If so, make a * note to sort the list before dispatch. */if (list_empty(&plug->list))trace_block_plug(q);else if (!plug->should_sort) {struct request *__rq;__rq = list_entry_rq(plug->list.prev);if (__rq->q != q)plug->should_sort = 1;}if (request_count >= BLK_MAX_REQUEST_COUNT)blk_flush_plug_list(plug, false);//请求数量达到上限，进行unplug操作list_add_tail(&req->queuelist, &plug->list);drive_stat_acct(req, 1);} else {spin_lock_irq(q->queue_lock);add_acct_request(q, req, where);//添加到io调度器中__blk_run_queue(q);//执行unplug操作out_unlock:spin_unlock_irq(q->queue_lock);}out:return 0;}

void __blk_run_queue(struct request_queue *q){if (unlikely(blk_queue_stopped(q)))return;q->request_fn(q);}

接下来以scsi设备为例来分析q->request_fn函数

static void scsi_request_fn(struct request_queue *q){struct scsi_device *sdev = q->queuedata;struct Scsi_Host *shost;struct scsi_cmnd *cmd;struct request *req;if (!sdev) {printk("scsi: killing requests for dead queue\n");while ((req = blk_peek_request(q)) != NULL)scsi_kill_request(req, q);return;}if(!get_device(&sdev->sdev_gendev))/* We must be tearing the block queue down already */return;/* * To start with, we keep looping until the queue is empty, or until * the host is no longer able to accept any more requests. */shost = sdev->host;for (;;) {int rtn;/* * get next queueable request.  We do this early to make sure * that the request is fully prepared even if we cannot  * accept it. */req = blk_peek_request(q);//获得下一个请求if (!req || !scsi_dev_queue_ready(q, sdev))break;if (unlikely(!scsi_device_online(sdev))) {//设备离线sdev_printk(KERN_ERR, sdev,    "rejecting I/O to offline device\n");scsi_kill_request(req, q);continue;}/* * Remove the request from the request list. */if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))blk_start_request(req);sdev->device_busy++;spin_unlock(q->queue_lock);cmd = req->special;if (unlikely(cmd == NULL)) {printk(KERN_CRIT "impossible request in %s.\n" "please mail a stack trace to " "linux-scsi@vger.kernel.org\n", __func__);blk_dump_rq_flags(req, "foo");BUG();}spin_lock(shost->host_lock);/* * We hit this when the driver is using a host wide * tag map. For device level tag maps the queue_depth check * in the device ready fn would prevent us from trying * to allocate a tag. Since the map is a shared host resource * we add the dev to the starved list so it eventually gets * a run when a tag is freed. */if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {if (list_empty(&sdev->starved_entry))list_add_tail(&sdev->starved_entry,      &shost->starved_list);goto not_ready;}if (!scsi_target_queue_ready(shost, sdev))//是否可以发送命令到目标节点goto not_ready;if (!scsi_host_queue_ready(q, shost, sdev))//是否可以发送到主机适配器goto not_ready;scsi_target(sdev)->target_busy++;shost->host_busy++;/* * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will *take the lock again. */spin_unlock_irq(shost->host_lock);/* * Finally, initialize any error handling parameters, and set up * the timers for timeouts. */scsi_init_cmd_errh(cmd);/* * Dispatch the command to the low-level driver. */rtn = scsi_dispatch_cmd(cmd);//派发命令到底层驱动spin_lock_irq(q->queue_lock);if (rtn)goto out_delay;}goto out; not_ready:spin_unlock_irq(shost->host_lock);/* * lock q, handle tag, requeue req, and decrement device_busy. We * must return with queue_lock held. * * Decrementing device_busy without checking it is OK, as all such * cases (host limits or settings) should run the queue at some * later time. */spin_lock_irq(q->queue_lock);blk_requeue_request(q, req);sdev->device_busy--;out_delay:if (sdev->device_busy == 0)blk_delay_queue(q, SCSI_QUEUE_DELAY);out:/* must be careful here...if we trigger the ->remove() function * we cannot be holding the q lock */spin_unlock_irq(q->queue_lock);put_device(&sdev->sdev_gendev);spin_lock_irq(q->queue_lock);}

blk_peek_request主要调用了__elv_next_request和q->prep_rq_fn(q, rq)，而prep_rq_fn被赋值为sd_prep_fn，主要作用是为请求构造scsi命令。

int scsi_dispatch_cmd(struct scsi_cmnd *cmd){struct Scsi_Host *host = cmd->device->host;unsigned long timeout;int rtn = 0;atomic_inc(&cmd->device->iorequest_cnt);/* check if the device is still usable */if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {/* in SDEV_DEL we error all commands. DID_NO_CONNECT * returns an immediate error upwards, and signals * that the device is no longer present */cmd->result = DID_NO_CONNECT << 16;scsi_done(cmd);/* return 0 (because the command has been processed) */goto out;}/* Check to see if the scsi lld made this device blocked. */if (unlikely(scsi_device_blocked(cmd->device))) {/*  * in blocked state, the command is just put back on * the device queue.  The suspend state has already * blocked the queue so future requests should not * occur until the device transitions out of the * suspend state. */scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));/* * NOTE: rtn is still zero here because we don't need the * queue to be plugged on return (it's already stopped) */goto out;}/*  * If SCSI-2 or lower, store the LUN value in cmnd. */if (cmd->device->scsi_level <= SCSI_2 &&    cmd->device->scsi_level != SCSI_UNKNOWN) {cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |       (cmd->device->lun << 5 & 0xe0);}/* * We will wait MIN_RESET_DELAY clock ticks after the last reset so * we can avoid the drive not being ready. */timeout = host->last_reset + MIN_RESET_DELAY;if (host->resetting && time_before(jiffies, timeout)) {int ticks_remaining = timeout - jiffies;/* * NOTE: This may be executed from within an interrupt * handler!  This is bad, but for now, it'll do.  The irq * level of the interrupt handler has been masked out by the * platform dependent interrupt handling code already, so the * sti() here will not cause another call to the SCSI host's * interrupt handler (assuming there is one irq-level per * host). */while (--ticks_remaining >= 0)mdelay(1 + 999 / HZ);host->resetting = 0;}scsi_log_send(cmd);/* * Before we queue this command, check if the command * length exceeds what the host adapter can handle. */if (cmd->cmd_len > cmd->device->host->max_cmd_len) {SCSI_LOG_MLQUEUE(3,printk("queuecommand : command too long. "       "cdb_size=%d host->max_cmd_len=%d\n",       cmd->cmd_len, cmd->device->host->max_cmd_len));cmd->result = (DID_ABORT << 16);scsi_done(cmd);goto out;}if (unlikely(host->shost_state == SHOST_DEL)) {cmd->result = (DID_NO_CONNECT << 16);scsi_done(cmd);} else {trace_scsi_dispatch_cmd_start(cmd);cmd->scsi_done = scsi_done;rtn = host->hostt->queuecommand(host, cmd);//调用主机适配器的queuecommand回调函数}if (rtn) {trace_scsi_dispatch_cmd_error(cmd, rtn);if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&    rtn != SCSI_MLQUEUE_TARGET_BUSY)rtn = SCSI_MLQUEUE_HOST_BUSY;scsi_queue_insert(cmd, rtn);SCSI_LOG_MLQUEUE(3,    printk("queuecommand : request rejected\n"));} out:SCSI_LOG_MLQUEUE(3, printk("leaving scsi_dispatch_cmnd()\n"));return rtn;}

0 0