真实的I/O调度层处理

来源:互联网 发布:java qq停止服务 编辑:程序博客网 时间:2024/05/21 17:34

1.5.6 真实的I/O调度层处理

现在我们块设备也有了,队列也有了,要提交请求也就可以开始提交了。那就让我们回到generic_make_request来研究一下如何提交请求如何处理请求吧。我们看到,函数最后调用q->make_request_fn(q, bio)。对 make_request_fn 函数的调用可以认为是 IO调度层的入口,该函数用于向请求队列中添加请求。该函数是在创建请求队列时指定的,代码如下(blk_init_queue 函数中):

q->request_fn = rfn;

blk_queue_make_request(q, __make_request);

 

前面看到函数 blk_queue_make_request 将函数 __make_request 的地址赋予了请求队列 q make_request_fn 成员:

 

void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)

{

       q->nr_requests = BLKDEV_MAX_RQ;

       q->make_request_fn = mfn;

……

 

那么,__make_request函数才是IO调度层的真实入口,来自block/ll_rw_blk.c

 

2846static int __make_request(request_queue_t *q, struct bio *bio)

2847{

2848        struct request *req;

2849        int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;

2850        unsigned short prio;

2851        sector_t sector;

2852

2853        sector = bio->bi_sector;

2854        nr_sectors = bio_sectors(bio);

2855        cur_nr_sectors = bio_cur_sectors(bio);

2856        prio = bio_prio(bio);

2857

2858        rw = bio_data_dir(bio);

2859        sync = bio_sync(bio);

2860

2861        /*

2862         * low level driver can indicate that it wants pages above a

2863         * certain limit bounced to low memory (ie for highmem, or even

2864         * ISA dma in theory)

2865         */

2866        blk_queue_bounce(q, &bio);

2867

2868        spin_lock_prefetch(q->queue_lock);

2869

2870        barrier = bio_barrier(bio);

2871        if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {

2872                err = -EOPNOTSUPP;

2873                goto end_io;

2874        }

2875

2876        spin_lock_irq(q->queue_lock);

2877

2878        if (unlikely(barrier) || elv_queue_empty(q))

2879                goto get_rq;

2880

2881        el_ret = elv_merge(q, &req, bio);

2882        switch (el_ret) {

2883                case ELEVATOR_BACK_MERGE:

2884                        BUG_ON(!rq_mergeable(req));

2885

2886                        if (!q->back_merge_fn(q, req, bio))

2887                                break;

2888

2889                        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

2890

2891                        req->biotail->bi_next = bio;

2892                        req->biotail = bio;

2893                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;

2894                        req->ioprio = ioprio_best(req->ioprio, prio);

2895                        drive_stat_acct(req, nr_sectors, 0);

2896                        if (!attempt_back_merge(q, req))

2897                                elv_merged_request(q, req);

2898                        goto out;

2899

2900                case ELEVATOR_FRONT_MERGE:

2901                        BUG_ON(!rq_mergeable(req));

2902

2903                        if (!q->front_merge_fn(q, req, bio))

2904                                break;

2905

2906                        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

2907

2908                        bio->bi_next = req->bio;

2909                        req->bio = bio;

2910

2911                        /*

2912                         * may not be valid. if the low level driver said

2913                         * it didn't need a bounce buffer then it better

2914                         * not touch req->buffer either...

2915                         */

2916                        req->buffer = bio_data(bio);

2917                        req->current_nr_sectors = cur_nr_sectors;

2918                        req->hard_cur_sectors = cur_nr_sectors;

2919                        req->sector = req->hard_sector = sector;

2920                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;

2921                        req->ioprio = ioprio_best(req->ioprio, prio);

2922                        drive_stat_acct(req, nr_sectors, 0);

2923                        if (!attempt_front_merge(q, req))

2924                                elv_merged_request(q, req);

2925                        goto out;

2926

2927                /* ELV_NO_MERGE: elevator says don't/can't merge. */

2928                default:

2929                        ;

2930        }

2931

2932get_rq:

2933        /*

2934         * Grab a free request. This is might sleep but can not fail.

2935         * Returns with the queue unlocked.

2936         */

2937        req = get_request_wait(q, rw, bio);

2938

2939        /*

2940         * After dropping the lock and possibly sleeping here, our request

2941         * may now be mergeable after it had proven unmergeable (above).

2942         * We don't worry about that case for efficiency. It won't happen

2943         * often, and the elevators are able to handle it.

2944         */

2945        init_request_from_bio(req, bio);

2946

2947        spin_lock_irq(q->queue_lock);

2948        if (elv_queue_empty(q))

2949                blk_plug_device(q);

2950        add_request(q, req);

2951out:

2952        if (sync)

2953                __generic_unplug_device(q);

2954

2955        spin_unlock_irq(q->queue_lock);

2956        return 0;

2957

2958end_io:

2959        bio_endio(bio, nr_sectors << 9, err);

2960        return 0;

2961}

 

__make_request 函数比较复杂,它接收request_queue类型的描述符q和一个bio结构的描述符bio作为其参数,然后执行如下操作:

 

2853内部变量sector赋值为biobi_sector字段,即将传送的bio的第一个扇区。2854行通过bio_sectors(bio)宏得到需要传送多少个连续的扇区,并赋值给内部变量nr_sectorsbio_sectors宏来自include/linux/bio.h

#define bio_sectors(bio) ((bio)->bi_size >> 9)

 

bio->bi_size我们知道,在前面如果一个页中的4个块内容连续,则do_mpage_readpage通过调用bio_add_page4个块的大小4096赋值给biobi_size字段,那么nr_sectors就是4096>>9,等于8,表示这个bio一共8个扇区待传输。

 

2866行,如果需要,调用blk_cqueue_bounce()函数建立一个回弹缓冲区。如果回弹缓冲区被建立,__make_request()函数将对该缓冲区而不是原先的bio结构进行操作。关于回弹缓冲区的相关知识,请查阅相关资料,这里我们不做过多的介绍。

 

2878行,调用I/O调度程序的elv_queue_empty()函数检查请求队列中是否存在待处理请求——注意,调度队列可能是空的,但是I/O调度程序的其他队列可能包含待处理请求。如果没有待处理请求,那么调用blk_plug_device()函数插入请求队列,然后跳转到2932行的get_rq标号处。

 

如果插入的请求队列包含待处理请求,则走到2881行调用I/O调度程序的elv_merge()函数检查新的bio结构是否可以并入已存在的请求中。

 

该函数将返回三个可能值:

1. ELEVATOR_NO_MERGE:已经存在的请求中不能包含bio结构;这种情况下,跳转到2932行的get_rq标号处。

2. ELEVATOR_BACK_MERGEbio结构可作为末尾的bio而插入到某个请求req中;这种情形下,函数调用q->back_merge_fn方法检查是否可以扩展该请求。如果不行,则跳转到2932行的get_rq标号处。否则,将bio描述符插入req链表的末尾并更新req的相应字段值。然后,函数试图将该请求与其后面的请求合并(新的bio可能填充在两个请求之间)。

3. ELEVATOR_FRONT_MERGEbio结构可作为某个请求req的第一个bio被插入;这种情形下,函数调用q->front_merge_fn方法检查是否可以扩展该请求。如果不行,则跳转到2932行的get_rq标号处。否则,将bio描述符插入req链表的首部并更新req的相应字段值。然后,试图将该请求与其前面的请求合并。

 

不管是ELEVATOR_BACK_MERGE还是ELEVATOR_FRONT_MERGE,说明bio已经被并入存在的请求中,跳转到2951out标号处终止函数。

 

下面来看2932行的get_rq标号处,bio必须被插人到一个新的请求中。那么通过get_request_wait给我们这个分区的请求队列q分配一个新的请求描述符request。如果没有空闲的内存,get_request_wait函数将调用io_schedule挂起当前进程,直到设置了bio->bi_rw中的BIO_RW_AHEAD标志,该标志表明这个I/O操作是一次预读;在这种情形下,调用bio_endio()并终止:此时将不会执行数据传送。

 

然后调用2945行的init_request_from_bio(req, bio)初始化请求描述符中的字段。主要有:

a) 根据bio描述符的内容初始化各个字段,包括扇区数、当前bio以及当前段。

b) 设置flags字段中的REQ_CMD标志(说明这次request是一个标准的读或写操作)。

c) 如果第一个bio段的页框存放在低端内存,则将buffer字段设置为缓冲区的线性地址。

d) rc_disk字段设置为bio->bi_bdev->bd_disk的地址。

e) bio插入请求链表。

f) start_time字段设置为jiffies的值。

 

回到__make_request2948-2949行,再次调用elv_queue_empty检查一下请求队列中是否存在待处理请求。如果没有待处理请求,那么调用blk_plug_device()函数插入请求队列。不管怎样,都会执行2950行的add_request函数:

 

static inline void add_request(request_queue_t * q, struct request * req)

{

       drive_stat_acct(req, req->nr_sectors, 1);

 

       if (q->activity_fn)

              q->activity_fn(q->activity_data, rq_data_dir(req));

 

       /*

        * elevator indicated where it wants this request to be

        * inserted at elevator_merge time

        */

       __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);

}

 

add_request函数本质上调用__elv_add_request函数通过电梯算法把这个新的request插入到对应requesr_queue合适的位置。在介绍__elv_add_request函数之前,我们先介绍几个宏,来自include/linux/elevator.h

 

    155 /*

    156  * Insertion selection

    157  */

    158 #define ELEVATOR_INSERT_FRONT   1

    159 #define ELEVATOR_INSERT_BACK    2

    160 #define ELEVATOR_INSERT_SORT    3

    161 #define ELEVATOR_INSERT_REQUEUE 4

 

很明显,在add_request函数中传递进来的是ELEVATOR_INSERT_SORT,表示从前面插入。那么带着这个where我们进入下一个函数,即__elv_add_request。来自block/elevator.c

 

    646 void __elv_add_request(request_queue_t *q, struct request *rq, int where,

    647                        int plug)

    648 {

    649         if (q->ordcolor)

    650                 rq->cmd_flags |= REQ_ORDERED_COLOR;

    651

    652         if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {

    653                 /*

    654                  * toggle ordered color

    655                  */

    656                 if (blk_barrier_rq(rq))

    657                         q->ordcolor ^= 1;

    658

    659                 /*

    660                  * barriers implicitly indicate back insertion

    661                  */

    662                 if (where == ELEVATOR_INSERT_SORT)

    663                         where = ELEVATOR_INSERT_BACK;

    664

    665                 /*

    666                  * this request is scheduling boundary, update

    667                  * end_sector

    668                  */

    669                 if (blk_fs_request(rq)) {

    670                         q->end_sector = rq_end_sector(rq);

    671                         q->boundary_rq = rq;

    672                 }

    673   } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)

    674                 where = ELEVATOR_INSERT_BACK;

    675

    676         if (plug)

    677                 blk_plug_device(q);

    678

    679         elv_insert(q, rq, where);

    680 }

 

传入的参数plug等于0,所以blk_plug_device()不会被执行。很明显,一路走来我们根本没有设置什么REQ_SOFTBARRIERREQ_HARDBARRIERREQ_ELVPRIV标识,所以前面都和我们无关,直接跳到最后一行这个elv_insert()

 

    548 void elv_insert(request_queue_t *q, struct request *rq, int where)

    549 {

    550         struct list_head *pos;

    551         unsigned ordseq;

    552         int unplug_it = 1;

    553

    554         blk_add_trace_rq(q, rq, BLK_TA_INSERT);

    555

    556         rq->q = q;

    557

    558         switch (where) {

    559         case ELEVATOR_INSERT_FRONT:

    560                 rq->cmd_flags |= REQ_SOFTBARRIER;

    561

    562                 list_add(&rq->queuelist, &q->queue_head);

    563                 break;

    564

    565         case ELEVATOR_INSERT_BACK:

    566                 rq->cmd_flags |= REQ_SOFTBARRIER;

    567                 elv_drain_elevator(q);

    568                 list_add_tail(&rq->queuelist, &q->queue_head);

    569                 /*

    570                  * We kick the queue here for the following reasons.

    571                  * - The elevator might have returned NULL previously

    572                  *   to delay requests and returned them now.  As the

    573                  *   queue wasn't empty before this request, ll_rw_blk

    574                  *   won't run the queue on return, resulting in hang.

    575                  * - Usually, back inserted requests won't be merged

    576                  *   with anything.  There's no point in delaying queue

    577                  *   processing.

    578                  */

    579                 blk_remove_plug(q);

    580                 q->request_fn(q);

    581                 break;

    582

    583         case ELEVATOR_INSERT_SORT:

    584                 BUG_ON(!blk_fs_request(rq));

    585                 rq->cmd_flags |= REQ_SORTED;

    586                 q->nr_sorted++;

    587                 if (rq_mergeable(rq)) {

    588                         elv_rqhash_add(q, rq);

    589                         if (!q->last_merge)

    590                                 q->last_merge = rq;

    591                 }

    592

    593                 /*

    594                  * Some ioscheds (cfq) run q->request_fn directly, so

    595                  * rq cannot be accessed after calling

    596                  * elevator_add_req_fn.

    597                  */

    598                 q->elevator->ops->elevator_add_req_fn(q, rq);

    599                 break;

    600

    601         case ELEVATOR_INSERT_REQUEUE:

    602                 /*

    603                  * If ordered flush isn't in progress, we do front

    604                  * insertion; otherwise, requests should be requeued

    605                  * in ordseq order.

    606                  */

    607                 rq->cmd_flags |= REQ_SOFTBARRIER;

    608

    609                 /*

    610                  * Most requeues happen because of a busy condition,

    611                  * don't force unplug of the queue for that case.

    612                  */

    613                 unplug_it = 0;

    614

    615                 if (q->ordseq == 0) {

    616                         list_add(&rq->queuelist, &q->queue_head);

    617                         break;

    618                 }

    619

    620                 ordseq = blk_ordered_req_seq(rq);

    621

    622                 list_for_each(pos, &q->queue_head) {

    623                         struct request *pos_rq = list_entry_rq(pos);

    624                         if (ordseq <= blk_ordered_req_seq(pos_rq))

    625                                 break;

    626                 }

    627

    628                 list_add_tail(&rq->queuelist, pos);

    629                 break;

    630

    631         default:

    632                 printk(KERN_ERR "%s: bad insertion point %d/n",

    633                        __FUNCTION__, where);

    634                 BUG();

    635         }

    636

    637         if (unplug_it && blk_queue_plugged(q)) {

    638                 int nrq = q->rq.count[READ] + q->rq.count[WRITE]

    639                         - q->in_flight;

    640

    641                 if (nrq >= q->unplug_thresh)

    642                         __generic_unplug_device(q);

    643         }

    644 }

 

由于我们是从前面插,所以我们执行562行这个list_addstruct request有一个成员struct list_head queuelist,而struct request_queue有一个成员struct list_head queue_head,所以我们就把前者插入到后者所代表的这个队伍中来。然后咱们就返回了。

 

以上所有操作全部完成后,在终止之前,2952行检查是否设置了bio->bi_rw中的BIO_RW_SYNC标志。如果是,则对“请求队列”调用generic_unplug_device()函数以卸载设备驱动程序,并直接调用q->request_fn(q),这个函数是什么,马上会看到。

 

如果在调用__make_request()函数之前请求队列不是空的,那么说明该请求队列要么已经被拔掉过,要么很快将被拔掉——因为每个拥有待处理请求的插入请求队列q都有一个正在运行的动态定时器q->unplug_timer。另一方面,如果请求队列是空的,则__make_request()函数插入请求队列。或迟(最坏的情况是当拔出的定时器到期了)或最早(从__make_request()中退出时,如果设置了bioBIO_RW_SYNC标志),该请求队列都会被拔掉。任何情形下,块设备驱动程序的策略例程最后都将处理调度队列中的请求。

 

generic_make_request执行完scsi磁盘设备对应请求队列的q->make_request_fn方法,也就是刚才分析的__make_request以后,块设备的调度层就结束了。至于包含该biorequest放入到请求队列中后,何时被处理就由 IO 调度器的调度算法决定了。一旦该请求能够被处理,便调用请求队列中request_fn 字段所指向的函数处理。这个成员的初始化也是在创建请求队列时设置的:

 

   1590 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)

   1591 {

   1592         struct request_queue *q;

   1593

   1594         q = __scsi_alloc_queue(sdev->host, scsi_request_fn);

   1595         if (!q)

   1596                 return NULL;

   1597

   1598         blk_queue_prep_rq(q, scsi_prep_fn);

   1599         blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);

   1600         blk_queue_softirq_done(q, scsi_softirq_done);

   1601         return q;

   1602 }

 

我们看到,给scsi设备创建request_queue的时候,是把scsi_request_fn作为他的request_fn 字段所指向的函数地址,所以这个scsi_request_fn就是scsi底层驱动的入口。

原创粉丝点击