Linux native AIO与eventfd、epoll的结合使用

来源:互联网 发布:gbdt python实现 编辑:程序博客网 时间:2024/05/29 06:34

 

在前面的示例libaio_test.c和native_aio_test.c中,可以看到对磁盘aio请求(本文的aio都指此类)的使用有阻塞等待,这明显之处为对io_getevents()函数(当然,其它函数,比如io_submit()也有一定程度的阻塞)的调用,它会等待并获取已完成的io请求,如果当前没有或少于指定数目的io请求完成,那么就会等待直到timeout。

io_getevents()函数的等待会导致整个进程的阻塞使得程序无法继续向下执行,如果程序还有其它阻塞点,那么有必要想办法把这多处等待合而为一同时进行,从而提高并行性,也就是通常所说的select/epoll等这类多路复用技术。

本文就以epoll为例,介绍一下在linux下,如何把aio结合并应用到epoll机制里。我们知道,epoll机制的最大好处就是它能够在同一时刻对多个文件描述符(通常是由众多套接字形成的描述符集合)进行监听,并将其上发生的读/写(或错误等)事件通知给应用程序,也就是做到时间上的复用。如果能够把aio也放到epoll机制里,即把aio当作epoll机制里的“一路io”,那么就能使得aio与其它可能的等待操作(比如:读/写套接字)共同工作,从而达到时间复用的目的。

作为epoll机制里的“一路io”,需要一个文件描述符来反馈对应的发生事件,而对于纯aio而言,是没有文件描述符作为代表的,因此linux系统上多出了一个eventfd()的系统调用:

1
2
3
#include <sys/eventfd.h>
  
int eventfd(unsigned int initval, int flags);

当然,这个系统调用是否就是因此原因才出现,我不得而知(也没去细查),但要把aio应用到epoll机制里,的确少不了它。从man手册http://man7.org/linux/man-pages/man2/eventfd.2.html可以看到,eventfd()函数的作用是提供一种让内核通知应用程序有事件发生的机制。根据给定参数的不同,对eventfd进行read()的语义也有所不同,看本文aio应用的场景情况:

1
int efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);

对该描述符efd进行read(),如果读取成功,那么将返回8-byte的整型数据,而该数据也就是表示已经完成的aio请求个数。

充当中间桥梁的eventfd有了,并且eventfd()函数返回的描述符可以添加到epoll机制内,因此剩下需要做的就是把eventfd与aio联系起来,而目前aio当然已经有了这个支持,不过,由于native aio的相关结构体有两套封装,即一种是libaio的封装,一种是内核的直接封装(便于直接使用aio),比如iocb:
libaio的封装(来之:/usr/include/libaio.h):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
structio_iocb_common {
    PADDEDptr(void *buf, __pad1);
    PADDEDul(nbytes, __pad2);
    longlong   offset;
    longlong   __pad3;
    unsigned    flags;
    unsigned    resfd;
};  /* result code is the amount read or -'ve errno */
  
structio_iocb_vector {
    conststruct iovec  *vec;
    int        nr;
    longlong       offset;
};  /* result code is the amount read or -'ve errno */
  
structiocb {
    PADDEDptr(void*data, __pad1);  /* Return in the io completion event */
    PADDED(unsigned key, __pad2);  /* For use in identifying io requests */
  
    short      aio_lio_opcode;
    short      aio_reqprio;
    int    aio_fildes;
  
    union{
        structio_iocb_common       c;
        structio_iocb_vector       v;
        structio_iocb_poll     poll;
        structio_iocb_sockaddr saddr;
    } u;
};

内核的封装(来之:/usr/include/linux/aio_abi.h或/usr/src/linux-2.6.38.8/include/linux/aio_abi.h):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/*
 * we always use a 64bit off_t when communicating
 * with userland.  its up to libraries to do the
 * proper padding and aio_error abstraction
 */
  
structiocb {
    /* these are internal to the kernel/libc. */
    __u64   aio_data;  /* data to be returned in event's data */
    __u32   PADDED(aio_key, aio_reserved1);
                /* the kernel sets aio_key to the req # */
  
    /* common fields */
    __u16   aio_lio_opcode;/* see IOCB_CMD_ above */
    __s16   aio_reqprio;
    __u32   aio_fildes;
  
    __u64   aio_buf;
    __u64   aio_nbytes;
    __s64   aio_offset;
  
    /* extra parameters */
    __u64   aio_reserved2; /* TODO: use this for a (struct sigevent *) */
  
    /* flags for the "struct iocb" */
    __u32   aio_flags;
  
    /*
     * if the IOCB_FLAG_RESFD flag of "aio_flags" is set, this is an
     * eventfd to signal <SPAN class=wp_keywordlink_affiliate><A title="查看 AIO 中的全部文章" href="http://lenky.info/tag/aio/">AIO</A></SPAN> readiness to
     */
    __u32   aio_resfd;
}; /* 64 bytes */

两个结构体是等价的,只是字段名称有所不同而已,此处仅看内核封装的情况(后续将提到nginx对aio的使用实现,而nginx是采用的就是syscall手动封装),有一段很明显的英文注释出卖了aio对eventfd的使用支持,即两个字段:aio_flags与aio_resfd,详细来说就是将aio_flags打上IOCB_FLAG_RESFD标记并且将eventfd()函数返回的描述符设置到aio_resfd即可。

废话少说,看两个示例,第一个来之:http://blog.sina.com.cn/s/blog_6b19f21d0100znza.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#define _GNU_SOURCE
#define __STDC_FORMAT_MACROS
  
#include <stdio.h>
#include <errno.h>
#include <libaio.h>
#include <sys/eventfd.h>
#include <sys/epoll.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdint.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <inttypes.h>
  
#define TEST_FILE   "aio_test_file"
#define TEST_FILE_SIZE  (127 * 1024)
#define NUM_EVENTS  128
#define ALIGN_SIZE  512
#define RD_WR_SIZE  1024
  
structcustom_iocb
{
    structiocb iocb;
    intnth_request;
};
  
voidaio_callback(io_context_t ctx, structiocb *iocb, longres, long res2)
{
    structcustom_iocb *iocbp = (structcustom_iocb *)iocb;
    printf("nth_request: %d, request_type: %s, offset: %lld, length: %lu, res: %ld, res2: %ld\n",
            iocbp->nth_request, (iocb->aio_lio_opcode == IO_CMD_PREAD) ?"READ" : "WRITE",
            iocb->u.c.offset, iocb->u.c.nbytes, res, res2);
}
  
int main(int argc,char *argv[])
{
    intefd, fd, epfd;
    io_context_t ctx;
    structtimespec tms;
    structio_event events[NUM_EVENTS];
    structcustom_iocb iocbs[NUM_EVENTS];
    structiocb *iocbps[NUM_EVENTS];
    structcustom_iocb *iocbp;
    inti, j, r;
    void*buf;
    structepoll_event epevent;
  
    efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
    if(efd == -1) {
        perror("eventfd");
        return2;
    }
  
    fd = open(TEST_FILE, O_RDWR | O_CREAT | O_DIRECT, 0644);
    if(fd == -1) {
        perror("open");
        return3;
    }
    ftruncate(fd, TEST_FILE_SIZE);
  
    ctx = 0;
    if(io_setup(8192, &ctx)) {
        perror("io_setup");
        return4;
    }
  
    if(posix_memalign(&buf, ALIGN_SIZE, RD_WR_SIZE)) {
        perror("posix_memalign");
        return5;
    }
    printf("buf: %p\n", buf);
  
    for(i = 0, iocbp = iocbs; i < NUM_EVENTS; ++i, ++iocbp) {
        iocbps[i] = &iocbp->iocb;
        io_prep_pread(&iocbp->iocb, fd, buf, RD_WR_SIZE, i * RD_WR_SIZE);
        io_set_eventfd(&iocbp->iocb, efd);
        io_set_callback(&iocbp->iocb, aio_callback);
        iocbp->nth_request = i + 1;
    }
  
    if(io_submit(ctx, NUM_EVENTS, iocbps) != NUM_EVENTS) {
        perror("io_submit");
        return6;
    }
  
    epfd = epoll_create(1);
    if(epfd == -1) {
        perror("epoll_create");
        return7;
    }
  
    epevent.events = EPOLLIN | EPOLLET;
    epevent.data.ptr = NULL;
    if(epoll_ctl(epfd, EPOLL_CTL_ADD, efd, &epevent)) {
        perror("epoll_ctl");
        return8;
    }
  
    i = 0;
    while(i < NUM_EVENTS) {
        uint64_t finished_aio;
  
        if(epoll_wait(epfd, &epevent, 1, -1) != 1) {
            perror("epoll_wait");
            return9;
        }
  
        if(read(efd, &finished_aio, sizeof(finished_aio)) !=sizeof(finished_aio)) {
            perror("read");
            return10;
        }
  
        printf("finished io number: %"PRIu64"\n", finished_aio);
  
        while(finished_aio > 0) {
            tms.tv_sec = 0;
            tms.tv_nsec = 0;
            r = io_getevents(ctx, 1, NUM_EVENTS, events, &tms);
            if(r > 0) {
                for(j = 0; j < r; ++j) {
                    ((io_callback_t)(events[j].data))(ctx, events[j].obj, events[j].res, events[j].res2);
                }
                i += r;
                finished_aio -= r;
            }
        }
    }
  
    close(epfd);
    free(buf);
    io_destroy(ctx);
    close(fd);
    close(efd);
    remove(TEST_FILE);
  
    return0;
}

编译执行,OK无误(特别注意:上面示例代码仅只是演示aio+eventfd+epoll的使用,而细节部分是有严重bug的,比如所有请求共用一个缓存区buf):

1
2
[root@www 1]# gcc t.c -laio
[root@www 1]# ./a.out

上面示例采用了libaio库,试试syscall简单封装(由上面示例修改而来):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/**
 * gcc aio_eventfd_epoll.c -o aio_eventfd_epoll
 * ref:http://blog.sina.com.cn/s/blog_6b19f21d0100znza.html
 * modified by:http://lenky.info/
 */
#define _GNU_SOURCE
#define __STDC_FORMAT_MACROS
  
#include <sys/epoll.h>
#include <stdio.h>            /* for perror() */
#include <unistd.h>           /* for syscall() */
#include <sys/syscall.h>      /* for __NR_* definitions */
#include <linux/aio_abi.h>    /* for AIO types and constants */
#include <fcntl.h>            /* O_RDWR */
#include <string.h>           /* memset() */
#include <inttypes.h>         /* uint64_t */
#include <stdlib.h>
  
#define TEST_FILE   "aio_test_file"
#define TEST_FILE_SIZE  (128 * 1024)
#define NUM_EVENTS  128
#define ALIGN_SIZE  512
#define RD_WR_SIZE  1024
  
inlineint io_setup(unsigned nr, aio_context_t *ctxp)
{
    returnsyscall(__NR_io_setup, nr, ctxp);
}
  
inlineint io_submit(aio_context_t ctx,long nr,  struct iocb **iocbpp)
{
    returnsyscall(__NR_io_submit, ctx, nr, iocbpp);
}
  
inlineint io_getevents(aio_context_t ctx,long min_nr, long max_nr,
        structio_event *events, structtimespec *timeout)
{
    returnsyscall(__NR_io_getevents, ctx, min_nr, max_nr, events, timeout);
}
  
inlineint io_destroy(aio_context_t ctx)
{
    returnsyscall(__NR_io_destroy, ctx);
}
  
inlineint eventfd2(unsignedint initval, int flags)
{
    returnsyscall(__NR_eventfd2, initval, flags);
}
  
structcustom_iocb
{
    structiocb iocb;
    intnth_request;
};
  
typedefvoid io_callback_t(aio_context_t ctx,struct iocb *iocb,long res, long res2);
  
voidaio_callback(aio_context_t ctx, structiocb *iocb, longres, long res2)
{
    structcustom_iocb *iocbp = (structcustom_iocb *)iocb;
    printf("nth_request: %d, request_type: %s, offset: %lld, length: %lu, res: %ld, res2: %ld\n",
            iocbp->nth_request, (iocb->aio_lio_opcode == IOCB_CMD_PREAD) ?"READ" : "WRITE",
            iocb->aio_offset, iocb->aio_nbytes, res, res2);
}
  
int main(int argc,char *argv[])
{
    intefd, fd, epfd;
    aio_context_t ctx;
    structtimespec tms;
    structio_event events[NUM_EVENTS];
    structcustom_iocb iocbs[NUM_EVENTS];
    structiocb *iocbps[NUM_EVENTS];
    structcustom_iocb *iocbp;
    inti, j, r;
    void*buf;
    void*aio_buf;
    structepoll_event epevent;
  
    efd = eventfd2(0, O_NONBLOCK | O_CLOEXEC);
    if(efd == -1) {
        perror("eventfd2");
        return2;
    }
  
    fd = open(TEST_FILE, O_RDWR | O_CREAT | O_DIRECT, 0644);
    if(fd == -1) {
        perror("open");
        return3;
    }
    ftruncate(fd, TEST_FILE_SIZE);
  
    ctx = 0;
    if(io_setup(NUM_EVENTS, &ctx)) {
        perror("io_setup");
        return4;
    }
  
    if(posix_memalign(&buf, ALIGN_SIZE, RD_WR_SIZE * NUM_EVENTS)) {
        perror("posix_memalign");
        return5;
    }
    printf("buf: %p\n", buf);
  
    for(i = 0, iocbp = iocbs; i < NUM_EVENTS; ++i, ++iocbp) {
        aio_buf = (void*)((char *)buf + (i*RD_WR_SIZE));
        memset(aio_buf, 0, RD_WR_SIZE);
  
        //io_prep_pread(&iocbp->iocb, fd, buf, RD_WR_SIZE, i * RD_WR_SIZE);
        iocbp->iocb.aio_fildes = fd;
        iocbp->iocb.aio_lio_opcode = IOCB_CMD_PREAD;
        iocbp->iocb.aio_buf = (uint64_t)aio_buf;
        iocbp->iocb.aio_offset = i * RD_WR_SIZE;
        iocbp->iocb.aio_nbytes = RD_WR_SIZE;
  
        //io_set_eventfd(&iocbp->iocb, efd);
        iocbp->iocb.aio_flags = IOCB_FLAG_RESFD;
        iocbp->iocb.aio_resfd = efd;
  
        //io_set_callback(&iocbp->iocb, aio_callback);
        iocbp->iocb.aio_data = (__u64)aio_callback;
  
        iocbp->nth_request = i + 1;
        iocbps[i] = &iocbp->iocb;
    }
  
    if(io_submit(ctx, NUM_EVENTS, iocbps) != NUM_EVENTS) {
        perror("io_submit");
        return6;
    }
  
    epfd = epoll_create(1);
    if(epfd == -1) {
        perror("epoll_create");
        return7;
    }
  
    epevent.events = EPOLLIN | EPOLLET;
    epevent.data.ptr = NULL;
    if(epoll_ctl(epfd, EPOLL_CTL_ADD, efd, &epevent)) {
        perror("epoll_ctl");
        return8;
    }
  
    i = 0;
    while(i < NUM_EVENTS) {
        uint64_t finished_aio;
  
        if(epoll_wait(epfd, &epevent, 1, -1) != 1) {
            perror("epoll_wait");
            return9;
        }
  
        if(read(efd, &finished_aio, sizeof(finished_aio)) !=sizeof(finished_aio)) {
            perror("read");
            return10;
        }
  
        printf("finished io number: %"PRIu64"\n", finished_aio);
  
        while(finished_aio > 0) {
            tms.tv_sec = 0;
            tms.tv_nsec = 0;
            r = io_getevents(ctx, 1, NUM_EVENTS, events, &tms);
            if(r > 0) {
                for(j = 0; j < r; ++j) {
                    ((io_callback_t *)(events[j].data))(ctx, (structiocb *)events[j].obj, events[j].res, events[j].res2);
                }
                i += r;
                finished_aio -= r;
            }
        }
    }
  
    close(epfd);
    free(buf);
    io_destroy(ctx);
    close(fd);
    close(efd);
    remove(TEST_FILE);
  
    return0;
}

转载请保留地址:http://lenky.info/2013/01/09/linux-native-aio%e4%b8%8eeventfd%e3%80%81epoll%e7%9a%84%e7%bb%93%e5%90%88%e4%bd%bf%e7%94%a8/ 或 http://lenky.info/?p=2183