qemu-kvm 线程事件模型【QEMU-KVM代码分析之三】

来源：互联网发布：南京java开发培训机构编辑：程序博客网时间：2024/05/21 06:33

qemu-kvm 线程事件模型
1.主（父）线程。
主线程执行循环，主要做三件事情
1）.执行select操作，查询文件描述符有无读写操作
2）.执行定时器回调函数
3）.执行下半部（BH）回调函数。为什么要采用BH，资料说主要避免可重入性和调用栈溢出。

2.执行客户机代码的线程
只讨论kvm执行客户机代码情况（不考虑TCG，TCG采用动态翻译技术），如果有多个vcpu，就意味着存在多个线程。

3.异步io文件操作线程
提交i/o操作请求到队列中，该线程从队列取请求，并进行处理。

4.主线程与执行客户机代码线程同步
主线程与执行客户机代码线程不能同时运行，主要通过一个全局互斥锁实现。

代码分析

1.主（父）线程。
下面函数是主线程主要执行函数：当文件描述符，定时器，下半部分触发相应事件后，将执行相应回调函数。
void main_loop_wait(int timeout){
ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
if (ret > 0) {
        IOHandlerRecord *pioh;
        QLIST_FOREACH(ioh, &io_handlers, next) {
            if (!ioh->deleted && ioh->fd_read && FD_ISSET(ioh->fd, &rfds)) {
                ioh->fd_read(ioh->opaque);
                if (!(ioh->fd_read_poll && ioh->fd_read_poll(ioh->opaque)))
                    FD_CLR(ioh->fd, &rfds);
            }
            if (!ioh->deleted && ioh->fd_write && FD_ISSET(ioh->fd, &wfds)) {
                ioh->fd_write(ioh->opaque);
            }
        }
    }

qemu_run_timers(&active_timers[QEMU_CLOCK_HOST],
                    qemu_get_clock(host_clock));

   /* Check bottom-halves last in case any of the earlier events triggered
       them. */
    qemu_bh_poll();
}

对于select函数轮循文件描述符，以及对于该描述执行操作函数，主要通过qemu_set_fd_handler()和qemu_set_fd_handler2函数添加完成的。
int qemu_set_fd_handler(int fd,
                        IOHandler *fd_read,
                        IOHandler *fd_write,
                        void *opaque);
int qemu_set_fd_handler2(int fd,
                         IOCanRWHandler *fd_read_poll,
                         IOHandler *fd_read,
                         IOHandler *fd_write,
                         void *opaque)

对于到期执行的定时器函数，回调函数由qemu_new_time函数添加的，触发时间qemu_mod_timer函数修改的
EMUTimer *qemu_new_timer(QEMUClock *clock, QEMUTimerCB *cb, void *opaque)
void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)

下半部要添加调度函数由qemu_bh_new 和qemu_bh_schedule完成的。
EMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
void qemu_bh_schedule(QEMUBH *bh)

2.执行客户机代码的线程
当初始化客户机硬件时，对于每个cpu创建一个线程，每个线程执行ap_main_loop函数，该函数运行kvm_run函数，运行客户机代码。
/* PC hardware initialisation */
static void pc_init1(ram_addr_t ram_size,
                     const char *boot_device,
                     const char *kernel_filename,
                     const char *kernel_cmdline,
                     const char *initrd_filename,
                     const char *cpu_model,
                     int pci_enabled)
{
for (i = 0; i < smp_cpus; i++) {
        env = pc_new_cpu(cpu_model);
    }

}

void kvm_init_vcpu(CPUState *env)
{
    pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);

    while (env->created == 0)
        qemu_cond_wait(&qemu_vcpu_cond);
}
执行客户机线程调用函数ap_main_loop，该函数最终调用函数kvm_main_loop_cpu，该函数工作过程如下：
1.注入中断，执行客户机代码，解决客户机退出原因，例如 KVM_EXIT_MMIO， KVM_EXIT_IO。如果解决成功，继续运行。失败话，进入步骤2
2.该步骤如果vcpu存在着，已传递但是还没有处理里信号SIG_IPI，SIGBUS，该线程阻塞，也就意味着暂停处理器客户机代码，直到处理相应信号。
3.如果上述过程完成后，继续允许执行客户机代码。
static int kvm_main_loop_cpu(CPUState *env)
{
    while (1) {
        int run_cpu = !is_cpu_stopped(env);
        if (run_cpu && !kvm_irqchip_in_kernel()) {
            process_irqchip_events(env);
            run_cpu = !env->halted;
        }
        if (run_cpu) {
            kvm_cpu_exec(env);
            kvm_main_loop_wait(env, 0);
        } else {
            kvm_main_loop_wait(env, 1000);
        }
    }
    pthread_mutex_unlock(&qemu_mutex);
    return 0;
}
static void kvm_main_loop_wait(CPUState *env, int timeout)
{
    struct timespec ts;
    int r, e;
    siginfo_t siginfo;
    sigset_t waitset;
    sigset_t chkset;

    ts.tv_sec = timeout / 1000;
    ts.tv_nsec = (timeout % 1000) * 1000000;
    sigemptyset(&waitset);
    sigaddset(&waitset, SIG_IPI);
    sigaddset(&waitset, SIGBUS);

    do {
        pthread_mutex_unlock(&qemu_mutex);

        r = sigtimedwait(&waitset, &siginfo, &ts);
        e = errno;

        pthread_mutex_lock(&qemu_mutex);

        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
            printf("sigtimedwait: %s\n", strerror(e));
            exit(1);
        }

        switch (r) {
        case SIGBUS:
            kvm_on_sigbus(env, &siginfo);
            break;
        default:
            break;
        }
        r = sigpending(&chkset);
        if (r == -1) {
            printf("sigpending: %s\n", strerror(e));
            exit(1);
        }
    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));

    cpu_single_env = env;
    flush_queued_work(env);

    if (env->stop) {
        env->stop = 0;
        env->stopped = 1;
        pthread_cond_signal(&qemu_pause_cond);
    }

    env->kvm_cpu_state.signalled = 0;
}
3.异步io文件操作线程
创建io操作线程，进行读写操作。可以通过gdb跟踪，验证查看io线程
static void spawn_thread(void)
{
    sigset_t set, oldset;

    cur_threads++;
    idle_threads++;

    /* block all signals */
    if (sigfillset(&set)) die("sigfillset");
    if (sigprocmask(SIG_SETMASK, &set, &oldset)) die("sigprocmask");

    thread_create(&thread_id, &attr, aio_thread, NULL);

    if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
}

static void qemu_paio_submit(struct qemu_paiocb *aiocb)
{
    aiocb->ret = -EINPROGRESS;
    aiocb->active = 0;
    mutex_lock(&lock);
    if (idle_threads == 0 && cur_threads < max_threads)
        spawn_thread();
    QTAILQ_INSERT_TAIL(&request_list, aiocb, node);
    mutex_unlock(&lock);
    cond_signal(&cond);
}
可见启动一次bdrv_aio_readv或者raw_aio_writev操作，创建一个aio_thread线程。
static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque)
{
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
                          cb, opaque, QEMU_AIO_READ);
}

static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
        BlockDriverCompletionFunc *cb, void *opaque)
{
    return raw_aio_submit(bs, sector_num, qiov, nb_sectors,
                          cb, opaque, QEMU_AIO_WRITE);
}

4.主线程与执行客户机代码线程，主线程与异步io文件操作线程同步----qemu_global_mutex
select阻塞（主线程）和执行客户机代码（客户机线程）不需要同步锁，这在qemu运行过程占的时间比例较大。
但是执行异步io文件操作时，占用qemu_global_mutex锁的。
select阻塞这里，实际不需要锁定。
void main_loop_wait(int timeout)
{
   qemu_mutex_unlock_iothread();//开锁
    ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
    qemu_mutex_lock_iothread(); //锁住
}
执行客户机代码是不要锁定
int kvm_cpu_exec(CPUState *env)
{
       qemu_mutex_unlock_iothread();//开锁
        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
        qemu_mutex_lock_iothread(); //锁住

}

Qemu IO thread初始化函数位于main-loop.c:

[cpp] view plaincopy
int qemu_init_main_loop(void)  
{  
    int ret;  
    GSource *src;  
  
    init_clocks();  
    if (init_timer_alarm() < 0) {  
        fprintf(stderr, "could not initialize alarm timer\n");  
        exit(1);  
    }  
  
    ret = qemu_signal_init();  
    if (ret) {  
        return ret;  
    }  
  
    qemu_aio_context = aio_context_new();  
    src = aio_get_g_source(qemu_aio_context);  
    g_source_attach(src, NULL);  
    g_source_unref(src);  
    return 0;  

init_clocks(): 依次创建rt_clock, vm_clock, host_clock三个时钟，这3个时钟区别，在qemu-timer.h中有简要的说明，后面讨论qemu时钟和定时器的时候，再详细的分析他们之间的区别。
init_timer_alarm(): QEMU-KVM默认支持两种timer, 分别是dynticks timer和unix timer; dynticks timer使用timer_create(CLOCK_REALTIME)创建时钟，然后通过timer_settime()触发SIGALRM信号, 而unix timer使用setitimer来触发SIGALRM信号; 前者的实现会使用rdtsc, 所以精度更高些。QEMU-KVM默认使用dynticks timer.
qemu_signal_init(): 该函数首先创建一个signalfd, 然后调用qemu_set_fd_handler2()函数将signalfd加入main loop的select文件句柄集合中；IO thread关注SIGIO, SIGALRM, SIGBUS这3个信号，当IO thread捕获其中一个信号，signalfd因此会变成可读状态，select()函数返回并调用该事件的回调函数sigfd_handler(), sigfd_handler()会调用为该信号注册的回调函数sa_handler。这3个信号中比较特别的是SIGALRM, 因为qemu-kvm的定时器，就是定期触发该信号, 它的回调函数是qemu-timer.c中的host_alarm_handler()。
qemu_aio_context: 用于处理aio handler调度的上下文，该调度系统利用了glib的main loop机制，相关背景知识可以参考http://blog.csdn.net/luo_brian/article/details/8540296

IO thread主函数

Qemu IO thread的主函数也位于main-loop.c, 主要考虑到跨平台性，使用select和g_poll轮询系统的IO描述符，并根据测试结果调用相应的回调函数。

Qemu中IO的有些回调函数函数是分上部和下部的，该机制在Linux内核中被广泛使用，主要是为了提高事件相应的及时性。

Qemu中常用的IO描述符有下面几类：

block io: 虚拟磁盘相关的io, 为了保证高性能，主要使用aio；
signalfd: 前面介绍过，qemu的时钟模拟利用了linux kernel的signalfd, 定期产生SIGALRM信号；
eventfd: 主要用于qemu和kvm之间的notifier, 比如qemu的模拟设备可以通过notifier向kvm发送一个模拟中断，kvm也可以通过notifier向qemu报告guest的各种状态；
socket: 用于虚拟机迁移，qmp管理等

该函数同时还负责轮询系统中所有的定时器，并调用定时器的回调函数；

[cpp] view plaincopy
int main_loop_wait(int nonblocking)  
{  
    int ret;  
    uint32_t timeout = UINT32_MAX;  
  
    if (nonblocking) {  
        timeout = 0;  
    }  
  
    /* poll any events */  
    /* XXX: separate device handlers from system ones */  
    nfds = -1;  
    FD_ZERO(&rfds);  
    FD_ZERO(&wfds);  
    FD_ZERO(&xfds);  
  
#ifdef CONFIG_SLIRP  
    slirp_update_timeout(&timeout);  
    slirp_select_fill(&nfds, &rfds, &wfds, &xfds);  
#endif  
    qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);  
    ret = os_host_main_loop_wait(timeout);  
    qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);  
#ifdef CONFIG_SLIRP  
    slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));  
#endif  
  
    qemu_run_all_timers();  
  
    return ret;  
}  

slirp（user mode networking）

在KVM环境中，为了提高网络的性能一般虚拟机使用vhost-net, 所以slirp基本没有太多实际用途，也建议在编译配置qemu-kvm的时候，使用--disable-slirp将其禁止掉。关于slirp的介绍，可以看一下http://wiki.qemu.org/Documentation/Networking。

IO Handler

用来表示一个IO描述符，其结构定义如下；iohandler.c中定义了一个全局的链表io_handlers，并提供qemu_set_fd_handler()和qemu_set_fd_handler2()函数将一个fd加入到这个链表；在IO thread主循环中qemu_iohandler_fill()函数负责将io_handlers链表中的所有描述符，加入select测试集合。

[cpp] view plaincopy
typedef struct IOHandlerRecord {  
    IOCanReadHandler *fd_read_poll;  
    IOHandler *fd_read;  
    IOHandler *fd_write;  
    void *opaque;  
    QLIST_ENTRY(IOHandlerRecord) next;  
    int fd;  
    bool deleted;  
} IOHandlerRecord;  
  
static QLIST_HEAD(, IOHandlerRecord) io_handlers =  
    QLIST_HEAD_INITIALIZER(io_handlers);  

AIO Handler

Qemu中的aio主要用于block io, 从而可以提高虚拟磁盘读写的性能。Qemu使用g_poll轮询测试AIO描述符；g_poll是glib的函数，使用方法可以参考GMainLoop的实现原理和代码模型以及glib GMainLoop的手册；

aio_set_fd_handler()用于将一个AioHandler的描述符加入g_poll集合。

[cpp] view plaincopy
typedef struct AioContext {  
    GSource source;  
  
    /* The list of registered AIO handlers */  
    QLIST_HEAD(, AioHandler) aio_handlers;  
  
    /* This is a simple lock used to protect the aio_handlers list. 
     * Specifically, it's used to ensure that no callbacks are removed while 
     * we're walking and dispatching callbacks. 
     */  
    int walking_handlers;  
  
    /* Anchor of the list of Bottom Halves belonging to the context */  
    struct QEMUBH *first_bh;  
  
    /* A simple lock used to protect the first_bh list, and ensure that 
     * no callbacks are removed while we're walking and dispatching callbacks. 
     */  
    int walking_bh;  
  
    /* Used for aio_notify.  */  
    EventNotifier notifier;  
} AioContext;  
  
struct AioHandler {   
    EventNotifier *e;   
    EventNotifierHandler *io_notify;   
    AioFlushEventNotifierHandler *io_flush;   
    GPollFD pfd; int deleted;   
    QLIST_ENTRY(AioHandler) node;  
};  

IO thread同步

Qemu IO thread和vcpu thread使用一个全局共享线程锁来保证同步，函数qemu_mutex_lock_iothread()和qemu_mutex_unlock_iothread()分别用来获取和释放该锁；

当vcpu thread从guest模式退出到host模式的时候，vcpu thread会尝试取得该锁；而IO thread在主循环中，也会不断尝试取得该锁；

思考

使用epoll替换select和g_poll：qemu是一个通用模拟器，在xen和kvm中，用于为hypervisor提供设备模拟服务；同时它也能仿真arm， mips等系统，同时还需要考虑跨平台(windows+linux)，所以不少某些Linux系统中的高级函数，qemu的主干没有使用，epoll就是其中之一；select和g_poll的移植性好，但是其O(n)的复杂度以及频繁的描述符拷贝会造成不小的开销，在KVM上下文中，使用epoll应该是比较明智的选择。
同步问题：IO thread和vcpu threads在qemu中共享上下文，所以block io的性能较bare metal相比，即使是使用aio, 还是会有比较大的损耗。Qemu 1.3开始引入dataplane, 其核心思想是把block io从qemu上下文中剥离出去，作为一个单独的线程，这样，block io和vcpu之间不会再有竞争关系，从而可以大幅提高io性能；但是因为dataplane没有qemu上下文，所以也没法很好的支持qcow2等虚拟磁盘格式。

0 0