qemu-kvm 设备虚拟化----I/O 端口和 I/O 内存

来源:互联网 发布:mac word转pdf 白边 编辑:程序博客网 时间:2024/04/30 08:10
操作设备存在两种接口:I/O 端口和 I/O 内存,下面分析虚拟机如何截获和模拟这两种情况的。

1.用户空间访问内核数据结构信息
内存映射可被实现来提供用户程序对设备内存的直接存取,KVM 内核代表每个 VCPU 的 struct kvm_run 数据结构被 mmap用户空间,从而用户空间可以读取 struct kvm_run 中的信息,对于mmio读写操作来说,可以知道
其地址和大小。当然可以读取其他的信息例如struct kvm_coalesced_mmio_ring等。
mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
env->kvm_run =mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,0);

static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
        struct kvm_vcpu *vcpu = vma->vm_file->private_data;
        struct page *page;

        if (vmf->pgoff == 0)
                page = virt_to_page(vcpu->run);
#ifdef CONFIG_X86
        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->arch.pio_data);
#endif  
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
        else
                return VM_FAULT_SIGBUS;
        get_page(page);
        vmf->page = page;
        return 0;
}
        
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
        .fault = kvm_vcpu_fault,
};

static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
        vma->vm_ops = &kvm_vcpu_vm_ops;
        return 0;
}

2.I/O 内存-mmio的截获和模拟流程

首先用户态对mmio地址的读写函数注册
然后MMIO 会被 KVM 内核截获
最后从KVM 内核返回到用户空间,由 qemu-kvm 来完成 MMIO 读写的模拟。 

qemu-kvm 用户态
=================
设备都注册自己特定mmio地址的读写函数(read/write) 函数
iomemtype = cpu_register_io_memory(hpet_ram_read,hpet_ram_write, s);
cpu_register_physical_memory(HPET_BASE, 0x400, iomemtype);

int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
                           CPUWriteMemoryFunc * const *mem_write,
                           void *opaque)
{
    return cpu_register_io_memory_fixed(0, mem_read, mem_write, opaque);
}

static int cpu_register_io_memory_fixed(int io_index,
                                        CPUReadMemoryFunc * const *mem_read,
                                        CPUWriteMemoryFunc * const *mem_write,
                                        void *opaque)
{
    int i, subwidth = 0;

    if (io_index <= 0) {
        io_index = get_free_io_mem_idx();
        if (io_index == -1)
            return io_index;
    } else {
        io_index >>= IO_MEM_SHIFT;
        if (io_index >= IO_MEM_NB_ENTRIES)
            return -1;
    }

    for(i = 0;i < 3; i++) {
        if (!mem_read[i] || !mem_write[i])
            subwidth = IO_MEM_SUBWIDTH;
        io_mem_read[io_index][i] = mem_read[i];
        io_mem_write[io_index][i] = mem_write[i];
    }
    io_mem_opaque[io_index] = opaque;
    return (io_index << IO_MEM_SHIFT) | subwidth;
}



int kvm_run(CPUState *env)
{


    case KVM_EXIT_MMIO:
            r = handle_mmio(env);
}

static int handle_mmio(CPUState *env)
{           
    unsigned long addr = env->kvm_run->mmio.phys_addr;
    struct kvm_run *kvm_run = env->kvm_run;
    void *data = kvm_run->mmio.data;
            
    /* hack: Red Hat 7.1 generates these weird accesses. */
    if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3)
        return 0;
            
    cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
    return 0;


void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                            int len, int is_write)
{
    int l, io_index;
    uint8_t *ptr;
    uint32_t val;
    target_phys_addr_t page;
    unsigned long pd;
    PhysPageDesc *p;

    while (len > 0) {
        page = addr & TARGET_PAGE_MASK;
        l = (page + TARGET_PAGE_SIZE) - addr;
        if (l > len)
            l = len;
        p = phys_page_find(page >> TARGET_PAGE_BITS);
        if (!p) {
            pd = IO_MEM_UNASSIGNED;
        } else {
            pd = p->phys_offset;
        }
   if (is_write) {
            if ((pd & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
                target_phys_addr_t addr1 = addr;
                io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
                if (p)
                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
                /* XXX: could force cpu_single_env to NULL to avoid
                   potential bugs */
                if (l >= 4 && ((addr1 & 3) == 0)) {
                    /* 32 bit write access */
                    val = ldl_p(buf);
                    io_mem_write[io_index][2](io_mem_opaque[io_index], addr1, val);
                    l = 4;
                } else if (l >= 2 && ((addr1 & 1) == 0)) {
                    /* 16 bit write access */
                    val = lduw_p(buf);
                    io_mem_write[io_index][1](io_mem_opaque[io_index], addr1, val);
                    l = 2;
                } else {
                    /* 8 bit write access */
                    val = ldub_p(buf);
                    io_mem_write[io_index][0](io_mem_opaque[io_index], addr1, val);
                    l = 1;
                }
            } else {
                unsigned long addr1;
                addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
                /* RAM case */
                ptr = qemu_get_ram_ptr(addr1);
                memcpy(ptr, buf, l);
                if (!cpu_physical_memory_is_dirty(addr1)) {
                    /* invalidate code */
                    tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
                    /* set dirty bit */
                if (!cpu_physical_memory_get_dirty(addr1, MIGRATION_DIRTY_FLAG))
                        ram_list.dirty_pages++;
                    ram_list.phys_dirty[addr1 >> TARGET_PAGE_BITS] |=
                        (0xff & ~CODE_DIRTY_FLAG);
                }
                /* qemu doesn't execute guest code directly, but kvm does
                   therefore flush instruction caches */
                if (kvm_enabled())
                    flush_icache_range((unsigned long)ptr,
                                       ((unsigned long)ptr)+l);
            }
        } else {
            if ((pd & ~TARGET_PAGE_MASK) > IO_MEM_ROM &&
                !(pd & IO_MEM_ROMD)) {
                target_phys_addr_t addr1 = addr;
                /* I/O case */
                io_index = (pd >> IO_MEM_SHIFT) & (IO_MEM_NB_ENTRIES - 1);
                if (p)
                    addr1 = (addr & ~TARGET_PAGE_MASK) + p->region_offset;
                if (l >= 4 && ((addr1 & 3) == 0)) {
                    /* 32 bit read access */
                    val = io_mem_read[io_index][2](io_mem_opaque[io_index], addr1);
                    stl_p(buf, val);
                    l = 4;
                } else if (l >= 2 && ((addr1 & 1) == 0)) {
                    /* 16 bit read access */
                    val = io_mem_read[io_index][1](io_mem_opaque[io_index], addr1);
                    stw_p(buf, val);
                   l = 2;
                } else {
                    /* 8 bit read access */
                    val = io_mem_read[io_index][0](io_mem_opaque[io_index], addr1);
                    stb_p(buf, val);
                    l = 1;
                }
            } else {
                /* RAM case */
                ptr = qemu_get_ram_ptr(pd & TARGET_PAGE_MASK) +
                    (addr & ~TARGET_PAGE_MASK);
                memcpy(buf, ptr, l);
            }
        }
        len -= l;
        buf += l;
        addr += l;
    }
}

qemu-kvm 内核态
==============
static int handle_exception(struct kvm_vcpu *vcpu)
{
       if (is_page_fault(intr_info)) {
                /* EPT won't cause page fault directly */
                if (enable_ept)
                        BUG();
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                trace_kvm_page_fault(cr2, error_code);

                if (kvm_event_needs_reinjection(vcpu))
                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
                return kvm_mmu_page_fault(vcpu, cr2, error_code);
        }
}


int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
        int r;
        enum emulation_result er;

        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
        if (r < 0)
                goto out;
        
        if (!r) {
                r = 1;       
                goto out;
        }        

        r = mmu_topup_memory_caches(vcpu);
        if (r)
                goto out;

        er = emulate_instruction(vcpu, cr2, error_code, 0);

        switch (er) {
        case EMULATE_DONE:
                return 1;
        case EMULATE_DO_MMIO:
                ++vcpu->stat.mmio_exits;
                return 0; 
        case EMULATE_FAIL:
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                vcpu->run->internal.ndata = 0;
                return 0;
        default:
                BUG();
        }
out:
        return r;
}
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                               u32 error_code)
{
        /* mmio */
        if (is_error_pfn(pfn)) {
                pgprintk("gfn %lx is mmio\n", walker.gfn);
                kvm_release_pfn_clean(pfn);
                return is_fault_pfn(pfn) ? -EFAULT : 1;
        }
}
int emulate_instruction(struct kvm_vcpu *vcpu,
                        unsigned long cr2,
                        u16 error_code,
                        int emulation_type)
{
     if ((r || vcpu->mmio_is_write) && run) {
                run->exit_reason = KVM_EXIT_MMIO;
                run->mmio.phys_addr = vcpu->mmio_phys_addr;
                memcpy(run->mmio.data, vcpu->mmio_data, 8);
                run->mmio.len = vcpu->mmio_size;
                run->mmio.is_write = vcpu->mmio_is_write;
        }

      if (r) {
                if (reexecute_instruction(vcpu, cr2))
                        return EMULATE_DONE;
                if (!vcpu->mmio_needed) {
                        kvm_report_emulation_failure(vcpu, "mmio");
                        return EMULATE_FAIL;
                }
                return EMULATE_DO_MMIO;
        }

        kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);

        if (vcpu->mmio_is_write) {
                vcpu->mmio_needed = 0;
                return EMULATE_DO_MMIO;
        }

        return EMULATE_DONE;
}

内核态返回到用户态
======================
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
  r = __vcpu_run(vcpu);

out:
        post_kvm_run_save(vcpu);
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);

        vcpu_put(vcpu);
        return r;
}

static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{       
        struct kvm_run *kvm_run = vcpu->run;
               kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
        if (irqchip_in_kernel(vcpu->kvm))
                kvm_run->ready_for_interrupt_injection = 1;
        else
                kvm_run->ready_for_interrupt_injection =
                        kvm_arch_interrupt_allowed(vcpu) &&
                        !kvm_cpu_has_interrupt(vcpu) &&
                        !kvm_event_needs_reinjection(vcpu);
}
2  coalesced mmio截获和模拟流程。 
这种方式模拟 MMIO 会被 KVM 内核截取,但 KVM 并不会立即跳出到 qemu-kvm 用户空间,KVM 将需要模拟的读写操作形成一个记录 (struct kvm_coalesced_mmio), 放在在代表整个 VM 的 struct kvm 所指向的一个环形缓冲区中 (struct kvm_coalesced_mmio_ring), 这个环形缓冲区被 mmap 到了用户空间。 当下一次代表某个 VCPU 的 qemu-kvm 线程返回到用户空间后,就会对环形缓冲区中的记录进行处理,执行 MMIO 读写模拟。 对于这种方式, qemu-kvm 一次模拟的可能是已经被积累起来的多个 MMIO 读写操作, 显然这种方式是一种性能优化,它适合于对响应时间要求不是很严格的 MMIO 写操作。
1.用户态对mmio地址的读写函数注册
2.然后MMIO 会被 KVM 内核截获
3.不返回到用户空间,积累起来的多个 MMIO 读写操作。 
qemu-kvm 用户态
=================
设备都注册自己特定coalesced mmio地址的读写函数(read/write) 函数
 d->mmio_index = cpu_register_io_memory(e1000_mmio_read, e1000_mmio_write, d);
 cpu_register_physical_memory(addr, PNPMMIO_SIZE, d->mmio_index);
 qemu_register_coalesced_mmio(addr, excluded_regs[0]);

qemu-kvm 内核态
==============
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
        int r;
        enum emulation_result er;

        r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
        if (r < 0)
                goto out;
        
        if (!r) {
                r = 1;       
                goto out;
        }        

        r = mmu_topup_memory_caches(vcpu);
        if (r)
                goto out;

        er = emulate_instruction(vcpu, cr2, error_code, 0);

        switch (er) {
        case EMULATE_DONE:
                return 1;
        case EMULATE_DO_MMIO:
                ++vcpu->stat.mmio_exits;
                return 0; 
        case EMULATE_FAIL:
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                vcpu->run->internal.ndata = 0;
                return 0;
        default:
                BUG();
        }
out:
        return r;
}

static int emulator_write_emulated_onepage(unsigned long addr,
                                           const void *val,
                                           unsigned int bytes,
                                           struct kvm_vcpu *vcpu)
{
      /* For APIC access vmexit */
        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
                goto mmio;

        if (emulator_write_phys(vcpu, gpa, val, bytes))
                return X86EMUL_CONTINUE;

mmio:
        trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
        /*
         * Is this MMIO handled locally?
         */
        if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
                return X86EMUL_CONTINUE;

        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
        vcpu->mmio_size = bytes;
        vcpu->mmio_is_write = 1;
        memcpy(vcpu->mmio_data, val, bytes);

        return X86EMUL_CONTINUE;
}

static int coalesced_mmio_write(struct kvm_io_device *this,
                                gpa_t addr, int len, const void *val)
{
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
        struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
        if (!coalesced_mmio_in_range(dev, addr, len))
                return -EOPNOTSUPP;

        spin_lock(&dev->lock);
    
        /* copy data in first free entry of the ring */
    
        ring->coalesced_mmio[ring->last].phys_addr = addr;
        ring->coalesced_mmio[ring->last].len = len;
        memcpy(ring->coalesced_mmio[ring->last].data, val, len);
        smp_wmb();
        ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
        spin_unlock(&dev->lock);
        return 0;
}
内核态返回到用户态
==================
不返回用户态,积累起来的多个 MMIO 读写操作。当其他vm exit退出时,返回用户态,再进行coalesced_mmio处理。

////////////////////////////
3.I/O 端口模拟和截获---pio
io指令大概分为两种单字和字串指令。下面只分析单字io端口。
例如 
Intel语法的in、out指令格式为:
    IN 累加器, {端口号│DX}
    OUT {端口号│DX},累加器

1.设备端口的注册
以内核模拟i8259设备为例说明,如果采用用户空间模拟设备,需要返回到用户空间处理handle_io,过程稍复杂。

        kvm_iodevice_init(&s->dev, &picdev_ops);
        mutex_lock(&kvm->slots_lock);
        ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
        mutex_unlock(&kvm->slots_lock);
        if (ret < 0) {
                kfree(s);
                return NULL;
        }
2.截获io端口操作 
static int handle_io(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification;
        int size, in, string;
        unsigned port;

        ++vcpu->stat.io_exits;
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        string = (exit_qualification & 16) != 0;

        if (string) {
                if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
                        return 0;
                return 1;
        }

        size = (exit_qualification & 7) + 1;
        in = (exit_qualification & 8) != 0;
        port = exit_qualification >> 16;

        skip_emulated_instruction(vcpu);
        return kvm_emulate_pio(vcpu, in, size, port);
}

 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
        unsigned long val;

        trace_kvm_pio(!in, port, size, 1);

        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
        vcpu->run->io.size = vcpu->arch.pio.size = size;
        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
        vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
        vcpu->run->io.port = vcpu->arch.pio.port = port;
        vcpu->arch.pio.in = in;
        vcpu->arch.pio.string = 0;
        vcpu->arch.pio.down = 0;
        vcpu->arch.pio.rep = 0;

        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);

        if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
                complete_pio(vcpu);
                return 1;
        }
        return 0;
}
3.调用i8259设备注册读写函数,进行模拟 
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
{       
        /* TODO: String I/O for in kernel device */
        int r;
        
        if (vcpu->arch.pio.in)
                r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
                                    vcpu->arch.pio.size, pd);
        else
                r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
                                     vcpu->arch.pio.port, vcpu->arch.pio.size,
                                     pd);
        return r;
}   
int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
{
        int i;
        struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);

        for (i = 0; i < bus->dev_count; i++)
                if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
                        return 0;
        return -EOPNOTSUPP;
}

static inline int kvm_iodevice_read(struct kvm_io_device *dev,
                                    gpa_t addr, int l, void *v)
{
        return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
}
//8259读操作函数
static int picdev_read(struct kvm_io_device *this,
                       gpa_t addr, int len, void *val)
{
        struct kvm_pic *s = to_pic(this);
        unsigned char data = 0;
        if (!picdev_in_range(addr))
                return -EOPNOTSUPP;

        if (len != 1) {
                if (printk_ratelimit())
                        printk(KERN_ERR "PIC: non byte read\n");
                return 0;
        }
        pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
        case 0xa0:
        case 0xa1:
                data = pic_ioport_read(&s->pics[addr >> 7], addr);
                break;
        case 0x4d0:
        case 0x4d1:
                data = elcr_ioport_read(&s->pics[addr & 1], addr);
                break;
        }
        *(unsigned char *)val = data;
        pic_unlock(s);
        return 0;
}  
4.pio完成后,保存相应寄存器(rax)到vmcs中。          
int complete_pio(struct kvm_vcpu *vcpu)
{
        struct kvm_pio_request *io = &vcpu->arch.pio;
        long delta;
        int r;
        unsigned long val;

        if (!io->string) {
                if (io->in) {
                        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
                        memcpy(&val, vcpu->arch.pio_data, io->size);
                        kvm_register_write(vcpu, VCPU_REGS_RAX, val);
                }
        }
}
                                              
0 0