[知其然不知其所以然-35] nvdimm[0] using systemtap

来源:互联网 发布:淘宝买家素质差 编辑:程序博客网 时间:2024/05/18 20:12

nvdimm is a storage device whose datas will not be lost after poweroff, and the speed to access this

device is near as far as dram. So this is an article to give a replay on how to study pmem on linux.


Basically, you can use memmap=size!base to tell the kernel you want to emulate a pmem device,

based on e820 table. How to set size/base is a topic, you should make sure the region does not

exceed the max e820 memory, and besides, the region should be  aligned with 128M, and not overlap with

other e820 region.  So memmap=256M@4G would be a proper choice.


After bootup, you might find /dev/pmem0 created for you, and you can treat as a generic block disk, or

you can play with another interesting feature, that is DAX. The DAX stands for direct access, which means,

you don't need to involve  page caches in, what you need to do is just mmap this file from user space,

and access the data directly. Let's take a look at how to plat with it.


First, you need to mount a filesystem with DAX mode, currently there is only ext2/ext4/xfs support DAX mode,

for example:

ext4.fs -b 4096 /dev/pmem0p1

notice, the DAX block size need to be set with the same size of page size.

then

mount -o dax /dev/pmem0p1 /mnt/pmem


ok, after the preparation, we can use systemtap to track what's going on when we play with DAX, for example, here's one of the stap used to track

if the inode is set with DAX when this inode is set:


#! /usr/bin/env stapprobe kernel.function("ext4_set_inode_flags"){#probe kernel.function("ext4_writepages"){#probe kernel.function("dax_writeback_mapping_range"){        if (@cast($inode->i_sb->s_fs_info, "ext4_sb_info")->s_mount_opt & 0x200) {        #if (@cast($inode->i_sb->s_fs_info, "ext4_sb_info", "kernel<fs/ext4/ext4.h>")->s_mount_opt & 0x200) {        #if ($mapping->host->i_flags == 0x2000) {        print("----------------START-------------------------\n")          printf("In process [%s]\n", execname())        printf("on CPU [%d]\n", cpu())        printf("%s\n", $$parms$);        #printf("inode flag: %x\n", $mapping->host->i_flags);        print_backtrace()        print("----------------END-------------------------\n")          //exit()        }}


../install/bin/stap dax_ext4.stp ----------------START-------------------------In process [touch]on CPU [2]inode={.i_mode=33206, .i_opflags=0, .i_uid={...}, .i_gid={...}, .i_flags=0, .i_acl=0xffffffffffffffff, .i_default_acl=0xffffffffffffffff, .i_op=0xffffffff8200d840, .i_sb=0xffff8800a208b000, .i_mapping=0xffff88005c4123e0, .i_security=0x0, .i_ino=12, <union>={...}, .i_rdev=0, .i_size=0, .i_atime={...}, .i_mtime={...}, .i_ctime={...}, .i_lock={...}, .i_bytes=0, .i_blkbits=12, .i_blocks=0, .i_state=0, .i_rwsem={...}, .dirtied_when=0, .dirtied_time_when=0, .i_hash={...}, .i_io_list={...}, .i_wb=0x0, .i_wb_frn_w 0xffffffff81294f70 : ext4_set_inode_flags+0x0/0x80 [kernel] 0xffffffff81290a20 : __ext4_new_inode+0xe00/0x1360 [kernel] 0xffffffff812a27c3 : ext4_create+0xc3/0x170 [kernel] 0xffffffff812188a6 : path_openat+0xe06/0x1360 [kernel] 0xffffffff8121a34e : do_filp_open+0x7e/0xd0 [kernel] 0xffffffff812086f5 : do_sys_open+0x115/0x1f0 [kernel] 0xffffffff812087ee : sys_open+0x1e/0x20 [kernel] 0xffffffff8100391e : do_syscall_64+0x6e/0x170 [kernel] 0xffffffff817f6ae5 : return_from_SYSCALL_64+0x0/0x6a [kernel]----------------END-------------------------

or a more complex script to dump the backtrace of make_request_io is :

#! /usr/bin/env stap  global new_inodeprobe kernel.function("ext4_set_inode_flags").return{#probe kernel.function("ext4_writepages"){  #probe kernel.function("dax_writeback_mapping_range"){  if (@cast($inode->i_sb->s_fs_info, "ext4_sb_info")->s_mount_opt & 0x200) {#if (@cast($inode->i_sb->s_fs_info, "ext4_sb_info", "kernel<fs/ext4/ext4.h>")->s_mount_opt & 0x200) {print("The return of ext4_set_inode_flags\n")          printf("In process [%s]\n", execname())          printf("on CPU [%d]\n", cpu()) #printf("%s\n", $$parms$);         #print_backtrace() printf("ext4_set_inode_flags, new inode:%p, flag:%x\n", $inode, $inode->i_flags); new_inode = $inode;        //exit()}}probe kernel.function("ext4_writepages"){  if (new_inode == $mapping->host) {printf("ext4_writepages, inode:%p, flag:%x\n", $mapping->host, $mapping->host->i_flags);        print_backtrace() if ($mapping->host->i_flags & 0x2000) {        printf("We are writing DAX inode, in process [%s]\n", execname())          printf("on CPU [%d]\n", cpu())         print_backtrace() }}#}}probe kernel.function("nd_blk_make_request"){#probe kernel.function("dax_fault"){  #probe kernel.function("dax_dev_fault"){          printf("We are in nd_blk_make_request, in process [%s]\n", execname())        printf("on CPU [%d]\n", cpu())        print_backtrace()}
by using:

#include <stdio.h>#include <sys/mman.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>int main(int argc, char *argv[]){        int fd, i;        char *addr;        if (argc != 2) {                printf("Please provide the file you want to mmap\n");                return -1;        }        fd = open(argv[1], O_RDWR);        if (fd == -1) {                printf("Can not open the file\n");                return -1;        }        addr = (char *)mmap(NULL, 4096, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_SHARED,                        fd, 0);        if (addr == MAP_FAILED) {                printf("Can not mmap file\n");                return -1;        }        for (i = 0; i < 4096; i++)                *(addr+i) = i;        while(1)                sleep(1);        munmap(addr, 4096);        return 0;}

However, there will be no nd_blk_make_request backtrace printed, so it looks like the mmap

dax will not go through this code path. Instead, we check the page fault code path to find how

dax is going to execute. So let's switch to fs/dax.c, and consider dax_insert_mapping_entry:

We are in dax_insert_mapping_entry, in process [mmap_pmem]on CPU [2] 0xffffffff8125933c : dax_fault+0x62c/0xad0 [kernel] 0xffffffff8128d98f : ext4_dax_fault+0xbf/0x150 [kernel] 0xffffffff811b26b0 : __do_fault+0x70/0xf0 [kernel] 0xffffffff811b7cc0 : handle_mm_fault+0x620/0x12f0 [kernel] 0xffffffff8106600d : __do_page_fault+0x1dd/0x4d0 [kernel] 0xffffffff81066330 : do_page_fault+0x30/0x80 [kernel] 0xffffffff817f8be8 : page_fault+0x28/0x30 [kernel]


0 0
原创粉丝点击