文件系统挂载源码跟踪

来源:互联网 发布:淘宝魅力惠 编辑:程序博客网 时间:2024/05/19 01:58

一、前言

       很早很早就有写技术博客、写技术日志的打算,可是由于种种原因,不过归根结底就是懒惰造成的,现在终于痛下决心,计划每周抽点时间去阅读linux源码,然后记录一下自己的理解、体会。今天就从文件系统开始,阅读文件系统挂载相关的源代码,阅读的源代码是基于linux-3.0.57,以下是个人对于源代码的理解。


二、文件系统mount命令调用源代码跟踪


         linux支持各种文件系统,各种文件系统都是通过VFS(虚拟文件系统)提供统一的接口来进行管理的。要使用各种文件系统,首先要挂载文件系统,挂载文件系统通过mount命令或者在代码中调用mount函数来挂载,那么,mount命令或者mount函数最终是调用什么函数来挂载文件系统呢?mount命令或者mount函数最终会通过系统调用,至于,在mount命令及mount函数在用户空间调用了什么函数,怎么样调用系统函数的,这边就不追究了,这边主要是阅读系统调用到内核后是怎么实现文件系统的挂载的。系统调用后到内核会调用函数SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data)函数,下面就来研究一下该函数:
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,char __user *, type, unsigned long, flags, void __user *, data){int ret;char *kernel_type;char *kernel_dir;char *kernel_dev;unsigned long data_page;ret = copy_mount_string(type, &kernel_type);if (ret < 0)goto out_type;kernel_dir = getname(dir_name);if (IS_ERR(kernel_dir)) {ret = PTR_ERR(kernel_dir);goto out_dir;}ret = copy_mount_string(dev_name, &kernel_dev);if (ret < 0)goto out_dev;ret = copy_mount_options(data, &data_page);if (ret < 0)goto out_data;ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,(void *) data_page);free_page(data_page);out_data:kfree(kernel_dev);out_dev:putname(kernel_dir);out_dir:kfree(kernel_type);out_type:return ret;}
       上面函数中,SYSCALL_DEFINE5的定义在include\linux\syscalls.h中,这边就不具体展开了,这个函数各个参数的具体含义如下所示:
dev_name:将要挂上的文件系统,通常是一个设备名
dir_name:文件系统所要挂在的目标目录
type:文件系统的类型,是一个字符串,如"ext3","msdos","proc","ntfs","iso9660"。。。
flags:挂载标志,可以是如下值,该值的具体定义在include\linux\fs.h中,没注释的不太懂其作用
/*
 * These are the fs-independent mount-flags: up to 32 flags are supported
 */
#define MS_RDONLY  1 /* Mount read-only,只读挂载  */
#define MS_NOSUID  2 /* Ignore suid and sgid bits,执行程序时,不遵照set-user-ID和set-group-ID位 */
#define MS_NODEV  4 /* Disallow access to device special files,不允许访问设备文件 */
#define MS_NOEXEC  8 /* Disallow program execution,不允许在挂上的文件系统上执行程序 */
#define MS_SYNCHRONOUS 16 /* Writes are synced at once,写数据时马上进行同步 */
#define MS_REMOUNT 32 /* Alter flags of a mounted FS,重新挂载文件系统,这允许你改变现存文件系统的mountflag和数据,而无需使用先卸载,再挂上文件系统的方式 */
#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS,允许在文件上执行强制锁 */
#define MS_DIRSYNC 128 /* Directory modifications are synchronous,同步目录的更新 */
#define MS_NOATIME 1024 /* Do not update access times.不要更新文件上的访问时间 */
#define MS_NODIRATIME 2048 /* Do not update directory access times,不要更新目录的访问时间 */
#define MS_BIND  4096  /* 执行bind挂载,使文件或者子目录树在文件系统内的另一个点上可视 */
#define MS_MOVE  8192 /* 移动子目录树 */
#define MS_REC  16384  /*  */
#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence. MS_VERBOSE is deprecated. */
#define MS_SILENT 32768     /* 支持内核打印(printk)警告提示在kernel log,跟MS_VERBOSE有同样的作用 */
#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
#define MS_UNBINDABLE (1<<17) /* change to unbindable,不可绑定 */
#define MS_PRIVATE (1<<18) /* change to private */
#define MS_SLAVE (1<<19) /* change to slave */
#define MS_SHARED (1<<20) /* change to shared */
#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
#define MS_I_VERSION (1<<23) /* Update inode I_version field */
#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
#define MS_NOSEC (1<<28)
#define MS_BORN  (1<<29)
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)
data:这个参数是根据各个文件系统的不同而不同,一般是不同文件系统自己能识别的字符串选项,大小是一个页框的大小,可以mount(8)查看各个文件系统选项详述
       该函数比较简单,主要是把这些参数拷贝到内核空间,然后调用do_mount函数,调用完毕释放相应申请的空间,另外,提一下系统调用相关的,系统调用是通过中断实现的,不过不同于真正的中断,而是通过软件触发一个中断,如x86系统是通过 INT 80实现,还有就是系统调用的参数都是存储在寄存器中,由于寄存器个数有限,所以系统调用的参数一般不会超过七个。下面我们分析一下do_mount函数。
/* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */long do_mount(char *dev_name, char *dir_name, char *type_page,  unsigned long flags, void *data_page){struct path path;int retval = 0;int mnt_flags = 0;/* Discard magic,清除旧的挂载标志及掩码 */if ((flags & MS_MGC_MSK) == MS_MGC_VAL)flags &= ~MS_MGC_MSK;/* Basic sanity checks */if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))return -EINVAL;    /* 如果有字符串数据,则把页框末尾加上字符串结束标志 */if (data_page)((char *)data_page)[PAGE_SIZE - 1] = 0;/* ... and get the mountpoint,获取挂载点,主要是获取denty及mnt */retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);if (retval)return retval;    /* 安全检查 */retval = security_sb_mount(dev_name, &path,   type_page, flags, data_page);if (retval)goto dput_out;    /* 检查各种挂载标志 *//* Default to relatime unless overriden */if (!(flags & MS_NOATIME))mnt_flags |= MNT_RELATIME;/* Separate the per-mountpoint flags */if (flags & MS_NOSUID)mnt_flags |= MNT_NOSUID;if (flags & MS_NODEV)mnt_flags |= MNT_NODEV;if (flags & MS_NOEXEC)mnt_flags |= MNT_NOEXEC;if (flags & MS_NOATIME)mnt_flags |= MNT_NOATIME;if (flags & MS_NODIRATIME)mnt_flags |= MNT_NODIRATIME;if (flags & MS_STRICTATIME)mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);if (flags & MS_RDONLY)mnt_flags |= MNT_READONLY;flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |   MS_STRICTATIME);    /* 根据挂载标志,调用不同的函数 */if (flags & MS_REMOUNT)retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,    data_page); /* 重新挂载 */else if (flags & MS_BIND)retval = do_loopback(&path, dev_name, flags & MS_REC);else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))retval = do_change_type(&path, flags); /* 改变挂载点类型 */else if (flags & MS_MOVE)retval = do_move_mount(&path, dev_name); /* 改变挂载点子目录 */elseretval = do_new_mount(&path, type_page, flags, mnt_flags,      dev_name, data_page); dput_out:path_put(&path);return retval;}
        首先进行参数检查及标志的检查,获取挂载点的mnt及denty,然后根据标志调用对应的函数还实现相关功能,如果标志是MS_REMOUNT标志,则调用do_remount重新挂载文件系统,如果是MS_BIND标志,则调用do_loopback函数处理回接设备,如果是MS_MOVE标志,则调用do_move_mount函数,改变挂载点子目录,否则的话调用do_new_mount函数挂载文件系统。那么是怎么样获取挂载点的mnt及denty的呢?主要是调用kern_path函数,kern_path函数的源代码如下所示:
int kern_path(const char *name, unsigned int flags, struct path *path){struct nameidata nd;int res = do_path_lookup(AT_FDCWD, name, flags, &nd);if (!res)*path = nd.path;return res;}
该函数主要是调用do_path_lookup(AT_FDCWD, name, flags, &nd);其中的AT_FDCWD宏表示使用当前的工作目录作为根目录,这边name的值就是挂载点路径名dir_name,查找结果存储在struct nameidata结构体中,do_path_lookup具体的代码:
static int do_path_lookup(int dfd, const char *name,unsigned int flags, struct nameidata *nd){int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);if (unlikely(retval == -ECHILD))retval = path_lookupat(dfd, name, flags, nd);if (unlikely(retval == -ESTALE))retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);if (likely(!retval)) {if (unlikely(!audit_dummy_context())) {if (nd->path.dentry && nd->inode)audit_inode(name, nd->path.dentry);}}return retval;}

      path-lookup.txt那边有这样解释:,路径遍历有两种方式,ref-walk及rcu-walk,ref-walk是传统的方式,使用d_lock及引用计数来保护denty,该方式可以睡眠、获取锁等,其所有操作是基于目录结构denty;rcu-walk使用基于denty的seqcount,rcu-walk在有些情况下是不能使用的,例如,文件系统必须睡眠或者不支持原子操作(这边自己理解的)时,必须切换到ref-walk模式。首先,通过rcu-walk方式进行查找,如果rcu-walk无法继续查找了,则切换到ref-walk进行查找,如果还是找不到说明denty cache没有对应的denty,则强制进行真实??查找。查找通过path_lookupat函数实现。
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */static int path_lookupat(int dfd, const char *name,unsigned int flags, struct nameidata *nd){struct file *base = NULL;struct path path;int err;/* * Path walking is largely split up into 2 different synchronisation * schemes, rcu-walk and ref-walk (explained in * Documentation/filesystems/path-lookup.txt). These share much of the * path walk code, but some things particularly setup, cleanup, and * following mounts are sufficiently divergent that functions are * duplicated. Typically there is a function foo(), and its RCU * analogue, foo_rcu(). * * -ECHILD is the error number of choice (just to avoid clashes) that * is returned if some aspect of an rcu-walk fails. Such an error must * be handled by restarting a traditional ref-walk (which will always * be able to complete). *//* 设置查找的根目录 */err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);if (unlikely(err))return err;current->total_link_count = 0;    /* 通过挂载路径名查找denty及mnt */err = link_path_walk(name, nd);    /* 如果找到对应的并且不是查找父节点则还需要做最后的查找 */if (!err && !(flags & LOOKUP_PARENT)) {err = lookup_last(nd, &path);        /* 如果查找失败,有可能该路径名指向的是一个链接,则这边还需要继续跳转到链接指向的继续查找         * 直至找到         */while (err > 0) {void *cookie;struct path link = path;nd->flags |= LOOKUP_PARENT;err = follow_link(&link, nd, &cookie);if (!err)err = lookup_last(nd, &path);put_link(nd, &link, cookie);}}if (!err)err = complete_walk(nd);if (!err && nd->flags & LOOKUP_DIRECTORY) {if (!nd->inode->i_op->lookup) {path_put(&nd->path);err = -ENOTDIR;}}if (base)fput(base);if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {path_put(&nd->root);nd->root.mnt = NULL;}return err;}
         首先要找到挂载根目录的dentry和vfsmount,查找根目录的dentry和vfsmount是通过调用path_init函数,找到根目录的后就可以继续查找了,最终是要找到挂载的dentry和vfsmount的,查找通过调用link_path_walk函数
static int path_init(int dfd, const char *name, unsigned int flags,     struct nameidata *nd, struct file **fp){int retval = 0;int fput_needed;struct file *file;nd->last_type = LAST_ROOT; /* if there are only slashes...,首先设置成LAST_ROOT,如果成功地找到了目标文件则设置成LAST_NORM */nd->flags = flags | LOOKUP_JUMPED;nd->depth = 0;    /* 对于flags为LOOKUP_ROOT时,nd的已经进行赋值了,这边主要是进行一些检查操作及根目录的dentry和vfsmount操作 */if (flags & LOOKUP_ROOT) {struct inode *inode = nd->root.dentry->d_inode;if (*name) {if (!inode->i_op->lookup)return -ENOTDIR;retval = inode_permission(inode, MAY_EXEC); /* 检查该inode是否有运行权限 */if (retval)return retval;}nd->path = nd->root;nd->inode = inode;        /* 如果是rcu-walk,则记录其seq,否则的话增加dentry及vfsmount的引用计数 */if (flags & LOOKUP_RCU) {br_read_lock(vfsmount_lock);rcu_read_lock();nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);} else {path_get(&nd->path);}return 0;}nd->root.mnt = NULL;    /* 如果传进来的路径名是以根目录开始则直接根据不同的路径查找方式,nd获取当前进程根目录的dentry和vfsmount */if (*name=='/') {if (flags & LOOKUP_RCU) {br_read_lock(vfsmount_lock);rcu_read_lock();set_root_rcu(nd);} else {set_root(nd);path_get(&nd->root);}nd->path = nd->root;} else if (dfd == AT_FDCWD) { /* 如果是按照当前目录来查找,直接根据不同的路径查找方式,nd获取当前进程所在目录的dentry和vfsmount */if (flags & LOOKUP_RCU) {struct fs_struct *fs = current->fs;unsigned seq;br_read_lock(vfsmount_lock);rcu_read_lock();do {seq = read_seqcount_begin(&fs->seq);nd->path = fs->pwd;nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);} while (read_seqcount_retry(&fs->seq, seq));} else {get_fs_pwd(current->fs, &nd->path);}} else {/* 否则是按照当前打开文件来查找 */struct dentry *dentry;file = fget_raw_light(dfd, &fput_needed);retval = -EBADF;if (!file)goto out_fail;dentry = file->f_path.dentry;if (*name) {retval = -ENOTDIR;if (!S_ISDIR(dentry->d_inode->i_mode))goto fput_fail;retval = file_permission(file, MAY_EXEC);if (retval)goto fput_fail;}nd->path = file->f_path;if (flags & LOOKUP_RCU) {if (fput_needed)*fp = file;nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);br_read_lock(vfsmount_lock);rcu_read_lock();} else {path_get(&file->f_path);fput_light(file, fput_needed);}}nd->inode = nd->path.dentry->d_inode;return 0;fput_fail:fput_light(file, fput_needed);out_fail:return retval;}
       
static int link_path_walk(const char *name, struct nameidata *nd){struct path next;int err;unsigned int lookup_flags = nd->flags;    /* 跳过多余的'/' */while (*name=='/')name++;if (!*name)return 0;/* At this point we know we have a real path component. */    /* 按照一级一级目录查找对应的dentry、vfsmount、inode,直至最终目录,     * 最终nd里面的dentry、vfsmount、inode指向最终目录对应的dentry、vfsmount、inode     */for(;;) {unsigned long hash;struct qstr this;unsigned int c;int type;nd->flags |= LOOKUP_CONTINUE;        /* 检查是否有权限可以继续查找,如果是通过rcu-walk查找,此时有用户在修改挂载根目录,         * 也即会检查失败,此时会尝试切换到ref-walk,然后再次检查,切换时有可能nd指向的根         * 目录跟当前进程的根目录不一样         */err = may_lookup(nd); if (err)break;this.name = name;c = *(const unsigned char *)name;hash = init_name_hash();do {name++;hash = partial_name_hash(c, hash);c = *(const unsigned char *)name;} while (c && (c != '/'));this.len = name - (const char *) this.name;this.hash = end_name_hash(hash);        /* 首先检查是否有需要指向上一级目录或者是当前目录 */type = LAST_NORM;if (this.name[0] == '.') switch (this.len) {case 2:if (this.name[1] == '.') {type = LAST_DOTDOT; /* 上一级目录 */nd->flags |= LOOKUP_JUMPED;}break;case 1:type = LAST_DOT;   /* 当前目录 */}if (likely(type == LAST_NORM)) {struct dentry *parent = nd->path.dentry;nd->flags &= ~LOOKUP_JUMPED;if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {err = parent->d_op->d_hash(parent, nd->inode,   &this);if (err < 0)break;}}/* remove trailing slashes? */if (!c)goto last_component;while (*++name == '/');if (!*name)goto last_component;        /* 如果有指向当前目录或者上一级目录则进行上一级目录处理,当前目前是直接返回;         * 否则则进行哈希查找dentry、vfsmount及inode         */err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);if (err < 0)return err;if (err) {err = nested_symlink(&next, nd);if (err)return err;}err = -ENOTDIR; if (!nd->inode->i_op->lookup)break;continue;/* here ends the main loop */last_component:/* Clear LOOKUP_CONTINUE iff it was previously unset */nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;nd->last = this;nd->last_type = type;return 0;}terminate_walk(nd); /* 结束查找 */return err;}

static inline int walk_component(struct nameidata *nd, struct path *path,struct qstr *name, int type, int follow){struct inode *inode;int err;/* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */if (unlikely(type != LAST_NORM))return handle_dots(nd, type);    /* 通过哈希查找,首先在dcache的哈希表dentry_hashtable中是否有dentry,      * 如果找不到则会申请一个dentry     */err = do_lookup(nd, name, path, &inode);if (unlikely(err)) {terminate_walk(nd);return err;}    /* 如果没有对应的inode,说明没有对应的文件夹,做一些处理返回没有对应目录 */if (!inode) {path_to_nameidata(path, nd);terminate_walk(nd);return -ENOENT;}if (unlikely(inode->i_op->follow_link) && follow) {if (nd->flags & LOOKUP_RCU) {if (unlikely(unlazy_walk(nd, path->dentry))) {terminate_walk(nd);return -ECHILD;}}BUG_ON(inode != path->dentry->d_inode);return 1;}path_to_nameidata(path, nd);nd->inode = inode;return 0;}

        至此,最终挂载点的vfsmount、dentry已经找到了,其指针存在path变量中,下面就要根据不同的挂载标志调用不同的挂载函数了,对于该标志是MS_REMOUNT则调用do_remount函数,只是表示要改变一个已经挂载的设备的挂载方式而已,对于标志是则调用do_loopback函数,表示对特殊设备如/dev/loopback等“回接”设备的处理,实际上只是提供一条回接到某个可访问普通文件的块设备的手段。对于标志是MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE,则调用do_change_type函数,只是改变挂载方式而已,对于标志MS_MOVE则调用do_move_mount函数,卸载文件系统,剩下的标志或者没有传入挂载标志则调用do_new_mount函数,真的挂载文件系统了。本博客只是分析文件系统的挂载,所以只分析do_new_mount函数了。
/* * create a new mount for userspace and request it to be added into the * namespace's tree */static int do_new_mount(struct path *path, char *type, int flags,int mnt_flags, char *name, void *data){struct vfsmount *mnt;int err;if (!type)return -EINVAL;/* we need capabilities...,检查是是否有权限,一般只有拥有特权用户权限的程序才能挂载 */if (!capable(CAP_SYS_ADMIN))return -EPERM;    /* 先查找对应的file_system_type结构,然后申请初始化一个vfsmount及vfsmount id,然后调用对应的     * mount函数,如ext4调用邋ext4_mount函数     */mnt = do_kern_mount(type, flags, name, data);if (IS_ERR(mnt))return PTR_ERR(mnt);    /* 把vfsmount添加到namespace的vfsmount树 */err = do_add_mount(mnt, path, mnt_flags);if (err)mntput(mnt);return err;}struct vfsmount *do_kern_mount(const char *fstype, int flags, const char *name, void *data){    /* 通过文件系统类型名,如"ext4","vfat"等找对应file_system_type结构 */struct file_system_type *type = get_fs_type(fstype);struct vfsmount *mnt;if (!type)return ERR_PTR(-ENODEV);mnt = vfs_kern_mount(type, flags, name, data);if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&    !mnt->mnt_sb->s_subtype)mnt = fs_set_subtype(mnt, fstype);put_filesystem(type);return mnt;}struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data){struct vfsmount *mnt;struct dentry *root;if (!type)return ERR_PTR(-ENODEV);    /* 从mnt_cache专业缓冲池中申请一个vfsmount,并申请一个vfsmount id,初始化 */mnt = alloc_vfsmnt(name);if (!mnt)return ERR_PTR(-ENOMEM);    /* 如果kern mount调用的,如一些虚拟的文件系统,如pipe,共享内存,把mnt的flags置为初始化标志 */if (flags & MS_KERNMOUNT)mnt->mnt_flags = MNT_INTERNAL;    /* 根据不同file_system_type调用对应的mount函数,如ext4文件系统调用的是ext4_mount函数 */root = mount_fs(type, flags, name, data);if (IS_ERR(root)) {free_vfsmnt(mnt);return ERR_CAST(root);}mnt->mnt_root = root;mnt->mnt_sb = root->d_sb;mnt->mnt_mountpoint = mnt->mnt_root;mnt->mnt_parent = mnt;return mnt;}
 
          首先要检查权限,一般只有拥有特权用户权限的程序才能挂载,有几种方式可以使程序拥有特权用户权限,在root用户下运行出现,或者使用su、sudo暂时获取特权用户权限。对于没有CAP_SYS_ADMIN权限时直接返回不允许操作的错误。有权限了就可以进行挂载了,先要查找对应文件系统的file_system_type结构,我们传进去的是一个字符串如“ext4”,找到了就会申请和初始化一个vfsmount及vfsmount id,然后调用具体文件系统的mount函数,如ext4就调用ext4_mount函数。最后,还要把vfsmount添加到namespace的vfsmount树上,这样文件挂载就大功告成了。对于具体文件系统的mount函数各有不同,这边就不具体分析了。

三、总结

         以上分析是基于自己的理解,可能有很多分析错误的地方,对于这文件系统也刚接触,还需要花更多时间去分析。通过这次分析,我觉得我们在编程中可以借鉴使用的是字符串哈希,seq锁等。

原创粉丝点击