设备节点创建过程源代码分析

来源:互联网 发布:seo自学论坛 编辑:程序博客网 时间:2024/06/06 18:27

阅读本文需要linux文件系统基础知识。以下引用的kernel源代码,都是基于linux kernel源代码版本:3.4。


以下分析基于下面的假设:

根设备使用SD卡设备(块设备),根文件系统使用ext4文件系统。


linux kernel在初始化的最后阶段,会加载“根文件系统”,按照前面的假设,也就是加载一个ext4文件系统作为根文件系统,这个文件系统位于SD卡上。

在加载这个根文件系统前,kernel会先加载一个虚拟的根文件系统,名叫rootfs文件系统,这个rootfs文件系统才是kernel真正意义上mount的第一个文件系统。它是基于内存的文件系统(并没有对应一个非易失性存储设备,而像ext, fat这些文件系统都会对应一个外部存储设备)。

这个rootfs 文件系统会mount到自己的根目录 “/” 上,mount完毕后会将自己的根目录设为进程的根目录。现在假设这个rootfs文件系统已经mount好了(暂不分析rootfs文件系统的mount过程)。

kernel在安装根文件系统前会先安装根设备节点:/dev/root。ext4根文件系统必须存在于一个根设备上,比如硬盘,SD/MMC卡。根设备节点用来描述指定的根设备。


kernel在kernel_init函数中会调用prepare_namespace函数(init/do_mounts.c),prepare_namespace函数先从启动参数中获得根设备名称,保存在root_device_name中,这里名称就是/dev/mmcblk0p1。 

调用mount_root挂载根文件系统。在mount_root函数中就会创建设备节点了。

void __init prepare_namespace(void){int is_floppy;if (root_delay) {printk(KERN_INFO "Waiting %dsec before mounting root device...\n",       root_delay);ssleep(root_delay);}/* * wait for the known devices to complete their probing * * Note: this is a potential source of long boot delays. * For example, it is not atypical to wait 5 seconds here * for the touchpad of a laptop to initialize. */wait_for_device_probe();md_run_setup();if (saved_root_name[0]) {root_device_name = saved_root_name;if (!strncmp(root_device_name, "mtd", 3) ||    !strncmp(root_device_name, "ubi", 3)) {mount_block_root(root_device_name, root_mountflags);goto out;}        printk(KERN_INFO "-----ROOT_DEV_NAME----: %s\n", root_device_name);ROOT_DEV = name_to_dev_t(root_device_name);                    //拿到设备名称后调用name_to_dev_t转换为设备号;if (strncmp(root_device_name, "/dev/", 5) == 0)root_device_name += 5;                                 //跳过 ‘/dev/’;}if (initrd_load())goto out;/* wait for any asynchronous scanning to complete */if ((ROOT_DEV == 0) && root_wait) {printk(KERN_INFO "Waiting for root device %s...\n",saved_root_name);while (driver_probe_done() != 0 ||(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)msleep(100);async_synchronize_full();}is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;if (is_floppy && rd_doload && rd_load_disk(0))ROOT_DEV = Root_RAM0;mount_root();                       //安装根文件系统;out:devtmpfs_mount("dev");sys_mount(".", "/", NULL, MS_MOVE, NULL);sys_chroot((const char __user __force *)".");}

mount_root():    (init/do_mounts.c)

void __init mount_root(void){#ifdef CONFIG_ROOT_NFSif (ROOT_DEV == Root_NFS) {if (mount_nfs_root())return;printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");ROOT_DEV = Root_FD0;}#endif#ifdef CONFIG_BLK_DEV_FDif (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {/* rd_doload is 2 for a dual initrd/ramload setup */if (rd_doload==2) {if (rd_load_disk(1)) {ROOT_DEV = Root_RAM1;root_device_name = NULL;}} elsechange_floppy("root floppy");}#endif#ifdef CONFIG_BLOCK    create_dev("/dev/root", ROOT_DEV);            //创建根设备节点; mount_block_root("/dev/root", root_mountflags);#endif}

这个函数在create_dev函数中创建根设备节点。create_dev函数实际调用了mknod系统调用对应的内核函数sys_mknod。

参数为: name = "/dev/root", mode = S_IFBLK|0600,dev = xxx;

static inline int create_dev(char *name, dev_t dev){sys_unlink(name);return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev){return sys_mknodat(AT_FDCWD, filename, mode, dev);}

最后实际是调用sys_mknodat函数(fs/namei.c),SYSCALL_DEFINE4是函数定义的宏,展开后就变成sys_mknodat函数。

sys_mknodat函数调用user_path_create得到要创建节点的path,节点父目录的信息保存在path中,然后调用vfs_mknod创建节点。

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,unsigned, dev){struct dentry *dentry;struct path path;int error;if (S_ISDIR(mode))return -EPERM;dentry = user_path_create(dfd, filename, &path, 0);if (IS_ERR(dentry))return PTR_ERR(dentry);if (!IS_POSIXACL(path.dentry->d_inode))mode &= ~current_umask();error = may_mknod(mode);if (error)goto out_dput;error = mnt_want_write(path.mnt);if (error)goto out_dput;error = security_path_mknod(&path, dentry, mode, dev);if (error)goto out_drop_write;switch (mode & S_IFMT) {case 0: case S_IFREG:error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);break;case S_IFCHR: case S_IFBLK:error = vfs_mknod(path.dentry->d_inode,dentry,mode,new_decode_dev(dev));break;case S_IFIFO: case S_IFSOCK:error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);break;}out_drop_write:mnt_drop_write(path.mnt);out_dput:dput(dentry);mutex_unlock(&path.dentry->d_inode->i_mutex);path_put(&path);return error;}

user_path_create:   (fs/namei.c)


简单的调用了kern_path_create函数后返回。此时参数分别为dfd = AT_FDCWD,tmp = "/dev/root",path为输出参数,is_dir = 0;

struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir){char *tmp = getname(pathname);struct dentry *res;if (IS_ERR(tmp))return ERR_CAST(tmp);res = kern_path_create(dfd, tmp, path, is_dir);putname(tmp);return res;}


kern_path_create函数先调用do_path_lookup搜索路径,搜索方式是搜索最后一个目录项的父目录。然后再调用lookup_hash搜索目录最后一项,也就是root项。

struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir){struct dentry *dentry = ERR_PTR(-EEXIST);struct nameidata nd;int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);if (error)return ERR_PTR(error);/* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */if (nd.last_type != LAST_NORM)goto out;nd.flags &= ~LOOKUP_PARENT;nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;nd.intent.open.flags = O_EXCL;/* * Do the final lookup. */mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);dentry = lookup_hash(&nd);if (IS_ERR(dentry))goto fail;if (dentry->d_inode)goto eexist;/* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */if (unlikely(!is_dir && nd.last.name[nd.last.len])) {dput(dentry);dentry = ERR_PTR(-ENOENT);goto fail;}*path = nd.path;return dentry;eexist:dput(dentry);dentry = ERR_PTR(-EEXIST);fail:mutex_unlock(&nd.path.dentry->d_inode->i_mutex);out:path_put(&nd.path);return dentry;}

do_path_lookup函数如下(fs/namei.c)。

调用path_lookupat进行搜索。加上LOOKUP_RCU参数。vfs文件系统路径搜索主要有两种mode,rcu-walk和 ref-walk。 这两种方式在源代码目录 Documentation/filesystems/path-lookup.txt 的文档中有详细说明。

static int do_path_lookup(int dfd, const char *name,unsigned int flags, struct nameidata *nd){int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);if (unlikely(retval == -ECHILD))retval = path_lookupat(dfd, name, flags, nd);if (unlikely(retval == -ESTALE))retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);if (likely(!retval)) {if (unlikely(!audit_dummy_context())) {if (nd->path.dentry && nd->inode)audit_inode(name, nd->path.dentry);}}return retval;}

path_lookupat函数先初始化路径,可以看作将设置路径的根(搜索的起点)。VFS中目录项是用dentry和inode描述的,这里可以看作设置根路径对应的dentry。


static int path_lookupat(int dfd, const char *name,unsigned int flags, struct nameidata *nd){struct file *base = NULL;struct path path;int err;/* * Path walking is largely split up into 2 different synchronisation * schemes, rcu-walk and ref-walk (explained in * Documentation/filesystems/path-lookup.txt). These share much of the * path walk code, but some things particularly setup, cleanup, and * following mounts are sufficiently divergent that functions are * duplicated. Typically there is a function foo(), and its RCU * analogue, foo_rcu(). * * -ECHILD is the error number of choice (just to avoid clashes) that * is returned if some aspect of an rcu-walk fails. Such an error must * be handled by restarting a traditional ref-walk (which will always * be able to complete). */err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);      //找到搜索的起点,保存在nd中;if (unlikely(err))return err;current->total_link_count = 0;err = link_path_walk(name, nd);                                   //从起点开始路径的搜索,其中nd用来返回搜索结果;if (!err && !(flags & LOOKUP_PARENT)) {err = lookup_last(nd, &path);while (err > 0) {void *cookie;struct path link = path;nd->flags |= LOOKUP_PARENT;err = follow_link(&link, nd, &cookie);if (!err)err = lookup_last(nd, &path);put_link(nd, &link, cookie);}}if (!err)err = complete_walk(nd);if (!err && nd->flags & LOOKUP_DIRECTORY) {if (!nd->inode->i_op->lookup) {path_put(&nd->path);err = -ENOTDIR;}}if (base)fput(base);if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {path_put(&nd->root);nd->root.mnt = NULL;}return err;}


主要调用set_root_rcu设置根路径,将根路径保存在nd->root。

static int path_init(int dfd, const char *name, unsigned int flags,     struct nameidata *nd, struct file **fp){int retval = 0;int fput_needed;struct file *file;nd->last_type = LAST_ROOT; /* if there are only slashes... */nd->flags = flags | LOOKUP_JUMPED;nd->depth = 0;if (flags & LOOKUP_ROOT) {struct inode *inode = nd->root.dentry->d_inode;if (*name) {if (!inode->i_op->lookup)return -ENOTDIR;retval = inode_permission(inode, MAY_EXEC);if (retval)return retval;}nd->path = nd->root;nd->inode = inode;if (flags & LOOKUP_RCU) {br_read_lock(vfsmount_lock);rcu_read_lock();nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);} else {path_get(&nd->path);}return 0;}nd->root.mnt = NULL;if (*name=='/') {if (flags & LOOKUP_RCU) {br_read_lock(vfsmount_lock);rcu_read_lock();                     set_root_rcu(nd);                       //设置根路径} else {set_root(nd);path_get(&nd->root);}nd->path = nd->root;                            //搜索的初始路径设为根路径} else if (dfd == AT_FDCWD) {if (flags & LOOKUP_RCU) {struct fs_struct *fs = current->fs;unsigned seq;br_read_lock(vfsmount_lock);rcu_read_lock();do {seq = read_seqcount_begin(&fs->seq);nd->path = fs->pwd;nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);} while (read_seqcount_retry(&fs->seq, seq));} else {get_fs_pwd(current->fs, &nd->path);}} else {struct dentry *dentry;file = fget_raw_light(dfd, &fput_needed);retval = -EBADF;if (!file)goto out_fail;dentry = file->f_path.dentry;if (*name) {retval = -ENOTDIR;if (!S_ISDIR(dentry->d_inode->i_mode))goto fput_fail;retval = inode_permission(dentry->d_inode, MAY_EXEC);if (retval)goto fput_fail;}nd->path = file->f_path;if (flags & LOOKUP_RCU) {if (fput_needed)*fp = file;nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);br_read_lock(vfsmount_lock);rcu_read_lock();} else {path_get(&file->f_path);fput_light(file, fput_needed);}}nd->inode = nd->path.dentry->d_inode;             //设置根目录的inodereturn 0;fput_fail:fput_light(file, fput_needed);out_fail:return retval;}


用当前进程的根路径设置nd->root。

static __always_inline void set_root_rcu(struct nameidata *nd){if (!nd->root.mnt) {struct fs_struct *fs = current->fs;unsigned seq;do {seq = read_seqcount_begin(&fs->seq);nd->root = fs->root;                                         //设置nd的root为当前进程fs的root;nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);} while (read_seqcount_retry(&fs->seq, seq));}}


path_init返回后,调用link_path_walk正式开始搜索,搜索的结果会保存在struct nameidata结构体nd中。


struct nameidata {struct pathpath;struct qstrlast;                                         //保存最后一个目录项struct pathroot;struct inode*inode; /* path.dentry.d_inode */unsigned intflags;unsignedseq;intlast_type;unsigneddepth;char *saved_names[MAX_NESTED_LINKS + 1];/* Intent data */union {struct open_intent open;} intent;};

如果路径名以‘/’开头,就把它跳过去,因为在这种情况下nd->path已经指向本进程的根目录了。如果路径名中仅仅包含‘/’字符的话,那么其搜索目标就是根目录,所以任务完成。

调用hash_name,hash_name函数计算第一个目录项的hash值并返回目录项长度,对于本例中的/dev/root,第一个目录项是dev,长度为3。用name和len构造struct qstr结构体。若发现是最后一个目录项,就将struct qstr赋给nd->last, 然后直接返回,若不是最后一项,就调用walk_component搜索这个目录项。

static int link_path_walk(const char *name, struct nameidata *nd){struct path next;int err;while (*name=='/')name++;if (!*name)return 0;/* At this point we know we have a real path component. */for(;;) {struct qstr this;long len;int type;err = may_lookup(nd); if (err)break;len = hash_name(name, &this.hash);this.name = name;this.len = len;type = LAST_NORM;if (name[0] == '.') switch (len) {case 2:if (name[1] == '.') {type = LAST_DOTDOT;nd->flags |= LOOKUP_JUMPED;}break;case 1:type = LAST_DOT;}if (likely(type == LAST_NORM)) {struct dentry *parent = nd->path.dentry;nd->flags &= ~LOOKUP_JUMPED;if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {err = parent->d_op->d_hash(parent, nd->inode,   &this);if (err < 0)break;}}if (!name[len])                                                   //若是最后一项,就直接返回;goto last_component;/* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */do {len++;} while (unlikely(name[len] == '/'));if (!name[len])goto last_component;                name += len;                        //运行到这里,表示当前节点为中间节点;                        err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);      //搜索目录项;if (err < 0)return err;if (err) {err = nested_symlink(&next, nd);if (err)return err;}if (can_lookup(nd->inode))continue;err = -ENOTDIR; break;/* here ends the main loop */last_component:nd->last = this;nd->last_type = type;return 0;}terminate_walk(nd);return err;}

权限检查。

static inline int may_lookup(struct nameidata *nd){if (nd->flags & LOOKUP_RCU) {int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);if (err != -ECHILD)return err;if (unlazy_walk(nd, NULL))return -ECHILD;}return inode_permission(nd->inode, MAY_EXEC);}

hash_name返回第一个目录项的长度。

static inline unsigned long hash_name(const char *name, unsigned int *hashp){unsigned long hash = init_name_hash();               //    #define init_name_hash()        0;unsigned long len = 0, c;c = (unsigned char)*name;do {len++;hash = partial_name_hash(c, hash);           // 计算并返回哈希值;c = (unsigned char)name[len];} while (c && c != '/');*hashp = end_name_hash(hash);                        // return hash;return len;}


计算hash值;

static inline unsigned longpartial_name_hash(unsigned long c, unsigned long prevhash){return (prevhash + (c << 4) + (c >> 4)) * 11;}


walk_component:  (fs/namei.c)

调用do_lookup函数搜索目录项的inode。如果找不到inode节点,则终止搜索。如果这个inode节点被mount上另一个文件系统,更新搜索路径。

static inline int walk_component(struct nameidata *nd, struct path *path,struct qstr *name, int type, int follow){struct inode *inode;int err;/* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */if (unlikely(type != LAST_NORM))return handle_dots(nd, type);err = do_lookup(nd, name, path, &inode);if (unlikely(err)) {terminate_walk(nd);return err;}if (!inode) {path_to_nameidata(path, nd);terminate_walk(nd);return -ENOENT;}if (should_follow_link(inode, follow)) {if (nd->flags & LOOKUP_RCU) {if (unlikely(unlazy_walk(nd, path->dentry))) {terminate_walk(nd);return -ECHILD;}}BUG_ON(inode != path->dentry->d_inode);return 1;}path_to_nameidata(path, nd);                                              //将path的内容转化到nd中;nd->inode = inode;return 0;}


do_lookup: (fs/namei.c)  做实际的路径搜索工作。

do_lookup先获取要搜索目录项的父目录parent,根据LOOPUP_RCU标志,调用__d_lookup_rcu或者__d_lookup。假设没有LOOKUP_RCU标志,则进入__d_lookup函数,若找到则__d_lookup返回找到的dentry,用找到的dentry给path->dentry赋值,将dentry对应的dentry->d_inode赋给输出参数inode,inode有可能为NULL。

static int do_lookup(struct nameidata *nd, struct qstr *name,struct path *path, struct inode **inode){struct vfsmount *mnt = nd->path.mnt;struct dentry *dentry, *parent = nd->path.dentry;int need_reval = 1;int status = 1;int err;/* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */if (nd->flags & LOOKUP_RCU) {unsigned seq;*inode = nd->inode;dentry = __d_lookup_rcu(parent, name, &seq, inode);if (!dentry)goto unlazy;/* Memory barrier in read_seqcount_begin of child is enough */if (__read_seqcount_retry(&parent->d_seq, nd->seq))return -ECHILD;nd->seq = seq;if (unlikely(d_need_lookup(dentry)))goto unlazy;if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {status = d_revalidate(dentry, nd);if (unlikely(status <= 0)) {if (status != -ECHILD)need_reval = 0;goto unlazy;}}path->mnt = mnt;path->dentry = dentry;if (unlikely(!__follow_mount_rcu(nd, path, inode)))goto unlazy;if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))goto unlazy;return 0;unlazy:if (unlazy_walk(nd, dentry))return -ECHILD;} else {dentry = __d_lookup(parent, name);}if (unlikely(!dentry))goto need_lookup;if (unlikely(d_need_lookup(dentry))) {dput(dentry);goto need_lookup;}if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)status = d_revalidate(dentry, nd);if (unlikely(status <= 0)) {if (status < 0) {dput(dentry);return status;}if (!d_invalidate(dentry)) {dput(dentry);goto need_lookup;}}done:path->mnt = mnt;path->dentry = dentry;err = follow_managed(path, nd->flags);if (unlikely(err < 0)) {path_put_conditional(path, nd);return err;}if (err)nd->flags |= LOOKUP_JUMPED;*inode = path->dentry->d_inode;return 0;need_lookup:BUG_ON(nd->inode != parent->d_inode);mutex_lock(&parent->d_inode->i_mutex);dentry = __lookup_hash(name, parent, nd);mutex_unlock(&parent->d_inode->i_mutex);if (IS_ERR(dentry))return PTR_ERR(dentry);goto done;}

__d_lookup: (fs/dcache.c)

在内存中寻找该节点已经建立的dentry结构,内核中有个hash表dentry_hashtable,是一个struct hlist_bl_head结构的指针数组,每一个struct hlist_bl_head结构包含有一个队列头指针struct hlist_bl_node*。

一旦在内存中建立起一个目录节点的dentry结构,就根据其节点名的hash值和父目录的dentry值,调用d_hash获得dentry_hashtable中的一个队列头指针,然后将dentry成员变量dentry->d_hash挂入dentry_hashtable表中的这个队列,需要寻找时会根据hash值查找dentry_hashtable表。

将dentry->d_hash挂入dentry_hashtable的过程在d_rehash函数中完成,后面分析。

这里__d_lookup先调用d_hash从dentry_hashtable表中获得这个队列。遍历这个队列,看看队列中有没有目录项的hash值和要搜索的目录项的hash值相同的,如果有,再比较parent是否是同一个parent,最后调用dentry_cmp进一步比较是否是同一个目录项,如果是,表示找到了,就增加引用计数并将其返回。如果没有找到,返回NULL。

struct dentry *__d_lookup(struct dentry *parent, struct qstr *name){unsigned int len = name->len;unsigned int hash = name->hash;const unsigned char *str = name->name;struct hlist_bl_head *b = d_hash(parent, hash);struct hlist_bl_node *node;struct dentry *found = NULL;struct dentry *dentry;/* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. *//* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */rcu_read_lock();hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {                           //遍历散列const char *tname;int tlen;if (dentry->d_name.hash != hash)continue;spin_lock(&dentry->d_lock);if (dentry->d_parent != parent)goto next;if (d_unhashed(dentry))goto next;/* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). */tlen = dentry->d_name.len;tname = dentry->d_name.name;if (parent->d_flags & DCACHE_OP_COMPARE) {if (parent->d_op->d_compare(parent, parent->d_inode,dentry, dentry->d_inode,tlen, tname, name))goto next;} else {if (dentry_cmp(tname, tlen, str, len))goto next;}dentry->d_count++;found = dentry;spin_unlock(&dentry->d_lock);break;next:spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found;}

d_hash为散列函数,它将parent与hash值相结合,映射到dentry_hashtable表中,返回相应的散列链。

static inline struct hlist_bl_head *d_hash(const struct dentry *parent,unsigned int hash){hash += (unsigned long) parent / L1_CACHE_BYTES;hash = hash + (hash >> D_HASHBITS);return dentry_hashtable + (hash & D_HASHMASK);}


__d_lookup然后遍历刚刚得到的队列上的每个元素。hlist_bl_for_each_entry_rcu实现遍历。

#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)\for (pos = hlist_bl_first_rcu(head);\pos &&\({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \pos = rcu_dereference_raw(pos->next))

static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h){return (struct hlist_bl_node *)((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);}


struct hlist_bl_head {struct hlist_bl_node *first;};struct hlist_bl_node {struct hlist_bl_node *next, **pprev;};


container_of返回包含d_hash的struct dentry结构体;container_of是linux内核常用的宏,能根据结构体内部成员的指针返回包含这个成员的结构体的指针。

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)


执行到这里,就层层返回到do_lookup,用找到的dentry给path赋值,接着调用follow_managed函数 (fs/namei.c)。

follow_managed处理一些有特殊用途的目录项。比如标记为mount点的目录项,如果目录项是一个mount点,那就切换到另一个文件系统,将path->dentry重新设为新文件系统的根。

最后,do_lookup函数将找到的dentry->d_inode赋值给输出参数inode,返回。

static int follow_managed(struct path *path, unsigned flags){struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */unsigned managed;bool need_mntput = false;int ret = 0;/* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */while (managed = ACCESS_ONCE(path->dentry->d_flags),       managed &= DCACHE_MANAGED_DENTRY,       unlikely(managed != 0)) {/* Allow the filesystem to manage the transit without i_mutex * being held. */if (managed & DCACHE_MANAGE_TRANSIT) {BUG_ON(!path->dentry->d_op);BUG_ON(!path->dentry->d_op->d_manage);ret = path->dentry->d_op->d_manage(path->dentry, false);if (ret < 0)break;}/* Transit to a mounted filesystem. */if (managed & DCACHE_MOUNTED) {struct vfsmount *mounted = lookup_mnt(path);if (mounted) {dput(path->dentry);if (need_mntput)mntput(path->mnt);path->mnt = mounted;path->dentry = dget(mounted->mnt_root);need_mntput = true;continue;}/* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this * namespace got unmounted before we managed to get the * vfsmount_lock */}/* Handle an automount point */if (managed & DCACHE_NEED_AUTOMOUNT) {ret = follow_automount(path, flags, &need_mntput);if (ret < 0)break;continue;}/* We didn't change the current path point */break;}if (need_mntput && path->mnt == mnt)mntput(path->mnt);if (ret == -EISDIR)ret = 0;return ret < 0 ? ret : need_mntput;}
返回到walk_component函数,检查inode是不是NULL,如果是则终止搜索。

调用path_to_nameidata函数,path_to_nameidata函数将path的内容转化到nd里面去。将找到的inode节点赋给nd->inode。

static inline void path_to_nameidata(const struct path *path,struct nameidata *nd){if (!(nd->flags & LOOKUP_RCU)) {dput(nd->path.dentry);if (nd->path.mnt != path->mnt)mntput(nd->path.mnt);}nd->path.mnt = path->mnt;nd->path.dentry = path->dentry;}


返回到link_path_walk函数,继续循环搜索下一个目录项。若发现是最后一个目录项,就将struct qstr this赋给nd->last, 然后直接返回到path_lookupat函数。

如果link_path_walk返回0,调用complete_walk函数完成搜索 (fs/namei.c)。


static int complete_walk(struct nameidata *nd){struct dentry *dentry = nd->path.dentry;int status;if (nd->flags & LOOKUP_RCU) {nd->flags &= ~LOOKUP_RCU;if (!(nd->flags & LOOKUP_ROOT))nd->root.mnt = NULL;spin_lock(&dentry->d_lock);if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {spin_unlock(&dentry->d_lock);rcu_read_unlock();br_read_unlock(vfsmount_lock);return -ECHILD;}BUG_ON(nd->inode != dentry->d_inode);spin_unlock(&dentry->d_lock);mntget(nd->path.mnt);rcu_read_unlock();br_read_unlock(vfsmount_lock);}if (likely(!(nd->flags & LOOKUP_JUMPED)))return 0;if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))return 0;if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))return 0;/* Note: we do not d_invalidate() */status = d_revalidate(dentry, nd);if (status > 0)return 0;if (!status)status = -ESTALE;path_put(&nd->path);return status;}

层层返回到kern_path_create,调用lookup_hash函数 (fs/namei.c)进行最后一个目录项的搜索。


static struct dentry *lookup_hash(struct nameidata *nd){return __lookup_hash(&nd->last, nd->path.dentry, nd);}

调用__lookup_hash函数进行搜索,__lookup_hash调用lookup_dcache搜索dcache,若找不到则lookup_real调用对应文件系统的lookup函数去查找。

static struct dentry *__lookup_hash(struct qstr *name,struct dentry *base, struct nameidata *nd){bool need_lookup;struct dentry *dentry;dentry = lookup_dcache(name, base, nd, &need_lookup);if (!need_lookup)return dentry;return lookup_real(base->d_inode, dentry, nd);}

调用d_lookup搜索。

static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,    struct nameidata *nd, bool *need_lookup){struct dentry *dentry;int error;*need_lookup = false;dentry = d_lookup(dir, name);if (dentry) {if (d_need_lookup(dentry)) {*need_lookup = true;} else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {error = d_revalidate(dentry, nd);if (unlikely(error <= 0)) {if (error < 0) {dput(dentry);return ERR_PTR(error);} else if (!d_invalidate(dentry)) {dput(dentry);dentry = NULL;}}}}if (!dentry) {dentry = d_alloc(dir, name);if (unlikely(!dentry))return ERR_PTR(-ENOMEM);*need_lookup = true;}return dentry;}

d_lookup进一步调用__d_lookup。__d_lookup上面已经分析过了,就是在内存的dentry_hashtable表中查找dentry结构。这里/dev下的root还没有创建,所以肯定找不到,返回NULL。层层返回到lookup_dcache,如果d_lookup返回的dentry为空,则调用d_alloc分配一个dentry对象。

struct dentry *d_lookup(struct dentry *parent, struct qstr *name){struct dentry *dentry;unsigned seq;        do {                seq = read_seqbegin(&rename_lock);                dentry = __d_lookup(parent, name);                if (dentry)break;} while (read_seqretry(&rename_lock, seq));return dentry;}

d_alloc: (fs/dcache.c)

调用__d_alloc分配一个dcache对象并初始化。设置dentry的父目录,并将新的dentry链入其父目录的d_subdirs链表中。

struct dentry *d_alloc(struct dentry * parent, const struct qstr *name){struct dentry *dentry = __d_alloc(parent->d_sb, name);if (!dentry)return NULL;spin_lock(&parent->d_lock);/* * don't need child lock because it is not subject * to concurrency here */__dget_dlock(parent);dentry->d_parent = parent;list_add(&dentry->d_u.d_child, &parent->d_subdirs);spin_unlock(&parent->d_lock);return dentry;}

__d_alloc: (fs/dcache.c)

分配并初始化一个dcache对象。

struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name){struct dentry *dentry;char *dname;dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);if (!dentry)return NULL;if (name->len > DNAME_INLINE_LEN-1) {dname = kmalloc(name->len + 1, GFP_KERNEL);if (!dname) {kmem_cache_free(dentry_cache, dentry); return NULL;}} else  {dname = dentry->d_iname;}dentry->d_name.name = dname;dentry->d_name.len = name->len;dentry->d_name.hash = name->hash;memcpy(dname, name->name, name->len);dname[name->len] = 0;dentry->d_count = 1;dentry->d_flags = 0;spin_lock_init(&dentry->d_lock);seqcount_init(&dentry->d_seq);dentry->d_inode = NULL;dentry->d_parent = dentry;dentry->d_sb = sb;dentry->d_op = NULL;dentry->d_fsdata = NULL;INIT_HLIST_BL_NODE(&dentry->d_hash);INIT_LIST_HEAD(&dentry->d_lru);INIT_LIST_HEAD(&dentry->d_subdirs);INIT_LIST_HEAD(&dentry->d_alias);INIT_LIST_HEAD(&dentry->d_u.d_child);d_set_d_op(dentry, dentry->d_sb->s_d_op);this_cpu_inc(nr_dentry);    printk("__d_alloc: dentry name: %s\n", dentry->d_name.name);return dentry;}

返回到lookup_dcache中,将need_lookup置为真,向上返回到__lookup_hash,__lookup_hash最后调用lookup_real通过文件系统自己的lookup函数从设备读目录信息。


lookup_real: (fs/namei.c)

在缓存中无法找到指定的目录项,那就创建该目录项。通过底层文件系统的lookup函数从设备读,lookup在读目录项的过程中,也把该目录项对应的inode结构从磁盘读出来,并赋值给dentry的d_inode字段。

static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,  struct nameidata *nd){struct dentry *old;/* Don't create child dentry for a dead directory. */if (unlikely(IS_DEADDIR(dir))) {dput(dentry);return ERR_PTR(-ENOENT);}old = dir->i_op->lookup(dir, dentry, nd);if (unlikely(old)) {dput(dentry);dentry = old;}return dentry;}


simple_lookup: (fs/libfs.c)

struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){static const struct dentry_operations simple_dentry_operations = {.d_delete = simple_delete_dentry,};if (dentry->d_name.len > NAME_MAX)return ERR_PTR(-ENAMETOOLONG);d_set_d_op(dentry, &simple_dentry_operations);d_add(dentry, NULL);return NULL;}


d_set_d_op: (fs/dcache.c)

用simple_dentry_operations初始化dentry->d_op。

void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op){WARN_ON_ONCE(dentry->d_op);WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH|DCACHE_OP_COMPARE|DCACHE_OP_REVALIDATE|DCACHE_OP_DELETE ));dentry->d_op = op;if (!op)return;if (op->d_hash)dentry->d_flags |= DCACHE_OP_HASH;if (op->d_compare)dentry->d_flags |= DCACHE_OP_COMPARE;if (op->d_revalidate)dentry->d_flags |= DCACHE_OP_REVALIDATE;if (op->d_delete)dentry->d_flags |= DCACHE_OP_DELETE;if (op->d_prune)dentry->d_flags |= DCACHE_OP_PRUNE;}



d_add: (include/linux/cache.h)

传入的inode参数为NULL。d_instantiate函数将dentry->d_inode置为NULL。

d_rehash将dentry加入hash表。

static inline void d_add(struct dentry *entry, struct inode *inode){d_instantiate(entry, inode);d_rehash(entry);}

d_rehash: (fs/dcache.c)

void d_rehash(struct dentry * entry){spin_lock(&entry->d_lock);_d_rehash(entry);spin_unlock(&entry->d_lock);}


_d_rehash: (fs/dcache.c)

static void _d_rehash(struct dentry * entry){__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));}


__d_rehash: (fs/dcache.c)

通过hlist_bl_add_head_rcu将dentry链入dentry_hashtable表。

static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b){BUG_ON(!d_unhashed(entry));hlist_bl_lock(b);entry->d_flags |= DCACHE_RCUACCESS;hlist_bl_add_head_rcu(&entry->d_hash, b);hlist_bl_unlock(b);}

hlist_bl_add_head_rcu: (include/linux/rculist_bl.h)

static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,struct hlist_bl_head *h){struct hlist_bl_node *first;/* don't need hlist_bl_first_rcu because we're under lock */first = hlist_bl_first(h);n->next = first;if (first)first->pprev = &n->next;n->pprev = &h->first;/* need _rcu because we can have concurrent lock free readers */hlist_bl_set_first_rcu(h, n);}



lookup_real返回后,向上一直返回到kern_path_create函数,接着kern_path_create函数检查返回的dentry->d_inode是否不空,如果不空表示inode节点已经存在了,不需要创建,本例中/dev下的root节点是不存在的(需要创建),因此将nd.path赋值给path后,直接返回dentry。

返回到sys_mknodat函数,执行mnt_want_write,这个函数告诉文件系统将要执行写操作了,如果不可写,将会返回错误。

如果可写,调用vfs_mknod函数创建inode节点。


vfs_mknod : (fs/namei.c)

这个函数其实就是调用了要创建节点的父目录的inode节点的方法i_op->mknod。

int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev){int error = may_create(dir, dentry);if (error)return error;if ((S_ISCHR(mode) || S_ISBLK(mode)) &&    !ns_capable(inode_userns(dir), CAP_MKNOD))return -EPERM;if (!dir->i_op->mknod)return -EPERM;error = devcgroup_inode_mknod(mode, dev);if (error)return error;error = security_inode_mknod(dir, dentry, mode, dev);if (error)return error;error = dir->i_op->mknod(dir, dentry, mode, dev);if (!error)fsnotify_create(dir, dentry);return error;}

inode节点的方法是创建inode节点时初始化的,这里也就是/dev节点创建时初始化的,dev节点是在rootfs虚拟文件系统下创建的(前面说过rootfs虚拟文件系统是linux kernel真正意义上挂载的第一个根文件系统),它的i_op被赋值为ramfs_dir_inode_operations。

(fs/ramfs/inode.c)

static const struct inode_operations ramfs_dir_inode_operations = {.create= ramfs_create,.lookup= simple_lookup,.link= simple_link,.unlink= simple_unlink,.symlink= ramfs_symlink,.mkdir= ramfs_mkdir,.rmdir= simple_rmdir,.mknod= ramfs_mknod,.rename= simple_rename,};


所以i_op->mknod方法就是调用的ramfs_mknod函数。

ramfs_mknod: (fs/ramfs/inode.c)

ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev){struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);int error = -ENOSPC;if (inode) {d_instantiate(dentry, inode);dget(dentry);/* Extra count - pin the dentry in core */error = 0;dir->i_mtime = dir->i_ctime = CURRENT_TIME;}return error;}

ramfs_mknod函数调用了ramfs_get_inode函数创建一个新的inode。

ramfs_get_inode: (fs/ramfs/inode.c)

struct inode *ramfs_get_inode(struct super_block *sb,const struct inode *dir, umode_t mode, dev_t dev){struct inode * inode = new_inode(sb);if (inode) {inode->i_ino = get_next_ino();inode_init_owner(inode, dir, mode);inode->i_mapping->a_ops = &ramfs_aops;inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);mapping_set_unevictable(inode->i_mapping);inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;switch (mode & S_IFMT) {default:init_special_inode(inode, mode, dev);break;case S_IFREG:inode->i_op = &ramfs_file_inode_operations;inode->i_fop = &ramfs_file_operations;break;case S_IFDIR:inode->i_op = &ramfs_dir_inode_operations;inode->i_fop = &simple_dir_operations;/* directory inodes start off with i_nlink == 2 (for "." entry) */inc_nlink(inode);break;case S_IFLNK:inode->i_op = &page_symlink_inode_operations;break;}}return inode;}


调用new_inode分配一个新inode。

new_inode: (fs/inode.c)

new_inode进一步调用new_inode_pseudo函数新建inode。如果创建成功,则将其加入到该文件系统超级块的inode链表。

struct inode *new_inode(struct super_block *sb){struct inode *inode;spin_lock_prefetch(&inode_sb_list_lock);inode = new_inode_pseudo(sb);if (inode)inode_sb_list_add(inode);return inode;}

new_inode_pseudo: (fs/inode.c)

pseudo是“虚拟”的意思,因为当前是在虚拟根文件系统————rootfs文件系统下创建节点,前面提过rootfs文件系统是基于内存的特殊文件系统,它的inode是只存在于内存中的。而其它像ext, fat等文件系统的inode节点都存储在硬盘等外部设备中。

new_inode_pseudo调用alloc_inode创建inode。

struct inode *new_inode_pseudo(struct super_block *sb){struct inode *inode = alloc_inode(sb);if (inode) {spin_lock(&inode->i_lock);inode->i_state = 0;spin_unlock(&inode->i_lock);INIT_LIST_HEAD(&inode->i_sb_list);}return inode;}

alloc_inode: (fs/inode.c)

如果对应文件系统超级块的s_op->alloc_inode存在,则调用它,否则调用kmem_cache_alloc函数分配inode。

static struct inode *alloc_inode(struct super_block *sb){struct inode *inode;if (sb->s_op->alloc_inode)inode = sb->s_op->alloc_inode(sb);elseinode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);if (!inode)return NULL;if (unlikely(inode_init_always(sb, inode))) {if (inode->i_sb->s_op->destroy_inode)inode->i_sb->s_op->destroy_inode(inode);elsekmem_cache_free(inode_cachep, inode);return NULL;}return inode;}

在挂载rootfs虚拟根文件系统的时候,对应超级块的s_op已经被设置为ramfs_ops了。可以看到,alloc_inode没有设置,所以调用kmem_cache_alloc函数来分配inode。

static const struct super_operations ramfs_ops = {.statfs= simple_statfs,.drop_inode= generic_delete_inode,.show_options= generic_show_options,};

inode分配好后就将它返回new_inode_pseudo,再返回到new_inode,接着new_inode函数调用inode_sb_list_add将新的inode加入到文件系统超级块的inode链表中。

返回到ramfs_get_inode。
从new_inode返回后,进行一些初始化工作(包括inode的节点号),然后进入init_special_inode函数。


init_special_inode: (fs/inode.c)

前面create_dev的时候传进来的参数mode是块设备,所以这里将inode节点操作函数设为def_blk_fops,设备号设为rdev。

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev){inode->i_mode = mode;    if (S_ISCHR(mode)) {inode->i_fop = &def_chr_fops;inode->i_rdev = rdev;} else if (S_ISBLK(mode)) {       inode->i_fop = &def_blk_fops;inode->i_rdev = rdev;} else if (S_ISFIFO(mode))inode->i_fop = &def_fifo_fops;else if (S_ISSOCK(mode))inode->i_fop = &bad_sock_fops;elseprintk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"  " inode %s:%lu\n", mode, inode->i_sb->s_id,  inode->i_ino);}

至此,inode节点就创建好了,向上层层返回到ramfs_get_inode,ramfs_get_inode就将创建好的inode节点直接返回到ramfs_mknod。

ramfs_mknod进一步调用d_instantiate。


d_instantiate: (fs/dcache.c)

d_instantiate函数封装了__d_instantiate。

void d_instantiate(struct dentry *entry, struct inode * inode){BUG_ON(!list_empty(&entry->d_alias));if (inode)spin_lock(&inode->i_lock);__d_instantiate(entry, inode);if (inode)spin_unlock(&inode->i_lock);security_d_instantiate(entry, inode);}

__d_instantiate: (fs/dcache.c)

__d_instantiate将inode与dentry关联起来。

static void __d_instantiate(struct dentry *dentry, struct inode *inode){spin_lock(&dentry->d_lock);if (inode) {if (unlikely(IS_AUTOMOUNT(inode)))dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;list_add(&dentry->d_alias, &inode->i_dentry);}dentry->d_inode = inode;dentry_rcuwalk_barrier(dentry);spin_unlock(&dentry->d_lock);fsnotify_d_instantiate(dentry, inode);}

层层返回,至此,/dev/root设备就创建完毕了。

0 0
原创粉丝点击