文件系统(四)--pipe.c read_write.c stat.c fcntl.c ioctl.c源码分析

来源：互联网发布：易售乐服装进销存软件编辑：程序博客网时间：2024/05/17 22:50

1.pipe.c

1 /*
2 * linux/fs/pipe.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <signal.h>
9 #include <linux/sched.h>
10 #include <linux/mm.h>   /* for get_free_page */
11 #include <asm/segment.h>

include/linux/fs.h:58:#define PIPE_HEAD(inode) ((inode).i_zone[0])
include/linux/fs.h:59:#define PIPE_TAIL(inode) ((inode).i_zone[1])
include/linux/fs.h:60:#define PIPE_SIZE(inode) ((PIPE_HEAD(inode)-PIPE_TAIL(inode))&(PAGE_SIZE-1))
可见i_zone[0],i_zone[1]分别存放的是首尾指针
include/linux/fs.h:61:#define PIPE_EMPTY(inode) (PIPE_HEAD(inode)==PIPE_TAIL(inode))
关于上面的宏定义，还有一点需要说明一下的，我们这里给pipe分配的内存大小为一个内存页，它是2的n次方。对于这样的长度的“数组”，可以优化边界条件的处理。比如我们假设LEN = 4；对应数组元素为a[0]-a[3]，假设现在指针pos=LEN-1=3，那么(pos+1)&(LEN-1)=0，也就是自动折回，免去了判断的过程。

13 int read_pipe(struct m_inode * inode, char * buf, int count)
14 {
15     int chars, size, read = 0;
16
17     while (count>0) {
18         while (!(size=PIPE_SIZE(*inode))) {//如果pipe长度为0
19             wake_up(&inode->i_wait);         //唤醒写等待的任务
20             if (inode->i_count != 2) /* are there any writers? */
21                 return read;
22             sleep_on(&inode->i_wait);         //读进程睡眠
23         }

24         chars = PAGE_SIZE-PIPE_TAIL(*inode);
这里是此次最大能读取的字节数，这个值很可能不准确，下面会对它进行调整
25         if (chars > count)
26             chars = count;
chars最大为count

27 if (chars > size)
28 chars = size;
chars最大为size

29 count -= chars;
30 read += chars;

从29和30行，可以推测这里chars是此次可以读取的字节数。count是剩余待读取的字节数，read是已经读取的字节数。

31 size = PIPE_TAIL(*inode); //size作为指针，指向我们读取数据的偏移
32 PIPE_TAIL(*inode) += chars;

从32行可以知道读取数据是从PIPE_TAIL(*inode)开始读取的，这里就是更新tail位置，因为我们可以读取chars字节

33 PIPE_TAIL(*inode) &= (PAGE_SIZE-1); //调整tail

34 while (chars-->0)
35 put_fs_byte(((char *)inode->i_size)[size++],buf++);

从35行可以知道，对于pipe类型的inode，它的i_size保存到是内存页的地址。这里就是从pipe读取chars字节到用户缓冲区
36 }

37 wake_up(&inode->i_wait);
38 return read;
39 }

41 int write_pipe(struct m_inode * inode, char * buf, int count)
42 {
43     int chars, size, written = 0;
44
45     while (count>0) {
46         while (!(size=(PAGE_SIZE-1)-PIPE_SIZE(*inode))) {
47             wake_up(&inode->i_wait);
48             if (inode->i_count != 2) { /* no readers */
49                 current->signal |= (1<<(SIGPIPE-1));
50                 return written?written:-1;
51             }
52             sleep_on(&inode->i_wait);
53         }
54         chars = PAGE_SIZE-PIPE_HEAD(*inode);

55         if (chars > count)
56             chars = count;
57         if (chars > size)
58             chars = size;
59         count -= chars;
60         written += chars;
61         size = PIPE_HEAD(*inode);
62         PIPE_HEAD(*inode) += chars;
63         PIPE_HEAD(*inode) &= (PAGE_SIZE-1);
64         while (chars-->0)
65             ((char *)inode->i_size)[size++]=get_fs_byte(buf++);
66     }
67     wake_up(&inode->i_wait);
68     return written;
69 }
写与读基本一致，这里不再分析

71 int sys_pipe(unsigned long * fildes)
72 {
73     struct m_inode * inode;
74     struct file * f[2];
75     int fd[2];
76     int i,j;

78     j=0;
79     for(i=0;j<2 && i<NR_FILE;i++)
80         if (!file_table[i].f_count)
81             (f[j++]=i+file_table)->f_count++;

上面是找到两个引用计数为0的file结构体保存到f[]，并增加它的引用计数。这里file_table是系统文件表

82     if (j==1)
83         f[0]->f_count=0;
84     if (j<2)
85         return -1;

运行到此处，说明找到两个引用计数为0的file结构

86     j=0;
87     for(i=0;j<2 && i<NR_OPEN;i++)
88         if (!current->filp[i]) {
89             current->filp[ fd[j]=i ] = f[j];
90             j++;
91         }

遍历当前进程filp，找到还未使用的项，这时的索引i就作为文件描述符，用它来索引我们上面的f[]中的file结构。现在fd[]也已经被赋值，它保存的是f[]中file在filp中的索引

92     if (j==1)
93         current->filp[fd[0]]=NULL;
94     if (j<2) {
95         f[0]->f_count=f[1]->f_count=0;
96         return -1;
97     }

运行到这里说明一切正常

98     if (!(inode=get_pipe_inode())) {
99         current->filp[fd[0]] =
100             current->filp[fd[1]] = NULL;
101         f[0]->f_count = f[1]->f_count = 0;
102         return -1;
103     }

98行为pipe分配inode节点。

104     f[0]->f_inode = f[1]->f_inode = inode;
105     f[0]->f_pos = f[1]->f_pos = 0;
106     f[0]->f_mode = 1;       /* read */
107     f[1]->f_mode = 2;       /* write */

设置f[]中file结构的f_inode指向前面分配的inode，这样file结构就与inode关联起来

108 put_fs_long(fd[0],0+fildes);
109 put_fs_long(fd[1],1+fildes);

把文件描述符写入到用户空间

110 return 0;
111 }

总结

pipe与inode是关系密切，它申请一页内存用作循环队列。其中内存地址保存在inode->i_size中，头尾指针则分别保存在i_zone[0]和i_zone[1]中。循环数组的读写操作大家并不陌生。重点看一下pipe的创建过程，由于pipe的实现是与文件系统密切相关的，需要有两个file（读端写端）关联到pipe。之前已经提到过系统中所有的file都保存在file_table中，因此我们需要首先从中找出两个空闲的file结构。然后还需要为pipe的读端与写端返回文件描述符，遍历current->filp，找到两个未使用的索引作为相应的文件描述符保存到数组中，并让二者作为索引的current->filp分别指向上面的空闲file结构体,接下来就可以为pipe创建inode，这个过程上面已经总结了，创建完成该inode之后将其保存到file的f_inode中，然后还需要设置读端与写端对应的file的权限分别为只读，只写，最后把文件描述符写入用户空间。

2.read_write.c

1 /*
2 * linux/fs/read_write.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <sys/stat.h>
8 #include <errno.h>
9 #include <sys/types.h>
10
11 #include <linux/kernel.h>
12 #include <linux/sched.h>
13 #include <asm/segment.h>
14
15 extern int rw_char(int rw,int dev, char * buf, int count, off_t * pos);
16 extern int read_pipe(struct m_inode * inode, char * buf, int count);
17 extern int write_pipe(struct m_inode * inode, char * buf, int count);
18 extern int block_read(int dev, off_t * pos, char * buf, int count);
19 extern int block_write(int dev, off_t * pos, char * buf, int count);
20 extern int file_read(struct m_inode * inode, struct file * filp,
21 char * buf, int count);
22 extern int file_write(struct m_inode * inode, struct file * filp,
23 char * buf, int count);

25 int sys_lseek(unsigned int fd,off_t offset, intorigin)
26 {
27     struct file * file;
28     int tmp;
30     if (fd >= NR_OPEN || !(file=current->filp[fd]) || !(file->f_inode)
31        || !IS_SEEKABLE(MAJOR(file->f_inode->i_dev)))
32         return -EBADF;

33 if (file->f_inode->i_pipe)
34 return -ESPIPE;

35     switch (origin) {
36         case 0:
37             if (offset<0) return -EINVAL;
38             file->f_pos=offset;
39             break;
40         case 1:c
41             if (file->f_pos+offset<0) return -EINVAL;
42             file->f_pos += offset;
43             break;
44         case 2:
45             if ((tmp=file->f_inode->i_size+offset) < 0)
46                 return -EINVAL;
47             file->f_pos = tmp;
48             break;
49         default:
50             return -EINVAL;
51     }

52 return file->f_pos;
53 }
这个函数也比较简单，我们看一下它的使用方法就可以了

定义函数：off_t lseek(int fildes, off_t offset, int whence);

函数说明：
每一个已打开的文件都有一个读写位置, 当打开文件时通常其读写位置是指向文件开头, 若是以附加的方式打开文件(如O_APPEND), 则读写位置会指向文件尾. 当read()或write()时, 读写位置会随之增加,lseek()便是用来控制该文件的读写位置. 参数fildes 为已打开的文件描述词, 参数offset 为根据参数whence来移动读写位置的位移数.

参数 whence 为下列其中一种:
SEEK_SET 参数offset 即为新的读写位置.
SEEK_CUR 以目前的读写位置往后增加offset 个位移量.
SEEK_END 将读写位置指向文件尾后再增加offset 个位移量. 当whence 值为SEEK_CUR 或
SEEK_END 时, 参数offet 允许负值的出现.

下列是特别的使用方式:
1) 欲将读写位置移到文件开头时:lseek(int fildes, 0, SEEK_SET);
2) 欲将读写位置移到文件尾时:lseek(int fildes, 0, SEEK_END);
3) 想要取得目前文件位置时:lseek(int fildes, 0, SEEK_CUR);

返回值：当调用成功时则返回目前的读写位置, 也就是距离文件开头多少个字节. 若有错误则返回-1, errno 会存放错误代码.

55 int sys_read(unsigned int fd,char * buf,int count)
56 {
57     struct file * file;
58     struct m_inode * inode;

60     if (fd>=NR_OPEN || count<0 || !(file=current->filp[fd]))
61         return -EINVAL;
62     if (!count)
63         return 0;

64 verify_area(buf,count);

要保证buf开始的count字节的空间在内存中

65 inode = file->f_inode;

取得inode

66 if (inode->i_pipe)
67 return (file->f_mode&1)?read_pipe(inode,buf,count):-EIO;

68 if (S_ISCHR(inode->i_mode))
69 return rw_char(READ,inode->i_zone[0],buf,count,&file->f_pos);

70 if (S_ISBLK(inode->i_mode))
71 return block_read(inode->i_zone[0],&file->f_pos,buf,count);

72     if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode)) {
73         if (count+file->f_pos > inode->i_size)
74             count = inode->i_size - file->f_pos;
75         if (count<=0)
76             return 0;
77         return file_read(inode,file,buf,count);

78 }

根据不同的类型调用不同的函数

79 printk("(Read)inode->i_mode=%06o\n\r",inode->i_mode);
80 return -EINVAL;
81 }

83 int sys_write(unsigned int fd,char * buf,int count)
84 {
85     struct file * file;
86     struct m_inode * inode;

88     if (fd>=NR_OPEN || count <0 || !(file=current->filp[fd]))
89         return -EINVAL;
90     if (!count)
91         return 0;
92     inode=file->f_inode;
93     if (inode->i_pipe)
94         return (file->f_mode&2)?write_pipe(inode,buf,count):-EIO;
95     if (S_ISCHR(inode->i_mode))
96         return rw_char(WRITE,inode->i_zone[0],buf,count,&file->f_pos);
97     if (S_ISBLK(inode->i_mode))
98         return block_write(inode->i_zone[0],&file->f_pos,buf,count);
99     if (S_ISREG(inode->i_mode))
100         return file_write(inode,file,buf,count);
101     printk("(Write)inode->i_mode=%06o\n\r",inode->i_mode);
102     return -EINVAL;
103 }

总结

这里会根据inode的不同类型调用不同的函数进行具体的读写操作。至于lseek上面已经解释的很具体了。前面的文章中我们分析过文件写操作时会根据指定的模式（比如O_APPEND）和f_pos计算一个位置，根据这个位置进行实际的磁盘块写入操作(读操作也是类似)，lseek正好给了我们修改f_pos的机会。

3.stat.c

1 /*
2 * linux/fs/stat.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <errno.h>
8 #include <sys/stat.h>
9
10 #include <linux/fs.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <asm/segment.h>
14
15 static void cp_stat(struct m_inode * inode, struct stat * statbuf)
16 {
17     struct stat tmp;
18     int i;
19
20    verify_area(statbuf,sizeof (* statbuf));

还是要确定用户空间中statbuf开始的sizeof (* statbuf)的空间都在内存中

21     tmp.st_dev = inode->i_dev;
22     tmp.st_ino = inode->i_num;
23     tmp.st_mode = inode->i_mode;
24     tmp.st_nlink = inode->i_nlinks;
25     tmp.st_uid = inode->i_uid;
26     tmp.st_gid = inode->i_gid;
27     tmp.st_rdev = inode->i_zone[0];
28     tmp.st_size = inode->i_size;
29     tmp.st_atime = inode->i_atime;
30     tmp.st_mtime = inode->i_mtime;
31     tmp.st_ctime = inode->i_ctime;

32     for (i=0 ; i<sizeof (tmp) ; i++)
33         put_fs_byte(((char *) &tmp)[i],&((char *) statbuf)[i]);

因为这个函数是在内核空间执行的，所以需要通过33行把它拷贝到用户空间指定地址处

34 }

36 int sys_stat(char * filename, struct stat * statbuf)
37 {
38     struct m_inode * inode;
39
40     if (!(inode=namei(filename)))
41         return -ENOENT;
42     cp_stat(inode,statbuf);
43     iput(inode);
44     return 0;
45 }

47 int sys_fstat(unsigned int fd, struct stat * statbuf)
48 {
49     struct file * f;
50     struct m_inode * inode;
51
52     if (fd >= NR_OPEN || !(f=current->filp[fd]) || !(inode=f->f_inode))
53         return -EBADF;
54     cp_stat(inode,statbuf);
55     return 0;
56 }

总结

很简单，没什么好说的。基本上就是对inode属性的拷贝，关于put_fs_byte也已经说过多次了，实现从内核空间到用户空间的拷贝，还有一个namei函数，用来根据文件名寻找其inode节点，这个会在相应的文章中进行分析，它的效率还是比较低的。

4.fcntl.c

1 /*
2 * linux/fs/fcntl.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <string.h>
8 #include <errno.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <asm/segment.h>
12
13 #include <fcntl.h>
14 #include <sys/stat.h>
15
16 extern int sys_close(int fd);

18 static int dupfd(unsigned int fd, unsigned int arg)
19 {
20     if (fd >= NR_OPEN || !current->filp[fd])
21         return -EBADF;
22     if (arg >= NR_OPEN)
23         return -EINVAL;
24     while (arg < NR_OPEN)
25         if (current->filp[arg])
26             arg++;
27         else
28             break;

从arg开始遍历，找到一个filp中还未使用的项的索引

29     if (arg >= NR_OPEN)
30         return -EMFILE;
31     current->close_on_exec &= ~(1<<arg);
32     (current->filp[arg] = current->filp[fd])->f_count++;

让filp[arg]指向被复制的fd对应的file，增加引用计数
33 return arg; //返回找到的文件描述符
34 }

36 int sys_dup2(unsigned int oldfd, unsigned int newfd)
37 {
38 sys_close(newfd);
39 return dupfd(oldfd,newfd);
40 }
先关闭newfd对应的文件，根据我们之前对close的分析，关闭一个文件会值filp[newfd]=NULL，因此这里newfd就是我们要找的fd 。

42 int sys_dup(unsigned int fildes)
43 {
44 return dupfd(fildes,0);
45 }
从0开始查找

47 int sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
48 {
49     struct file * filp;
50
51     if (fd >= NR_OPEN || !(filp = current->filp[fd]))
52         return -EBADF;
53     switch (cmd) {
54         case F_DUPFD:
55             return dupfd(fd,arg);
56         case F_GETFD:
57             return (current->close_on_exec>>fd)&1;

             返回current->close_on_exec中fd处对应的值
58         case F_SETFD:
59             if (arg&1)     //arg==1，设置
60                 current->close_on_exec |= (1<<fd);

              设置current->close_on_exec中与fd对应的位
61             else              //arg==0，清空
62                 current->close_on_exec &= ~(1<<fd);
63             return 0;
64         case F_GETFL:
65             return filp->f_flags;     //返回flags
66         case F_SETFL:     //设置flags
67             filp->f_flags &= ~(O_APPEND | O_NONBLOCK);
68             filp->f_flags |= arg & (O_APPEND | O_NONBLOCK);
69             return 0;
70         case F_GETLK:   case F_SETLK:   case F_SETLKW:
71             return -1;
  72         default:
73             return -1;
74     }
75 }

总结

注意一下两个复制文件句柄操作的异同。dupfd从参数arg指定的位置开始寻找一个空闲描述符（在close_on_exec清除对应的位），并让它与参数fd指向相同的文件，而dupfd2会先关闭指定的fd再重新查找，这样参数中指定的fd就是最终的fd。sys_fcntl根据不同的命令执行不同的指令或调用不同的函数。

5.iocntl.c

1 /*
2 * linux/fs/ioctl.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <string.h>
8 #include <errno.h>
9 #include <sys/stat.h>

11 #include <linux/sched.h>

13 extern int tty_ioctl(int dev, int cmd, int arg);

15 typedef int (*ioctl_ptr)(int dev,int cmd,int arg);

17 #define NRDEVS ((sizeof (ioctl_table))/(sizeof (ioctl_ptr)))

19 static ioctl_ptr ioctl_table[]={
20     NULL,       /* nodev */
21     NULL,       /* /dev/mem */
22     NULL,       /* /dev/fd */
23     NULL,       /* /dev/hd */
24     tty_ioctl, /* /dev/ttyx */
25     tty_ioctl, /* /dev/tty */
26     NULL,       /* /dev/lp */
27     NULL};      /* named pipes */
27     NULL};      /* named pipes */

30 int sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
31 {
32     struct file * filp;
33     int dev,mode;
34
35     if (fd >= NR_OPEN || !(filp = current->filp[fd]))
36         return -EBADF;

37 mode=filp->f_inode->i_mode;

38 if (!S_ISCHR(mode) && !S_ISBLK(mode))
39 return -EINVAL;

如果不是字符设备同时不是块设备，退出

40 dev = filp->f_inode->i_zone[0];

对设备来说其inode->i_zone[0]中存的是设备号

41     if (MAJOR(dev) >= NRDEVS)
42         return -ENODEV;
43     if (!ioctl_table[MAJOR(dev)])
44         return -ENOTTY;
45     return ioctl_table[MAJOR(dev)](dev,cmd,arg);
46 }

总结

ioctl仅仅是根据操作对象类型的不同进行转发而已。

1 0