文件系统(二)--buffer.c namei.c truncate.c open.c源码分析

来源：互联网发布：java collection接口编辑：程序博客网时间：2024/06/07 20:47

1.buffer.c

1 /*
2 * linux/fs/buffer.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting a interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it. NOTE! As interrupts
11 * can wake up a caller, some cli-sti sequences are needed to check for
12 * sleep-on-calls. These should be extremely quick, though (I hope).
13 */

15 /*
16 * NOTE! There is one discordant note here: checking floppies for
17 * disk change. This is where it fits best, I think, as it should
18 * invalidate changed floppy-disk-caches.
19 */
20
21 #include <stdarg.h>
22
23 #include <linux/config.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
26 #include <asm/system.h>
27 #include <asm/io.h>

29 extern int end;     //由链接器生成的，指向内核空间末端后一个字节
30 struct buffer_head * start_buffer = (struct buffer_head *) &end;
31 struct buffer_head * hash_table[NR_HASH];
32 static struct buffer_head * free_list;     //空闲链表头
33 static struct task_struct * buffer_wait = NULL; //等待空闲缓冲块而睡眠的任务队列
34 int NR_BUFFERS = 0;     //缓冲块个数

36 static inline void wait_on_buffer(struct buffer_head * bh)
37 {
38     cli();
39     while (bh->b_lock)
40         sleep_on(&bh->b_wait);
41     sti();
42 }
等待指定缓冲块解锁

44 int sys_sync(void)
45 {
46     int i;
47     struct buffer_head *bh;
49     sync_inodes();      /* write out inodes into buffers */
50     bh = start_buffer;
51      for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
52         wait_on_buffer(bh);
53         if (bh->b_dirt)
54             ll_rw_block(WRITE,bh);
55     }
56     return 0;
57 }

49行先进行inode的同步（具体过程下面分析），随后遍历所有的buffer_head，如果标记为脏，就进行写操作将其写入到磁盘中。

现在看一下sync_inodes：

fs/inode.c

59 void sync_inodes(void)
60 {
61     int i;
62     struct m_inode * inode;
63
64     inode = 0+inode_table;
65     for(i=0 ; i<NR_INODE ; i++,inode++) {
66         wait_on_inode(inode);
67         if (inode->i_dirt && !inode->i_pipe)
68             write_inode(inode);
69     }
70 }

之前的文章中提到过，内核把所有存在于内核中的inode保存在一个数组inode_table中，现在就遍历这个数组，如果该inode没有被锁定，并且是脏的，同时不是pipe，执行write_inode。

write_inode(inode)：

314 static void write_inode(struct m_inode * inode)
315 {
316     struct super_block * sb;
317     struct buffer_head * bh;
318     int block;
319
320     lock_inode(inode);
321     if (!inode->i_dirt || !inode->i_dev) {
322         unlock_inode(inode);
323         return;
324     }

如果是干净的，直接返回

325 if (!(sb=get_super(inode->i_dev)))
326 panic("trying to write inode without device");

获取分区超级块

327 block = 2 + sb->s_imap_blocks + sb->s_zmap_blocks +
328 (inode->i_num-1)/INODES_PER_BLOCK;

这里是计算这里的inode节点的块号，这是为了从磁盘中读取inode节点，并与内存中的inode节点进行比对。

我们再来看一下为什么这么计算，2在这里分别代表了引导块与超级块，然后是imap所占的块号，然后是zmap（逻辑块map）所占的块号。inode->i_num是inode的编号，除以INODES_PER_BLOCK表示它对应的块号偏移。

329 if (!(bh=bread(inode->i_dev,block)))
330 panic("unable to read i-node block");

读取参数inode所在块的内容。

331     ((struct d_inode *)bh->b_data)
332         [(inode->i_num-1)%INODES_PER_BLOCK] =
333             *(struct d_inode *)inode;
这里的目的是把参数中的inode写入磁盘中。但是这里还是会经过缓冲区。
334     bh->b_dirt=1;
335     inode->i_dirt=0;
这时把bh设为脏，inode设为干净就可以了。
336     brelse(bh);

唤醒其他等待bh的任务
337 unlock_inode(inode);
解锁inode
338 }
可以看到这里只是把inode放回到了缓冲区，等待写入磁盘。

我们继续回到前面，sync_inode就是遍历inode table,把所有的标记为脏的inode并且不是pipe类型的写入磁盘。当然它必须先写入高速缓冲区，注意它是怎么计算对应磁盘位置的。

继续回到sys_sync:

44 int sys_sync(void)
45 {
46     int i;
47     struct buffer_head * bh;
48
49     sync_inodes();      /* write out inodes into buffers */
50     bh = start_buffer;
51     for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
52         wait_on_buffer(bh);
53         if (bh->b_dirt)
54             ll_rw_block(WRITE,bh);
55     }
56     return 0;
57 }
这里50行开始会遍历所有的buffer_head，如果没有其他任务锁定这个buffer_head，并且buffer是脏的，就调用54行（驱动程序）把缓冲区内容写入到硬盘中。

59 int sync_dev(int dev)
60 {
61     int i;
62     struct buffer_head * bh;
63
64     bh = start_buffer;
65     for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
66         if (bh->b_dev != dev)
67             continue;
68         wait_on_buffer(bh);
69         if (bh->b_dev == dev && bh->b_dirt)
70             ll_rw_block(WRITE,bh);
71     }

72     sync_inodes();
73     bh = start_buffer;
74     for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
75         if (bh->b_dev != dev)
76             continue;
77         wait_on_buffer(bh);
78         if (bh->b_dev == dev && bh->b_dirt)
79             ll_rw_block(WRITE,bh);
80     }
81     return 0;
82 }

这里分两步执行是从效率的角度上考虑的，第一步先把脏的buffer写入到磁盘，第二部同步inode节点，第三步，把因为同步inode节点变脏的buffer再写入磁盘。

===========================================

我们平时编程时涉及到文件写操作的，如果仅仅是把数据写入到应用程序缓冲区中，这时高速缓冲区中并没有它的记录，这时如果应用程序退出，就会造成数据丢失。如果执行了flush就会把数据刷到告诉缓冲区中。

===========================================

84 void inline invalidate_buffers(int dev)
85 {
86     int i;
87     struct buffer_head * bh;
88
89     bh = start_buffer;
90     for (i=0 ; i<NR_BUFFERS ; i++,bh++) {
91         if (bh->b_dev != dev)
92             continue;
93         wait_on_buffer(bh);
94         if (bh->b_dev == dev)
95             bh->b_uptodate = bh->b_dirt = 0;
96     }
97 }
很简单，93行首先等待对bh的执行权，然后只需要设置标记b_uptodate，b_dirt即可。

99 /*
100 * This routine checks whether a floppy has been changed, and
101 * invalidates all buffer-cache-entries in that case. This
102 * is a relatively slow routine, so we have to try to minimize using
103 * it. Thus it is called only upon a 'mount' or 'open'. This
104 * is the best way of combining speed and utility, I think.
105 * People changing diskettes in the middle of an operation deserve
106 * to loose :-)
107 *
108 * NOTE! Although currently this is only for floppies, the idea is
109 * that any additional removable block-device will use this routine,
110 * and that mount/open needn't know that floppies/whatever are
111 * special.
112 */
113 void check_disk_change(int dev)
114 {
115     int i;
116
117     if (MAJOR(dev) != 2)
118         return;
119     if (!floppy_change(dev & 0x03))
120         return;
121     for (i=0 ; i<NR_SUPER ; i++)
122         if (super_block[i].s_dev == dev)
123             put_super(super_block[i].s_dev);
124     invalidate_inodes(dev);
125     invalidate_buffers(dev);
126 }

128 #define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)
129 #define hash(dev,block) hash_table[_hashfn(dev,block)]

131 static inline void remove_from_queues(struct buffer_head * bh)
132 {
133 /* remove from hash-queue */
134     if (bh->b_next)
135         bh->b_next->b_prev = bh->b_prev;
136     if (bh->b_prev)
137         bh->b_prev->b_next = bh->b_next;

buffer_head通过b_prev和b_next来链接成双向链表

138 if (hash(bh->b_dev,bh->b_blocknr) == bh)
139 hash(bh->b_dev,bh->b_blocknr) = bh->b_next;

hash table中相应的slot指向hash值相同的链表

140 /* remove from free list */
141     if (!(bh->b_prev_free) || !(bh->b_next_free))
142         panic("Free block list corrupted");
143     bh->b_prev_free->b_next_free = bh->b_next_free;
144     bh->b_next_free->b_prev_free = bh->b_prev_free;
通过b_next_free和b_prev_free连接成空闲双向链表

145 if (free_list == bh)
146 free_list = bh->b_next_free;

free_list作为空闲链表表头
147 }

149 static inline void insert_into_queues(struct buffer_head * bh)
150 {
151 /* put at end of free list */
152     bh->b_next_free = free_list;
153     bh->b_prev_free = free_list->b_prev_free;
154     free_list->b_prev_free->b_next_free = bh;
155     free_list->b_prev_free = bh;

可见free_list链表尾部是最近使用的，首部则是最不常使用的

156 /* put the buffer in new hash-queue if it has a device */
157     bh->b_prev = NULL;
158     bh->b_next = NULL;
159     if (!bh->b_dev)
160         return;
161     bh->b_next = hash(bh->b_dev,bh->b_blocknr);
162    hash(bh->b_dev,bh->b_blocknr) = bh;
163    bh->b_next->b_prev = bh;
添加到hash表中
164 }
这个过程的示意图如下：

166 static struct buffer_head * find_buffer(int dev, int block)
167 {
168     struct buffer_head * tmp;
169
170     for (tmp = hash(dev,block) ; tmp != NULL ; tmp = tmp->b_next)
171         if (tmp->b_dev==dev && tmp->b_blocknr==block)
172             return tmp;
173     return NULL;
174 }
非常简单。

176 /*
177 * Why like this, I hear you say... The reason is race-conditions.
178 * As we don't lock buffers (unless we are readint them, that is),
179 * something might happen to it while we sleep (ie a read-error
180 * will force it bad). This shouldn't really happen currently, but
181 * the code is ready.
182 */
183 struct buffer_head * get_hash_table(int dev, int block)
184 {
185     struct buffer_head * bh;
186
187     for (;;) {
188         if (!(bh=find_buffer(dev,block)))
189             return NULL;
190         bh->b_count++;
191         wait_on_buffer(bh);
192         if (bh->b_dev == dev && bh->b_blocknr == block)
193             return bh;
194         bh->b_count--;
195     }
196 }
返回对应设备和块号的buffer_head.192行重新判断是因为在睡眠过程中，可能整个世界都变了

205 #define BADNESS(bh) (((bh)->b_dirt<<1)+(bh)->b_lock)
206 struct buffer_head * getblk(int dev,int block)
207 {
208     struct buffer_head * tmp, * bh;
210 repeat:
211     if (bh = get_hash_table(dev,block))
212         return bh;

如果没有找到
213     tmp = free_list;
214     do {
215         if (tmp->b_count)
216             continue;
217         if (!bh || BADNESS(tmp)<BADNESS(bh)) {
218             bh = tmp;
219             if (!BADNESS(tmp))
220                 break;
221         }
222 /* and repeat until we find something good */
223     } while ((tmp = tmp->b_next_free) != free_list);

224     if (!bh) {
225         sleep_on(&buffer_wait);
226         goto repeat;
227     }

228     wait_on_buffer(bh);
229     if (bh->b_count)
230         goto repeat;
231     while (bh->b_dirt) {
232         sync_dev(bh->b_dev);
233         wait_on_buffer(bh);
234         if (bh->b_count)
235             goto repeat;
236     }
237 /* NOTE!! While we slept waiting for this block, somebody else might */
238 /* already have added "this" block to the cache. check it */
239     if (find_buffer(dev,block))
240         goto repeat;
241 /* OK, FINALLY we know that this buffer is the only one of it's kind, */
242 /* and that it's unused (b_count=0), unlocked (b_lock=0), and clean */
243     bh->b_count=1;
244     bh->b_dirt=0;
245     bh->b_uptodate=0;
246     remove_from_queues(bh);
247     bh->b_dev=dev;
248     bh->b_blocknr=block;
249     insert_into_queues(bh);
250     return bh;
251 }

253 void brelse(struct buffer_head * buf)
254 {
255     if (!buf)
256         return;
257     wait_on_buffer(buf);
258     if (!(buf->b_count--))
259         panic("Trying to free free buffer");
260     wake_up(&buffer_wait);
261 }
关于这个函数只解释一下buffer_wait，我们知道buffer_head是有限的，如果一个任务请求磁盘操作，但此时所有的buffer_head都不是空闲的，那么该任务只有等待在buffer_wait上面。

263 /*
264 * bread() reads a specified block and returns the buffer that contains
265 * it. It returns NULL if the block was unreadable.
266 */
267 struct buffer_head * bread(int dev,int block)
268 {
269     struct buffer_head * bh;

271     if (!(bh=getblk(dev,block)))
272         panic("bread: getblk returned NULL\n");
273     if (bh->b_uptodate)
274         return bh;
275     ll_rw_block(READ,bh);
276     wait_on_buffer(bh);
277     if (bh->b_uptodate)
278         return bh;
279     brelse(bh);
280     return NULL;
281 }
这个我们在分析其他源码的过程中已经详细分析过了。

先从缓冲区中读，如果读不到就向设备驱动程序发起请求。

283 #define COPYBLK(from,to) \
284 __asm__("cld\n\t" \
285     "rep\n\t" \
286     "movsl\n\t" \
287     ::"c" (BLOCK_SIZE/4),"S" (from),"D" (to) \
288     :"cx","di","si")
源字符串指出由DS：SI和ES：DI指向目标字符串

290 /*
291 * bread_page reads four buffers into memory at the desired address. It's
292 * a function of its own, as there is some speed to be got by reading them
293 * all at the same time, not waiting for one to be read, and then another
294 * etc.
295 */
296 void bread_page(unsigned long address,int dev,int b[4])
297 {
298     struct buffer_head * bh[4];
299     int i;
300
301     for (i=0 ; i<4 ; i++)
302         if (b[i]) {
303             if (bh[i] = getblk(dev,b[i]))
304                 if (!bh[i]->b_uptodate)
305                     ll_rw_block(READ,bh[i]);
306         } else
307             bh[i] = NULL;
308     for (i=0 ; i<4 ; i++,address += BLOCK_SIZE)
309         if (bh[i]) {
310             wait_on_buffer(bh[i]);
311             if (bh[i]->b_uptodate)
312                 COPYBLK((unsigned long) bh[i]->b_data,address);
313             brelse(bh[i]);
314         }
315 }
这里也比较简单，读取4个块，并拷贝到指定内存地址后释放buffer_head

317 /*
318 * Ok, breada can be used as bread, but additionally to mark other
319 * blocks for reading as well. End the argument list with a negative
320 * number.
321 */
322 struct buffer_head * breada(int dev,int first, ...)
323 {
324     va_list args;
325     struct buffer_head * bh, *tmp;
326
327     va_start(args,first);
328     if (!(bh=getblk(dev,first)))
329         panic("bread: getblk returned NULL\n");
330     if (!bh->b_uptodate)
331         ll_rw_block(READ,bh);
332     while ((first=va_arg(args,int))>=0) {
333         tmp=getblk(dev,first);
334         if (tmp) {
335             if (!tmp->b_uptodate)
336                 ll_rw_block(READA,bh);
337             tmp->b_count--;
338         }
339     }
340     va_end(args);
341     wait_on_buffer(bh);
342     if (bh->b_uptodate)
343         return bh;
344     brelse(bh);
345     return (NULL);
346 }

这个函数可以接收可变参数，但是原理上与前面的bread一致

348 void buffer_init(long buffer_end)
349 {
350     struct buffer_head * h = start_buffer;     //前面已经初始化了
351     void * b;
352     int i;
353
354     if (buffer_end == 1<<20)
355         b = (void *) (640*1024);
356     else
357         b = (void *) buffer_end;

358     while ( (b -= BLOCK_SIZE) >= ((void *) (h+1)) ) {
359         h->b_dev = 0;
360         h->b_dirt = 0;
361         h->b_count = 0;
362         h->b_lock = 0;
363         h->b_uptodate = 0;
364         h->b_wait = NULL;
365         h->b_next = NULL;
366         h->b_prev = NULL;
367         h->b_data = (char *) b;
368         h->b_prev_free = h-1;
369         h->b_next_free = h+1;
pre指向内存低地址，next指向内存高地址。
370         h++;
每个BLOCK_SIZE为1K，所以从尾部end开始为每一个buff设置buffer_head与其对应
371         NR_BUFFERS++;
372         if (b == (void *) 0x100000)
373             b = (void *) 0xA0000;
374     }
375     h--;
376     free_list = start_buffer;
377     free_list->b_prev_free = h; //双向链表
378     h->b_next_free = free_list; //处理链表最后一个和第一个
379     for (i=0;i<NR_HASH;i++)
380         hash_table[i]=NULL;     //hash_table初始为空
381 }

到这里我们就介绍完了buffer.c整个源文件

总结

我们就从初始化函数开始总结，在buffer_init中，对缓冲区内存从尾部开始遍历每个块对应的从缓冲区首部为其设置buffer_head来描述之。缓冲区的地址就保存在buffer_head的b_data域中。初始时，hash_table内容均为空。空闲链表从尾部一直连接到首部（双向的）。后续在读取磁盘内容时，对应的内容会被读到buffer_head中，并且添加到hash_table和free_list中。hash_table中的冲突域采用b_next和b_pre属性链接成链。free_list指向的空闲链表则是通过buffer_head的b_pre_free和b_next_free链接成双向链表的。对于sync同步操作，依次遍历buffer_head，对标记为脏的buffer_head进行写操作，这里的写操作是实际调用磁盘驱动程序实现的。对于getblk操作，首先会通过hash方法寻找，如果发现为空，说明这时此磁盘内容还未被读入。由于每一个读入的磁盘内容都需要在缓冲区中使用一个buffer_head对其进行描述，因此现在就需要找到这样一个buffer_head，这是从空闲链表中进行寻找的。找到之后对该buffer_head的属性进行设置，注意此时我们并没有真正调用磁盘驱动程序来实际读取内容，只是把这个buffer_head加入相应的空闲链表和hash_table之中。真正的读取操作是在bread中进行的。在bread中首先进行getblk操作得到buffer_head后根据它的b_uptodate（内容是否是新的）标记来决定是否执行实际的读盘操作。至于释放操作brelse比较简单，首先等待buffer_head解锁，然后递减引用计数（i_count），最后唤醒等待在buffer_wait上的任务。

2.namei.c

1 /*
2 * linux/fs/namei.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * Some corrections by tytso.
9 */
10
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <asm/segment.h>
14
15 #include <string.h>
16 #include <fcntl.h>
17 #include <errno.h>
18 #include <const.h>
19 #include <sys/stat.h>
20
21 #define ACC_MODE(x) ("\004\002\006\377"[(x)&O_ACCMODE])

"\004\002\006\377"看成字符数组，[(x)&O_ACCMODE]就是数组索引

23 /*
24 * comment out this line if you want names > NAME_LEN chars to be
25 * truncated. Else they will be disallowed.
26 */
27 /* #define NO_TRUNCATE */
28
29 #define MAY_EXEC 1
30 #define MAY_WRITE 2
31 #define MAY_READ 4

33 /*
34 * permission()
35 *
36 * is used to check for read/write/execute permissions on a file.
37 * I don't know if we should look at just the euid or both euid and
38 * uid, but that should be easily changed.
39 */
40 static int permission(struct m_inode * inode,int mask)
41 {
42     int mode = inode->i_mode;
43
44 /* special case: not even root can read/write a deleted file */
45     if (inode->i_dev && !inode->i_nlinks)     //设备不为0，链接数为0，说明已经被删除
46         return 0;                                                 //返回
47     else if (current->euid==inode->i_uid)     //当前进程的euid == inode的uid
48         mode >>= 6;
49     else if (current->egid==inode->i_gid)     //当前进程的egid == inode的gid
50         mode >>= 3;
51     if (((mode & mask & 0007) == mask) || suser())//如果mode与mask的判断通过或者是超级用户，返回1
52         return 1;
53     return 0;
54 }

下面介绍的函数涉及到struct dir_entry，所以我们先来看一下：

157 struct dir_entry {
158 unsigned short inode; //目录对应的inode
159 char name[NAME_LEN]; //目录名
160 };

56 /*
57 * ok, we cannot use strncmp, as the name is not in our data space.
58 * Thus we'll have to use match. No big problem. Match also makes
59 * some sanity tests.
60 *
61 * NOTE! unlike strncmp, match returns 1 for success, 0 for failure.
62 */
63 static int match(int len,const char * name,struct dir_entry * de)
64 {
65     register int same __asm__("ax");
66
67     if (!de || !de->inode || len > NAME_LEN)
68         return 0;

69 if (len < NAME_LEN && de->name[len])
70 return 0;

这个情况下，参数中指定的name长度小于目录名实际长度，直接返回不匹配0.
71     __asm__("cld\n\t"
72         "fs ; repe ; cmpsb\n\t"
73         "setz %%al"
74         :"=a" (same)
75         :"0" (0),"S" ((long) name),"D" ((long) de->name),"c" (len)
76         :"cx","di","si");
77     return same;
78 }

80 /*
81 * find_entry()
82 *
83 * finds an entry in the specified directory with the wanted name. It
84 * returns the cache buffer in which the entry was found, and the entry
85 * itself (as a parameter - res_dir). It does NOT read the inode of the
86 * entry - you'll have to do that yourself if you want to.
87 *
88 * This also takes care of the few special cases due to '..'-traversal
89 * over a pseudo-root and a mount point.
90 */
91 static struct buffer_head * find_entry(struct m_inode ** dir,
92     const char * name, int namelen, struct dir_entry ** res_dir)
93 {
94     int entries;
95     int block,i;
96     struct buffer_head * bh;
97     struct dir_entry * de;
98     struct super_block * sb;
99
100 #ifdef NO_TRUNCATE
101     if (namelen > NAME_LEN)
102         return NULL;
103 #else         //这时会截取字符串
104     if (namelen > NAME_LEN)
105         namelen = NAME_LEN;
106 #endif

107 entries = (*dir)->i_size / (sizeof (struct dir_entry));

如果inode是目录的话，那么它的内容将以目录项dir_entry的形式存放。这里是用来计算目录项个数的

108     *res_dir = NULL;
109     if (!namelen)
110         return NULL;

111 /* check for '..', as we might have to do some "magic" for it */
112     if (namelen==2 && get_fs_byte(name)=='.' && get_fs_byte(name+1)=='.') { // ".." 的情况
113 /* '..' in a pseudo-root results in a faked '.' (just change namelen) */
114         if ((*dir) == current->root)     //如果指定目录是当期进程的伪根目录
115             namelen=1;                            //这时".."应变为"."
116         else if ((*dir)->i_num == ROOT_INO) {     //如果指定目录是安装点
117 /* '..' over a mount-point results in 'dir' being exchanged forthe mounted
118    directory-inode. NOTE! We set mounted, so that we can iput the new dir */
在安装点上,".."会导致目录变为安装目录
119             sb=get_super((*dir)->i_dev);
120             if (sb->s_imount) {
121                 iput(*dir);
122                 (*dir)=sb->s_imount;
123                 (*dir)->i_count++;
124             }
125         }
126     }

127     if (!(block = (*dir)->i_zone[0]))
128         return NULL;
先取出第一个块号
129     if (!(bh = bread((*dir)->i_dev,block)))
130         return NULL;
读取该块
131     i = 0;
132     de = (struct dir_entry *) bh->b_data;
转为dir_entry

133     while (i < entries) {     //遍历每个entry
134         if ((char *)de >= BLOCK_SIZE+bh->b_data) {     //如果已经搜索完了整个块
135             brelse(bh);                             //释放该块
136             bh = NULL;
137             if (!(block = bmap(*dir,i/DIR_ENTRIES_PER_BLOCK)) ||
138                 !(bh = bread((*dir)->i_dev,block))) {

137行，首先根据dir中的块号计算实际块号（在磁盘中的实际块号）

然后读取该块。如果存在块号为0或者bh为NULL，则执行下面139,140行（这一块内没有存放目录或文件）
139                 i += DIR_ENTRIES_PER_BLOCK;
140                 continue;
141             }
142             de = (struct dir_entry *) bh->b_data;
143         }
144         if (match(namelen,name,de)) {
145             *res_dir = de;
146             return bh;
147         }
如果匹配了，就把该dir_entry保存到res_dir中，返回该bh
148         de++;
149         i++;
如果当前res_dir不是，继续搜索下一个
150     }//while
151     brelse(bh);
152     return NULL;

遍历完还没找到就放回bh，返回NULL
153 }

155 /*
156 * add_entry()
157 *
158 * adds a file entry to the specified directory, using the same
159 * semantics as find_entry(). It returns NULL if it failed.
160 *
161 * NOTE!! The inode part of 'de' is left at 0 - which means you
162 * may not sleep between calling this and putting something into
163 * the entry, as someone else might have used it while you slept.
164 */
165 static struct buffer_head * add_entry(struct m_inode * dir,
166     const char * name, int namelen, struct dir_entry ** res_dir)
167 {
168     int block,i;
169     struct buffer_head * bh;
170     struct dir_entry * de;
171
172     *res_dir = NULL;
173 #ifdef NO_TRUNCATE
174     if (namelen > NAME_LEN)
175         return NULL;
176 #else
177     if (namelen > NAME_LEN)
178         namelen = NAME_LEN;
179 #endif
180     if (!namelen)
181         return NULL;
182     if (!(block = dir->i_zone[0]))
183         return NULL;
184     if (!(bh = bread(dir->i_dev,block)))
185         return NULL;
186     i = 0;
187     de = (struct dir_entry *) bh->b_data;

上面与 find_entry都是一样的

188     while (1) {
189         if ((char *)de >= BLOCK_SIZE+bh->b_data) { //如果一个逻辑块遍历完成
190             brelse(bh);              //把遍历完的块放回
191             bh = NULL;
192             block = create_block(dir,i/DIR_ENTRIES_PER_BLOCK);     //获取下一个块号
193             if (!block)
194                 return NULL;
195             if (!(bh = bread(dir->i_dev,block))) {     //读入下一个块
196                 i += DIR_ENTRIES_PER_BLOCK;     //如果下一个块不存在，跳过，需要更新i
197                 continue;
198             }
199             de = (struct dir_entry *) bh->b_data;
200         }

201         if (i*sizeof(struct dir_entry) >= dir->i_size) {
202             de->inode=0;
203             dir->i_size = (i+1)*sizeof(struct dir_entry);
204             dir->i_dirt = 1;
205             dir->i_ctime = CURRENT_TIME;
206         }
201行为true说明指定的目录没有删除的空目录项，现在要向它添加一个目录项，因此203行增加它的大小。202行暂时把目录项的inode设为0.置位脏标记，修改i_ctime。
207         if (!de->inode) {
208             dir->i_mtime = CURRENT_TIME;
209             for (i=0; i < NAME_LEN ; i++)
210                 de->name[i]=(i<namelen)?get_fs_byte(name+i):0;
211             bh->b_dirt = 1;
212             *res_dir = de;
213             return bh;
214         }
207行为true，说明找到了满足条件的目录项，它或许是由于之前删除而留下的空项，或者是由于我们在201的if中为该目录新添加的。208行修改它的mtime；209-210行为其name属性赋值，211行标记bh为脏；212把目录项保存到res_dir；213返回该bh
215         de++;
216         i++;
如果当前项不符合，215,216行递增地址和计数，准备遍历下一个
217     }
218     brelse(bh);
219     return NULL;
220 }

222 /*
223 * get_dir()
224 *
225 * Getdir traverses the pathname until it hits the topmost directory.
226 * It returns NULL on failure.
227 */
228 static struct m_inode *get_dir(const char * pathname)
229 {
230     char c;
231     const char * thisname;
232     struct m_inode * inode;
233     struct buffer_head * bh;
234     int namelen,inr,idev;
235     struct dir_entry * de;
236
237     if (!current->root || !current->root->i_count)
238         panic("No root inode");

239 if (!current->pwd || !current->pwd->i_count)
240 panic("No cwd inode");

241     if ((c=get_fs_byte(pathname))=='/') {
242         inode = current->root;
243         pathname++;
244     } else if (c)
245         inode = current->pwd;
246     else
247         return NULL;    /* empty name is bad */
因为是要获取the topmost directory，所以只要pathname中第一个字符为'/'，inode就设为根目录

否则如果不为空，inode就设为当前工作目录为空返回NULL

248 inode->i_count++; //递增引用计数

249     while (1) {
250         thisname = pathname;
251         if (!S_ISDIR(inode->i_mode) || !permission(inode,MAY_EXEC)) {     //权限检查
252             iput(inode);
253             return NULL;
254         }
255         for(namelen=0;(c=get_fs_byte(pathname++))&&(c!='/');namelen++)
256             /* nothing */ ;
243行如果pathname以/开头，则已经执行过++操作了。因此这for循环就是找pathname中的各个部分
257         if (!c)
258             return inode;

c为NULL，说明遍历完成了，这时我们已经把结果保存到inode，直接返回即可
259         if (!(bh = find_entry(&inode,thisname,namelen,&de))) {
260             iput(inode);
261             return NULL;
262         }

寻找目录项，找不到的话就返回NULL
263         inr = de->inode;         //记录目录项对应的inode，下次循环还会用到
264         idev = inode->i_dev;
265         brelse(bh);
266         iput(inode);          //放回inode（前面已经用过了）
267         if (!(inode = iget(idev,inr)))     //如果找不到对应的inode，直接返回NULL
268             return NULL;
269     }
270 }

272 /*
273 * dir_namei()
274 *
275 * dir_namei() returns the inode of the directory of the
276 * specified name, and the name within that directory.
277 */
278 static struct m_inode * dir_namei(const char * pathname,
279     int * namelen, const char ** name)
280 {
281     char c;
282     const char * basename;
283     struct m_inode * dir;
284
285     if (!(dir = get_dir(pathname)))
286         return NULL;

首先获取目录inode
287     basename = pathname;
288     while (c=get_fs_byte(pathname++))
289         if (c=='/')
290             basename=pathname;
取得最后的文件名（比如/etc/passwd，这里basename就是passwd）
291     *namelen = pathname-basename-1;
保存文件名的长度
292     *name = basename;
保存文件名
293     return dir;
294 }

296 /*
297 * namei()
298 *
299 * is used by most simple commands to get the inode of a specified name.
300 * Open, link etc use their own routines, but this is enough for things
301 * like 'chmod' etc.
302 */
303 struct m_inode * namei(const char * pathname)
304 {
305     const char * basename;
306     int inr,dev,namelen;
307     struct m_inode * dir;
308     struct buffer_head * bh;
309     struct dir_entry * de;
310
311     if (!(dir = dir_namei(pathname,&namelen,&basename)))
312         return NULL;

获取目录inode

313 if (!namelen) /* special case: '/usr/' etc */
314 return dir;

315 bh = find_entry(&dir,basename,namelen,&de);

寻找目录项inode
316     if (!bh) {
317         iput(dir);
318         return NULL;
319     }
如果bh为空，放回dir

320 inr = de->inode;
321 dev = dir->i_dev;

获取目录项的inode和设备号
322 brelse(bh);

现在可以释放bh了，因为我们已经获得了需要的inode以及设备号，bh用不到了
323 iput(dir);
324 dir=iget(dev,inr);

获取目录项对应的inode（当然，目录项可能是目录也可能是文件）
325 if (dir) {
326 dir->i_atime=CURRENT_TIME; //更新访问时间
327 dir->i_dirt=1; //脏标记
328 }
329 return dir;
330 }

332 /*
333 * open_namei()
334 *
335 * namei for open - this is in fact almost the whole open-routine.
336 */
337 int open_namei(const char * pathname, int flag, int mode,
338     struct m_inode ** res_inode)
339 {
340     const char * basename;
341     int inr,dev,namelen;
342     struct m_inode * dir, *inode;
343     struct buffer_head * bh;
344     struct dir_entry * de;
345
346     if ((flag & O_TRUNC) && !(flag & O_ACCMODE))
347         flag |= O_WRONLY;

O_ACCMODE<0003>：读写文件操作时，用于取出flag的低2位

O_RDONLY<00>：只读打开

O_WRONLY<01>：只写打开

O_RDWR<02>：读写打开

348 mode &= 0777 & ~current->umask;
349 mode |= I_REGULAR;

350 if (!(dir = dir_namei(pathname,&namelen,&basename)))
351 return -ENOENT;

获取目录inode
352     if (!namelen) {         /* special case: '/usr/' etc */
353         if (!(flag & (O_ACCMODE|O_CREAT|O_TRUNC))) {
354             *res_inode=dir;
355             return 0;
356         }
357         iput(dir);
358         return -EISDIR;
359     }

360     bh = find_entry(&dir,basename,namelen,&de);

获取目录项

361 if (!bh) {

//下面是没有读到目录项的情况
362         if (!(flag & O_CREAT)) {
363             iput(dir);
364             return -ENOENT;
365         }

如果没有指定不存在时创建文件，那么放回dir，返回。

366         if (!permission(dir,MAY_WRITE)) {
367             iput(dir);
368             return -EACCES;
369         }
如果指定不存在时创建文件，但是没有权限，返回。
370         inode = new_inode(dir->i_dev);
371         if (!inode) {
372             iput(dir);
373             return -ENOSPC;
374         }
否则就创建一个inode，如果创建失败，返回

375         inode->i_uid = current->euid;
376         inode->i_mode = mode;
377         inode->i_dirt = 1;
设置它的属性

378 bh = add_entry(dir,basename,namelen,&de);

添加到目录中

379         if (!bh) {     //添加失败
380             inode->i_nlinks--;
381             iput(inode);
382             iput(dir);
383             return -ENOSPC;
384         }

385         de->inode = inode->i_num;
386         bh->b_dirt = 1;
387         brelse(bh);
388         iput(dir);
389         *res_inode = inode;
390         return 0;
391     }

运行到这里说明读到了目录项
392     inr = de->inode;
393     dev = dir->i_dev;
394     brelse(bh);
395     iput(dir);

396 if (flag & O_EXCL) //独占标记
397 return -EEXIST;

398 if (!(inode=iget(dev,inr)))
399 return -EACCES;

取得inode，取不到的话，返回NULL

400     if ((S_ISDIR(inode->i_mode) && (flag & O_ACCMODE)) ||
401         !permission(inode,ACC_MODE(flag))) {
402         iput(inode);
403         return -EPERM;
404     }

如果取得的inode是目录并且访问权限为只写或读写或者没有访问权限，则放回inode

405 inode->i_atime = CURRENT_TIME;

406     if (flag & O_TRUNC)
407         truncate(inode);
408     *res_inode = inode;
409     return 0;
410 }

总结

目录用dir_entry描述，包括inode和目录名。

根据路径找到文件的inode节点是一个非常重要的操作（namei），这个过程比较耗时，因为需要一层层的遍历目录。我们知道inode节点的i_zone数组可以用来保存磁盘块号。对于目录来说，其inode节点中保存的就是dir_entry,因此当我们需要寻找一个目录下的文件（或目录）时，需要依次读入这些磁盘块根据文件（目录）名进行比较（match），需要注意一些特殊情况，比如路径中没有指定根目录时需要以工作目录作为参考，在安装点上,".."会导致目录变为安装目录等，另外一点就是权限问题。还有一个比较重要的操作就是向目录中添加项（add_entry），同样需要搜索inode的数据块，找到一个合适的位置插入。

3.truncate.c

1 /*
2 * linux/fs/truncate.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 #include <linux/sched.h>
8
9 #include <sys/stat.h>

11 static void free_ind(int dev,int block)
12 {
13     struct buffer_head * bh;
14     unsigned short * p;
15     int i;
16
17     if (!block)
18         return;
19     if (bh=bread(dev,block)) {
20         p = (unsigned short *) bh->b_data;
21         for (i=0;i<512;i++,p++)
22             if (*p)
23                 free_block(dev,*p);
24         brelse(bh);
25     }
26     free_block(dev,block);
27 }
我们看到p是short类型的指针，说明22行中*p是一个short数据，通过23行可以看出它代表block号。这其实是用来释放文件对应的块的。因为我们知道inode节点中有一个属性如下

100 unsigned short i_zone[9];
它代表文件所占用的块号数组。其中i_zone[0] - i_zone[7]是直接块号，i_zone[7]是一次间接块号，i_zone[8]是二次间接块号。

上面这个函数就是用来释放一次间接块号的。

29 static void free_dind(int dev,int block)
30 {
31     struct buffer_head * bh;
32     unsigned short * p;
33     int i;
34
35     if (!block)
36         return;
37     if (bh=bread(dev,block)) {
38         p = (unsigned short *) bh->b_data;
39         for (i=0;i<512;i++,p++)
40             if (*p)
41                 free_ind(dev,*p);
42         brelse(bh);
43     }
44     free_block(dev,block);
45 }
释放二次间接块号，这里调用了前面的释放一次间接块号的函数。

47 void truncate(struct m_inode * inode)
48 {
49     int i;
50
51     if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
52         return;

常规文件或者目录才可以调用此函数

53     for (i=0;i<7;i++)
54         if (inode->i_zone[i]) {
55             free_block(inode->i_dev,inode->i_zone[i]);
56             inode->i_zone[i]=0;
57         }
释放直接块
58     free_ind(inode->i_dev,inode->i_zone[7]);
59     free_dind(inode->i_dev,inode->i_zone[8]);
释放一次和二次间接快
60     inode->i_zone[7] = inode->i_zone[8] = 0;
61     inode->i_size = 0;
文件大小变为0
62     inode->i_dirt = 1;
标记为脏
63     inode->i_mtime = inode->i_ctime = CURRENT_TIME;
这会改变文件的修改时间i_mtime和改变时间i_ctime。
64 }

总结

这个文件整体上是比较简单的。包括三个函数释放一级间接块，释放二级间接块，截取inode。对于直接块来说，i_zone中相应保存的就是inode所使用的磁盘块。一级间接块，顾名思义，i_zone指明的块中存放的不是普通数据，而是块号。因此对一级间接块的释放操作就是读取一级间接块，遍历其中每一个块调用free_block进行释放；对于二级间接块读取一级间接块后就可以转换为对一级间接块的释放操作。truncate操作则是对inode的所有块进行释放，最后设置其大小为0. 此外留意一下对于inode的操作，其atime，ctime，mtime是如何变化的。

4.open.c

1 /*
2 * linux/fs/open.c
3 *
4 * (C) 1991 Linus Torvalds
5 */

7 #include <string.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <sys/types.h>
11 #include <utime.h>
12 #include <sys/stat.h>
14 #include <linux/sched.h>
15 #include <linux/tty.h>
16 #include <linux/kernel.h>
17 #include <asm/segment.h>

19 int sys_ustat(int dev, struct ustat * ubuf)
20 {
21     return -ENOSYS;
22 }
23
24 int sys_utime(char * filename, struct utimbuf * times)
25 {
26     struct m_inode * inode;
27     long actime,modtime;
28
29     if (!(inode=namei(filename)))
30         return -ENOENT;
31     if (times) {
32         actime = get_fs_long((unsigned long *) &times->actime);
33         modtime = get_fs_long((unsigned long *) &times->modtime);
34     } else
35         actime = modtime = CURRENT_TIME;
36     inode->i_atime = actime;
37     inode->i_mtime = modtime;
38     inode->i_dirt = 1;
39     iput(inode);
40     return 0;
41 }
更新参数中指明的文件的事件，如果times不为空，就用times来更新；否则就用当前时间来更新

43 /*
44 * XXX should we use the real or effective uid? BSD uses the real uid,
45 * so as to make this call useful to setuid programs.
46 */
47 int sys_access(const char * filename,int mode)
48 {
49 struct m_inode * inode;
50 int res, i_mode;
52 mode &= 0007;
53 if (!(inode=namei(filename)))
54 return -EACCES;

55 i_mode = res = inode->i_mode & 0777;

取得参数中文件的i_mode属性
56 iput(inode);
57 if (current->uid == inode->i_uid)
58 res >>= 6;
59 else if (current->gid == inode->i_gid)
60 res >>= 6;

根据当前用户与inode的关系取得相应的mode，保存到res中
61 if ((res & 0007 & mode) == mode)
62 return 0;

63 /*
64 * XXX we are doing this test last because we really should be
65 * swapping the effective with the real user id (temporarily),
66 * and then calling suser() routine. If we do call the
67 * suser() routine, it needs to be called last.
68 */
69 if ((!current->uid) &&
70 (!(mode & 1) || (i_mode & 0111)))
71 return 0;
72 return -EACCES;
73 }

75 int sys_chdir(const char * filename)
76 {
77 struct m_inode * inode;
78
79 if (!(inode = namei(filename)))
80 return -ENOENT;
81 if (!S_ISDIR(inode->i_mode)) {
82 iput(inode);
83 return -ENOTDIR;
84 }
85 iput(current->pwd);
86 current->pwd = inode;
87 return (0);
88 }

90 int sys_chroot(const char * filename)
91 {
92 struct m_inode * inode;
93
94 if (!(inode=namei(filename)))
95 return -ENOENT;
96 if (!S_ISDIR(inode->i_mode)) {
97 iput(inode);

98 return -ENOTDIR;
99 }
100 iput(current->root);
101 current->root = inode;
102 return (0);
103 }

105 int sys_chmod(const char * filename,int mode)
106 {
107 struct m_inode * inode;
108
109 if (!(inode=namei(filename)))
110 return -ENOENT;
111 if ((current->euid != inode->i_uid) && !suser()) {
112 iput(inode);
113 return -EACCES;
114 }
115 inode->i_mode = (mode & 07777) | (inode->i_mode & ~07777);
116 inode->i_dirt = 1;
117 iput(inode);
118 return 0;
119 }
120
121 int sys_chown(const char * filename,int uid,int gid)
122 {
123 struct m_inode * inode;
125 if (!(inode=namei(filename)))
126 return -ENOENT;
127 if (!suser()) {
128 iput(inode);
129 return -EACCES;
130 }
131 inode->i_uid=uid;
132 inode->i_gid=gid;
133 inode->i_dirt=1;
134 iput(inode);
135 return 0;
136 }

138 int sys_open(const char * filename,int flag,int mode)
139 {
140    struct m_inode * inode;
141     struct file * f;
142     int i,fd;

144     mode &= 0777 & ~current->umask;
145     for(fd=0 ; fd<NR_OPEN ; fd++)
146         if (!current->filp[fd])
147             break;
148     if (fd>=NR_OPEN)
149         return -EINVAL;
150    current->close_on_exec &= ~(1<<fd);

151     f=0+file_table;
152     for (i=0 ; i<NR_FILE ; i++,f++)
153         if (!f->f_count) break;

154     if (i>=NR_FILE)
155     return -EINVAL;

156     (current->filp[fd]=f)->f_count++;
递增引用计数
157     if ((i=open_namei(filename,flag,mode,&inode))<0) {
158         current->filp[fd]=NULL;
159         f->f_count=0;
160         return i;
161     }

162     /* ttys are somewhat special (ttyxx major==4, tty major==5) */
163     if (S_ISCHR(inode->i_mode))
164         if (MAJOR(inode->i_zone[0])==4) {
165             if (current->leader && current->tty<0) {
166                 current->tty = MINOR(inode->i_zone[0]);
167                 tty_table[current->tty].pgrp = current->pgrp;
                 对于字符型设备，如果当前进程是组长进程并且设备没有终端

166行就设置它的tty为inode的设备号

               167行设置当前进程tty表中与当前tty对应的表项的父进程组号等于当前进程组号
168             }
169         } else if (MAJOR(inode->i_zone[0])==5)
170             if (current->tty<0) {     //没有终端，出错返回
171                     iput(inode);
172                     current->filp[fd]=NULL;
173                     f->f_count=0;
174                     return -EPERM;
175             }

176         /* Likewise with block-devices: check for floppy_change */
177         if (S_ISBLK(inode->i_mode))
178             check_disk_change(inode->i_zone[0]);

179        f->f_mode = inode->i_mode;
180        f->f_flags = flag;
181         f->f_count = 1;
182         f->f_inode = inode;
183         f->f_pos = 0;
184         return (fd);
185 }

187 int sys_creat(const char * pathname, int mode)
188 {
189     return sys_open(pathname, O_CREAT | O_TRUNC, mode);
190 }

192 int sys_close(unsigned int fd)
193 {
194     struct file * filp;
196     if (fd >= NR_OPEN)
197         return -EINVAL;
198     current->close_on_exec &= ~(1<<fd);
199     if (!(filp = current->filp[fd]))
200         return -EINVAL;
201     current->filp[fd] = NULL;
202    if (filp->f_count == 0)
203         panic("Close: file count is 0");
204    if (--filp->f_count)
205         return (0);

上面看到我们释放了filp[fd]，还记得我们在打开文件时，从系统file_table中搜索一个引用计数为0的file，现在我们在关闭时对应的递减了它的引用计数，使得它可以重新被使用。
206 iput(filp->f_inode);
207 return (0);
208 }

另外还有一点需要说明一下，我们知道file_table是一个全局数组，用于存放系统中file，但是C语言定义一个数组时默认是不会对其进行初始化的，它是在fs/super.c文件的mount_root中初始化的

250 for(i=0;i<NR_FILE;i++)
251 file_table[i].f_count=0;
可以看到，只初始化f_count为0即可，我们在上面的sys_open中也看到了，它判断一个file空闲的标准就是根据这里的f_count是否为0

总结

大部分函数都是对inode的属性或者根据inode的属性进行一些操作或设置。看一下sys_open，它执行打开操作，操作对象可能是常规文件，也可能是设备等，打开的文件都要对应一个file，因此先从file_table中寻找空闲file。然后根据文件名执行打开操作，这个我们之前已经详细分析过了。操作成功后，如果打开的是tty设备则进行一些设置操作，最后初始化file的其他几个属性。对于sys_close，它根据指定的文件描述符清除close_on_exec中的相应的位，设current->filp[fd]为NULL，并递减相应文件的引用计数。

0 0