proc文件系统_每进程信息形成原理、目录遍历方式、位图查找

来源：互联网发布：淘宝时尚女装店名编辑：程序博客网时间：2024/05/29 08:05

我们知道在linux中，proc系统中对于每个进程都有一个进程相关的目录，里面描述了该进程各个方面详细的信息，本文探讨3个问题：
1： /proc目录下每进程子目录的形成[动态遍历当前进程列表形成]
2：每进程子目录下子目录/子文件的形成[静态数组]
3：/proc/net/子目录下子目录、子文件形成[系统初始化时形成]
内容：点击打开链接
0 首先介绍proc_dir_entry的层次结构
struct proc_dir_entry {
unsignedint low_ino;
unsignedshort namelen;
constchar *name;
mode_tmode;
nlink_tnlink;
uid_tuid;
gid_tgid;
loff_tsize;
const struct inode_operations *proc_iops;
/*
* NULL ->proc_fops means "PDE is goingaway RSN" or
* "PDE is just created". In eithercase, e.g. ->read_proc won't be
* called because it's too late or too early,respectively.
*
* If you're allocating ->proc_fopsdynamically, save a pointer
* somewhere.
*/
const struct file_operations *proc_fops;
structmodule *owner;
struct proc_dir_entry *next, *parent, *subdir;
…
};
所有的proc目录项都是通过next，parent，subdir组成树状连接结构，目录层次组织如下：
Root
|
A –> B –>C.. –>Root
|
A1–>A2…–>A
1 内核中如何遍历当前进程列表
我们知道当前系统中的所有进程在进程终止前都会存在于各种链表中，但是只有当进程处于就绪态时会被调度器调度执行，否则永远无法被调度；当进程终止时从链表中清除，从而结束其生命过程；linux中的进程有两个派系组成：内核线程，其根线程是kthreadd线程，进程号是2；用户进程，其根进程是init进程，进程号是1,1号和2号进程是并列的，在系统初始化后期有内核同时创建；用pstree无法看到kthreadd派系，用ps、top等命令、以及在
Proc系统下可以看到当前所有的用户进程和内核线程，遍历原理如下：
1.1 proc顶层目录结构的接口
/proc目录的遍历函数位由proc目录的file_operations->readdir函数实现
static const struct file_operationsproc_root_operations = {
.read = generic_read_dir,
.readdir =proc_root_readdir,
};
注：proc目录下的内容大致如下：

该目录下的文件形成有以下特点：
1 内容一部分应该是静态形成的，比如fs目录，fb文件等等，这部分子目录、子文件在系统初始化时候，应该挂载在proc目录对应的proc_dir_entry链表下；
2 另外还有.和..子目录，分别是对当前目录和父目录的连接[ls –al可以显示]，内核对于上述猜测的实现为;
3 由数字组成的子目录显然是在每次读取proc内容时动态生成的，其表示当前系统所有进程的一些信息情况；
static int proc_root_readdir(struct file * filp,void* dirent, filldir_t filldir)
{
unsignedint nr = filp->f_pos;
intret;
lock_kernel();
if(nr < FIRST_PROCESS_ENTRY) {
int error =proc_readdir(filp, dirent, filldir); //1,2部分形成原理
if(error <= 0) {
unlock_kernel();
returnerror;
}
filp->f_pos= FIRST_PROCESS_ENTRY;
}
unlock_kernel();
ret = proc_pid_readdir(filp, dirent, filldir); //3部分形成
returnret;
}
系统中对于一个目录有多中读取子目录的方式，比如ls和ls –al显示的结果不同，这是由传入过程中对file->f_ops设定不同的偏移决定的，对于proc根目录而言，有以下特定：
f_ops = 0 为.目录链接，接连接到自身
f_ops = 1 为..目录链接，接连接到父目录
f_ops=[ 2 --- (FIRST_PROCESS_ENTRY-1) ]为proc下的静态目录或者静态文件
f_ops=[ FIRST_PROCESS_ENTRY --- (FIRST_PROCESS_ENTRY+ ARRAY_SIZE(proc_base_stuff)-1) ] 为self子目录内容
f_ops=[ FIRST_PROCESS_ENTRY+ ARRAY_SIZE(proc_base_stuff)] 为init_task即0号初始进程，至少可以这么认为
f_pos = PID_MAX_LIMIT + TGID_OFFSET; 标志着目录遍历结束， FIRST_PROCESS_ENTRY= 256
proc_readdir(filp, dirent, filldir)的实现比较简单，下面分析proc_pid_readdir(filp,dirent, filldir)的实现过程：
int proc_pid_readdir(struct file * filp,void * dirent, filldir_t filldir)
{
unsignedint nr = filp->f_pos - FIRST_PROCESS_ENTRY; //nr ==0
structtask_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);//init_task
structtgid_iter iter;
structpid_namespace *ns;
if(!reaper)
gotoout_no_task;

for (; nr <ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
const struct pid_entry *p =&proc_base_stuff[nr];
if(proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
goto out;
} //self子目录，不用管
ns= filp->f_dentry->d_sb->s_fs_info;
iter.task= NULL;
iter.tgid= filp->f_pos - TGID_OFFSET;
for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1,iter = next_tgid(ns, iter)) {
filp->f_pos = iter.tgid + TGID_OFFSET;
if (proc_pid_fill_cache(filp, dirent, filldir,iter) < 0) {
put_task_struct(iter.task);
goto out;
}
} //关键，通过struct tgid_iter iter包装结构，来遍历所有当前进程，然后通过proc_pid_fill_cache(filp, dirent, filldir, iter)来建立每进程目录层次结构。
filp->f_pos= PID_MAX_LIMIT + TGID_OFFSET;
out:
put_task_struct(reaper);
out_no_task:
return0;
}
1.2 当前进程的遍历过程
for (iter = next_tgid(ns, iter);iter.task;iter.tgid+= 1, iter = next_tgid(ns, iter))
其中struct pid_namespace * ns = filp->f_dentry->d_sb->s_fs_info为pid结构的命名空间，可认为系统唯一。
遍历过程如下：首先获取进程号对应的struct pid结构，如下所示：
pid = find_ge_pid(iter.tgid, ns);如果pid存在，说明找到了一个进程，填写iter的进程号和进程task_struct结构，并返回；
由于按照进程号从小到大依次遍历的，如果没找到进程，说明进程遍历完成。
原函数实现如下：
/*
*Used by proc to find the first pid that is greater then or equal to nr.
*
* Ifthere is a pid at nr this function is exactly the same as find_pid_ns.
*/
struct pid *find_ge_pid(int nr, structpid_namespace *ns)
{
structpid *pid;
do{
pid = find_pid_ns(nr, ns);
if(pid)
break;
nr = next_pidmap(ns, nr);
}while (nr > 0);
returnpid;
}
1.2.1 进程struct pid的查找，根据pid变量填写task_struct变量和全局进程号信息
struct pid *find_pid_ns(int nr, structpid_namespace *ns)
{
structhlist_node *elem;
structupid *pnr;
hlist_for_each_entry_rcu(pnr, elem,
&pid_hash[pid_hashfn(nr, ns)],pid_chain)
if(pnr->nr == nr && pnr->ns == ns)
returncontainer_of(pnr, struct pid,
numbers[ns->level]);
returnNULL;
}
注:由于内核实现了命名空间，大大增加了内核复杂性，为了查找效率，进程号nr和其struct pid 结构变量的查找采用hash表实现，各个进程号nr对应的struct upid结构变量通过pid_chain链表挂接在链表头&pid_hash[pid_hashfn(nr, ns)]中，struct upid表示在当前pid命令空间中的局部进程号等信息，找到该nr对应的struct upid结构后，由于struct upid嵌入到该进程对应的struct pid中，所以很容易找到进程在全局范围的struct pid结构。
1.2.2 查找下一个存在进程的全局进程号nr
int next_pidmap(structpid_namespace *pid_ns, int last)
{
intoffset;
structpidmap *map, *end;
offset= (last + 1) & BITS_PER_PAGE_MASK;
map= &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];//该offset所在的页面
end= &pid_ns->pidmap[PIDMAP_ENTRIES]; //共8页，所以指向位置7出
for(; map < end; map++, offset = 0) {
if(unlikely(!map->page))
continue;
offset= find_next_bit((map)->page, BITS_PER_PAGE,offset);
if(offset < BITS_PER_PAGE)
return mk_pid(pid_ns, map, offset);
}
return-1;
}
1页大小为4KB，即BITS_PER_PAGE = 4096* 8bit
注: 当前系统中所有进程的全局进程号的分配情况记录在8个缓存页面上，大约25万6千多个。
#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : (sizeof(long)> 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
这些缓存页面存在于pid_namespace->pidmap数组中，每个变量的结构如下：
struct pidmap {
atomic_t nr_free;
void *page;
};
1.2.3 在一个页面遍历位图原理(每个当前进程对应位图中该位为1)：
原理：对一个页面的查找，分两步：
1：将页面划分为以unsigned long大小为单元，定位下一个比特位单元
2：在该32位内部，确定下一个bit偏移位置
/*
*Find the next set bit in a memory region.
*/
unsigned long find_next_bit(const unsignedlong *addr, unsigned long size, unsigned long offset)
{
constunsigned long *p = addr + BITOP_WORD(offset); //offset位图所在的地址
unsignedlong result = offset & ~(BITS_PER_LONG-1); // result = offset位图所在的整数字在一页中的偏移量，即nr_unisgned_long*32,size此时为 1024 * 32
unsignedlong tmp;

if(offset >= size)
returnsize;
size-= result; // size = 4096*8 - nr_unsigned_long * 32
offset%= BITS_PER_LONG; //在该unsigned long上的32位偏移
if(offset) {
tmp= *(p++); //tmp 为该位图所在的32位数值
tmp&= (~0UL << offset); //将其最低offset位清0，保留高位
if(size < BITS_PER_LONG)
gotofound_first;
if(tmp) //tmp 存在，则该32位数值内有大于offset的值，找到新的非0标志位
goto found_middle;
size-= BITS_PER_LONG;
result+= BITS_PER_LONG;
}
while(size & ~(BITS_PER_LONG-1)) { //当size大于32，注：size初始= 1024*32
if ((tmp = *(p++))) //关键，tmp依次循环遍历该页面的32位区间，如果遇见非0，则找到了
gotofound_middle;
result+= BITS_PER_LONG;
size-= BITS_PER_LONG;
}
if(!size)
returnresult; //该页面没有找到位图为1的比特位，返回1024*32
tmp= *p;
found_first:
tmp&= (~0UL >> (BITS_PER_LONG - size));
if(tmp == 0UL) /* Are anybits set? */
returnresult + size; /* Nope. */
found_middle:
returnresult + __ffs(tmp); //tmp 为含有非0比特的对应的32位数值
//返回在该页面内的下一个号的偏移位，其中result = 32 * nr_unsigned_long
}
EXPORT_SYMBOL(find_next_bit);
1.2.4 32位内部确定下一个bit位置的函数
/**
*__ffs - find first bit in word.
*@word: The word to search
*
*Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsignedlong word)
{
intnum = 0;

#if BITS_PER_LONG == 64
if((word & 0xffffffff) == 0) {
num+= 32;
word>>= 32;
}
#endif
if((word & 0xffff) == 0) {
num+= 16;
word>>= 16;
}
if((word & 0xff) == 0) {
num+= 8;
word>>= 8;
}
if((word & 0xf) == 0) {
num+= 4;
word>>= 4;
}
if((word & 0x3) == 0) {
num+= 2;
word>>= 2;
}
if((word & 0x1) == 0)
num+= 1;
returnnum;
}
1.3 每进程结构的建立
static int proc_pid_fill_cache(struct file*filp, void *dirent, filldir_t filldir,
structtgid_iter iter)
{
charname[PROC_NUMBUF];
int len = snprintf(name, sizeof(name), "%d",iter.tgid); //进程名名的形成过程
returnproc_fill_cache(filp, dirent, filldir, name, len,
proc_pid_instantiate, iter.task, NULL);
}
通过proc_pid_instantiate建立inode结构，并填写inode_operations和file_operations结构, 其file_opeartions结构的readdir为proc_tgid_base_operations，为每进程遍历提供了条件。然后在proc_fill_cache中建立dentry结构，并与父目录形成层次结构。
2 每进程信息的实现原理：
2.1 在proc/base.c文件中：
proc/num下文件布局如下：

内核静态定义了每进程的子目录结构数组如下：
/*
* Thread groups
*/
static const struct file_operationsproc_task_operations;
static const struct inode_operationsproc_task_inode_operations;
static const structpid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, task),
DIR("fd", S_IRUSR|S_IXUSR, fd),
DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, net),
#endif
REG("environ", S_IRUSR, environ),
INF("auxv", S_IRUSR, pid_auxv),
ONE("status", S_IRUGO, pid_status),
ONE("personality",S_IRUSR, pid_personality),
INF("limits", S_IRUSR, pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, pid_sched),
#endif
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF("syscall", S_IRUSR, pid_syscall),
#endif
INF("cmdline", S_IRUGO, pid_cmdline),
ONE("stat", S_IRUGO, tgid_stat),
ONE("statm", S_IRUGO, pid_statm),
REG("maps", S_IRUGO, maps),
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, numa_maps),
#endif
REG("mem", S_IRUSR|S_IWUSR, mem),
LNK("cwd", cwd),
LNK("root", root),
LNK("exe", exe),
REG("mounts", S_IRUGO, mounts),
REG("mountinfo", S_IRUGO, mountinfo),
REG("mountstats",S_IRUSR, mountstats),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs",S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
REG("pagemap", S_IRUSR, pagemap),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
#endif
#ifdef CONFIG_KALLSYMS
INF("wchan", S_IRUGO, pid_wchan),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, lstats),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif
#ifdef CONFIG_CGROUPS
REG("cgroup", S_IRUGO, cgroup),
#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, loginuid),
REG("sessionid", S_IRUGO, sessionid),
#endif
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail",S_IRUGO|S_IWUSR, fault_inject),
#endif
#if defined(USE_ELF_CORE_DUMP) &&defined(CONFIG_ELF_CORE)
REG("coredump_filter",S_IRUGO|S_IWUSR, coredump_filter),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
INF("io", S_IRUGO, tgid_io_accounting),
#endif
};
2.2 下面以net子目录为例，介绍该proc/num目录下net子目录建立过程：
DIR("net", S_IRUGO|S_IXUGO, net),
前提：
#define DIR(NAME, MODE, OTYPE) \
NOD(NAME,(S_IFDIR|(MODE)), \
&proc_##OTYPE##_inode_operations,&proc_##OTYPE##_operations, \
{})
struct pid_entry {
char*name;
intlen;
mode_tmode;
conststruct inode_operations *iop;
conststruct file_operations *fop;
unionproc_op op;
};
#define NOD(NAME, MODE, IOP, FOP, OP) { \
.name= (NAME), \
.len = sizeof(NAME) - 1, \
.mode= MODE, \
.iop = IOP, \
.fop = FOP, \
.op = OP, \
}
所以该项变为：
static const struct pid_entry[3]= {.., {.name = net,.len=2,
.mode = S_IFDIR| S_IRUGO|S_IXUGO,iop=&proc_net_inode_operations,
.fop = &proc_net_operations,op={},},..}
2.3 proc/num/子目录下文件的安装过程
特定于pid的目录中读取该目录下的所有子目录或者子文件时，使用该目录的file_operations结构完成的，内核使用的pid目录的file_operations为：
staticconst struct file_operations proc_tid_base_operations = {
.read =generic_read_dir,
.readdir = proc_tid_base_readdir,
}; [注:readdir只对目录适用，用于读取该目录内容，即读取特定于目录的子目录或者子文件]，最终调用为：
proc_pident_readdir(filp,dirent,filldir,tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff))，
->proc_pident_fill_cache(filp,dirent, filldir, task, p)[对上述静态数组中每个目录项执行]
->proc_pident_instantiate为每个子目录建立inode等结构。
Proc_pident_fill_cache建立inode后，建立dentry缓存，并将这些信息与父目录建立关系。
2.4下面以dev文件为例，介绍/proc/num/net下，子文件的建立过程
我们知道，net子目录的file_operations为proc_net_operations，其stuct inode_operations为：proc_net_inode_operations，两者其最终调用的是proc_readdir_de(net->proc_net, filp, dirent, filldir),并没有子文件建立过程，说明在文件或者子目录的建立是在系统初始化时候建立的，不是动态建立的，经过观察，每进程的dev文件统计的并不是该进程通过每个网络接口发送或接受的数据量，而是该进程所在的net空间发送或接受的数据量，一般情况下，系统只有一个struct net结构。net目录下的所有目录\文件都是以单链表形式串联在net->proc_net目录下面，该变量为struct proc_dir_entry*结构，正好对应proc文件系统中的一个目录或者文件结构。

Net目录下的结构布局如下（一个net命名空间对应一个net）：

那么这些文件是如何形成的？

其实，这些文件都是在内核代码的net顶层目录下的相应文件中实现的，像dev，arp之类的文件是在对应对象初始化时侯创建的，比如在net/core/dev.c文件中，在dev_proc_net_init函数中proc_net_fops_create(net, "dev", S_IRUGO,&dev_seq_fops)实现，该函数在net->proc_net目录下创建一个dev虚拟文件，且其操作函数为dev_seq_fops，其实现了dev文件的输出操作。