文件系统相关知识以及根据文件系统原理设计内核缓冲区

来源：互联网发布：linux下root破解视频编辑：程序博客网时间：2024/05/06 20:27

进程fd与具体的物理文件是如何联系起来的？

与进程相关的为fd和file_struct
后者有个file结构数组，代表内核中打开的文件实例。
每个file结构里的pos代表本次打开的实例，以及目前读取写入的偏移。

他们的关系可以概括为：
struct file_struct{
struct file* fd_array[MAX_FD];
}

fd是个指针，用来跟踪fd_array里的元素。
而 file结构与inode的关系为多对1，
具体的file对应inode映射为file->address_space->inode

除了file结构可以对应到inode，还有另外一个重要结构dentry可以映射到inode
dentry里的d_inode字段指向具体的inode, d_parent指向父目录的dentry，d_child指向
子目录项链表头，有了父指针，子指针和本身inode指针，我们就可以定位出目录项的具体
位置。那么dentry与inode是1对1关系吗？答案是否，在某些情况下，如硬链接就会出现多个
目录项对应一个inode的情况。

关于mount的意义，即为了生成一个vfsmount对象，并在vfsmount对象里生成一个super block内存对象，
super block里包含多种操作，每次操作open一个文件时生成一个file结构，并把file结构里inode的读写操作
赋值为super block里的操作。

如果系统中已经mount了这个设备的文件系统（注意，是这种文件类型的设备），那么super block就不生成新的，
直接返回此super block；在大多数情况下，之前没有mount过这个设备，则会生成一个super block对象，
并调用这个被打开的块设备上的底层函数，访问磁盘上的超级块信息，
填充新super block字段，最后将此super block的s_instances钩子挂到系统的file_system_type的fs_supers上，
这个fs_supers代表的是同一种文件系统类型的超级块对象链表头，例如/dev/sda1和/dev/sda2都是ext3格式，

然后相继被挂载，生成两个super block对象链接在ext3 filesystem的fs_supers上。

借鉴文件系统page cache的设计思想，我们可以设计内核缓冲区，在高端内存分配页框，

通过kmap_atomic访问，从而达到在物理地址上不连续，

但提供给用户的接口是连续地址。

顺带说明一下kmap，kmap设置的是内核主页表，所有进程共享内核主页表

（具体的流程就是fork时拷贝了内核主页表的pgd条目（可理解为指针）。至于具体的pgd条目，指向的都是共享的pmd,pte）

kmap在系统初始化时，就会一直分配到pte级，

所以后面fork出来的进程访问kmap空间都不缺页，

但vmalloc是会重新生成新的pgd条目，

所有后面的进程内核空间里没有这个地址空间，就会缺页。

那么如果有多个进程同时执行kmap_atomic会不会冲突呢？答案是不会。
因为虽然多个kmap_atomic流程都尝试去修改内核主页表，但是kmap_atomic
获取的虚拟地址是每个cpu互相不冲突的，因为虚拟地址不冲突，从而修改的主页表
的pte位置也不同，也就没有同步的必要。

完整的代码如下：

/* *  mm/critical_buf.c * *  Kernel critical buffer * *  Simple critical buffer  *  for non-blockable environment *  there should only exist one instance at one time * *  Copyright (C) 1985-2012  ZTE * *  2012-08-21  created  *  chenyu105 at gmail dot com */#include <linux/mm.h>#include <linux/highmem.h>#include <linux/cpu.h>#include <linux/critical_buf.h>static critical_cmd global_critical_cmd;static percpu_pagecache buf_pagecache[NR_CPUS];void reset_cpu_buf_pos(int cpu){buf_pagecache[cpu].read_pos = 0;buf_pagecache[cpu].write_pos = 0;buf_pagecache[cpu].over_flow = 0;buf_pagecache[cpu].last_writed_pos = 0;}EXPORT_SYMBOL(reset_cpu_buf_pos);static int alloc_buf_on_cpu(int cpu,unsigned long size){int i = 0;struct page* page = NULL;unsigned long max_buf_pfn = size>>PAGE_SHIFT;/* * buf in use */if(buf_pagecache[cpu].pagecache_array!=NULL)goto out;retry:buf_pagecache[cpu].pagecache_array = (struct page**)kmalloc(max_buf_pfn*sizeof(struct page*),GFP_KERNEL);if(buf_pagecache[cpu].pagecache_array == NULL){printk("alloc array failed,retry\n");goto retry;}for(;i<max_buf_pfn;++i){page = alloc_pages(GFP_HIGHUSER|__GFP_COLD, 0);if(page == NULL){printk("alloc page failed,retry\n");i--;continue;}buf_pagecache[cpu].pagecache_array[i] = page;}out:reset_cpu_buf_pos(cpu);return 0;}static int free_buf_on_cpu(int cpu,unsigned long size){int i = 0;unsigned long max_buf_pfn = size>>PAGE_SHIFT;struct page* page = NULL;reset_cpu_buf_pos(cpu);for(;i<max_buf_pfn;++i){page = buf_pagecache[cpu].pagecache_array[i];if(page != NULL){__free_pages(page,0);buf_pagecache[cpu].pagecache_array[i] = NULL;}}if(buf_pagecache[cpu].pagecache_array!=NULL){kfree(buf_pagecache[cpu].pagecache_array);buf_pagecache[cpu].pagecache_array = NULL;}return 0;}static void  byte_memcpy(void * to, const void * from, size_t n){  const char *c_from = from;  char *c_to = to;  while (n-- > 0)    *c_to++ = *c_from++;}/*  * steal from generic_perform_write */static int generic_write_buf_cpu(int cpu,char* outbuf,unsigned long len,unsigned long* ppos){unsigned long pos = 0; /*last write pos within buf*/unsigned long index = 0;/*pfn within buf*/unsigned long offset = 0; /*offset within page*/unsigned long writed = 0; /*writed within outbuf*/pos = *ppos;while(len){struct page *page;unsigned long bytes;char *kaddr;index = pos>>PAGE_SHIFT;offset = (pos & (PAGE_SIZE - 1));bytes = min_t(unsigned long, PAGE_SIZE - offset,len);page = buf_pagecache[cpu].pagecache_array[index];kaddr = kmap_atomic(page, KM_USER0);byte_memcpy(kaddr + offset, outbuf+writed, bytes);kunmap_atomic(kaddr, KM_USER0);pos +=bytes;len -=bytes;writed +=bytes;};*ppos = pos;return writed;}/* * steal from do_generic_file_read */static int generic_read_buf_cpu(int cpu,char* inbuf,unsigned long len,unsigned long* ppos){unsigned long pos = 0; /*pos within buf*/unsigned long index = 0;/*pfn within buf*/unsigned long offset = 0; /*offset within page*/unsigned long readed = 0; /*readed within inbuf*/pos = *ppos;while(len){struct page *page;unsigned long bytes;char *kaddr;index  = pos>>PAGE_SHIFT;offset = (pos & (PAGE_SIZE - 1));bytes  = min_t(unsigned long, PAGE_SIZE - offset,len);page   = buf_pagecache[cpu].pagecache_array[index];kaddr  = kmap_atomic(page, KM_USER0);byte_memcpy(inbuf+readed, kaddr+offset, bytes);kunmap_atomic(kaddr, KM_USER0);pos +=bytes;len -=bytes;readed +=bytes;};*ppos = pos;return readed;}/* * force = 1, wrapped writing * force = 0, write limited to buf size */int critical_write_buf_cpu(int cpu,char* outbuf, unsigned long len,int force){    unsigned long *last_ppos  = &buf_pagecache[cpu].write_pos;    unsigned long last_pos    = *last_ppos;unsigned long this_writed = 0;if(force){write_continue:    if(last_pos+len>global_critical_cmd.buf_size){/* * wrapped writing */       int left = global_critical_cmd.buf_size - last_pos;buf_pagecache[cpu].over_flow = 1;        this_writed +=generic_write_buf_cpu(cpu,outbuf,left,last_ppos);        *last_ppos = 0;last_pos = 0;len -=left;outbuf +=left;        goto write_continue;    }else        this_writed += generic_write_buf_cpu(cpu,outbuf,len,last_ppos);}else{if(last_pos+len>global_critical_cmd.buf_size){       int left = global_critical_cmd.buf_size - last_pos;        this_writed +=generic_write_buf_cpu(cpu,outbuf,left,last_ppos);}else{this_writed +=generic_write_buf_cpu(cpu,outbuf,len,last_ppos);}}    return this_writed;}EXPORT_SYMBOL(critical_write_buf_cpu);/* * force = 1, wrapped reading * force = 0, start from writed if not writed wrapped,  * otherwise start from readed to writed */int critical_read_buf_cpu(int cpu,char* inbuf, unsigned long len,int force){    unsigned long* writedp = &buf_pagecache[cpu].write_pos;    unsigned long* readedp = &buf_pagecache[cpu].read_pos;unsigned long* last_writedp = &buf_pagecache[cpu].last_writed_pos;    unsigned long  writed  = *writedp;unsigned long  last_writed = *last_writedp;    unsigned long  readed  = *readedp;    unsigned long  to_read;unsigned long  this_readed = 0;    if(force){read_continue:        if(readed + len > global_critical_cmd.buf_size){/* * wrapped reading */            to_read = global_critical_cmd.buf_size - readed;            this_readed +=generic_read_buf_cpu(cpu,inbuf,to_read,readedp);           *readedp = 0;readed  = 0;inbuf  += to_read;len    -=to_read;goto read_continue;         }else            this_readed +=generic_read_buf_cpu(cpu,inbuf,            len,readedp);    }else{if(buf_pagecache[cpu].over_flow){/* * If writed pos changed,update readed pos, * we are supposed to read the oldest data. */if(writed != last_writed){*last_writedp = writed;*readedp  = writed;  readed   = writed;}goto read_continue; }else{this_readed +=generic_read_buf_cpu(cpu,inbuf,          min_t(unsigned long,len,writed-readed),readedp);}}    return this_readed;}EXPORT_SYMBOL(critical_read_buf_cpu);int critical_initial_buffer(critical_cmd* cmd){int i = 0;cpumask_t cpumask = cmd->cpu_critical_map;unsigned long size = cmd->buf_size;unsigned long max_buf_size =(MAX_KMALLOC_SIZE/sizeof(unsigned long))*PAGE_SIZE;if(size>max_buf_size)size = max_buf_size;global_critical_cmd.cpu_critical_map = cpumask;global_critical_cmd.buf_size = size;for_each_cpu_mask(i,cpumask)alloc_buf_on_cpu(i,size);return 0;}EXPORT_SYMBOL(critical_initial_buffer);int critical_free_buffers(void){int i = 0;for_each_cpu_mask(i,global_critical_cmd.cpu_critical_map)free_buf_on_cpu(i,global_critical_cmd.buf_size);return 0;}EXPORT_SYMBOL(critical_free_buffers);int critical_cpu_has_buf(int cpu){return cpu_isset(cpu,global_critical_cmd.cpu_critical_map);}EXPORT_SYMBOL(critical_cpu_has_buf);void critical_reset_buf_pos(void){int i = 0;for_each_cpu_mask(i,global_critical_cmd.cpu_critical_map)reset_cpu_buf_pos(i);}EXPORT_SYMBOL(critical_reset_buf_pos);

/* *  kernel/sched_monitor.c * *  Kernel scheduler switch info * *  Copyright (C) 1985-2012  ZTE * *  2012-08-21  created  *  chenyu105 at gmail dot com * */#include <linux/smp_lock.h>#include <asm/mmu_context.h>#include <linux/interrupt.h>#include <linux/smp.h>#include <linux/threads.h>#include <linux/cpuset.h>#include <linux/kallsyms.h>#include <linux/critical_buf.h>#include "rtmutex_common.h"typedef void (*hook_func)(struct task_struct* prev,  struct task_struct* next);static hook_func callback_func = NULL;static int system_recording = 0;static DEFINE_RWLOCK(hook_rwlock);static unsigned long task_switchin_time[NR_CPUS];static const char *task_state_array[] = {"R (running)",/*  0 */"S (sleeping)",/*  1 */"D (disk sleep)",/*  2 */"T (stopped)",/*  4 */"T (tracing stop)",/*  8 */"Z (zombie)",/* 16 */"X (dead)"/* 32 */};static inline const char * get_task_state(struct task_struct *tsk){unsigned int state = (tsk->state & (TASK_RUNNING |    TASK_INTERRUPTIBLE |    TASK_UNINTERRUPTIBLE |    TASK_STOPPED |    TASK_TRACED)) |(tsk->exit_state & (EXIT_ZOMBIE |    EXIT_DEAD));const char **p = &task_state_array[0];while (state) {p++;state >>= 1;}return *p;}void sched_hook_internal(struct task_struct* prev,  struct task_struct* next){    /* flag and rwlock cooperate     * to make high prio writer.     * be careful.     */     int cpu = task_cpu(prev);if(system_recording&&critical_cpu_has_buf(cpu)){             if (read_trylock(&hook_rwlock)){                   if(callback_func!=NULL)                          callback_func(prev,next);                   read_unlock(&hook_rwlock);              }        }}EXPORT_SYMBOL(sched_hook_internal);/* *  For external use */ int sched_sprint_symbol(char *buffer, unsigned long addr){   return  sprint_symbol(buffer,addr);}EXPORT_SYMBOL(sched_sprint_symbol);/* *  Weird?  */unsigned long sched_get_task_prevtime(struct task_struct* prev){unsigned long delta = 0;int cpu = task_cpu(prev);if(task_switchin_time[cpu]==0){task_switchin_time[cpu] = sched_clock();return 0;}delta  = sched_clock() - task_switchin_time[cpu];task_switchin_time[cpu] = sched_clock();return delta;}EXPORT_SYMBOL(sched_get_task_prevtime);int sched_rec_task_block_info(struct task_struct *task,char* buffer){    struct task_struct *lock_owner = NULL;    int len = 0;    if(((task->pi_blocked_on)!=NULL)&&((((struct rt_mutex_waiter *)(task->pi_blocked_on))->lock)!=NULL)&&((((task->pi_blocked_on)->lock)->owner)!=NULL))    {        lock_owner = rt_mutex_owner(task->pi_blocked_on->lock);        len += sprintf(buffer+len,"blocked by task:\n");        len += sprintf(buffer+len,"%d (%s)  %c  %d   %lu\n",lock_owner->pid,lock_owner->comm,*get_task_state(lock_owner),task_cpu(lock_owner),lock_owner->rt_priority);    }    return len;}EXPORT_SYMBOL(sched_rec_task_block_info);struct pt_regs * sched_task_pt_regs(struct task_struct* prev){#ifndef task_pt_regs/* Work-around for PPC */#define task_pt_regs(task) (task->thread.regs)#endifreturn (struct pt_regs *)task_pt_regs(prev);}EXPORT_SYMBOL(sched_task_pt_regs);int sched_smp_processor_id(void){return smp_processor_id();}EXPORT_SYMBOL(sched_smp_processor_id);void sched_switchHook_hook_regist(hook_func hook){system_recording = 0;write_lock(&hook_rwlock);callback_func = hook;write_unlock(&hook_rwlock);}EXPORT_SYMBOL(sched_switchHook_hook_regist);void sched_switchHook_hook_delete(void){system_recording = 0;write_lock(&hook_rwlock);callback_func = NULL;write_unlock(&hook_rwlock);critical_free_buffers();}EXPORT_SYMBOL(sched_switchHook_hook_delete);void sched_switchHook_hook_start(critical_cmd* cmd){    critical_initial_buffer(cmd);    system_recording = 1;}EXPORT_SYMBOL(sched_switchHook_hook_start);void sched_switchHook_hook_stop(void){system_recording = 0;}EXPORT_SYMBOL(sched_switchHook_hook_stop);

文件系统相关知识 以及根据文件系统原理设计内核缓冲区

文件系统相关知识以及根据文件系统原理设计内核缓冲区