linux内核代码——页框的回收(2.4.18)

来源:互联网 发布:新时代网络宣传怎么做 编辑:程序博客网 时间:2024/05/01 16:58

写这篇blog参照了O'Reilly 《Understanding the Linux Kernel》,或者说是大部分参照,实际上这本书关于这章写得确实很经典。尽管如此,在看代码的过程中我仍然遇到了很多困惑,以至于看过两遍之后脑子里还是一团浆糊。所以还得感谢村爷帮我理清楚了思路,不敢怠慢,赶紧写下来防止忘记。(以下代码及解释均以2.4.18版本为准)

   管理页框回收的主要函数是try_to_free_pages,函数原型是int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)。这个函数的参数:classzone--要回收的页框所在的管理区;gfp_mask--标志,这个不好解释,建议大家看代码;order--释放一个页框所以是0。看这个函数之前需要了解一下LRU链表,活动链表与非活动链表是页框回收的核心数据结构,请先补全这部分的知识!下面以代码分析:
int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
{
     int priority = DEF_PRIORITY;                                              //默认优先级
     int nr_pages = SWAP_CLUSTER_MAX;                                 //释放的页框数
     gfp_mask = pf_gfp_mask(gfp_mask);                                  //检查标志位
     do {
       nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);       //循环调用shrink_caches释放页框。每次循环优先级数值递减(优先级更高
       if (nr_pages <= 0)
       return 1;
} while (--priority);
 /*
  * Hmm.. Cache shrink failed - time to kill something?
  * Mhwahahhaha! This is the part I really like. Giggle.
  */
 out_of_memory();                                                                           //回收SWAP_CLUSTER_MAX个页框失败,选择杀死一个进程以释放足够页框
                                                                                                       哎~  选择杀死的进程也足够复杂啊!!
 return 0;
}
这段代码的核心就是shrink_caches函数,下面是这个函数的代码
static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
{
 int chunk_size = nr_pages;
 unsigned long ratio;
 nr_pages -= kmem_cache_reap(gfp_mask);                         //先从slab分配器高速缓存回收页框,如果成功释放要求数量的页框,返回
 if (nr_pages <= 0)
  return 0;
 nr_pages = chunk_size;                                                                //好吧,我们从头开始!!!
 /* try to keep the active list 2/3 of the size of the cache */
 ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);         //这段代码看英文注释就可以了
 refill_inactive(ratio);                                                                                           //把一些页从活动链表转入非活动链表            
 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
 if (nr_pages <= 0)
  return 0;
 shrink_dcache_memory(priority, gfp_mask);                                                   //从目录项高速缓存回收页框
 shrink_icache_memory(priority, gfp_mask);                                                    //从索引结点高速缓存回收页框
#ifdef CONFIG_QUOTA
 shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
 return nr_pages;
}
   好吧,我们还完全没有讲到重点,shrink_cache函数才是页框回收的主要实现部分。首先,我们来补齐一点东西。可以被回收的页框是那些呢?1.高速缓存中的页,这里就有page_cache、buffer_cache、目录项高速缓存及索引结点高速缓存;2.进程的匿名页(包括共享内存区)。还有一个需要知道的知识点就是page_cache与buffer_cache的区别。为什么要分这两种cache?page_cache是用于内核缓冲区管理部分的,而buffer_cache是用于设备驱动与设备之间交互的(我操,绕口),这么说吧,buffer_cache被设备驱动程序用来从一个块设备(比如硬盘)中读入一个块的数据(一般是1k),例如四个buffer_cache分别存有从硬盘中读入的1,2,3,4四个数据块,那么page_cache同样存有这1,2,3,4四个数据块。
   但是注意,至少2.4.18版本的内核取消了buffer_cache,取代的是一个buffer_head的数据结构,这个数据结构中的b_data数据成员指向了其数据在page_cache中的位置。这么表达还是比较难以理解,这么说。一个4k的page_cache被分成了四个1k的buffer_cache,而buffer_head用于指向这里面的buffer_cache。也就是说实际上已经不存在buffer_cache了,因为page_cache就是四个buffer_cache。
   那么下面看shrink_cache函数的代码就会收获良多:
static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)                
//priority参数控制此函数扫描非活动链表的大小。例如 priority = 6(最低优先级),则最多扫描1/6链表的大小。随着prority值减小,优先级增加,扫描的链表数量递增,priotity = 1时 则扫描整个链表。
{
 struct list_head * entry;
 int max_scan = nr_inactive_pages / priority;                                    //扫描链表的大小~~ 刚才说到的!
 int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
 spin_lock(&pagemap_lru_lock);
 while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {              //从inactive_list链表末尾开始扫描
  struct page * page;
  if (unlikely(current->need_resched)) {                                                        //设置了need_resched标志,当前进程重新调度
   spin_unlock(&pagemap_lru_lock);
   __set_current_state(TASK_RUNNING);                                                      //设置当前进程调度状态,让它有机会再次被调度
   schedule();
   spin_lock(&pagemap_lru_lock);
   continue;
  }
  page = list_entry(entry, struct page, lru);                                               
  if (unlikely(!PageLRU(page)))
   BUG();
  if (unlikely(PageActive(page)))
   BUG();
  list_del(entry);
  list_add(entry, &inactive_list);                        //页从链表当前位置移动至头部,实现轮转扫描                                           
  /*
   * Zero page counts can happen because we unlink the pages
   * _after_ decrementing the usage count..
   */
  if (unlikely(!page_count(page)))                     //页的引用计数为0,继续链表下一页扫描,为0说明页应当属于伙伴系统,以后的释放均是根据页的引用计数值
   continue;
  if (!memclass(page->zone, classzone))             //检查页的内存管理区与classzone是否一致
   continue;
  /* Racy check to avoid trylocking when not worthwhile */
  if (!page->buffers && (page_count(page) != 1 || !page->mapping))         //页不在buffer_cache且页的引用计数不等于1或者页是匿名页,这个说明了还
                                                                                                        有进程在占用页,则不能回收。
                                                                                                   
   goto page_mapped;
  /*
   * The page is locked. IO in progress?
   * Move it to the back of the list.
   */

  if (unlikely(TryLockPage(page))) {                                               //page_locked标志置位
   if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
    page_cache_get(page);
    spin_unlock(&pagemap_lru_lock);
    wait_on_page(page);                                                           //在页面锁打开之前,调度其它进程
    page_cache_release(page);                                                 
    spin_lock(&pagemap_lru_lock);
   }
   continue;
  }
 /*页面是否脏,页是否是匿名页及当前页是否还有进程占用。查看is_page_cache_freeable代码为:
static inline int is_page_cache_freeable(struct page * page)
{
 return page_count(page) - !!page->buffers == 1;
}
缓冲区缓存也会导致page的引用技术加1,此函数用来检测是否还有进程在占用此页
*/
  if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping)                    //也就是检查是否有可以回收的页,如果有且是有名页而且页面
                                                                                                                         为脏页面,则调用下面代码写入磁盘。如果映射文件的页面可
                                                                                                                         不能用来swap哦,所以这里分开来释放。
{                                                                                                                     
   /*
    * It is not critical here to write it only if
    * the page is unmapped beause any direct writer
    * like O_DIRECT would set the PG_dirty bitflag
    * on the phisical page after having successfully
    * pinned it and after the I/O to the page is finished,
    * so the direct writes to the page cannot get lost.
    */

   int (*writepage)(struct page *);
   writepage = page->mapping->a_ops->writepage;
   if ((gfp_mask & __GFP_FS) && writepage) {
    ClearPageDirty(page);
    SetPageLaunder(page);
    page_cache_get(page);
    spin_unlock(&pagemap_lru_lock);
    writepage(page);                                //调用address_space对象的writepage方法,关于address_space对象请参阅源码!!
    page_cache_release(page);
    spin_lock(&pagemap_lru_lock);
    continue;
   }
  }
  /*
   * If the page has buffers, try to free the buffer mappings
   * associated with this page. If we succeed we try to free
   * the page as well.
   */
 
/*如果page在缓冲区缓存中,则释放buffer_cache。由于2.4.18取消了buffer_cache,因此此处释放buffer_head占用的内存。关于buffer_cache和page_cache这里再次补充一下,下面这段代码相信可以让大家看的很清楚了:
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
{
 bh->b_page = page;
 if (offset >= PAGE_SIZE)
  BUG();
 if (PageHighMem(page))
  /*
   * This catches illegal uses and preserves the offset:
   */
  bh->b_data = (char *)(0 + offset);
 else
  bh->b_data = page_address(page) + offset;
}
这段代码为create_empty_buffers函数调用,用来创建buffer_cache。  源代码中并没有分配1k大小的页来作为buffer_cache。而只是分配了存放buffer_head结构的内存(函数为get_unused_buffer_head,可以自己查看代码)。 再仔细分析上面的代码可以看到bh_b_data = page_address(page) + offset;其中offset为块设备的块的大小,一般是1k或者512b。
*/
  if (page->buffers) {
   spin_unlock(&pagemap_lru_lock);
   /* avoid to free a locked page */
   page_cache_get(page);
   if (try_to_release_page(page, gfp_mask)) {               //释放buffer_cache,参照前面的讲解
    if (!page->mapping) {                                            //如果是匿名页,注意这里分支的原因:匿名页没有buffer是不允许挂在LRU链表中的介个我也不敢肯定啊,望达人解答~)
     /*
      * We must not allow an anon page
      * with no buffers to be visible on
      * the LRU, so we unlock the page after
      * taking the lru lock
      */

     spin_lock(&pagemap_lru_lock);
     UnlockPage(page);
     __lru_cache_del(page);                                                  //lru链表中删除
     /* effectively free the page here */
/*这里的代码也要看一下,起代码实质上是这个函数:
void __free_pages(struct page *page, unsigned int order)
{
 if (!PageReserved(page) && put_page_testzero(page))
  __free_pages_ok(page, order);
  其中的put_page_testzero(page)的定义如下:
#define put_page_testzero(p)  atomic_dec_and_test(&(p)->count)
恩 也就是说page的引用计数减一为0后才会调用回收页框的函数。
}
*/
     page_cache_release(page);                                            
     if (--nr_pages)
      continue;
     break;
    } else {
     /*
      * The page is still in pagecache so undo the stuff
      * before the try_to_release_page since we've not
      * finished and we can now try the next step.
      */
     page_cache_release(page);                                      //页有映像,它在页缓存中,因此此处只是将page的引用计数减一
     spin_lock(&pagemap_lru_lock);
    }
   } else {
    /* failed to drop the buffers so stop here */
    UnlockPage(page);
    page_cache_release(page);                                           
    spin_lock(&pagemap_lru_lock);
    continue;
   }
  }
  spin_lock(&pagecache_lock);
  /*
   * this is the non-racy check for busy page.
   */

  if (!page->mapping || !is_page_cache_freeable(page)) {                   //匿名页且多个进程占用页框时
   spin_unlock(&pagecache_lock);
   UnlockPage(page);
page_mapped:
   if (--max_mapped >= 0)                                                               //检查阈值,为已扫描但未释放的此类页的个数。如果过多 则换出后释放。
    continue;
   /*
    * Alert! We've found too many mapped pages on the
    * inactive list, so we start swapping out now!
    */

   spin_unlock(&pagemap_lru_lock);
   swap_out(priority, gfp_mask, classzone);                                        //换出函数,以后会单独再讲!!!!!
   return nr_pages;
  }
  /*
   * It is critical to check PageDirty _after_ we made sure
   * the page is freeable* so not in use by anybody.
   */

  if (PageDirty(page)) {
   spin_unlock(&pagecache_lock);
   UnlockPage(page);
   continue;
  }
  /* point of no return */
  if (likely(!PageSwapCache(page))) {                        //到这里页有磁盘映射,是干净的,检查是否在交换缓存区中
   __remove_inode_page(page);                               //不在,则说明也是与文件映射的,从inode链表中删除
/*这里补充一下_remove_inode_page函数的源码:
实际上是调用了这个函数-------
static inline void remove_page_from_inode_queue(struct page * page)
{
 struct address_space * mapping = page->mapping;
 mapping->nrpages--;
 list_del(&page->list);
 page->mapping = NULL;
}
主要是关于page->list链接的地方,实际上是通过这个成员指针将page链接到不同属性的页面管理区,例如swapper_space交换缓存区,或者是((struct file *)filp)->f_dentry->d_inode->i_mappnig交换文件管理区。 当然请不要忘记这个函数还要从hash表中删除此页!!
*/
   spin_unlock(&pagecache_lock);
  } else {
   swp_entry_t swap;                                             
   swap.val = page->index;                                   //是在交换缓存区,则获取页的交换区标识符
   __delete_from_swap_cache(page);                     //从交换缓存区删除
   spin_unlock(&pagecache_lock);         
   swap_free(swap);                                            //交换区页槽引用计数加1
  }
  __lru_cache_del(page);                                     //从非活动页表中删除此页
  UnlockPage(page);
  /* effectively free the page here */
  page_cache_release(page);                               //程序走到这里,基本上可以肯定此页没有进城占用 且不在缓冲区中 且干净,所以这里就是释放了
  if (--nr_pages)
   continue;
  break;
 }
 spin_unlock(&pagemap_lru_lock);
 return nr_pages;
}
好了,最麻烦的shrink_cache函数在经过我左补右补后终于写完了,实际上有很多地方我仍然不敢过于肯定,如果发现错误我会及时来更正。下一篇应该是写一下swap_out了,似乎blog上也提到要单独讲一下的。
转载来自:http://wsqhs.spaces.live.com/Blog/cns!94F639580F58209C!490.entry
原创粉丝点击