linux内核代码——页框的回收(2.4.18)

来源：互联网发布：新时代网络宣传怎么做编辑：程序博客网时间：2024/05/01 16:58

写这篇blog参照了O'Reilly 的《Understanding the Linux Kernel》，或者说是大部分参照，实际上这本书关于这章写得确实很经典。尽管如此，在看代码的过程中我仍然遇到了很多困惑，以至于看过两遍之后脑子里还是一团浆糊。所以还得感谢村爷帮我理清楚了思路，不敢怠慢，赶紧写下来防止忘记。(以下代码及解释均以2.4.18版本为准）

管理页框回收的主要函数是try_to_free_pages，函数原型是int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)。这个函数的参数：classzone--要回收的页框所在的管理区；gfp_mask--标志，这个不好解释，建议大家看代码；order--释放一个页框所以是0。看这个函数之前需要了解一下LRU链表，活动链表与非活动链表是页框回收的核心数据结构，请先补全这部分的知识！下面以代码分析：

int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
{
int priority = DEF_PRIORITY; //默认优先级
int nr_pages = SWAP_CLUSTER_MAX; //释放的页框数

     gfp_mask = pf_gfp_mask(gfp_mask);                                  //检查标志位
     do {
       nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);       //循环调用shrink_caches释放页框。每次循环优先级数值递减（优先级更高
       if (nr_pages <= 0)
       return 1;
} while (--priority);

/*
* Hmm.. Cache shrink failed - time to kill something?
* Mhwahahhaha! This is the part I really like. Giggle.
*/
out_of_memory(); //回收SWAP_CLUSTER_MAX个页框失败，选择杀死一个进程以释放足够页框

哎~ 选择杀死的进程也足够复杂啊！！
return 0;
}

这段代码的核心就是shrink_caches函数，下面是这个函数的代码

static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
{
int chunk_size = nr_pages;
unsigned long ratio;

nr_pages -= kmem_cache_reap(gfp_mask); //先从slab分配器高速缓存回收页框，如果成功释放要求数量的页框，返回
if (nr_pages <= 0)
return 0;

nr_pages = chunk_size;                                                                //好吧，我们从头开始！！！
/* try to keep the active list 2/3 of the size of the cache */
ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);         //这段代码看英文注释就可以了
refill_inactive(ratio);                                                                                           //把一些页从活动链表转入非活动链表

nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
if (nr_pages <= 0)
return 0;

shrink_dcache_memory(priority, gfp_mask); //从目录项高速缓存回收页框
shrink_icache_memory(priority, gfp_mask); //从索引结点高速缓存回收页框
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif

return nr_pages;
}
好吧，我们还完全没有讲到重点，shrink_cache函数才是页框回收的主要实现部分。首先，我们来补齐一点东西。可以被回收的页框是那些呢？1.高速缓存中的页，这里就有page_cache、buffer_cache、目录项高速缓存及索引结点高速缓存；2.进程的匿名页（包括共享内存区）。还有一个需要知道的知识点就是page_cache与buffer_cache的区别。为什么要分这两种cache？page_cache是用于内核缓冲区管理部分的，而buffer_cache是用于设备驱动与设备之间交互的（我操，绕口），这么说吧，buffer_cache被设备驱动程序用来从一个块设备（比如硬盘）中读入一个块的数据（一般是1k），例如四个buffer_cache分别存有从硬盘中读入的1,2,3,4四个数据块，那么page_cache同样存有这1,2,3,4四个数据块。

但是注意，至少2.4.18版本的内核取消了buffer_cache，取代的是一个buffer_head的数据结构，这个数据结构中的b_data数据成员指向了其数据在page_cache中的位置。这么表达还是比较难以理解，这么说。一个4k的page_cache被分成了四个1k的buffer_cache，而buffer_head用于指向这里面的buffer_cache。也就是说实际上已经不存在buffer_cache了，因为page_cache就是四个buffer_cache。

那么下面看shrink_cache函数的代码就会收获良多：

static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)

//priority参数控制此函数扫描非活动链表的大小。例如 priority = 6(最低优先级)，则最多扫描1/6链表的大小。随着prority值减小，优先级增加，扫描的链表数量递增，priotity = 1时则扫描整个链表。
{
struct list_head * entry;
int max_scan = nr_inactive_pages / priority; //扫描链表的大小~~ 刚才说到的！
int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);

spin_lock(&pagemap_lru_lock);
while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { //从inactive_list链表末尾开始扫描
struct page * page;

  if (unlikely(current->need_resched)) {                                                        //设置了need_resched标志，当前进程重新调度
   spin_unlock(&pagemap_lru_lock);
   __set_current_state(TASK_RUNNING);                                                      //设置当前进程调度状态，让它有机会再次被调度
   schedule();
   spin_lock(&pagemap_lru_lock);
   continue;
  }

page = list_entry(entry, struct page, lru);

  if (unlikely(!PageLRU(page)))
   BUG();
  if (unlikely(PageActive(page)))
   BUG();

list_del(entry);
list_add(entry, &inactive_list); //页从链表当前位置移动至头部，实现轮转扫描

  /*
   * Zero page counts can happen because we unlink the pages
   * _after_ decrementing the usage count..
   */
  if (unlikely(!page_count(page)))                     //页的引用计数为0，继续链表下一页扫描，为0说明页应当属于伙伴系统，以后的释放均是根据页的引用计数值
   continue;

if (!memclass(page->zone, classzone)) //检查页的内存管理区与classzone是否一致
continue;

/* Racy check to avoid trylocking when not worthwhile */
if (!page->buffers && (page_count(page) != 1 || !page->mapping)) //页不在buffer_cache且页的引用计数不等于1或者页是匿名页，这个说明了还

有进程在占用页，则不能回收。

goto page_mapped;

  /*
   * The page is locked. IO in progress?
   * Move it to the back of the list.
   */
  if (unlikely(TryLockPage(page))) {                                               //page_locked标志置位
   if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
    page_cache_get(page);
    spin_unlock(&pagemap_lru_lock);
    wait_on_page(page);                                                           //在页面锁打开之前，调度其它进程
    page_cache_release(page);
    spin_lock(&pagemap_lru_lock);
   }
   continue;
  }

/*页面是否脏，页是否是匿名页及当前页是否还有进程占用。查看is_page_cache_freeable代码为：

static inline int is_page_cache_freeable(struct page * page)
{
return page_count(page) - !!page->buffers == 1;
}

缓冲区缓存也会导致page的引用技术加1，此函数用来检测是否还有进程在占用此页

if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) //也就是检查是否有可以回收的页，如果有且是有名页而且页面

为脏页面，则调用下面代码写入磁盘。如果映射文件的页面可

不能用来swap哦，所以这里分开来释放。

{
   /*
    * It is not critical here to write it only if
    * the page is unmapped beause any direct writer
    * like O_DIRECT would set the PG_dirty bitflag
    * on the phisical page after having successfully
    * pinned it and after the I/O to the page is finished,
    * so the direct writes to the page cannot get lost.
    */
   int (*writepage)(struct page *);

   writepage = page->mapping->a_ops->writepage;
   if ((gfp_mask & __GFP_FS) && writepage) {
    ClearPageDirty(page);
    SetPageLaunder(page);
    page_cache_get(page);
    spin_unlock(&pagemap_lru_lock);

writepage(page); //调用address_space对象的writepage方法，关于address_space对象请参阅源码！！
page_cache_release(page);

    spin_lock(&pagemap_lru_lock);
    continue;
   }
  }

  /*
   * If the page has buffers, try to free the buffer mappings
   * associated with this page. If we succeed we try to free
   * the page as well.
   */

/*如果page在缓冲区缓存中，则释放buffer_cache。由于2.4.18取消了buffer_cache,因此此处释放buffer_head占用的内存。关于buffer_cache和page_cache这里再次补充一下，下面这段代码相信可以让大家看的很清楚了：

void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
{
bh->b_page = page;
if (offset >= PAGE_SIZE)
  BUG();
if (PageHighMem(page))
  /*
   * This catches illegal uses and preserves the offset:
   */
  bh->b_data = (char *)(0 + offset);
else
  bh->b_data = page_address(page) + offset;
}

这段代码为create_empty_buffers函数调用，用来创建buffer_cache。源代码中并没有分配1k大小的页来作为buffer_cache。而只是分配了存放buffer_head结构的内存（函数为get_unused_buffer_head，可以自己查看代码）。再仔细分析上面的代码可以看到bh_b_data = page_address(page) + offset;其中offset为块设备的块的大小，一般是1k或者512b。

*/
if (page->buffers) {
spin_unlock(&pagemap_lru_lock);

/* avoid to free a locked page */
page_cache_get(page);

   if (try_to_release_page(page, gfp_mask)) {               //释放buffer_cache,参照前面的讲解
    if (!page->mapping) {                                            //如果是匿名页，注意这里分支的原因：匿名页没有buffer是不允许挂在LRU链表中的（介个我也不敢肯定啊，望达人解答~）
     /*
      * We must not allow an anon page
      * with no buffers to be visible on
      * the LRU, so we unlock the page after
      * taking the lru lock
      */
     spin_lock(&pagemap_lru_lock);
     UnlockPage(page);
     __lru_cache_del(page);                                                  //lru链表中删除

/* effectively free the page here */

/*这里的代码也要看一下，起代码实质上是这个函数：

void __free_pages(struct page *page, unsigned int order)
{
if (!PageReserved(page) && put_page_testzero(page))
__free_pages_ok(page, order);

其中的put_page_testzero(page)的定义如下：

#define put_page_testzero(p) atomic_dec_and_test(&(p)->count)

恩也就是说page的引用计数减一为0后才会调用回收页框的函数。
}

*/
page_cache_release(page);

     if (--nr_pages)
      continue;
     break;
    } else {
     /*
      * The page is still in pagecache so undo the stuff
      * before the try_to_release_page since we've not
      * finished and we can now try the next step.
      */
     page_cache_release(page);                                      //页有映像，它在页缓存中，因此此处只是将page的引用计数减一

     spin_lock(&pagemap_lru_lock);
    }
   } else {
    /* failed to drop the buffers so stop here */
    UnlockPage(page);
    page_cache_release(page);

    spin_lock(&pagemap_lru_lock);
    continue;
   }
  }

spin_lock(&pagecache_lock);

  /*
   * this is the non-racy check for busy page.
   */
  if (!page->mapping || !is_page_cache_freeable(page)) {                   //匿名页且多个进程占用页框时
   spin_unlock(&pagecache_lock);
   UnlockPage(page);
page_mapped:
   if (--max_mapped >= 0)                                                               //检查阈值，为已扫描但未释放的此类页的个数。如果过多则换出后释放。
    continue;

   /*
    * Alert! We've found too many mapped pages on the
    * inactive list, so we start swapping out now!
    */
   spin_unlock(&pagemap_lru_lock);
   swap_out(priority, gfp_mask, classzone);                                        //换出函数，以后会单独再讲！！！！！
   return nr_pages;
  }

  /*
   * It is critical to check PageDirty _after_ we made sure
   * the page is freeable* so not in use by anybody.
   */
  if (PageDirty(page)) {
   spin_unlock(&pagecache_lock);
   UnlockPage(page);
   continue;
  }

  /* point of no return */
  if (likely(!PageSwapCache(page))) {                        //到这里页有磁盘映射，是干净的，检查是否在交换缓存区中
   __remove_inode_page(page);                               //不在，则说明也是与文件映射的，从inode链表中删除

/*这里补充一下_remove_inode_page函数的源码：

实际上是调用了这个函数-------

static inline void remove_page_from_inode_queue(struct page * page)
{
struct address_space * mapping = page->mapping;

mapping->nrpages--;
list_del(&page->list);
page->mapping = NULL;
}

主要是关于page->list链接的地方，实际上是通过这个成员指针将page链接到不同属性的页面管理区，例如swapper_space交换缓存区，或者是((struct file *)filp)->f_dentry->d_inode->i_mappnig交换文件管理区。当然请不要忘记这个函数还要从hash表中删除此页！！
*/
   spin_unlock(&pagecache_lock);
  } else {
   swp_entry_t swap;
   swap.val = page->index;                                   //是在交换缓存区，则获取页的交换区标识符
   __delete_from_swap_cache(page);                     //从交换缓存区删除
   spin_unlock(&pagecache_lock);
   swap_free(swap);                                            //交换区页槽引用计数加1
  }

__lru_cache_del(page); //从非活动页表中删除此页
UnlockPage(page);

/* effectively free the page here */
page_cache_release(page); //程序走到这里，基本上可以肯定此页没有进城占用且不在缓冲区中且干净，所以这里就是释放了

  if (--nr_pages)
   continue;
  break;
}
spin_unlock(&pagemap_lru_lock);

return nr_pages;
}

好了，最麻烦的shrink_cache函数在经过我左补右补后终于写完了，实际上有很多地方我仍然不敢过于肯定，如果发现错误我会及时来更正。下一篇应该是写一下swap_out了，似乎blog上也提到要单独讲一下的。

转载来自：http://wsqhs.spaces.live.com/Blog/cns!94F639580F58209C!490.entry