Linux内核源代码情景分析-内存管理之用户页面的换入_html/css_WEB-ITnose

在下面几种情况下会发生，页面出错异常（也叫缺页中断）： 1、相应的页面目录项或者页面表项为空，也就是该线性地址与物理地址的映射关系尚未建立，或者已经撤销。 2、相应的物理页面不在内存中。本文讨论的就是这种情况。 3、指令中规定的访问方式与页面的权限不符，例如企图写一个“只读”的页面。假设已经建立好了映射，但是页表项最后一位P为0，表示页面不在内存中；整个页表项如下图，offset表示页面在一个磁盘设备的位置，也就是磁盘设备的逻辑页面号；而type则是指该页面在哪一个磁盘设备中。图 1 页面交换项结构这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。代码如下： arch/i386/mm/fault.c asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code){struct task_struct *tsk;struct mm_struct *mm;struct vm_area_struct * vma;unsigned long address;unsigned long page;unsigned long fixup;int write;siginfo_t info;/* get the address */__asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中tsk = current;//task_struct/* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */if (address >= TASK_SIZE)goto vmalloc_fault;mm = tsk->mm;//mm_structinfo.si_code = SEGV_MAPERR;/* * If we're in an interrupt or have no user * context, we must not take the fault.. */if (in_interrupt() || !mm)goto no_context;down(&mm->mmap_sem);vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。if (!vma)//没有找到，说明没有一个区间的结束地址高于给定的地址，参考上图，说明这个地址是在堆栈之下，也就是3G字节以上了。goto bad_area;if (vma->vm_start vm_flags & VM_GROWSDOWN))goto bad_area;..../* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */good_area:info.si_code = SEGV_ACCERR;write = 0;switch (error_code & 3) {// 110 & 011 = 2default:/* 3: write, present */#ifdef TEST_VERIFY_AREAif (regs->cs == KERNEL_CS)printk("WP fault at %08lx\n", regs->eip);#endif/* fall through */case 2:/* write, not present */if (!(vma->vm_flags & VM_WRITE))goto bad_area;write++;//执行到这里break;case 1:/* read, present */goto bad_area;case 0:/* read, not present */if (!(vma->vm_flags & (VM_READ | VM_EXEC)))goto bad_area;}/* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */switch (handle_mm_fault(mm, vma, address, write)) {case 1:tsk->min_flt++;break;case 2:tsk->maj_flt++;break;case 0:goto do_sigbus;default:goto out_of_memory;}/* * Did it hit the DOS screen memory VA from vm86 mode? */if (regs->eflags & VM_MASK) {unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;if (bit thread.screen_bitmap |= 1 mmap_sem);return; .......}登录后复制内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs，它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。 error_code: bit 0 == 0 means no page found, 1 means protection fault bit 1 == 0 means read, 1 means write bit 2 == 0 means kernel, 1 means user-mode 此时，error_code为110，用户态，页面不在内存中，写。 handle_mm_fault函数，代码如下： int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,unsigned long address, int write_access){int ret = -1;pgd_t *pgd;pmd_t *pmd;pgd = pgd_offset(mm, address);//返回页面表项指针pmd = pmd_alloc(pgd, address);//中转了一下，还是页目录表项指针if (pmd) {pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针if (pte)ret = handle_pte_fault(mm, vma, address, write_access, pte);}return ret;}登录后复制 handle_pte_fault函数，如下： static inline int handle_pte_fault(struct mm_struct *mm,struct vm_area_struct * vma, unsigned long address,int write_access, pte_t * pte){pte_t entry;/* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */spin_lock(&mm->page_table_lock);entry = *pte;//页表项中内容if (!pte_present(entry)) {//页面不在内存中/* * If it truly wasn't present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */spin_unlock(&mm->page_table_lock);if (pte_none(entry))//页表项不为空return do_no_page(mm, vma, address, write_access, pte);return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//执行到这里}if (write_access) {if (!pte_write(entry))return do_wp_page(mm, vma, address, pte, entry);entry = pte_mkdirty(entry);}entry = pte_mkyoung(entry);establish_pte(vma, address, pte, entry);spin_unlock(&mm->page_table_lock);return 1;}登录后复制 do_swap_page函数，如下： static int do_swap_page(struct mm_struct * mm,struct vm_area_struct * vma, unsigned long address,pte_t * page_table, swp_entry_t entry, int write_access){struct page *page = lookup_swap_cache(entry);//从hash表中寻找pte_t pte;if (!page) {lock_kernel();swapin_readahead(entry);//预读页面page = read_swap_cache(entry);//真正得到一个页面，这个页面可能从hash表中寻找到，因为上面预读了。或者自己申请页面，并且从盘上将其内容读进来。unlock_kernel();if (!page)return -1;flush_page_to_ram(page);flush_icache_page(vma, page);}mm->rss++;pte = mk_pte(page, vma->vm_page_prot);//形成页表项/* * Freeze the "shared"ness of the page, ie page_count + swap_count. * Must lock page before transferring our swap count to already * obtained page count. */lock_page(page);swap_free(entry);if (write_access && !is_page_shared(page))pte = pte_mkwrite(pte_mkdirty(pte));//页表项赋予已写过对应的物理页，可进行读、写或者执行UnlockPage(page);set_pte(page_table, pte);//页表项(属性刚才已经设置了)指向对应的页面/* No need to invalidate - it was non-present before */update_mmu_cache(vma, address, pte);return 1;/* Minor fault */}登录后复制一、下面分别解释各个函数。首先解释swapin_readahead函数，如下： void swapin_readahead(swp_entry_t entry){int i, num;struct page *new_page;unsigned long offset;/* * Get the number of handles we should do readahead io to. Also, * grab temporary references on them, releasing them as io completes. */num = valid_swaphandles(entry, &offset);for (i = 0; i 登录后复制提前预读相邻的盘面，根据下面的描述，__get_free_page，page使用计数为1，add_to_swap_cache，page使用计数再加1；此时page_cache_release，page使用计数又变成了1。直到有进程认领，才变成2。 read_swap_cache_async函数，如下： struct page * read_swap_cache_async(swp_entry_t entry, int wait){struct page *found_page = 0, *new_page;unsigned long new_page_addr;/* * Make sure the swap entry is still in use. */if (!swap_duplicate(entry))/* Account for the swap cache */goto out;/* * Look for the page in the swap cache. */found_page = lookup_swap_cache(entry);//假设没有找到if (found_page)goto out_free_swap;new_page_addr = __get_free_page(GFP_USER);//刚申请的page结构，使用计数为1if (!new_page_addr)goto out_free_swap;/* Out of memory */new_page = virt_to_page(new_page_addr);//转化成对应的page结构指针/* * Check the swap cache again, in case we stalled above. */found_page = lookup_swap_cache(entry);//假设没有找到if (found_page)goto out_free_page;/* * Add it to the swap cache and read its contents. */lock_page(new_page);add_to_swap_cache(new_page, entry);//加入到对应的链表上rw_swap_page(READ, new_page, wait);//真正的把磁盘上的数据读到新申请的page上，等待块设备驱动一章再来看return new_page;out_free_page:page_cache_release(new_page);out_free_swap:swap_free(entry);out:return found_page;}登录后复制 add_to_swap_cache函数是重点，代码如下： void add_to_swap_cache(struct page *page, swp_entry_t entry){unsigned long flags;#ifdef SWAP_CACHE_INFOswap_cache_add_total++;#endifif (!PageLocked(page))BUG();if (PageTestandSetSwapCache(page))BUG();if (page->mapping)BUG();flags = page->flags & ~((1 flags = flags | (1 登录后复制 add_to_page_cache_locked函数，代码如下： void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index){if (!PageLocked(page))BUG();page_cache_get(page);//增加了使用计数，现在使用计数为2spin_lock(&pagecache_lock);page->index = index;//index存着页面交换项add_page_to_inode_queue(mapping, page);//page->list链入mapping->clean_pagesadd_page_to_hash_queue(page, page_hash(mapping, index));//page->next_hash和page->pprev_hash链入全局的Hash表lru_cache_add(page);//page->lru链入了全局的active_listspin_unlock(&pagecache_lock);}登录后复制 add_page_to_inode_queue函数，代码如下： static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page){struct list_head *head = &mapping->clean_pages;mapping->nrpages++;list_add(&page->list, head);//page->list链入mapping->clean_pagespage->mapping = mapping;//mapping指向了swapper_space}登录后复制 struct address_space swapper_space = {LIST_HEAD_INIT(swapper_space.clean_pages),LIST_HEAD_INIT(swapper_space.dirty_pages),LIST_HEAD_INIT(swapper_space.locked_pages),0,/* nrpages*/&swap_aops,};登录后复制 add_page_to_hash_queue函数，如下： static void add_page_to_hash_queue(struct page * page, struct page **p){struct page *next = *p;//page->next_hash和page->pprev_hash链入全局的Hash表*p = page;page->next_hash = next;page->pprev_hash = p;if (next)next->pprev_hash = &page->next_hash;if (page->buffers)PAGE_BUG(page);atomic_inc(&page_cache_size);}登录后复制 lru_cache_add函数，如下： void lru_cache_add(struct page * page){spin_lock(&pagemap_lru_lock);if (!PageLocked(page))BUG();DEBUG_ADD_PAGEadd_page_to_active_list(page);/* This should be relatively rare */if (!page->age)deactivate_page_nolock(page);spin_unlock(&pagemap_lru_lock);}登录后复制 add_page_to_active_list函数，如下： #define add_page_to_active_list(page) { \DEBUG_ADD_PAGE \ZERO_PAGE_BUG \SetPageActive(page); \list_add(&(page)->lru, &active_list); \ //page->lru链入了全局的active_listnr_active_pages++; \ //全局的nr_active_pages加1}登录后复制二、下面解释read_swap_cache函数，如下： #define read_swap_cache(entry) read_swap_cache_async(entry, 1);登录后复制还是调用read_swap_cache_async函数，只是本次执行，很可能从lookup_swap_cache函数，找到了page。 struct page * read_swap_cache_async(swp_entry_t entry, int wait){struct page *found_page = 0, *new_page;unsigned long new_page_addr;/* * Make sure the swap entry is still in use. */if (!swap_duplicate(entry))/* Account for the swap cache */goto out;/* * Look for the page in the swap cache. */found_page = lookup_swap_cache(entry);//假设在hash表中找到对应的page，有进程认领了，使用计数为2if (found_page)goto out_free_swap;new_page_addr = __get_free_page(GFP_USER);if (!new_page_addr)goto out_free_swap;/* Out of memory */new_page = virt_to_page(new_page_addr);/* * Check the swap cache again, in case we stalled above. */found_page = lookup_swap_cache(entry);//有可能__get_free_page，没有足够的可分配的页面，切换到其他进程了，再切回来时，在Hash表中再寻找一遍if (found_page)goto out_free_page;/* * Add it to the swap cache and read its contents. */lock_page(new_page);add_to_swap_cache(new_page, entry);rw_swap_page(READ, new_page, wait);return new_page;out_free_page:page_cache_release(new_page);out_free_swap:swap_free(entry);out:return found_page;}登录后复制三、lookup_swap_cache函数，如下： struct page * lookup_swap_cache(swp_entry_t entry){struct page *found;#ifdef SWAP_CACHE_INFOswap_cache_find_total++;#endifwhile (1) {/* * Right now the pagecache is 32-bit only. But it's a 32 bit index. =) */repeat:found = find_lock_page(&swapper_space, entry.val);//entry.val为页面交换项if (!found)return 0;/* * Though the "found" page was in the swap cache an instant * earlier, it might have been removed by refill_inactive etc. * Re search ... Since find_lock_page grabs a reference on * the page, it can not be reused for anything else, namely * it can not be associated with another swaphandle, so it * is enough to check whether the page is still in the scache. */if (!PageSwapCache(found)) {UnlockPage(found);page_cache_release(found);goto repeat;}if (found->mapping != &swapper_space)goto out_bad;#ifdef SWAP_CACHE_INFOswap_cache_find_success++;#endifUnlockPage(found);return found;}登录后复制 find_lock_page函数，如下： #define find_lock_page(mapping, index) \__find_lock_page(mapping, index, page_hash(mapping, index))登录后复制 __find_lock_page函数，如下： struct page * __find_lock_page (struct address_space *mapping,unsigned long offset, struct page **hash){struct page *page;/* * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */repeat:spin_lock(&pagecache_lock);page = __find_page_nolock(mapping, offset, *hash);//得到了hash表的其中一个链表的头if (page) {page_cache_get(page);//增加使用计数spin_unlock(&pagecache_lock);lock_page(page);/* Is the page still hashed? Ok, good.. */if (page->mapping)return page;/* Nope: we raced. Release and try again.. */UnlockPage(page);page_cache_release(page);goto repeat;}spin_unlock(&pagecache_lock);return NULL;}登录后复制 __find_page_nolock函数，如下： static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page){goto inside;for (;;) {page = page->next_hash;//从hash表中寻找inside:if (!page)goto not_found;if (page->mapping != mapping)continue;if (page->index == offset)break;}/* * Touching the page may move it to the active list. * If we end up with too few inactive pages, we wake * up kswapd. */age_page_up(page);if (inactive_shortage() > inactive_target / 2 && free_shortage())wakeup_kswapd(0);not_found:return page;}登录后复制根据页面交换项，在hash表中寻找page结构。 swapin_readahead(entry);//预读页面 page = read_swap_cache(entry);//真正得到一个页面，这个页面可能从hash表中寻找到，因为上面预读了。或者自己申请页面，并且从盘上将其内容读进来。登录后复制 read_swap_cache无论从hash表中读取页面，还是自己申请页面，并加入到对应的链表。最后使用计数都是2。 swapin_readahead预读了很多页面，如果没有被进程认领，那么使用计数为1。

nicoollas

Linux内核源代码情景分析-内存管理之用户页面的换入_html/css_WEB-ITnose