From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:33 -0700 Subject: [PATCH] [PATCH] mm: follow_page with inner ptlock Final step in pushing down common core's page_table_lock. follow_page no longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself; and so no page_table_lock is taken in get_user_pages itself. But get_user_pages (and get_futex_key) do then need follow_page to pin the page for them: take Daniel's suggestion of bitflags to follow_page. Need one for WRITE, another for TOUCH (it was the accessed flag before: vanished along with check_user_page_readable, but surely get_numa_maps is wrong to mark every page it finds as accessed), another for GET. And another, ANON to dispose of untouched_anonymous_page: it seems silly for that to descend a second time, let follow_page observe if there was no page table and return ZERO_PAGE if so. Fix minor bug in that: check VM_LOCKED - make_pages_present ought to make readonly anonymous present. Give get_numa_maps a cond_resched while we're there. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +- include/linux/mm.h | 20 +++--- kernel/futex.c | 6 +- mm/memory.c | 152 +++++++++++++++++++++------------------------ mm/nommu.c | 3 +- 5 files changed, 88 insertions(+), 96 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7e5e7ec2e36..d2fa42006d8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) for_each_node(i) md->node[i] =0; - spin_lock(&mm->page_table_lock); for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { page = follow_page(mm, vaddr, 0); if (page) { @@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) md->anon++; md->node[page_to_nid(page)]++; } + cond_resched(); } - spin_unlock(&mm->page_table_lock); return md; } diff --git a/include/linux/mm.h b/include/linux/mm.h index aa8de20e2e8..e8d1424153b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); - -extern struct page * vmalloc_to_page(void *addr); -extern unsigned long vmalloc_to_pfn(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, - int write); -int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); +struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct page *vmalloc_to_page(void *addr); +unsigned long vmalloc_to_pfn(void *addr); +int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); + +struct page *follow_page(struct mm_struct *, unsigned long address, + unsigned int foll_flags); +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ #ifdef CONFIG_PROC_FS void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b..3b4d5ad44cc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * Do a quick atomic lookup first - this is the fastpath. */ - spin_lock(¤t->mm->page_table_lock); - page = follow_page(mm, uaddr, 0); + page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); if (likely(page != NULL)) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - spin_unlock(¤t->mm->page_table_lock); + put_page(page); return 0; } - spin_unlock(¤t->mm->page_table_lock); /* * Do it the general way. diff --git a/mm/memory.c b/mm/memory.c index 51f7c0a220d..8461e2dd91d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, /* * Do a quick page-table lookup for a single page. - * mm->page_table_lock must be held. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) +struct page *follow_page(struct mm_struct *mm, unsigned long address, + unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; + spinlock_t *ptl; unsigned long pfn; struct page *page; - page = follow_huge_addr(mm, address, write); - if (! IS_ERR(page)) - return page; + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; + goto no_page_table; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; + goto no_page_table; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + if (pmd_huge(*pmd)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); + } - ptep = pte_offset_map(pmd, address); + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) goto out; pte = *ptep; - pte_unmap(ptep); - if (pte_present(pte)) { - if (write && !pte_write(pte)) - goto out; - pfn = pte_pfn(pte); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) &&!PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); - return page; - } - } + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto unlock; + page = pfn_to_page(pfn); + if (flags & FOLL_GET) + get_page(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } +unlock: + pte_unmap_unlock(ptep, ptl); out: - return NULL; -} - -static inline int -untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - /* Check if the vma is for an anonymous mapping. */ - if (vma->vm_ops && vma->vm_ops->nopage) - return 0; - - /* Check if page directory entry exists. */ - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return 1; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - return 1; - - /* Check if page middle directory entry exists. */ - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return 1; + return page; - /* There is a pte slot for 'address' in 'mm'. */ - return 0; +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate page tables. + */ + if (flags & FOLL_ANON) { + page = ZERO_PAGE(address); + if (flags & FOLL_GET) + get_page(page); + BUG_ON(flags & FOLL_WRITE); + } + return page; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int flags; + unsigned int vm_flags; /* * Require read or write permissions. * If 'force' is set, we only require the "MAY" flags. */ - flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { - struct vm_area_struct * vma; + struct vm_area_struct *vma; + unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) - || !(flags & vma->vm_flags)) + || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, &start, &len, i); continue; } - spin_lock(&mm->page_table_lock); + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && !(vma->vm_flags & VM_LOCKED) && + (!vma->vm_ops || !vma->vm_ops->nopage)) + foll_flags |= FOLL_ANON; + do { - int write_access = write; struct page *page; - cond_resched_lock(&mm->page_table_lock); - while (!(page = follow_page(mm, start, write_access))) { - int ret; - - /* - * Shortcut for anonymous pages. We don't want - * to force the creation of pages tables for - * insanely big anonymously mapped areas that - * nobody touched so far. This is important - * for doing a core dump for these mappings. - */ - if (!write && untouched_anonymous_page(mm,vma,start)) { - page = ZERO_PAGE(start); - break; - } - spin_unlock(&mm->page_table_lock); - ret = __handle_mm_fault(mm, vma, start, write_access); + if (write) + foll_flags |= FOLL_WRITE; + cond_resched(); + while (!(page = follow_page(mm, start, foll_flags))) { + int ret; + ret = __handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); /* * The VM_FAULT_WRITE bit tells us that do_wp_page has * broken COW when necessary, even if maybe_mkwrite @@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * subsequent page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) - write_access = 0; + foll_flags &= ~FOLL_WRITE; switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: @@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } - spin_lock(&mm->page_table_lock); } if (pages) { pages[i] = page; flush_dcache_page(page); - page_cache_get(page); } if (vmas) vmas[i] = vma; @@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, start += PAGE_SIZE; len--; } while (len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); } while (len); return i; } diff --git a/mm/nommu.c b/mm/nommu.c index dfb124ffb9b..d1e076a487c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) +struct page *follow_page(struct mm_struct *mm, unsigned long address, + unsigned int foll_flags) { return NULL; } -- 2.41.1