From: Ingo Molnar Date: Fri, 10 Oct 2008 17:30:08 +0000 (+0200) Subject: Merge branch 'linus' into x86/pat2 X-Git-Tag: v2.6.28-rc1~713^2 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=3dd392a407d15250a501fa109cc1f93fee95ef85;hp=-c;p=linux-2.6-omap-h63xx.git Merge branch 'linus' into x86/pat2 Conflicts: arch/x86/mm/init_64.c --- 3dd392a407d15250a501fa109cc1f93fee95ef85 diff --combined arch/x86/mm/init_32.c index 74780800e7e,6b9a9358b33..c3789bb1930 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@@ -47,6 -47,7 +47,7 @@@ #include #include #include + #include unsigned int __VMALLOC_RESERVE = 128 << 20; @@@ -194,30 -195,11 +195,30 @@@ static void __init kernel_physical_mapp pgd_t *pgd; pmd_t *pmd; pte_t *pte; - unsigned pages_2m = 0, pages_4k = 0; + unsigned pages_2m, pages_4k; + int mapping_iter; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; if (!cpu_has_pse) use_pse = 0; +repeat: + pages_2m = pages_4k = 0; pfn = start_pfn; pgd_idx = pgd_index((pfn<> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); @@@ -827,22 -769,12 +824,22 @@@ unsigned long __init_refok init_memory_ pgd_t *pgd_base = swapper_pg_dir; unsigned long start_pfn, end_pfn; unsigned long big_page_start; +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + int use_pse = 0; +#else + int use_pse = cpu_has_pse; +#endif /* * Find space for the kernel direct mapping tables. */ if (!after_init_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse); #ifdef CONFIG_X86_PAE set_nx(); @@@ -888,7 -820,7 +885,7 @@@ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - cpu_has_pse); + use_pse); /* tail is not big page alignment ? */ start_pfn = end_pfn; @@@ -1051,6 -983,7 +1048,6 @@@ void __init mem_init(void if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - cpa_init(); save_pg_dir(); zap_low_mappings(); } diff --combined arch/x86/mm/init_64.c index 8c7eae490a2,770536ebf7e..fb30486c82f --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@@ -225,7 -225,7 +225,7 @@@ void __init init_extra_mapping_uc(unsig void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; pmd_t *last_pmd = pmd + PTRS_PER_PMD; @@@ -271,8 -271,7 +271,8 @@@ static __ref void unmap_low_page(void * } static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { unsigned pages = 0; unsigned long last_map_addr = end; @@@ -290,43 -289,36 +290,43 @@@ break; } + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ if (pte_val(*pte)) continue; if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); - last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; pages++; + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; } + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - return phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end, prot); } static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { unsigned long pages = 0; unsigned long last_map_addr = end; - unsigned long start = address; int i = pmd_index(address); @@@ -334,7 -326,6 +334,7 @@@ unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; if (address >= end) { if (!after_bootmem) { @@@ -348,40 -339,27 +348,40 @@@ if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end, prot); spin_unlock(&init_mm.page_table_lock); + continue; } - /* Count entries we're using from level2_ident_pgt */ - if (start == 0) - pages++; - continue; + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) + continue; + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end, new_prot); unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); @@@ -394,12 -372,12 +394,12 @@@ static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); __flush_tlb_all(); return last_map_addr; } @@@ -416,7 -394,6 +416,7 @@@ phys_pud_init(pud_t *pud_page, unsigne unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; if (addr >= end) break; @@@ -428,26 -405,10 +428,26 @@@ } if (pud_val(*pud)) { - if (!pud_large(*pud)) + if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); - continue; + page_size_mask, prot); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) + continue; + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<> PUD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - if (direct_gbpages) { + if (use_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could @@@ -570,7 -528,6 +570,7 @@@ static unsigned long __init kernel_phys pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); } + __flush_tlb_all(); return last_map_addr; } @@@ -614,7 -571,6 +614,7 @@@ unsigned long __init_refok init_memory_ struct map_range mr[NR_RANGE_MR]; int nr_range, i; + int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping\n"); @@@ -628,21 -584,9 +628,21 @@@ if (!after_bootmem) init_gbpages(); - if (direct_gbpages) +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) + if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); @@@ -703,7 -647,7 +703,7 @@@ (mr[i].page_size_mask & (1<> 10, initsize >> 10); - - cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --combined arch/x86/mm/ioremap.c index d03c461e045,cac6da54203..6ab3196d12b --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@@ -83,25 -83,6 +83,25 @@@ int page_is_ram(unsigned long pagenr return 0; } +int pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + if (page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@@ -440,7 -421,7 +440,7 @@@ void unxlate_dev_mem_ptr(unsigned long return; } - int __initdata early_ioremap_debug; + static int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) { @@@ -566,7 -547,7 +566,7 @@@ static inline void __init early_clear_f } - int __initdata early_ioremap_nested; + static int __initdata early_ioremap_nested; static int __init check_early_ioremap_leak(void) { diff --combined arch/x86/mm/pageattr.c index b6374d653d0,898fad617ab..a9ec89c3fbc --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@@ -25,27 -25,15 +25,27 @@@ * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { - unsigned long vaddr; + unsigned long *vaddr; pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int flushtlb; + int flags; unsigned long pfn; unsigned force_split : 1; + int curpage; }; +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@@ -96,7 -84,7 +96,7 @@@ static inline unsigned long highmap_sta static inline unsigned long highmap_end_pfn(void) { - return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@@ -202,41 -190,6 +202,41 @@@ static void cpa_flush_range(unsigned lo } } +static void cpa_flush_array(unsigned long *start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long *addr; + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* 4M threshold */ + if (numpages >= 1024) { + if (boot_cpu_data.x86_model >= 4) + wbinvd(); + return; + } + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr++) { + pte_t *pte = lookup_address(*addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) *addr, PAGE_SIZE); + } +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@@ -445,7 -398,7 +445,7 @@@ try_preserve_large_page(pte_t *kpte, un */ new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); __set_pmd_pte(kpte, address, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; do_split = 0; } @@@ -455,6 -408,84 +455,6 @@@ out_unlock return do_split; } -static LIST_HEAD(page_pool); -static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed; - -static void cpa_fill_pool(struct page **ret) -{ - gfp_t gfp = GFP_KERNEL; - unsigned long flags; - struct page *p; - - /* - * Avoid recursion (on debug-pagealloc) and also signal - * our priority to get to these pagetables: - */ - if (current->flags & PF_MEMALLOC) - return; - current->flags |= PF_MEMALLOC; - - /* - * Allocate atomically from atomic contexts: - */ - if (in_atomic() || irqs_disabled() || debug_pagealloc) - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - - while (pool_pages < pool_size || (ret && !*ret)) { - p = alloc_pages(gfp, 0); - if (!p) { - pool_failed++; - break; - } - /* - * If the call site needs a page right now, provide it: - */ - if (ret && !*ret) { - *ret = p; - continue; - } - spin_lock_irqsave(&pgd_lock, flags); - list_add(&p->lru, &page_pool); - pool_pages++; - spin_unlock_irqrestore(&pgd_lock, flags); - } - - current->flags &= ~PF_MEMALLOC; -} - -#define SHIFT_MB (20 - PAGE_SHIFT) -#define ROUND_MB_GB ((1 << 10) - 1) -#define SHIFT_MB_GB 10 -#define POOL_PAGES_PER_GB 16 - -void __init cpa_init(void) -{ - struct sysinfo si; - unsigned long gb; - - si_meminfo(&si); - /* - * Calculate the number of pool pages: - * - * Convert totalram (nr of pages) to MiB and round to the next - * GiB. Shift MiB to Gib and multiply the result by - * POOL_PAGES_PER_GB: - */ - if (debug_pagealloc) { - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; - } else { - pool_size = 1; - } - pool_low = pool_size; - - cpa_fill_pool(NULL); - printk(KERN_DEBUG - "CPA: page pool initialized %lu of %lu pages preallocated\n", - pool_pages, pool_size); -} - static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; @@@ -463,15 -494,28 +463,15 @@@ pgprot_t ref_prot; struct page *base; - /* - * Get a page from the pool. The pool list is protected by the - * pgd_lock, which we have to take anyway for the split - * operation: - */ - spin_lock_irqsave(&pgd_lock, flags); - if (list_empty(&page_pool)) { - spin_unlock_irqrestore(&pgd_lock, flags); - base = NULL; - cpa_fill_pool(&base); - if (!base) - return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); - } else { - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - } + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page * up for us already: @@@ -528,8 -572,11 +528,8 @@@ out_unlock * If we dropped out via the lookup_address check under * pgd_lock then stick the page back into the pool: */ - if (base) { - list_add(&base->lru, &page_pool); - pool_pages++; - } else - pool_used++; + if (base) + __free_page(base); spin_unlock_irqrestore(&pgd_lock, flags); return 0; @@@ -537,16 -584,11 +537,16 @@@ static int __change_page_attr(struct cpa_data *cpa, int primary) { - unsigned long address = cpa->vaddr; + unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; + if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; + repeat: kpte = lookup_address(address, &level); if (!kpte) @@@ -558,7 -600,7 +558,7 @@@ return 0; WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); + *cpa->vaddr); return -EINVAL; } @@@ -584,7 -626,7 +584,7 @@@ */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; @@@ -608,25 -650,7 +608,25 @@@ */ err = split_large_page(kpte, address); if (!err) { - cpa->flushtlb = 1; + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); goto repeat; } @@@ -639,7 -663,6 +639,7 @@@ static int cpa_process_alias(struct cpa { struct cpa_data alias_cpa; int ret = 0; + unsigned long temp_cpa_vaddr, vaddr; if (cpa->pfn >= max_pfn_mapped) return 0; @@@ -652,24 -675,16 +652,24 @@@ * No need to redo, when the primary call touched the direct * mapping already: */ - if (!(within(cpa->vaddr, PAGE_OFFSET, + if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) #ifdef CONFIG_X86_64 - || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + || within(vaddr, PAGE_OFFSET + (1UL<<32), PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) #endif )) { alias_cpa = *cpa; - alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; + ret = __change_page_attr_set_clr(&alias_cpa, 0); } @@@ -681,7 -696,7 +681,7 @@@ * No need to redo, when the primary call touched the high * mapping already: */ - if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) return 0; /* @@@ -692,9 -707,8 +692,9 @@@ return 0; alias_cpa = *cpa; - alias_cpa.vaddr = - (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; /* * The high mapping range is imprecise, so ignore the return value. @@@ -714,15 -728,8 +714,15 @@@ static int __change_page_attr_set_clr(s * preservation check. */ cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & CPA_ARRAY) + cpa->numpages = 1; + if (!debug_pagealloc) + spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); if (ret) return ret; @@@ -739,11 -746,7 +739,11 @@@ */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - cpa->vaddr += cpa->numpages * PAGE_SIZE; + if (cpa->flags & CPA_ARRAY) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } @@@ -754,9 -757,9 +754,9 @@@ static inline int cache_attr(pgprot_t a (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int array) { struct cpa_data cpa; int ret, cache, checkalias; @@@ -771,38 -774,21 +771,38 @@@ return 0; /* Ensure we are PAGE_SIZE aligned */ - if (addr & ~PAGE_MASK) { - addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); + if (!array) { + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + } else { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } } + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flushtlb = 0; + cpa.flags = 0; + cpa.curpage = 0; cpa.force_split = force_split; + if (array) + cpa.flags |= CPA_ARRAY; + /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@@ -811,7 -797,7 +811,7 @@@ /* * Check whether we really changed something: */ - if (!cpa.flushtlb) + if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* @@@ -826,30 -812,27 +826,30 @@@ * error case we fall back to cpa_flush_all (which uses * wbindv): */ - if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages, cache); - else + if (!ret && cpu_has_clflush) { + if (cpa.flags & CPA_ARRAY) + cpa_flush_array(addr, numpages, cache); + else + cpa_flush_range(*addr, numpages, cache); + } else cpa_flush_all(cache); out: - cpa_fill_pool(NULL); - return ret; } -static inline int change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + array); } -static inline int change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + array); } int _set_memory_uc(unsigned long addr, int numpages) @@@ -857,8 -840,8 +857,8 @@@ /* * for now UC MINUS. see comments in ioremap_nocache() */ - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) @@@ -874,48 -857,10 +874,48 @@@ } EXPORT_SYMBOL(set_memory_uc); +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + unsigned long start; + unsigned long end; + int i; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + start = __pa(addr[i]); + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + goto out; + } + + return change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); +out: + for (i = 0; i < addrinarray; i++) { + unsigned long tmp = __pa(addr[i]); + + if (tmp == start) + break; + for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(tmp, end); + } + return -EINVAL; +} +EXPORT_SYMBOL(set_memory_array_uc); + int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_WC)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_WC), 0); } int set_memory_wc(unsigned long addr, int numpages) @@@ -933,8 -878,8 +933,8 @@@ EXPORT_SYMBOL(set_memory_wc) int _set_memory_wb(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK)); + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) @@@ -945,57 -890,39 +945,59 @@@ } EXPORT_SYMBOL(set_memory_wb); +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + + for (i = 0; i < addrinarray; i++) { + unsigned long start = __pa(addr[i]); + unsigned long end; + + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(start, end); + } + return change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); +} +EXPORT_SYMBOL(set_memory_array_wb); + int set_memory_x(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } + EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } + EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), - __pgprot(0), 1); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0); } int set_pages_uc(struct page *page, int numpages) @@@ -1048,38 -975,22 +1050,38 @@@ int set_pages_rw(struct page *page, in static int __set_pages_p(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .mask_clr = __pgprot(0)}; + .mask_clr = __pgprot(0), + .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } static int __set_pages_np(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) @@@ -1099,8 -1010,11 +1101,8 @@@ /* * The return value is ignored as the calls cannot fail. - * Large pages are kept enabled at boot time, and are - * split up quickly with DEBUG_PAGEALLOC. If a splitup - * fails here (due to temporary memory shortage) no damage - * is done because we just keep the largepage intact up - * to the next attempt when it will likely be split up: + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); @@@ -1112,8 -1026,53 +1114,8 @@@ * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); - - /* - * Try to refill the page pool here. We can do this only after - * the tlb flush. - */ - cpa_fill_pool(NULL); } -#ifdef CONFIG_DEBUG_FS -static int dpa_show(struct seq_file *m, void *v) -{ - seq_puts(m, "DEBUG_PAGEALLOC\n"); - seq_printf(m, "pool_size : %lu\n", pool_size); - seq_printf(m, "pool_pages : %lu\n", pool_pages); - seq_printf(m, "pool_low : %lu\n", pool_low); - seq_printf(m, "pool_used : %lu\n", pool_used); - seq_printf(m, "pool_failed : %lu\n", pool_failed); - - return 0; -} - -static int dpa_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, dpa_show, NULL); -} - -static const struct file_operations dpa_fops = { - .open = dpa_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init debug_pagealloc_proc_init(void) -{ - struct dentry *de; - - de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, - &dpa_fops); - if (!de) - return -ENOMEM; - - return 0; -} -__initcall(debug_pagealloc_proc_init); -#endif - #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) diff --combined include/asm-x86/cacheflush.h index 092b9b4eb00,59859cb28a3..68840ef1b35 --- a/include/asm-x86/cacheflush.h +++ b/include/asm-x86/cacheflush.h @@@ -1,5 -1,5 +1,5 @@@ - #ifndef _ASM_X86_CACHEFLUSH_H - #define _ASM_X86_CACHEFLUSH_H + #ifndef ASM_X86__CACHEFLUSH_H + #define ASM_X86__CACHEFLUSH_H /* Keep includes the same across arches. */ #include @@@ -24,8 -24,6 +24,8 @@@ #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy((dst), (src), (len)) +#define PG_non_WB PG_arch_1 +PAGEFLAG(NonWB, non_WB) /* * The set_memory_* API can be used to change various attributes of a virtual @@@ -68,9 -66,6 +68,9 @@@ int set_memory_rw(unsigned long addr, i int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + /* * For legacy compatibility with the old APIs, a few functions * are provided that work on a "struct page". @@@ -101,6 -96,8 +101,6 @@@ int set_pages_rw(struct page *page, in void clflush_cache_range(void *addr, unsigned int size); -void cpa_init(void); - #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; @@@ -115,4 -112,4 +115,4 @@@ static inline int rodata_test(void } #endif - #endif + #endif /* ASM_X86__CACHEFLUSH_H */ diff --combined include/asm-x86/page.h index 3407ac12ba3,79544e6ffb8..c9157477675 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@@ -1,5 -1,5 +1,5 @@@ - #ifndef _ASM_X86_PAGE_H - #define _ASM_X86_PAGE_H + #ifndef ASM_X86__PAGE_H + #define ASM_X86__PAGE_H #include @@@ -57,7 -57,6 +57,7 @@@ typedef struct { pgdval_t pgd; } pgd_t typedef struct { pgprotval_t pgprot; } pgprot_t; extern int page_is_ram(unsigned long pagenr); +extern int pagerange_is_ram(unsigned long start, unsigned long end); extern int devmem_is_allowed(unsigned long pagenr); extern void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot); @@@ -200,4 -199,4 +200,4 @@@ static inline pteval_t native_pte_flags #define __HAVE_ARCH_GATE_AREA 1 #endif /* __KERNEL__ */ - #endif /* _ASM_X86_PAGE_H */ + #endif /* ASM_X86__PAGE_H */ diff --combined include/asm-x86/pgtable.h index bbf0f591d1b,888add7b088..ed932453ef2 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@@ -1,5 -1,5 +1,5 @@@ - #ifndef _ASM_X86_PGTABLE_H - #define _ASM_X86_PGTABLE_H + #ifndef ASM_X86__PGTABLE_H + #define ASM_X86__PGTABLE_H #define FIRST_USER_ADDRESS 0 @@@ -19,7 -19,6 +19,7 @@@ #define _PAGE_BIT_UNUSED3 11 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) @@@ -37,7 -36,6 +37,7 @@@ #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define __HAVE_ARCH_PTE_SPECIAL #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) @@@ -132,17 -130,6 +132,17 @@@ #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + #ifndef __ASSEMBLY__ /* @@@ -199,6 -186,13 +199,13 @@@ static inline int pte_special(pte_t pte return pte_val(pte) & _PAGE_SPECIAL; } + static inline unsigned long pte_pfn(pte_t pte) + { + return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; + } + + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) + static inline int pmd_large(pmd_t pte) { return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == @@@ -326,6 -320,8 +333,8 @@@ static inline void native_pagetable_set static inline void native_pagetable_setup_done(pgd_t *base) {} #endif + extern int arch_report_meminfo(char *page); + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ @@@ -534,4 -530,4 +543,4 @@@ static inline void clone_pgd_range(pgd_ #include #endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_PGTABLE_H */ + #endif /* ASM_X86__PGTABLE_H */