}
 
 /*
- * Find a free area in a specific range.
+ * Find a free area with specified alignment in a specific range.
  */
 unsigned long __init find_e820_area(unsigned long start, unsigned long end,
-                                   unsigned size)
+                                   unsigned size, unsigned long align)
 {
        int i;
+       unsigned long mask = ~(align - 1);
 
        for (i = 0; i < e820.nr_map; i++) {
                struct e820entry *ei = &e820.map[i];
                        continue;
                while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
                        ;
-               last = PAGE_ALIGN(addr) + size;
+               addr = (addr + align - 1) & mask;
+               last = addr + size;
                if (last > ei->addr + ei->size)
                        continue;
                if (last > end)
 
         * need roughly 0.5KB per GB.
         */
        start = 0x8000;
-       table_start = find_e820_area(start, end, tables);
+       table_start = find_e820_area(start, end, tables, PAGE_SIZE);
        if (table_start == -1UL)
                panic("Cannot find space for the kernel page tables");
 
-       /*
-        * When you have a lot of RAM like 256GB, early_table will not fit
-        * into 0x8000 range, find_e820_area() will find area after kernel
-        * bss but the table_start is not page aligned, so need to round it
-        * up to avoid overlap with bss:
-        */
-       table_start = round_up(table_start, PAGE_SIZE);
        table_start >>= PAGE_SHIFT;
        table_end = table_start;
 
                mmu_cr4_features = read_cr4();
        __flush_tlb_all();
 
-       reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE");
+       if (!after_bootmem)
+               reserve_early(table_start << PAGE_SHIFT,
+                                table_end << PAGE_SHIFT, "PGTABLE");
 }
 
 #ifndef CONFIG_NUMA
 
 
 static int __init allocate_cachealigned_memnodemap(void)
 {
-       unsigned long pad, pad_addr;
+       unsigned long addr;
 
        memnodemap = memnode.embedded_map;
        if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
                return 0;
 
-       pad = L1_CACHE_BYTES - 1;
-       pad_addr = 0x8000;
-       nodemap_size = pad + sizeof(s16) * memnodemapsize;
-       nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
-                                     nodemap_size);
+       addr = 0x8000;
+       nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
+       nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
+                                     nodemap_size, L1_CACHE_BYTES);
        if (nodemap_addr == -1UL) {
                printk(KERN_ERR
                       "NUMA: Unable to allocate Memory to Node hash map\n");
                nodemap_addr = nodemap_size = 0;
                return -1;
        }
-       pad_addr = (nodemap_addr + pad) & ~pad;
-       memnodemap = phys_to_virt(pad_addr);
+       memnodemap = phys_to_virt(nodemap_addr);
        reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
 
        printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 }
 
 static void * __init early_node_mem(int nodeid, unsigned long start,
-                                   unsigned long end, unsigned long size)
+                                   unsigned long end, unsigned long size,
+                                   unsigned long align)
 {
-       unsigned long mem = find_e820_area(start, end, size);
+       unsigned long mem = find_e820_area(start, end, size, align);
        void *ptr;
 
-       if (mem != -1L)
+       if (mem != -1L) {
+               mem = round_up(mem, align);
                return __va(mem);
-       ptr = __alloc_bootmem_nopanic(size,
-                               SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
+       }
+       ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
        if (ptr == NULL) {
                printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
                       size, nodeid);
        start_pfn = start >> PAGE_SHIFT;
        end_pfn = end >> PAGE_SHIFT;
 
-       node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+       node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
+                                          SMP_CACHE_BYTES);
        if (node_data[nodeid] == NULL)
                return;
        nodedata_phys = __pa(node_data[nodeid]);
        /* Find a place for the bootmem map */
        bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
        bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+       /*
+        * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
+        * to use that to align to PAGE_SIZE
+        */
        bootmap = early_node_mem(nodeid, bootmap_start, end,
-                                       bootmap_pages<<PAGE_SHIFT);
+                                bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
        if (bootmap == NULL)  {
                if (nodedata_phys < start || nodedata_phys >= end)
                        free_bootmem((unsigned long)node_data[nodeid],