2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 struct memnode memnode;
30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 int numa_off __initdata;
39 unsigned long __initdata nodemap_addr;
40 unsigned long __initdata nodemap_size;
44 * Given a shift value, try to populate memnodemap[]
47 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big)
51 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
55 unsigned long addr, end;
57 memset(memnodemap, 0xff, memnodemapsize);
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
63 if ((end >> shift) >= memnodemapsize)
66 if (memnodemap[addr >> shift] != 0xff)
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
76 static int __init allocate_cachealigned_memnodemap(void)
78 unsigned long pad, pad_addr;
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48)
84 pad = L1_CACHE_BYTES - 1;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
89 if (nodemap_addr == -1UL) {
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
108 extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
110 int i, nodes_used = 0;
111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128 memnodemapsize = (memtop >> i)+1;
132 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
144 "Your memory is not aligned you need to rebuild your kernel "
145 "with a bigger NODEMAPSIZE shift=%d\n",
152 #ifdef CONFIG_SPARSEMEM
153 int early_pfn_to_nid(unsigned long pfn)
155 return phys_to_nid(pfn << PAGE_SHIFT);
160 early_node_mem(int nodeid, unsigned long start, unsigned long end,
163 unsigned long mem = find_e820_area(start, end, size);
167 ptr = __alloc_bootmem_nopanic(size,
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
177 /* Initialize bootmem allocator for a node */
178 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
181 unsigned long nodedata_phys;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
185 start = round_up(start, ZONE_ALIGN);
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
189 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT;
192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
193 if (node_data[nodeid] == NULL)
195 nodedata_phys = __pa(node_data[nodeid]);
197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
199 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
202 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
205 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT);
207 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
210 node_data[nodeid] = NULL;
213 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT,
220 free_bootmem_with_active_regions(nodeid, end);
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
224 #ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid);
227 node_set_online(nodeid);
230 /* Initialize final allocator for a zone */
231 void __init setup_node_zones(int nodeid)
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
241 /* Try to allocate mem_map at end to not fill up precious <4GB
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245 #ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
254 void __init numa_init_array(void)
257 /* There are unfortunately some poorly designed mainboards around
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node[i] != NUMA_NO_NODE)
266 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map);
274 #ifdef CONFIG_NUMA_EMU
276 char *cmdline __initdata;
279 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * greater than max_addr, then max_addr is used instead. The return value is 0
281 * if there is additional memory left for allocation past addr and -1 otherwise.
282 * addr is adjusted to be at the end of the node.
284 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
285 u64 size, u64 max_addr)
288 nodes[nid].start = *addr;
290 if (*addr >= max_addr) {
294 nodes[nid].end = *addr;
295 node_set(nid, node_possible_map);
296 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
297 nodes[nid].start, nodes[nid].end,
298 (nodes[nid].end - nodes[nid].start) >> 20);
303 * Splits num_nodes nodes up equally starting at node_start. The return value
304 * is the number of nodes split up and addr is adjusted to be at the end of the
305 * last node allocated.
307 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
308 u64 max_addr, int node_start,
317 if (num_nodes > MAX_NUMNODES)
318 num_nodes = MAX_NUMNODES;
319 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
322 * Calculate the number of big nodes that can be allocated as a result
323 * of consolidating the leftovers.
325 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
328 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
329 size &= FAKE_NODE_MIN_HASH_MASK;
331 printk(KERN_ERR "Not enough memory for each node. "
332 "NUMA emulation disabled.\n");
336 for (i = node_start; i < num_nodes + node_start; i++) {
337 u64 end = *addr + size;
339 end += FAKE_NODE_MIN_SIZE;
341 * The final node can have the remaining system RAM. Other
342 * nodes receive roughly the same amount of available pages.
344 if (i == num_nodes + node_start - 1)
347 while (end - *addr - e820_hole_size(*addr, end) <
349 end += FAKE_NODE_MIN_SIZE;
350 if (end > max_addr) {
355 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
358 return i - node_start + 1;
362 * Splits the remaining system RAM into chunks of size. The remaining memory is
363 * always assigned to a final node and can be asymmetric. Returns the number of
366 static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
367 u64 max_addr, int node_start, u64 size)
370 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
371 while (!setup_node_range(i++, nodes, addr, size, max_addr))
373 return i - node_start;
377 * Sets up the system RAM area from start_pfn to end_pfn according to the
378 * numa=fake command-line option.
380 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
382 struct bootnode nodes[MAX_NUMNODES];
383 u64 addr = start_pfn << PAGE_SHIFT;
384 u64 max_addr = end_pfn << PAGE_SHIFT;
392 memset(&nodes, 0, sizeof(nodes));
394 * If the numa=fake command-line is just a single number N, split the
395 * system RAM into N fake nodes.
397 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
398 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
399 simple_strtol(cmdline, NULL, 0));
405 /* Parse the command line. */
406 for (coeff_flag = 0; ; cmdline++) {
407 if (*cmdline && isdigit(*cmdline)) {
408 num = num * 10 + *cmdline - '0';
411 if (*cmdline == '*') {
416 if (!*cmdline || *cmdline == ',') {
420 * Round down to the nearest FAKE_NODE_MIN_SIZE.
421 * Command-line coefficients are in megabytes.
423 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
425 for (i = 0; i < coeff; i++, num_nodes++)
426 if (setup_node_range(num_nodes, nodes,
427 &addr, size, max_addr) < 0)
439 /* Fill remainder of system RAM, if appropriate. */
440 if (addr < max_addr) {
441 if (coeff_flag && coeff < 0) {
442 /* Split remaining nodes into num-sized chunks */
443 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
447 switch (*(cmdline - 1)) {
449 /* Split remaining nodes into coeff chunks */
452 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
456 /* Do not allocate remaining system RAM */
459 /* Give one final node */
460 setup_node_range(num_nodes, nodes, &addr,
461 max_addr - addr, max_addr);
466 memnode_shift = compute_hash_shift(nodes, num_nodes);
467 if (memnode_shift < 0) {
469 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
475 * We need to vacate all active ranges that may have been registered by
478 remove_all_active_ranges();
479 for_each_node_mask(i, node_possible_map) {
480 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
481 nodes[i].end >> PAGE_SHIFT);
482 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
484 acpi_fake_nodes(nodes, num_nodes);
488 #endif /* CONFIG_NUMA_EMU */
490 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
494 nodes_clear(node_possible_map);
496 #ifdef CONFIG_NUMA_EMU
497 if (cmdline && !numa_emulation(start_pfn, end_pfn))
499 nodes_clear(node_possible_map);
502 #ifdef CONFIG_ACPI_NUMA
503 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
504 end_pfn << PAGE_SHIFT))
506 nodes_clear(node_possible_map);
509 #ifdef CONFIG_K8_NUMA
510 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
512 nodes_clear(node_possible_map);
514 printk(KERN_INFO "%s\n",
515 numa_off ? "NUMA turned off" : "No NUMA configuration found");
517 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
518 start_pfn << PAGE_SHIFT,
519 end_pfn << PAGE_SHIFT);
520 /* setup dummy node covering all memory */
522 memnodemap = memnode.embedded_map;
524 nodes_clear(node_online_map);
526 node_set(0, node_possible_map);
527 for (i = 0; i < NR_CPUS; i++)
529 node_to_cpumask[0] = cpumask_of_cpu(0);
530 e820_register_active_regions(0, start_pfn, end_pfn);
531 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
534 __cpuinit void numa_add_cpu(int cpu)
536 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
539 void __cpuinit numa_set_node(int cpu, int node)
541 cpu_pda(cpu)->nodenumber = node;
542 cpu_to_node[cpu] = node;
545 unsigned long __init numa_free_all_bootmem(void)
548 unsigned long pages = 0;
549 for_each_online_node(i) {
550 pages += free_all_bootmem_node(NODE_DATA(i));
555 void __init paging_init(void)
558 unsigned long max_zone_pfns[MAX_NR_ZONES];
559 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
560 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
561 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
562 max_zone_pfns[ZONE_NORMAL] = end_pfn;
564 sparse_memory_present_with_active_regions(MAX_NUMNODES);
567 for_each_online_node(i) {
571 free_area_init_nodes(max_zone_pfns);
574 static __init int numa_setup(char *opt)
578 if (!strncmp(opt,"off",3))
580 #ifdef CONFIG_NUMA_EMU
581 if (!strncmp(opt, "fake=", 5))
584 #ifdef CONFIG_ACPI_NUMA
585 if (!strncmp(opt,"noacpi",6))
587 if (!strncmp(opt,"hotadd=", 7))
588 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
593 early_param("numa", numa_setup);
596 * Setup early cpu_to_node.
598 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
599 * and apicid_to_node[] tables have valid entries for a CPU.
600 * This means we skip cpu_to_node[] initialisation for NUMA
601 * emulation and faking node case (when running a kernel compiled
602 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
603 * is already initialized in a round robin manner at numa_init_array,
604 * prior to this call, and this initialization is good enough
605 * for the fake NUMA cases.
607 void __init init_cpu_to_node(void)
610 for (i = 0; i < NR_CPUS; i++) {
611 u8 apicid = x86_cpu_to_apicid[i];
612 if (apicid == BAD_APICID)
614 if (apicid_to_node[apicid] == NUMA_NO_NODE)
616 numa_set_node(i,apicid_to_node[apicid]);
620 EXPORT_SYMBOL(cpu_to_node);
621 EXPORT_SYMBOL(node_to_cpumask);
622 EXPORT_SYMBOL(memnode);
623 EXPORT_SYMBOL(node_data);
625 #ifdef CONFIG_DISCONTIGMEM
627 * Functions to convert PFNs from/to per node page addresses.
628 * These are out of line because they are quite big.
629 * They could be all tuned by pre caching more state.
633 int pfn_valid(unsigned long pfn)
636 if (pfn >= num_physpages)
638 nid = pfn_to_nid(pfn);
641 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
643 EXPORT_SYMBOL(pfn_valid);