2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 struct memnode memnode;
30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 int numa_off __initdata;
42 * Given a shift value, try to populate memnodemap[]
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
49 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
53 unsigned long addr, end;
57 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
63 if ((end >> shift) >= NODEMAPSIZE)
66 if (memnodemap[addr >> shift] != 0xff)
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
76 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
86 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 "Your memory is not aligned you need to rebuild your kernel "
89 "with a bigger NODEMAPSIZE shift=%d\n",
96 #ifdef CONFIG_SPARSEMEM
97 int early_pfn_to_nid(unsigned long pfn)
99 return phys_to_nid(pfn << PAGE_SHIFT);
104 early_node_mem(int nodeid, unsigned long start, unsigned long end,
107 unsigned long mem = find_e820_area(start, end, size);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
121 /* Initialize bootmem allocator for a node */
122 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
125 unsigned long nodedata_phys;
127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
129 start = round_up(start, ZONE_ALIGN);
131 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
133 start_pfn = start >> PAGE_SHIFT;
134 end_pfn = end >> PAGE_SHIFT;
136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
137 if (node_data[nodeid] == NULL)
139 nodedata_phys = __pa(node_data[nodeid]);
141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
144 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
146 /* Find a place for the bootmem map */
147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
149 bootmap = early_node_mem(nodeid, bootmap_start, end,
150 bootmap_pages<<PAGE_SHIFT);
151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
157 bootmap_start = __pa(bootmap);
158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
161 bootmap_start >> PAGE_SHIFT,
164 free_bootmem_with_active_regions(nodeid, end);
166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
168 #ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
171 node_set_online(nodeid);
174 /* Initialize final allocator for a zone */
175 void __init setup_node_zones(int nodeid)
177 unsigned long start_pfn, end_pfn, memmapsize, limit;
179 start_pfn = node_start_pfn(nodeid);
180 end_pfn = node_end_pfn(nodeid);
182 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
183 nodeid, start_pfn, end_pfn);
185 /* Try to allocate mem_map at end to not fill up precious <4GB
187 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
188 limit = end_pfn << PAGE_SHIFT;
189 #ifdef CONFIG_FLAT_NODE_MEM_MAP
190 NODE_DATA(nodeid)->node_mem_map =
191 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
192 memmapsize, SMP_CACHE_BYTES,
193 round_down(limit - memmapsize, PAGE_SIZE),
198 void __init numa_init_array(void)
201 /* There are unfortunately some poorly designed mainboards around
202 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
203 mapping. To avoid this fill in the mapping for all possible
204 CPUs, as the number of CPUs is not known yet.
205 We round robin the existing nodes. */
206 rr = first_node(node_online_map);
207 for (i = 0; i < NR_CPUS; i++) {
208 if (cpu_to_node[i] != NUMA_NO_NODE)
210 numa_set_node(i, rr);
211 rr = next_node(rr, node_online_map);
212 if (rr == MAX_NUMNODES)
213 rr = first_node(node_online_map);
218 #ifdef CONFIG_NUMA_EMU
219 int numa_fake __initdata = 0;
222 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
225 struct bootnode nodes[MAX_NUMNODES];
226 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
228 /* Kludge needed for the hash function */
229 if (hweight64(sz) > 1) {
231 while ((x << 1) < sz)
234 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
238 memset(&nodes,0,sizeof(nodes));
239 for (i = 0; i < numa_fake; i++) {
240 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
241 if (i == numa_fake-1)
242 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
243 nodes[i].end = nodes[i].start + sz;
244 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
246 nodes[i].start, nodes[i].end,
247 (nodes[i].end - nodes[i].start) >> 20);
250 memnode_shift = compute_hash_shift(nodes, numa_fake);
251 if (memnode_shift < 0) {
253 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
256 for_each_online_node(i) {
257 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
258 nodes[i].end >> PAGE_SHIFT);
259 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
266 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
270 #ifdef CONFIG_NUMA_EMU
271 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
275 #ifdef CONFIG_ACPI_NUMA
276 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
277 end_pfn << PAGE_SHIFT))
281 #ifdef CONFIG_K8_NUMA
282 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
285 printk(KERN_INFO "%s\n",
286 numa_off ? "NUMA turned off" : "No NUMA configuration found");
288 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
289 start_pfn << PAGE_SHIFT,
290 end_pfn << PAGE_SHIFT);
291 /* setup dummy node covering all memory */
294 nodes_clear(node_online_map);
296 for (i = 0; i < NR_CPUS; i++)
298 node_to_cpumask[0] = cpumask_of_cpu(0);
299 e820_register_active_regions(0, start_pfn, end_pfn);
300 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
303 __cpuinit void numa_add_cpu(int cpu)
305 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
308 void __cpuinit numa_set_node(int cpu, int node)
310 cpu_pda(cpu)->nodenumber = node;
311 cpu_to_node[cpu] = node;
314 unsigned long __init numa_free_all_bootmem(void)
317 unsigned long pages = 0;
318 for_each_online_node(i) {
319 pages += free_all_bootmem_node(NODE_DATA(i));
324 #ifdef CONFIG_SPARSEMEM
325 static void __init arch_sparse_init(void)
329 for_each_online_node(i)
330 memory_present(i, node_start_pfn(i), node_end_pfn(i));
335 #define arch_sparse_init() do {} while (0)
338 void __init paging_init(void)
341 unsigned long max_zone_pfns[MAX_NR_ZONES];
342 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
343 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
344 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
345 max_zone_pfns[ZONE_NORMAL] = end_pfn;
349 for_each_online_node(i) {
353 free_area_init_nodes(max_zone_pfns);
356 static __init int numa_setup(char *opt)
360 if (!strncmp(opt,"off",3))
362 #ifdef CONFIG_NUMA_EMU
363 if(!strncmp(opt, "fake=", 5)) {
364 numa_fake = simple_strtoul(opt+5,NULL,0); ;
365 if (numa_fake >= MAX_NUMNODES)
366 numa_fake = MAX_NUMNODES;
369 #ifdef CONFIG_ACPI_NUMA
370 if (!strncmp(opt,"noacpi",6))
372 if (!strncmp(opt,"hotadd=", 7))
373 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
378 early_param("numa", numa_setup);
381 * Setup early cpu_to_node.
383 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
384 * and apicid_to_node[] tables have valid entries for a CPU.
385 * This means we skip cpu_to_node[] initialisation for NUMA
386 * emulation and faking node case (when running a kernel compiled
387 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
388 * is already initialized in a round robin manner at numa_init_array,
389 * prior to this call, and this initialization is good enough
390 * for the fake NUMA cases.
392 void __init init_cpu_to_node(void)
395 for (i = 0; i < NR_CPUS; i++) {
396 u8 apicid = x86_cpu_to_apicid[i];
397 if (apicid == BAD_APICID)
399 if (apicid_to_node[apicid] == NUMA_NO_NODE)
401 numa_set_node(i,apicid_to_node[apicid]);
405 EXPORT_SYMBOL(cpu_to_node);
406 EXPORT_SYMBOL(node_to_cpumask);
407 EXPORT_SYMBOL(memnode);
408 EXPORT_SYMBOL(node_data);
410 #ifdef CONFIG_DISCONTIGMEM
412 * Functions to convert PFNs from/to per node page addresses.
413 * These are out of line because they are quite big.
414 * They could be all tuned by pre caching more state.
418 int pfn_valid(unsigned long pfn)
421 if (pfn >= num_physpages)
423 nid = pfn_to_nid(pfn);
426 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
428 EXPORT_SYMBOL(pfn_valid);