2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES];
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32 cpumask_t node_to_cpumask[MAX_NUMNODES];
34 int numa_off __initdata;
36 int __init compute_hash_shift(struct node *nodes, int numnodes)
40 unsigned long addr,maxend=0;
42 for (i = 0; i < numnodes; i++)
43 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
44 maxend = nodes[i].end;
46 while ((1UL << shift) < (maxend / NODEMAPSIZE))
49 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
51 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
52 for (i = 0; i < numnodes; i++) {
53 if (nodes[i].start == nodes[i].end)
55 for (addr = nodes[i].start;
57 addr += (1UL << shift)) {
58 if (memnodemap[addr >> shift] != 0xff) {
60 "Your memory is not aligned you need to rebuild your kernel "
61 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
65 memnodemap[addr >> shift] = i;
71 #ifdef CONFIG_SPARSEMEM
72 int early_pfn_to_nid(unsigned long pfn)
74 return phys_to_nid(pfn << PAGE_SHIFT);
78 /* Initialize bootmem allocator for a node */
79 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
81 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
82 unsigned long nodedata_phys;
83 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
85 start = round_up(start, ZONE_ALIGN);
87 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
89 start_pfn = start >> PAGE_SHIFT;
90 end_pfn = end >> PAGE_SHIFT;
92 memory_present(nodeid, start_pfn, end_pfn);
93 nodedata_phys = find_e820_area(start, end, pgdat_size);
94 if (nodedata_phys == -1L)
95 panic("Cannot find memory pgdat in node %d\n", nodeid);
97 Dprintk("nodedata_phys %lx\n", nodedata_phys);
99 node_data[nodeid] = phys_to_virt(nodedata_phys);
100 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
101 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
102 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
103 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
105 /* Find a place for the bootmem map */
106 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
107 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
108 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
109 if (bootmap_start == -1L)
110 panic("Not enough continuous space for bootmap on node %d", nodeid);
111 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
113 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
114 bootmap_start >> PAGE_SHIFT,
117 e820_bootmem_free(NODE_DATA(nodeid), start, end);
119 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
120 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
121 node_set_online(nodeid);
124 /* Initialize final allocator for a zone */
125 void __init setup_node_zones(int nodeid)
127 unsigned long start_pfn, end_pfn;
128 unsigned long zones[MAX_NR_ZONES];
129 unsigned long dma_end_pfn;
131 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
133 start_pfn = node_start_pfn(nodeid);
134 end_pfn = node_end_pfn(nodeid);
136 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
138 /* All nodes > 0 have a zero length zone DMA */
139 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
140 if (start_pfn < dma_end_pfn) {
141 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
142 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
144 zones[ZONE_NORMAL] = end_pfn - start_pfn;
147 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
151 void __init numa_init_array(void)
154 /* There are unfortunately some poorly designed mainboards around
155 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
156 mapping. To avoid this fill in the mapping for all possible
157 CPUs, as the number of CPUs is not known yet.
158 We round robin the existing nodes. */
160 for (i = 0; i < NR_CPUS; i++) {
161 if (cpu_to_node[i] != NUMA_NO_NODE)
163 rr = next_node(rr, node_online_map);
164 if (rr == MAX_NUMNODES)
165 rr = first_node(node_online_map);
170 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
173 #ifdef CONFIG_NUMA_EMU
174 int numa_fake __initdata = 0;
177 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
180 struct node nodes[MAX_NUMNODES];
181 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
183 /* Kludge needed for the hash function */
184 if (hweight64(sz) > 1) {
186 while ((x << 1) < sz)
189 printk("Numa emulation unbalanced. Complain to maintainer\n");
193 memset(&nodes,0,sizeof(nodes));
194 for (i = 0; i < numa_fake; i++) {
195 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
196 if (i == numa_fake-1)
197 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
198 nodes[i].end = nodes[i].start + sz;
199 if (i != numa_fake-1)
201 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
203 nodes[i].start, nodes[i].end,
204 (nodes[i].end - nodes[i].start) >> 20);
207 memnode_shift = compute_hash_shift(nodes, numa_fake);
208 if (memnode_shift < 0) {
210 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
213 for_each_online_node(i)
214 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
220 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
224 #ifdef CONFIG_NUMA_EMU
225 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
229 #ifdef CONFIG_ACPI_NUMA
230 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
231 end_pfn << PAGE_SHIFT))
235 #ifdef CONFIG_K8_NUMA
236 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
239 printk(KERN_INFO "%s\n",
240 numa_off ? "NUMA turned off" : "No NUMA configuration found");
242 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
243 start_pfn << PAGE_SHIFT,
244 end_pfn << PAGE_SHIFT);
245 /* setup dummy node covering all memory */
248 nodes_clear(node_online_map);
250 for (i = 0; i < NR_CPUS; i++)
252 node_to_cpumask[0] = cpumask_of_cpu(0);
253 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
256 __cpuinit void numa_add_cpu(int cpu)
258 /* BP is initialized elsewhere */
260 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
263 unsigned long __init numa_free_all_bootmem(void)
266 unsigned long pages = 0;
267 for_each_online_node(i) {
268 pages += free_all_bootmem_node(NODE_DATA(i));
273 void __init paging_init(void)
276 for_each_online_node(i) {
282 __init int numa_setup(char *opt)
284 if (!strncmp(opt,"off",3))
286 #ifdef CONFIG_NUMA_EMU
287 if(!strncmp(opt, "fake=", 5)) {
288 numa_fake = simple_strtoul(opt+5,NULL,0); ;
289 if (numa_fake >= MAX_NUMNODES)
290 numa_fake = MAX_NUMNODES;
293 #ifdef CONFIG_ACPI_NUMA
294 if (!strncmp(opt,"noacpi",6))
300 EXPORT_SYMBOL(cpu_to_node);
301 EXPORT_SYMBOL(node_to_cpumask);
302 EXPORT_SYMBOL(memnode_shift);
303 EXPORT_SYMBOL(memnodemap);
304 EXPORT_SYMBOL(node_data);