2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
12 #include <linux/kernel.h>
13 #include <linux/acpi.h>
14 #include <linux/mmzone.h>
15 #include <linux/bitmap.h>
16 #include <linux/module.h>
17 #include <linux/topology.h>
18 #include <linux/bootmem.h>
20 #include <asm/proto.h>
24 #if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27 #define RESERVE_HOTADD 1
30 static struct acpi_table_slit *acpi_slit;
32 static nodemask_t nodes_parsed __initdata;
33 static nodemask_t nodes_found __initdata;
34 static struct bootnode nodes[MAX_NUMNODES] __initdata;
35 static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36 static int found_add_area __initdata;
37 int hotadd_percent __initdata = 10;
38 static u8 pxm2node[256] = { [0 ... 255] = 0xff };
40 /* Too small nodes confuse the VM badly. Usually they result
42 #define NODE_MIN_SIZE (4*1024*1024)
44 static int node_to_pxm(int n);
46 int pxm_to_node(int pxm)
48 if ((unsigned)pxm >= 256)
50 /* Extend 0xff to (int)-1 */
51 return (signed char)pxm2node[pxm];
54 static __init int setup_node(int pxm)
56 unsigned node = pxm2node[pxm];
58 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
60 node = first_unset_node(nodes_found);
61 node_set(node, nodes_found);
67 static __init int conflicting_nodes(unsigned long start, unsigned long end)
70 for_each_node_mask(i, nodes_parsed) {
71 struct bootnode *nd = &nodes[i];
72 if (nd->start == nd->end)
74 if (nd->end > start && nd->start < end)
76 if (nd->end == end && nd->start == start)
82 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
84 struct bootnode *nd = &nodes[i];
89 if (nd->start < start) {
91 if (nd->end < nd->start)
96 if (nd->start > nd->end)
101 static __init void bad_srat(void)
104 printk(KERN_ERR "SRAT: SRAT not used.\n");
106 for (i = 0; i < MAX_LOCAL_APIC; i++)
107 apicid_to_node[i] = NUMA_NO_NODE;
108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
112 static __init inline int srat_disabled(void)
114 return numa_off || acpi_numa < 0;
118 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
119 * up the NUMA heuristics which wants the local node to have a smaller
120 * distance than the others.
121 * Do some quick checks here and only use the SLIT if it passes.
123 static __init int slit_valid(struct acpi_table_slit *slit)
126 int d = slit->localities;
127 for (i = 0; i < d; i++) {
128 for (j = 0; j < d; j++) {
129 u8 val = slit->entry[d*i + j];
133 } else if (val <= 10)
140 /* Callback for SLIT parsing */
141 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
143 if (!slit_valid(slit)) {
144 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
150 /* Callback for Proximity Domain -> LAPIC mapping */
152 acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
157 if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat();
160 if (pa->flags.enabled == 0)
162 pxm = pa->proximity_domain;
163 node = setup_node(pxm);
165 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
169 apicid_to_node[pa->apic_id] = node;
171 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
172 pxm, pa->apic_id, node);
175 #ifdef RESERVE_HOTADD
177 * Protect against too large hotadd areas that would fill up memory.
179 static int hotadd_enough_memory(struct bootnode *nd)
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
186 unsigned long allowed;
187 unsigned long oldpages = pages;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
198 if (allocated >= allowed)
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
220 static int reserve_hotadd(int node, unsigned long start, unsigned long end)
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
225 struct bootnode *nd = &nodes_add[node];
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
231 The node size check is a basic sanity check to guard against
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
252 if (nd->start == nd->end) {
257 if (nd->start == end) {
261 if (nd->end == start) {
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
278 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
280 acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
282 struct bootnode *nd, oldnode;
283 unsigned long start, end;
289 if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
293 if (ma->flags.enabled == 0)
295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
299 pxm = ma->proximity_domain;
300 node = setup_node(pxm);
302 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
306 i = conflicting_nodes(start, end);
309 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
310 pxm, start, end, nodes[i].start, nodes[i].end);
313 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
314 pxm, start, end, node_to_pxm(i),
315 nodes[i].start, nodes[i].end);
321 if (!node_test_and_set(node, nodes_parsed)) {
325 if (start < nd->start)
331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
334 #ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
345 /* Sanity check to catch more bad SRATs (they are amazingly common).
346 Make sure the PXMs cover all memory. */
347 static int nodes_cover_memory(void)
350 unsigned long pxmram, e820ram;
353 for_each_node_mask(i, nodes_parsed) {
354 unsigned long s = nodes[i].start >> PAGE_SHIFT;
355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
357 pxmram -= e820_hole_size(s, e);
358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
364 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
365 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
367 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
368 (pxmram << PAGE_SHIFT) >> 20,
369 (e820ram << PAGE_SHIFT) >> 20);
375 static void unparse_node(int node)
378 node_clear(node, nodes_parsed);
379 for (i = 0; i < MAX_LOCAL_APIC; i++) {
380 if (apicid_to_node[i] == node)
381 apicid_to_node[i] = NUMA_NO_NODE;
385 void __init acpi_numa_arch_fixup(void) {}
387 /* Use the information discovered above to actually set up the nodes. */
388 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
392 /* First clean up the node list */
393 for (i = 0; i < MAX_NUMNODES; i++) {
394 cutoff_node(i, start, end);
395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
402 if (!nodes_cover_memory()) {
407 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
408 if (memnode_shift < 0) {
410 "SRAT: No NUMA node hash function found. Contact maintainer\n");
415 /* Finally register nodes */
416 for_each_node_mask(i, nodes_parsed)
417 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
418 /* Try again in case setup_node_bootmem missed one due
419 to missing bootmem */
420 for_each_node_mask(i, nodes_parsed)
422 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
424 for (i = 0; i < NR_CPUS; i++) {
425 if (cpu_to_node[i] == NUMA_NO_NODE)
427 if (!node_isset(cpu_to_node[i], nodes_parsed))
428 numa_set_node(i, NUMA_NO_NODE);
434 static int node_to_pxm(int n)
437 if (pxm2node[n] == n)
439 for (i = 0; i < 256; i++)
440 if (pxm2node[i] == n)
445 void __init srat_reserve_add_area(int nodeid)
447 if (found_add_area && nodes_add[nodeid].end) {
450 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
451 "for node %d at %Lx-%Lx\n",
452 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
453 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
455 total_mb *= sizeof(struct page);
457 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
458 "pre-allocated memory.\n", (unsigned long long)total_mb);
459 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
460 nodes_add[nodeid].end - nodes_add[nodeid].start);
464 int __node_distance(int a, int b)
469 return a == b ? 10 : 20;
470 index = acpi_slit->localities * node_to_pxm(a);
471 return acpi_slit->entry[index + node_to_pxm(b)];
474 EXPORT_SYMBOL(__node_distance);