git.oblomov.eu Git - linux-2.6/blob - arch/x86/mm/numa_64.c

   1 /*
   2  * Generic VM initialization for x86-64 NUMA setups.
   3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  */
   5 #include <linux/kernel.h>
   6 #include <linux/mm.h>
   7 #include <linux/string.h>
   8 #include <linux/init.h>
   9 #include <linux/bootmem.h>
  10 #include <linux/mmzone.h>
  11 #include <linux/ctype.h>
  12 #include <linux/module.h>
  13 #include <linux/nodemask.h>
  14 #include <linux/sched.h>
  15
  16 #include <asm/e820.h>
  17 #include <asm/proto.h>
  18 #include <asm/dma.h>
  19 #include <asm/numa.h>
  20 #include <asm/acpi.h>
  21 #include <asm/k8.h>
  22
  23 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  24 EXPORT_SYMBOL(node_data);
  25
  26 struct memnode memnode;
  27
  28 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
  29         [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  30 };
  31
  32 int numa_off __initdata;
  33 static unsigned long __initdata nodemap_addr;
  34 static unsigned long __initdata nodemap_size;
  35
  36 DEFINE_PER_CPU(int, node_number) = 0;
  37 EXPORT_PER_CPU_SYMBOL(node_number);
  38
  39 /*
  40  * Map cpu index to node index
  41  */
  42 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
  43 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
  44
  45 /*
  46  * Given a shift value, try to populate memnodemap[]
  47  * Returns :
  48  * 1 if OK
  49  * 0 if memnodmap[] too small (of shift too small)
  50  * -1 if node overlap or lost ram (shift too big)
  51  */
  52 static int __init populate_memnodemap(const struct bootnode *nodes,
  53                                       int numnodes, int shift, int *nodeids)
  54 {
  55         unsigned long addr, end;
  56         int i, res = -1;
  57
  58         memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
  59         for (i = 0; i < numnodes; i++) {
  60                 addr = nodes[i].start;
  61                 end = nodes[i].end;
  62                 if (addr >= end)
  63                         continue;
  64                 if ((end >> shift) >= memnodemapsize)
  65                         return 0;
  66                 do {
  67                         if (memnodemap[addr >> shift] != NUMA_NO_NODE)
  68                                 return -1;
  69
  70                         if (!nodeids)
  71                                 memnodemap[addr >> shift] = i;
  72                         else
  73                                 memnodemap[addr >> shift] = nodeids[i];
  74
  75                         addr += (1UL << shift);
  76                 } while (addr < end);
  77                 res = 1;
  78         }
  79         return res;
  80 }
  81
  82 static int __init allocate_cachealigned_memnodemap(void)
  83 {
  84         unsigned long addr;
  85
  86         memnodemap = memnode.embedded_map;
  87         if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
  88                 return 0;
  89
  90         addr = 0x8000;
  91         nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
  92         nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
  93                                       nodemap_size, L1_CACHE_BYTES);
  94         if (nodemap_addr == -1UL) {
  95                 printk(KERN_ERR
  96                        "NUMA: Unable to allocate Memory to Node hash map\n");
  97                 nodemap_addr = nodemap_size = 0;
  98                 return -1;
  99         }
 100         memnodemap = phys_to_virt(nodemap_addr);
 101         reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
 102
 103         printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 104                nodemap_addr, nodemap_addr + nodemap_size);
 105         return 0;
 106 }
 107
 108 /*
 109  * The LSB of all start and end addresses in the node map is the value of the
 110  * maximum possible shift.
 111  */
 112 static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 113                                          int numnodes)
 114 {
 115         int i, nodes_used = 0;
 116         unsigned long start, end;
 117         unsigned long bitfield = 0, memtop = 0;
 118
 119         for (i = 0; i < numnodes; i++) {
 120                 start = nodes[i].start;
 121                 end = nodes[i].end;
 122                 if (start >= end)
 123                         continue;
 124                 bitfield |= start;
 125                 nodes_used++;
 126                 if (end > memtop)
 127                         memtop = end;
 128         }
 129         if (nodes_used <= 1)
 130                 i = 63;
 131         else
 132                 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
 133         memnodemapsize = (memtop >> i)+1;
 134         return i;
 135 }
 136
 137 int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
 138                               int *nodeids)
 139 {
 140         int shift;
 141
 142         shift = extract_lsb_from_nodes(nodes, numnodes);
 143         if (allocate_cachealigned_memnodemap())
 144                 return -1;
 145         printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 146                 shift);
 147
 148         if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
 149                 printk(KERN_INFO "Your memory is not aligned you need to "
 150                        "rebuild your kernel with a bigger NODEMAPSIZE "
 151                        "shift=%d\n", shift);
 152                 return -1;
 153         }
 154         return shift;
 155 }
 156
 157 int __meminit  __early_pfn_to_nid(unsigned long pfn)
 158 {
 159         return phys_to_nid(pfn << PAGE_SHIFT);
 160 }
 161
 162 static void * __init early_node_mem(int nodeid, unsigned long start,
 163                                     unsigned long end, unsigned long size,
 164                                     unsigned long align)
 165 {
 166         unsigned long mem = find_e820_area(start, end, size, align);
 167         void *ptr;
 168
 169         if (mem != -1L)
 170                 return __va(mem);
 171
 172         ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
 173         if (ptr == NULL) {
 174                 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
 175                        size, nodeid);
 176                 return NULL;
 177         }
 178         return ptr;
 179 }
 180
 181 /* Initialize bootmem allocator for a node */
 182 void __init setup_node_bootmem(int nodeid, unsigned long start,
 183                                unsigned long end)
 184 {
 185         unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
 186         unsigned long bootmap_start, nodedata_phys;
 187         void *bootmap;
 188         const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 189         int nid;
 190
 191         start = roundup(start, ZONE_ALIGN);
 192
 193         printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
 194                start, end);
 195
 196         start_pfn = start >> PAGE_SHIFT;
 197         last_pfn = end >> PAGE_SHIFT;
 198
 199         node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
 200                                            SMP_CACHE_BYTES);
 201         if (node_data[nodeid] == NULL)
 202                 return;
 203         nodedata_phys = __pa(node_data[nodeid]);
 204         printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
 205                 nodedata_phys + pgdat_size - 1);
 206
 207         memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 208         NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
 209         NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 210         NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
 211
 212         /*
 213          * Find a place for the bootmem map
 214          * nodedata_phys could be on other nodes by alloc_bootmem,
 215          * so need to sure bootmap_start not to be small, otherwise
 216          * early_node_mem will get that with find_e820_area instead
 217          * of alloc_bootmem, that could clash with reserved range
 218          */
 219         bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
 220         nid = phys_to_nid(nodedata_phys);
 221         if (nid == nodeid)
 222                 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
 223         else
 224                 bootmap_start = roundup(start, PAGE_SIZE);
 225         /*
 226          * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
 227          * to use that to align to PAGE_SIZE
 228          */
 229         bootmap = early_node_mem(nodeid, bootmap_start, end,
 230                                  bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
 231         if (bootmap == NULL)  {
 232                 if (nodedata_phys < start || nodedata_phys >= end)
 233                         free_bootmem(nodedata_phys, pgdat_size);
 234                 node_data[nodeid] = NULL;
 235                 return;
 236         }
 237         bootmap_start = __pa(bootmap);
 238
 239         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 240                                          bootmap_start >> PAGE_SHIFT,
 241                                          start_pfn, last_pfn);
 242
 243         printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
 244                  bootmap_start, bootmap_start + bootmap_size - 1,
 245                  bootmap_pages);
 246
 247         free_bootmem_with_active_regions(nodeid, end);
 248
 249         /*
 250          * convert early reserve to bootmem reserve earlier
 251          * otherwise early_node_mem could use early reserved mem
 252          * on previous node
 253          */
 254         early_res_to_bootmem(start, end);
 255
 256         /*
 257          * in some case early_node_mem could use alloc_bootmem
 258          * to get range on other node, don't reserve that again
 259          */
 260         if (nid != nodeid)
 261                 printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
 262         else
 263                 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
 264                                         pgdat_size, BOOTMEM_DEFAULT);
 265         nid = phys_to_nid(bootmap_start);
 266         if (nid != nodeid)
 267                 printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
 268         else
 269                 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
 270                                  bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
 271
 272 #ifdef CONFIG_ACPI_NUMA
 273         srat_reserve_add_area(nodeid);
 274 #endif
 275         node_set_online(nodeid);
 276 }
 277
 278 /*
 279  * There are unfortunately some poorly designed mainboards around that
 280  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 281  * mapping. To avoid this fill in the mapping for all possible CPUs,
 282  * as the number of CPUs is not known yet. We round robin the existing
 283  * nodes.
 284  */
 285 void __init numa_init_array(void)
 286 {
 287         int rr, i;
 288
 289         rr = first_node(node_online_map);
 290         for (i = 0; i < nr_cpu_ids; i++) {
 291                 if (early_cpu_to_node(i) != NUMA_NO_NODE)
 292                         continue;
 293                 numa_set_node(i, rr);
 294                 rr = next_node(rr, node_online_map);
 295                 if (rr == MAX_NUMNODES)
 296                         rr = first_node(node_online_map);
 297         }
 298 }
 299
 300 #ifdef CONFIG_NUMA_EMU
 301 /* Numa emulation */
 302 static char *cmdline __initdata;
 303
 304 /*
 305  * Setups up nid to range from addr to addr + size.  If the end
 306  * boundary is greater than max_addr, then max_addr is used instead.
 307  * The return value is 0 if there is additional memory left for
 308  * allocation past addr and -1 otherwise.  addr is adjusted to be at
 309  * the end of the node.
 310  */
 311 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 312                                    u64 size, u64 max_addr)
 313 {
 314         int ret = 0;
 315
 316         nodes[nid].start = *addr;
 317         *addr += size;
 318         if (*addr >= max_addr) {
 319                 *addr = max_addr;
 320                 ret = -1;
 321         }
 322         nodes[nid].end = *addr;
 323         node_set(nid, node_possible_map);
 324         printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
 325                nodes[nid].start, nodes[nid].end,
 326                (nodes[nid].end - nodes[nid].start) >> 20);
 327         return ret;
 328 }
 329
 330 /*
 331  * Splits num_nodes nodes up equally starting at node_start.  The return value
 332  * is the number of nodes split up and addr is adjusted to be at the end of the
 333  * last node allocated.
 334  */
 335 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 336                                       u64 max_addr, int node_start,
 337                                       int num_nodes)
 338 {
 339         unsigned int big;
 340         u64 size;
 341         int i;
 342
 343         if (num_nodes <= 0)
 344                 return -1;
 345         if (num_nodes > MAX_NUMNODES)
 346                 num_nodes = MAX_NUMNODES;
 347         size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
 348                num_nodes;
 349         /*
 350          * Calculate the number of big nodes that can be allocated as a result
 351          * of consolidating the leftovers.
 352          */
 353         big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
 354               FAKE_NODE_MIN_SIZE;
 355
 356         /* Round down to nearest FAKE_NODE_MIN_SIZE. */
 357         size &= FAKE_NODE_MIN_HASH_MASK;
 358         if (!size) {
 359                 printk(KERN_ERR "Not enough memory for each node.  "
 360                        "NUMA emulation disabled.\n");
 361                 return -1;
 362         }
 363
 364         for (i = node_start; i < num_nodes + node_start; i++) {
 365                 u64 end = *addr + size;
 366
 367                 if (i < big)
 368                         end += FAKE_NODE_MIN_SIZE;
 369                 /*
 370                  * The final node can have the remaining system RAM.  Other
 371                  * nodes receive roughly the same amount of available pages.
 372                  */
 373                 if (i == num_nodes + node_start - 1)
 374                         end = max_addr;
 375                 else
 376                         while (end - *addr - e820_hole_size(*addr, end) <
 377                                size) {
 378                                 end += FAKE_NODE_MIN_SIZE;
 379                                 if (end > max_addr) {
 380                                         end = max_addr;
 381                                         break;
 382                                 }
 383                         }
 384                 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
 385                         break;
 386         }
 387         return i - node_start + 1;
 388 }
 389
 390 /*
 391  * Splits the remaining system RAM into chunks of size.  The remaining memory is
 392  * always assigned to a final node and can be asymmetric.  Returns the number of
 393  * nodes split.
 394  */
 395 static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
 396                                       u64 max_addr, int node_start, u64 size)
 397 {
 398         int i = node_start;
 399         size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
 400         while (!setup_node_range(i++, nodes, addr, size, max_addr))
 401                 ;
 402         return i - node_start;
 403 }
 404
 405 /*
 406  * Sets up the system RAM area from start_pfn to last_pfn according to the
 407  * numa=fake command-line option.
 408  */
 409 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 410
 411 static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
 412 {
 413         u64 size, addr = start_pfn << PAGE_SHIFT;
 414         u64 max_addr = last_pfn << PAGE_SHIFT;
 415         int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
 416
 417         memset(&nodes, 0, sizeof(nodes));
 418         /*
 419          * If the numa=fake command-line is just a single number N, split the
 420          * system RAM into N fake nodes.
 421          */
 422         if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
 423                 long n = simple_strtol(cmdline, NULL, 0);
 424
 425                 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
 426                 if (num_nodes < 0)
 427                         return num_nodes;
 428                 goto out;
 429         }
 430
 431         /* Parse the command line. */
 432         for (coeff_flag = 0; ; cmdline++) {
 433                 if (*cmdline && isdigit(*cmdline)) {
 434                         num = num * 10 + *cmdline - '0';
 435                         continue;
 436                 }
 437                 if (*cmdline == '*') {
 438                         if (num > 0)
 439                                 coeff = num;
 440                         coeff_flag = 1;
 441                 }
 442                 if (!*cmdline || *cmdline == ',') {
 443                         if (!coeff_flag)
 444                                 coeff = 1;
 445                         /*
 446                          * Round down to the nearest FAKE_NODE_MIN_SIZE.
 447                          * Command-line coefficients are in megabytes.
 448                          */
 449                         size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
 450                         if (size)
 451                                 for (i = 0; i < coeff; i++, num_nodes++)
 452                                         if (setup_node_range(num_nodes, nodes,
 453                                                 &addr, size, max_addr) < 0)
 454                                                 goto done;
 455                         if (!*cmdline)
 456                                 break;
 457                         coeff_flag = 0;
 458                         coeff = -1;
 459                 }
 460                 num = 0;
 461         }
 462 done:
 463         if (!num_nodes)
 464                 return -1;
 465         /* Fill remainder of system RAM, if appropriate. */
 466         if (addr < max_addr) {
 467                 if (coeff_flag && coeff < 0) {
 468                         /* Split remaining nodes into num-sized chunks */
 469                         num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
 470                                                          num_nodes, num);
 471                         goto out;
 472                 }
 473                 switch (*(cmdline - 1)) {
 474                 case '*':
 475                         /* Split remaining nodes into coeff chunks */
 476                         if (coeff <= 0)
 477                                 break;
 478                         num_nodes += split_nodes_equally(nodes, &addr, max_addr,
 479                                                          num_nodes, coeff);
 480                         break;
 481                 case ',':
 482                         /* Do not allocate remaining system RAM */
 483                         break;
 484                 default:
 485                         /* Give one final node */
 486                         setup_node_range(num_nodes, nodes, &addr,
 487                                          max_addr - addr, max_addr);
 488                         num_nodes++;
 489                 }
 490         }
 491 out:
 492         memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
 493         if (memnode_shift < 0) {
 494                 memnode_shift = 0;
 495                 printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
 496                        "disabled.\n");
 497                 return -1;
 498         }
 499
 500         /*
 501          * We need to vacate all active ranges that may have been registered by
 502          * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
 503          * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
 504          */
 505         remove_all_active_ranges();
 506 #ifdef CONFIG_ACPI_NUMA
 507         acpi_numa = -1;
 508 #endif
 509         for_each_node_mask(i, node_possible_map) {
 510                 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 511                                                 nodes[i].end >> PAGE_SHIFT);
 512                 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 513         }
 514         acpi_fake_nodes(nodes, num_nodes);
 515         numa_init_array();
 516         return 0;
 517 }
 518 #endif /* CONFIG_NUMA_EMU */
 519
 520 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 521 {
 522         int i;
 523
 524         nodes_clear(node_possible_map);
 525         nodes_clear(node_online_map);
 526
 527 #ifdef CONFIG_NUMA_EMU
 528         if (cmdline && !numa_emulation(start_pfn, last_pfn))
 529                 return;
 530         nodes_clear(node_possible_map);
 531         nodes_clear(node_online_map);
 532 #endif
 533
 534 #ifdef CONFIG_ACPI_NUMA
 535         if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 536                                           last_pfn << PAGE_SHIFT))
 537                 return;
 538         nodes_clear(node_possible_map);
 539         nodes_clear(node_online_map);
 540 #endif
 541
 542 #ifdef CONFIG_K8_NUMA
 543         if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
 544                                         last_pfn<<PAGE_SHIFT))
 545                 return;
 546         nodes_clear(node_possible_map);
 547         nodes_clear(node_online_map);
 548 #endif
 549         printk(KERN_INFO "%s\n",
 550                numa_off ? "NUMA turned off" : "No NUMA configuration found");
 551
 552         printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 553                start_pfn << PAGE_SHIFT,
 554                last_pfn << PAGE_SHIFT);
 555         /* setup dummy node covering all memory */
 556         memnode_shift = 63;
 557         memnodemap = memnode.embedded_map;
 558         memnodemap[0] = 0;
 559         node_set_online(0);
 560         node_set(0, node_possible_map);
 561         for (i = 0; i < nr_cpu_ids; i++)
 562                 numa_set_node(i, 0);
 563         e820_register_active_regions(0, start_pfn, last_pfn);
 564         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 565 }
 566
 567 unsigned long __init numa_free_all_bootmem(void)
 568 {
 569         unsigned long pages = 0;
 570         int i;
 571
 572         for_each_online_node(i)
 573                 pages += free_all_bootmem_node(NODE_DATA(i));
 574
 575         return pages;
 576 }
 577
 578 void __init paging_init(void)
 579 {
 580         unsigned long max_zone_pfns[MAX_NR_ZONES];
 581
 582         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 583         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 584         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 585         max_zone_pfns[ZONE_NORMAL] = max_pfn;
 586
 587         sparse_memory_present_with_active_regions(MAX_NUMNODES);
 588         sparse_init();
 589
 590         free_area_init_nodes(max_zone_pfns);
 591 }
 592
 593 static __init int numa_setup(char *opt)
 594 {
 595         if (!opt)
 596                 return -EINVAL;
 597         if (!strncmp(opt, "off", 3))
 598                 numa_off = 1;
 599 #ifdef CONFIG_NUMA_EMU
 600         if (!strncmp(opt, "fake=", 5))
 601                 cmdline = opt + 5;
 602 #endif
 603 #ifdef CONFIG_ACPI_NUMA
 604         if (!strncmp(opt, "noacpi", 6))
 605                 acpi_numa = -1;
 606         if (!strncmp(opt, "hotadd=", 7))
 607                 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 608 #endif
 609         return 0;
 610 }
 611 early_param("numa", numa_setup);
 612
 613 #ifdef CONFIG_NUMA
 614 /*
 615  * Setup early cpu_to_node.
 616  *
 617  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 618  * and apicid_to_node[] tables have valid entries for a CPU.
 619  * This means we skip cpu_to_node[] initialisation for NUMA
 620  * emulation and faking node case (when running a kernel compiled
 621  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 622  * is already initialized in a round robin manner at numa_init_array,
 623  * prior to this call, and this initialization is good enough
 624  * for the fake NUMA cases.
 625  *
 626  * Called before the per_cpu areas are setup.
 627  */
 628 void __init init_cpu_to_node(void)
 629 {
 630         int cpu;
 631         u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
 632
 633         BUG_ON(cpu_to_apicid == NULL);
 634
 635         for_each_possible_cpu(cpu) {
 636                 int node;
 637                 u16 apicid = cpu_to_apicid[cpu];
 638
 639                 if (apicid == BAD_APICID)
 640                         continue;
 641                 node = apicid_to_node[apicid];
 642                 if (node == NUMA_NO_NODE)
 643                         continue;
 644                 if (!node_online(node))
 645                         continue;
 646                 numa_set_node(cpu, node);
 647         }
 648 }
 649 #endif
 650
 651
 652 void __cpuinit numa_set_node(int cpu, int node)
 653 {
 654         int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 655
 656         /* early setting, no percpu area yet */
 657         if (cpu_to_node_map) {
 658                 cpu_to_node_map[cpu] = node;
 659                 return;
 660         }
 661
 662 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 663         if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
 664                 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
 665                 dump_stack();
 666                 return;
 667         }
 668 #endif
 669         per_cpu(x86_cpu_to_node_map, cpu) = node;
 670
 671         if (node != NUMA_NO_NODE)
 672                 per_cpu(node_number, cpu) = node;
 673 }
 674
 675 void __cpuinit numa_clear_node(int cpu)
 676 {
 677         numa_set_node(cpu, NUMA_NO_NODE);
 678 }
 679
 680 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 681
 682 void __cpuinit numa_add_cpu(int cpu)
 683 {
 684         cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 685 }
 686
 687 void __cpuinit numa_remove_cpu(int cpu)
 688 {
 689         cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 690 }
 691
 692 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
 693
 694 /*
 695  * --------- debug versions of the numa functions ---------
 696  */
 697 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 698 {
 699         int node = early_cpu_to_node(cpu);
 700         struct cpumask *mask;
 701         char buf[64];
 702
 703         mask = node_to_cpumask_map[node];
 704         if (mask == NULL) {
 705                 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
 706                 dump_stack();
 707                 return;
 708         }
 709
 710         if (enable)
 711                 cpumask_set_cpu(cpu, mask);
 712         else
 713                 cpumask_clear_cpu(cpu, mask);
 714
 715         cpulist_scnprintf(buf, sizeof(buf), mask);
 716         printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
 717                 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
 718 }
 719
 720 void __cpuinit numa_add_cpu(int cpu)
 721 {
 722         numa_set_cpumask(cpu, 1);
 723 }
 724
 725 void __cpuinit numa_remove_cpu(int cpu)
 726 {
 727         numa_set_cpumask(cpu, 0);
 728 }
 729
 730 int cpu_to_node(int cpu)
 731 {
 732         if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
 733                 printk(KERN_WARNING
 734                         "cpu_to_node(%d): usage too early!\n", cpu);
 735                 dump_stack();
 736                 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 737         }
 738         return per_cpu(x86_cpu_to_node_map, cpu);
 739 }
 740 EXPORT_SYMBOL(cpu_to_node);
 741
 742 /*
 743  * Same function as cpu_to_node() but used if called before the
 744  * per_cpu areas are setup.
 745  */
 746 int early_cpu_to_node(int cpu)
 747 {
 748         if (early_per_cpu_ptr(x86_cpu_to_node_map))
 749                 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 750
 751         if (!cpu_possible(cpu)) {
 752                 printk(KERN_WARNING
 753                         "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
 754                 dump_stack();
 755                 return NUMA_NO_NODE;
 756         }
 757         return per_cpu(x86_cpu_to_node_map, cpu);
 758 }
 759
 760 /*
 761  * --------- end of debug versions of the numa functions ---------
 762  */
 763
 764 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */