x86: cleanup e820_setup_gap(), v2
[linux-2.6] / arch / x86 / mm / init_32.c
1 /*
2  *
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6  */
7
8 #include <linux/module.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/hugetlb.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/pagemap.h>
24 #include <linux/pfn.h>
25 #include <linux/poison.h>
26 #include <linux/bootmem.h>
27 #include <linux/slab.h>
28 #include <linux/proc_fs.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/initrd.h>
31 #include <linux/cpumask.h>
32
33 #include <asm/asm.h>
34 #include <asm/processor.h>
35 #include <asm/system.h>
36 #include <asm/uaccess.h>
37 #include <asm/pgtable.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/bugs.h>
43 #include <asm/tlb.h>
44 #include <asm/tlbflush.h>
45 #include <asm/pgalloc.h>
46 #include <asm/sections.h>
47 #include <asm/paravirt.h>
48 #include <asm/setup.h>
49 #include <asm/cacheflush.h>
50
51 unsigned int __VMALLOC_RESERVE = 128 << 20;
52
53 unsigned long max_pfn_mapped;
54
55 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
56 unsigned long highstart_pfn, highend_pfn;
57
58 static noinline int do_test_wp_bit(void);
59
60
61 static unsigned long __initdata table_start;
62 static unsigned long __meminitdata table_end;
63 static unsigned long __meminitdata table_top;
64
65 static int __initdata after_init_bootmem;
66
67 static __init void *alloc_low_page(unsigned long *phys)
68 {
69         unsigned long pfn = table_end++;
70         void *adr;
71
72         if (pfn >= table_top)
73                 panic("alloc_low_page: ran out of memory");
74
75         adr = __va(pfn * PAGE_SIZE);
76         memset(adr, 0, PAGE_SIZE);
77         *phys  = pfn * PAGE_SIZE;
78         return adr;
79 }
80
81 /*
82  * Creates a middle page table and puts a pointer to it in the
83  * given global directory entry. This only returns the gd entry
84  * in non-PAE compilation mode, since the middle layer is folded.
85  */
86 static pmd_t * __init one_md_table_init(pgd_t *pgd)
87 {
88         pud_t *pud;
89         pmd_t *pmd_table;
90
91 #ifdef CONFIG_X86_PAE
92         unsigned long phys;
93         if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
94                 if (after_init_bootmem)
95                         pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
96                 else
97                         pmd_table = (pmd_t *)alloc_low_page(&phys);
98                 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
99                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
100                 pud = pud_offset(pgd, 0);
101                 BUG_ON(pmd_table != pmd_offset(pud, 0));
102         }
103 #endif
104         pud = pud_offset(pgd, 0);
105         pmd_table = pmd_offset(pud, 0);
106
107         return pmd_table;
108 }
109
110 /*
111  * Create a page table and place a pointer to it in a middle page
112  * directory entry:
113  */
114 static pte_t * __init one_page_table_init(pmd_t *pmd)
115 {
116         if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
117                 pte_t *page_table = NULL;
118
119                 if (after_init_bootmem) {
120 #ifdef CONFIG_DEBUG_PAGEALLOC
121                         page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
122 #endif
123                         if (!page_table)
124                                 page_table =
125                                 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
126                 } else {
127                         unsigned long phys;
128                         page_table = (pte_t *)alloc_low_page(&phys);
129                 }
130
131                 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
132                 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
133                 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
134         }
135
136         return pte_offset_kernel(pmd, 0);
137 }
138
139 /*
140  * This function initializes a certain range of kernel virtual memory
141  * with new bootmem page tables, everywhere page tables are missing in
142  * the given range.
143  *
144  * NOTE: The pagetables are allocated contiguous on the physical space
145  * so we can cache the place of the first one and move around without
146  * checking the pgd every time.
147  */
148 static void __init
149 page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
150 {
151         int pgd_idx, pmd_idx;
152         unsigned long vaddr;
153         pgd_t *pgd;
154         pmd_t *pmd;
155
156         vaddr = start;
157         pgd_idx = pgd_index(vaddr);
158         pmd_idx = pmd_index(vaddr);
159         pgd = pgd_base + pgd_idx;
160
161         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
162                 pmd = one_md_table_init(pgd);
163                 pmd = pmd + pmd_index(vaddr);
164                 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
165                                                         pmd++, pmd_idx++) {
166                         one_page_table_init(pmd);
167
168                         vaddr += PMD_SIZE;
169                 }
170                 pmd_idx = 0;
171         }
172 }
173
174 static inline int is_kernel_text(unsigned long addr)
175 {
176         if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
177                 return 1;
178         return 0;
179 }
180
181 /*
182  * This maps the physical memory to kernel virtual address space, a total
183  * of max_low_pfn pages, by creating page tables starting from address
184  * PAGE_OFFSET:
185  */
186 static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
187                                                 unsigned long start_pfn,
188                                                 unsigned long end_pfn,
189                                                 int use_pse)
190 {
191         int pgd_idx, pmd_idx, pte_ofs;
192         unsigned long pfn;
193         pgd_t *pgd;
194         pmd_t *pmd;
195         pte_t *pte;
196         unsigned pages_2m = 0, pages_4k = 0;
197
198         if (!cpu_has_pse)
199                 use_pse = 0;
200
201         pfn = start_pfn;
202         pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
203         pgd = pgd_base + pgd_idx;
204         for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
205                 pmd = one_md_table_init(pgd);
206
207                 if (pfn >= end_pfn)
208                         continue;
209 #ifdef CONFIG_X86_PAE
210                 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
211                 pmd += pmd_idx;
212 #else
213                 pmd_idx = 0;
214 #endif
215                 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
216                      pmd++, pmd_idx++) {
217                         unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
218
219                         /*
220                          * Map with big pages if possible, otherwise
221                          * create normal page tables:
222                          */
223                         if (use_pse) {
224                                 unsigned int addr2;
225                                 pgprot_t prot = PAGE_KERNEL_LARGE;
226
227                                 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
228                                         PAGE_OFFSET + PAGE_SIZE-1;
229
230                                 if (is_kernel_text(addr) ||
231                                     is_kernel_text(addr2))
232                                         prot = PAGE_KERNEL_LARGE_EXEC;
233
234                                 pages_2m++;
235                                 set_pmd(pmd, pfn_pmd(pfn, prot));
236
237                                 pfn += PTRS_PER_PTE;
238                                 continue;
239                         }
240                         pte = one_page_table_init(pmd);
241
242                         pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
243                         pte += pte_ofs;
244                         for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
245                              pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
246                                 pgprot_t prot = PAGE_KERNEL;
247
248                                 if (is_kernel_text(addr))
249                                         prot = PAGE_KERNEL_EXEC;
250
251                                 pages_4k++;
252                                 set_pte(pte, pfn_pte(pfn, prot));
253                         }
254                 }
255         }
256         update_page_count(PG_LEVEL_2M, pages_2m);
257         update_page_count(PG_LEVEL_4K, pages_4k);
258 }
259
260 /*
261  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
262  * is valid. The argument is a physical page number.
263  *
264  *
265  * On x86, access has to be given to the first megabyte of ram because that area
266  * contains bios code and data regions used by X and dosemu and similar apps.
267  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
268  * mmio resources as well as potential bios/acpi data regions.
269  */
270 int devmem_is_allowed(unsigned long pagenr)
271 {
272         if (pagenr <= 256)
273                 return 1;
274         if (!page_is_ram(pagenr))
275                 return 1;
276         return 0;
277 }
278
279 #ifdef CONFIG_HIGHMEM
280 pte_t *kmap_pte;
281 pgprot_t kmap_prot;
282
283 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
284 {
285         return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
286                         vaddr), vaddr), vaddr);
287 }
288
289 static void __init kmap_init(void)
290 {
291         unsigned long kmap_vstart;
292
293         /*
294          * Cache the first kmap pte:
295          */
296         kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
297         kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
298
299         kmap_prot = PAGE_KERNEL;
300 }
301
302 static void __init permanent_kmaps_init(pgd_t *pgd_base)
303 {
304         unsigned long vaddr;
305         pgd_t *pgd;
306         pud_t *pud;
307         pmd_t *pmd;
308         pte_t *pte;
309
310         vaddr = PKMAP_BASE;
311         page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
312
313         pgd = swapper_pg_dir + pgd_index(vaddr);
314         pud = pud_offset(pgd, vaddr);
315         pmd = pmd_offset(pud, vaddr);
316         pte = pte_offset_kernel(pmd, vaddr);
317         pkmap_page_table = pte;
318 }
319
320 static void __init add_one_highpage_init(struct page *page, int pfn)
321 {
322         ClearPageReserved(page);
323         init_page_count(page);
324         __free_page(page);
325         totalhigh_pages++;
326 }
327
328 struct add_highpages_data {
329         unsigned long start_pfn;
330         unsigned long end_pfn;
331 };
332
333 static int __init add_highpages_work_fn(unsigned long start_pfn,
334                                          unsigned long end_pfn, void *datax)
335 {
336         int node_pfn;
337         struct page *page;
338         unsigned long final_start_pfn, final_end_pfn;
339         struct add_highpages_data *data;
340
341         data = (struct add_highpages_data *)datax;
342
343         final_start_pfn = max(start_pfn, data->start_pfn);
344         final_end_pfn = min(end_pfn, data->end_pfn);
345         if (final_start_pfn >= final_end_pfn)
346                 return 0;
347
348         for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
349              node_pfn++) {
350                 if (!pfn_valid(node_pfn))
351                         continue;
352                 page = pfn_to_page(node_pfn);
353                 add_one_highpage_init(page, node_pfn);
354         }
355
356         return 0;
357
358 }
359
360 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
361                                               unsigned long end_pfn)
362 {
363         struct add_highpages_data data;
364
365         data.start_pfn = start_pfn;
366         data.end_pfn = end_pfn;
367
368         work_with_active_regions(nid, add_highpages_work_fn, &data);
369 }
370
371 #ifndef CONFIG_NUMA
372 static void __init set_highmem_pages_init(void)
373 {
374         add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
375
376         totalram_pages += totalhigh_pages;
377 }
378 #endif /* !CONFIG_NUMA */
379
380 #else
381 # define kmap_init()                            do { } while (0)
382 # define permanent_kmaps_init(pgd_base)         do { } while (0)
383 # define set_highmem_pages_init()       do { } while (0)
384 #endif /* CONFIG_HIGHMEM */
385
386 pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
387 EXPORT_SYMBOL(__PAGE_KERNEL);
388
389 pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
390
391 void __init native_pagetable_setup_start(pgd_t *base)
392 {
393         unsigned long pfn, va;
394         pgd_t *pgd;
395         pud_t *pud;
396         pmd_t *pmd;
397         pte_t *pte;
398
399         /*
400          * Remove any mappings which extend past the end of physical
401          * memory from the boot time page table:
402          */
403         for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
404                 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
405                 pgd = base + pgd_index(va);
406                 if (!pgd_present(*pgd))
407                         break;
408
409                 pud = pud_offset(pgd, va);
410                 pmd = pmd_offset(pud, va);
411                 if (!pmd_present(*pmd))
412                         break;
413
414                 pte = pte_offset_kernel(pmd, va);
415                 if (!pte_present(*pte))
416                         break;
417
418                 pte_clear(NULL, va, pte);
419         }
420         paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
421 }
422
423 void __init native_pagetable_setup_done(pgd_t *base)
424 {
425 }
426
427 /*
428  * Build a proper pagetable for the kernel mappings.  Up until this
429  * point, we've been running on some set of pagetables constructed by
430  * the boot process.
431  *
432  * If we're booting on native hardware, this will be a pagetable
433  * constructed in arch/x86/kernel/head_32.S.  The root of the
434  * pagetable will be swapper_pg_dir.
435  *
436  * If we're booting paravirtualized under a hypervisor, then there are
437  * more options: we may already be running PAE, and the pagetable may
438  * or may not be based in swapper_pg_dir.  In any case,
439  * paravirt_pagetable_setup_start() will set up swapper_pg_dir
440  * appropriately for the rest of the initialization to work.
441  *
442  * In general, pagetable_init() assumes that the pagetable may already
443  * be partially populated, and so it avoids stomping on any existing
444  * mappings.
445  */
446 static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
447 {
448         unsigned long vaddr, end;
449
450         /*
451          * Fixed mappings, only the page table structure has to be
452          * created - mappings will be set by set_fixmap():
453          */
454         early_ioremap_clear();
455         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
456         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
457         page_table_range_init(vaddr, end, pgd_base);
458         early_ioremap_reset();
459 }
460
461 static void __init pagetable_init(void)
462 {
463         pgd_t *pgd_base = swapper_pg_dir;
464
465         paravirt_pagetable_setup_start(pgd_base);
466
467         permanent_kmaps_init(pgd_base);
468
469         paravirt_pagetable_setup_done(pgd_base);
470 }
471
472 #ifdef CONFIG_ACPI_SLEEP
473 /*
474  * ACPI suspend needs this for resume, because things like the intel-agp
475  * driver might have split up a kernel 4MB mapping.
476  */
477 char swsusp_pg_dir[PAGE_SIZE]
478         __attribute__ ((aligned(PAGE_SIZE)));
479
480 static inline void save_pg_dir(void)
481 {
482         memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
483 }
484 #else /* !CONFIG_ACPI_SLEEP */
485 static inline void save_pg_dir(void)
486 {
487 }
488 #endif /* !CONFIG_ACPI_SLEEP */
489
490 void zap_low_mappings(void)
491 {
492         int i;
493
494         /*
495          * Zap initial low-memory mappings.
496          *
497          * Note that "pgd_clear()" doesn't do it for
498          * us, because pgd_clear() is a no-op on i386.
499          */
500         for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
501 #ifdef CONFIG_X86_PAE
502                 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
503 #else
504                 set_pgd(swapper_pg_dir+i, __pgd(0));
505 #endif
506         }
507         flush_tlb_all();
508 }
509
510 int nx_enabled;
511
512 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
513 EXPORT_SYMBOL_GPL(__supported_pte_mask);
514
515 #ifdef CONFIG_X86_PAE
516
517 static int disable_nx __initdata;
518
519 /*
520  * noexec = on|off
521  *
522  * Control non executable mappings.
523  *
524  * on      Enable
525  * off     Disable
526  */
527 static int __init noexec_setup(char *str)
528 {
529         if (!str || !strcmp(str, "on")) {
530                 if (cpu_has_nx) {
531                         __supported_pte_mask |= _PAGE_NX;
532                         disable_nx = 0;
533                 }
534         } else {
535                 if (!strcmp(str, "off")) {
536                         disable_nx = 1;
537                         __supported_pte_mask &= ~_PAGE_NX;
538                 } else {
539                         return -EINVAL;
540                 }
541         }
542
543         return 0;
544 }
545 early_param("noexec", noexec_setup);
546
547 static void __init set_nx(void)
548 {
549         unsigned int v[4], l, h;
550
551         if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
552                 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
553
554                 if ((v[3] & (1 << 20)) && !disable_nx) {
555                         rdmsr(MSR_EFER, l, h);
556                         l |= EFER_NX;
557                         wrmsr(MSR_EFER, l, h);
558                         nx_enabled = 1;
559                         __supported_pte_mask |= _PAGE_NX;
560                 }
561         }
562 }
563 #endif
564
565 /* user-defined highmem size */
566 static unsigned int highmem_pages = -1;
567
568 /*
569  * highmem=size forces highmem to be exactly 'size' bytes.
570  * This works even on boxes that have no highmem otherwise.
571  * This also works to reduce highmem size on bigger boxes.
572  */
573 static int __init parse_highmem(char *arg)
574 {
575         if (!arg)
576                 return -EINVAL;
577
578         highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
579         return 0;
580 }
581 early_param("highmem", parse_highmem);
582
583 /*
584  * Determine low and high memory ranges:
585  */
586 void __init find_low_pfn_range(void)
587 {
588         /* it could update max_pfn */
589
590         /* max_low_pfn is 0, we already have early_res support */
591
592         max_low_pfn = max_pfn;
593         if (max_low_pfn > MAXMEM_PFN) {
594                 if (highmem_pages == -1)
595                         highmem_pages = max_pfn - MAXMEM_PFN;
596                 if (highmem_pages + MAXMEM_PFN < max_pfn)
597                         max_pfn = MAXMEM_PFN + highmem_pages;
598                 if (highmem_pages + MAXMEM_PFN > max_pfn) {
599                         printk(KERN_WARNING "only %luMB highmem pages "
600                                 "available, ignoring highmem size of %uMB.\n",
601                                 pages_to_mb(max_pfn - MAXMEM_PFN),
602                                 pages_to_mb(highmem_pages));
603                         highmem_pages = 0;
604                 }
605                 max_low_pfn = MAXMEM_PFN;
606 #ifndef CONFIG_HIGHMEM
607                 /* Maximum memory usable is what is directly addressable */
608                 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
609                                         MAXMEM>>20);
610                 if (max_pfn > MAX_NONPAE_PFN)
611                         printk(KERN_WARNING
612                                  "Use a HIGHMEM64G enabled kernel.\n");
613                 else
614                         printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
615                 max_pfn = MAXMEM_PFN;
616 #else /* !CONFIG_HIGHMEM */
617 #ifndef CONFIG_HIGHMEM64G
618                 if (max_pfn > MAX_NONPAE_PFN) {
619                         max_pfn = MAX_NONPAE_PFN;
620                         printk(KERN_WARNING "Warning only 4GB will be used."
621                                 "Use a HIGHMEM64G enabled kernel.\n");
622                 }
623 #endif /* !CONFIG_HIGHMEM64G */
624 #endif /* !CONFIG_HIGHMEM */
625         } else {
626                 if (highmem_pages == -1)
627                         highmem_pages = 0;
628 #ifdef CONFIG_HIGHMEM
629                 if (highmem_pages >= max_pfn) {
630                         printk(KERN_ERR "highmem size specified (%uMB) is "
631                                 "bigger than pages available (%luMB)!.\n",
632                                 pages_to_mb(highmem_pages),
633                                 pages_to_mb(max_pfn));
634                         highmem_pages = 0;
635                 }
636                 if (highmem_pages) {
637                         if (max_low_pfn - highmem_pages <
638                             64*1024*1024/PAGE_SIZE){
639                                 printk(KERN_ERR "highmem size %uMB results in "
640                                 "smaller than 64MB lowmem, ignoring it.\n"
641                                         , pages_to_mb(highmem_pages));
642                                 highmem_pages = 0;
643                         }
644                         max_low_pfn -= highmem_pages;
645                 }
646 #else
647                 if (highmem_pages)
648                         printk(KERN_ERR "ignoring highmem size on non-highmem"
649                                         " kernel!\n");
650 #endif
651         }
652 }
653
654 #ifndef CONFIG_NEED_MULTIPLE_NODES
655 void __init initmem_init(unsigned long start_pfn,
656                                   unsigned long end_pfn)
657 {
658 #ifdef CONFIG_HIGHMEM
659         highstart_pfn = highend_pfn = max_pfn;
660         if (max_pfn > max_low_pfn)
661                 highstart_pfn = max_low_pfn;
662         memory_present(0, 0, highend_pfn);
663         printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
664                 pages_to_mb(highend_pfn - highstart_pfn));
665         num_physpages = highend_pfn;
666         high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
667 #else
668         memory_present(0, 0, max_low_pfn);
669         num_physpages = max_low_pfn;
670         high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
671 #endif
672 #ifdef CONFIG_FLATMEM
673         max_mapnr = num_physpages;
674 #endif
675         printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
676                         pages_to_mb(max_low_pfn));
677
678         setup_bootmem_allocator();
679 }
680
681 void __init zone_sizes_init(void)
682 {
683         unsigned long max_zone_pfns[MAX_NR_ZONES];
684         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
685         max_zone_pfns[ZONE_DMA] =
686                 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
687         max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
688         remove_all_active_ranges();
689 #ifdef CONFIG_HIGHMEM
690         max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
691         e820_register_active_regions(0, 0, highend_pfn);
692 #else
693         e820_register_active_regions(0, 0, max_low_pfn);
694 #endif
695
696         free_area_init_nodes(max_zone_pfns);
697 }
698 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
699
700 void __init setup_bootmem_allocator(void)
701 {
702         int i;
703         unsigned long bootmap_size, bootmap;
704         /*
705          * Initialize the boot-time allocator (with low memory only):
706          */
707         bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
708         bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
709                                  max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
710                                  PAGE_SIZE);
711         if (bootmap == -1L)
712                 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
713         reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
714
715         /* don't touch min_low_pfn */
716         bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
717                                          min_low_pfn, max_low_pfn);
718         printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
719                  max_pfn_mapped<<PAGE_SHIFT);
720         printk(KERN_INFO "  low ram: %08lx - %08lx\n",
721                  min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
722         printk(KERN_INFO "  bootmap %08lx - %08lx\n",
723                  bootmap, bootmap + bootmap_size);
724         for_each_online_node(i)
725                 free_bootmem_with_active_regions(i, max_low_pfn);
726         early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
727
728         after_init_bootmem = 1;
729 }
730
731 static void __init find_early_table_space(unsigned long end)
732 {
733         unsigned long puds, pmds, ptes, tables, start;
734
735         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
736         tables = PAGE_ALIGN(puds * sizeof(pud_t));
737
738         pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
739         tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
740
741         if (cpu_has_pse) {
742                 unsigned long extra;
743
744                 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
745                 extra += PMD_SIZE;
746                 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
747         } else
748                 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
749
750         tables += PAGE_ALIGN(ptes * sizeof(pte_t));
751
752         /* for fixmap */
753         tables += PAGE_SIZE * 2;
754
755         /*
756          * RED-PEN putting page tables only on node 0 could
757          * cause a hotspot and fill up ZONE_DMA. The page tables
758          * need roughly 0.5KB per GB.
759          */
760         start = 0x7000;
761         table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
762                                         tables, PAGE_SIZE);
763         if (table_start == -1UL)
764                 panic("Cannot find space for the kernel page tables");
765
766         table_start >>= PAGE_SHIFT;
767         table_end = table_start;
768         table_top = table_start + (tables>>PAGE_SHIFT);
769
770         printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
771                 end, table_start << PAGE_SHIFT,
772                 (table_start << PAGE_SHIFT) + tables);
773 }
774
775 unsigned long __init_refok init_memory_mapping(unsigned long start,
776                                                 unsigned long end)
777 {
778         pgd_t *pgd_base = swapper_pg_dir;
779         unsigned long start_pfn, end_pfn;
780         unsigned long big_page_start;
781
782         /*
783          * Find space for the kernel direct mapping tables.
784          */
785         if (!after_init_bootmem)
786                 find_early_table_space(end);
787
788 #ifdef CONFIG_X86_PAE
789         set_nx();
790         if (nx_enabled)
791                 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
792 #endif
793
794         /* Enable PSE if available */
795         if (cpu_has_pse)
796                 set_in_cr4(X86_CR4_PSE);
797
798         /* Enable PGE if available */
799         if (cpu_has_pge) {
800                 set_in_cr4(X86_CR4_PGE);
801                 __PAGE_KERNEL |= _PAGE_GLOBAL;
802                 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
803         }
804
805         /*
806          * Don't use a large page for the first 2/4MB of memory
807          * because there are often fixed size MTRRs in there
808          * and overlapping MTRRs into large pages can cause
809          * slowdowns.
810          */
811         big_page_start = PMD_SIZE;
812
813         if (start < big_page_start) {
814                 start_pfn = start >> PAGE_SHIFT;
815                 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
816         } else {
817                 /* head is not big page alignment ? */
818                 start_pfn = start >> PAGE_SHIFT;
819                 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
820                                  << (PMD_SHIFT - PAGE_SHIFT);
821         }
822         if (start_pfn < end_pfn)
823                 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
824
825         /* big page range */
826         start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
827                          << (PMD_SHIFT - PAGE_SHIFT);
828         if (start_pfn < (big_page_start >> PAGE_SHIFT))
829                 start_pfn =  big_page_start >> PAGE_SHIFT;
830         end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
831         if (start_pfn < end_pfn)
832                 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
833                                                 cpu_has_pse);
834
835         /* tail is not big page alignment ? */
836         start_pfn = end_pfn;
837         if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
838                 end_pfn = end >> PAGE_SHIFT;
839                 if (start_pfn < end_pfn)
840                         kernel_physical_mapping_init(pgd_base, start_pfn,
841                                                          end_pfn, 0);
842         }
843
844         early_ioremap_page_table_range_init(pgd_base);
845
846         load_cr3(swapper_pg_dir);
847
848         __flush_tlb_all();
849
850         if (!after_init_bootmem)
851                 reserve_early(table_start << PAGE_SHIFT,
852                                  table_end << PAGE_SHIFT, "PGTABLE");
853
854         return end >> PAGE_SHIFT;
855 }
856
857
858 /*
859  * paging_init() sets up the page tables - note that the first 8MB are
860  * already mapped by head.S.
861  *
862  * This routines also unmaps the page at virtual kernel address 0, so
863  * that we can trap those pesky NULL-reference errors in the kernel.
864  */
865 void __init paging_init(void)
866 {
867         pagetable_init();
868
869         __flush_tlb_all();
870
871         kmap_init();
872
873         /*
874          * NOTE: at this point the bootmem allocator is fully available.
875          */
876         sparse_init();
877         zone_sizes_init();
878
879         paravirt_post_allocator_init();
880 }
881
882 /*
883  * Test if the WP bit works in supervisor mode. It isn't supported on 386's
884  * and also on some strange 486's. All 586+'s are OK. This used to involve
885  * black magic jumps to work around some nasty CPU bugs, but fortunately the
886  * switch to using exceptions got rid of all that.
887  */
888 static void __init test_wp_bit(void)
889 {
890         printk(KERN_INFO
891   "Checking if this processor honours the WP bit even in supervisor mode...");
892
893         /* Any page-aligned address will do, the test is non-destructive */
894         __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
895         boot_cpu_data.wp_works_ok = do_test_wp_bit();
896         clear_fixmap(FIX_WP_TEST);
897
898         if (!boot_cpu_data.wp_works_ok) {
899                 printk(KERN_CONT "No.\n");
900 #ifdef CONFIG_X86_WP_WORKS_OK
901                 panic(
902   "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
903 #endif
904         } else {
905                 printk(KERN_CONT "Ok.\n");
906         }
907 }
908
909 static struct kcore_list kcore_mem, kcore_vmalloc;
910
911 void __init mem_init(void)
912 {
913         int codesize, reservedpages, datasize, initsize;
914         int tmp;
915
916 #ifdef CONFIG_FLATMEM
917         BUG_ON(!mem_map);
918 #endif
919         /* this will put all low memory onto the freelists */
920         totalram_pages += free_all_bootmem();
921
922         reservedpages = 0;
923         for (tmp = 0; tmp < max_low_pfn; tmp++)
924                 /*
925                  * Only count reserved RAM pages:
926                  */
927                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
928                         reservedpages++;
929
930         set_highmem_pages_init();
931
932         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
933         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
934         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
935
936         kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
937         kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
938                    VMALLOC_END-VMALLOC_START);
939
940         printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
941                         "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
942                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
943                 num_physpages << (PAGE_SHIFT-10),
944                 codesize >> 10,
945                 reservedpages << (PAGE_SHIFT-10),
946                 datasize >> 10,
947                 initsize >> 10,
948                 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
949                );
950
951         printk(KERN_INFO "virtual kernel memory layout:\n"
952                 "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
953 #ifdef CONFIG_HIGHMEM
954                 "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
955 #endif
956                 "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
957                 "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
958                 "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
959                 "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
960                 "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
961                 FIXADDR_START, FIXADDR_TOP,
962                 (FIXADDR_TOP - FIXADDR_START) >> 10,
963
964 #ifdef CONFIG_HIGHMEM
965                 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
966                 (LAST_PKMAP*PAGE_SIZE) >> 10,
967 #endif
968
969                 VMALLOC_START, VMALLOC_END,
970                 (VMALLOC_END - VMALLOC_START) >> 20,
971
972                 (unsigned long)__va(0), (unsigned long)high_memory,
973                 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
974
975                 (unsigned long)&__init_begin, (unsigned long)&__init_end,
976                 ((unsigned long)&__init_end -
977                  (unsigned long)&__init_begin) >> 10,
978
979                 (unsigned long)&_etext, (unsigned long)&_edata,
980                 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
981
982                 (unsigned long)&_text, (unsigned long)&_etext,
983                 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
984
985 #ifdef CONFIG_HIGHMEM
986         BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
987         BUG_ON(VMALLOC_END                              > PKMAP_BASE);
988 #endif
989         BUG_ON(VMALLOC_START                            > VMALLOC_END);
990         BUG_ON((unsigned long)high_memory               > VMALLOC_START);
991
992         if (boot_cpu_data.wp_works_ok < 0)
993                 test_wp_bit();
994
995         cpa_init();
996         save_pg_dir();
997         zap_low_mappings();
998 }
999
1000 #ifdef CONFIG_MEMORY_HOTPLUG
1001 int arch_add_memory(int nid, u64 start, u64 size)
1002 {
1003         struct pglist_data *pgdata = NODE_DATA(nid);
1004         struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
1005         unsigned long start_pfn = start >> PAGE_SHIFT;
1006         unsigned long nr_pages = size >> PAGE_SHIFT;
1007
1008         return __add_pages(zone, start_pfn, nr_pages);
1009 }
1010 #endif
1011
1012 /*
1013  * This function cannot be __init, since exceptions don't work in that
1014  * section.  Put this after the callers, so that it cannot be inlined.
1015  */
1016 static noinline int do_test_wp_bit(void)
1017 {
1018         char tmp_reg;
1019         int flag;
1020
1021         __asm__ __volatile__(
1022                 "       movb %0, %1     \n"
1023                 "1:     movb %1, %0     \n"
1024                 "       xorl %2, %2     \n"
1025                 "2:                     \n"
1026                 _ASM_EXTABLE(1b,2b)
1027                 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
1028                  "=q" (tmp_reg),
1029                  "=r" (flag)
1030                 :"2" (1)
1031                 :"memory");
1032
1033         return flag;
1034 }
1035
1036 #ifdef CONFIG_DEBUG_RODATA
1037 const int rodata_test_data = 0xC3;
1038 EXPORT_SYMBOL_GPL(rodata_test_data);
1039
1040 void mark_rodata_ro(void)
1041 {
1042         unsigned long start = PFN_ALIGN(_text);
1043         unsigned long size = PFN_ALIGN(_etext) - start;
1044
1045         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1046         printk(KERN_INFO "Write protecting the kernel text: %luk\n",
1047                 size >> 10);
1048
1049 #ifdef CONFIG_CPA_DEBUG
1050         printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
1051                 start, start+size);
1052         set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
1053
1054         printk(KERN_INFO "Testing CPA: write protecting again\n");
1055         set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
1056 #endif
1057         start += size;
1058         size = (unsigned long)__end_rodata - start;
1059         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1060         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1061                 size >> 10);
1062         rodata_test();
1063
1064 #ifdef CONFIG_CPA_DEBUG
1065         printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
1066         set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
1067
1068         printk(KERN_INFO "Testing CPA: write protecting again\n");
1069         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1070 #endif
1071 }
1072 #endif
1073
1074 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1075 {
1076 #ifdef CONFIG_DEBUG_PAGEALLOC
1077         /*
1078          * If debugging page accesses then do not free this memory but
1079          * mark them not present - any buggy init-section access will
1080          * create a kernel page fault:
1081          */
1082         printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
1083                 begin, PAGE_ALIGN(end));
1084         set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
1085 #else
1086         unsigned long addr;
1087
1088         /*
1089          * We just marked the kernel text read only above, now that
1090          * we are going to free part of that, we need to make that
1091          * writeable first.
1092          */
1093         set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
1094
1095         for (addr = begin; addr < end; addr += PAGE_SIZE) {
1096                 ClearPageReserved(virt_to_page(addr));
1097                 init_page_count(virt_to_page(addr));
1098                 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1099                 free_page(addr);
1100                 totalram_pages++;
1101         }
1102         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
1103 #endif
1104 }
1105
1106 void free_initmem(void)
1107 {
1108         free_init_pages("unused kernel memory",
1109                         (unsigned long)(&__init_begin),
1110                         (unsigned long)(&__init_end));
1111 }
1112
1113 #ifdef CONFIG_BLK_DEV_INITRD
1114 void free_initrd_mem(unsigned long start, unsigned long end)
1115 {
1116         free_init_pages("initrd memory", start, end);
1117 }
1118 #endif
1119
1120 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1121                                    int flags)
1122 {
1123         return reserve_bootmem(phys, len, flags);
1124 }