2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/proc_fs.h>
25 #include <linux/pci.h>
27 #include <asm/processor.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgtable.h>
31 #include <asm/pgalloc.h>
33 #include <asm/fixmap.h>
37 #include <asm/mmu_context.h>
38 #include <asm/proto.h>
40 #include <asm/sections.h>
46 static unsigned long dma_reserve __initdata;
48 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
52 * physical space so we can cache the place of the first one and move
53 * around without checking the pgd every time.
58 long i, total = 0, reserved = 0;
59 long shared = 0, cached = 0;
63 printk(KERN_INFO "Mem-info:\n");
65 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
67 for_each_pgdat(pgdat) {
68 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
69 page = pfn_to_page(pgdat->node_start_pfn + i);
71 if (PageReserved(page))
73 else if (PageSwapCache(page))
75 else if (page_count(page))
76 shared += page_count(page) - 1;
79 printk(KERN_INFO "%lu pages of RAM\n", total);
80 printk(KERN_INFO "%lu reserved pages\n",reserved);
81 printk(KERN_INFO "%lu pages shared\n",shared);
82 printk(KERN_INFO "%lu pages swap cached\n",cached);
85 /* References to section boundaries */
89 static void *spp_getpage(void)
93 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
95 ptr = alloc_bootmem_pages(PAGE_SIZE);
96 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
97 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
99 Dprintk("spp_getpage %p\n", ptr);
103 static void set_pte_phys(unsigned long vaddr,
104 unsigned long phys, pgprot_t prot)
111 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
113 pgd = pgd_offset_k(vaddr);
114 if (pgd_none(*pgd)) {
115 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
118 pud = pud_offset(pgd, vaddr);
119 if (pud_none(*pud)) {
120 pmd = (pmd_t *) spp_getpage();
121 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
122 if (pmd != pmd_offset(pud, 0)) {
123 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
127 pmd = pmd_offset(pud, vaddr);
128 if (pmd_none(*pmd)) {
129 pte = (pte_t *) spp_getpage();
130 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
131 if (pte != pte_offset_kernel(pmd, 0)) {
132 printk("PAGETABLE BUG #02!\n");
136 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
138 pte = pte_offset_kernel(pmd, vaddr);
139 if (!pte_none(*pte) &&
140 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
142 set_pte(pte, new_pte);
145 * It's enough to flush this one mapping.
146 * (PGE mappings get flushed as well)
148 __flush_tlb_one(vaddr);
151 /* NOTE: this is meant to be run only at boot */
152 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
154 unsigned long address = __fix_to_virt(idx);
156 if (idx >= __end_of_fixed_addresses) {
157 printk("Invalid __set_fixmap\n");
160 set_pte_phys(address, phys, prot);
163 unsigned long __initdata table_start, table_end;
165 extern pmd_t temp_boot_pmds[];
167 static struct temp_map {
171 } temp_mappings[] __initdata = {
172 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
173 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
177 static __init void *alloc_low_page(int *index, unsigned long *phys)
181 unsigned long pfn = table_end++, paddr;
185 panic("alloc_low_page: ran out of memory");
186 for (i = 0; temp_mappings[i].allocated; i++) {
187 if (!temp_mappings[i].pmd)
188 panic("alloc_low_page: ran out of temp mappings");
190 ti = &temp_mappings[i];
191 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
192 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
195 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
197 *phys = pfn * PAGE_SIZE;
201 static __init void unmap_low_page(int i)
203 struct temp_map *ti = &temp_mappings[i];
204 set_pmd(ti->pmd, __pmd(0));
208 static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
212 i = pud_index(address);
214 for (; i < PTRS_PER_PUD; pud++, i++) {
216 unsigned long paddr, pmd_phys;
219 paddr = address + i*PUD_SIZE;
221 for (; i < PTRS_PER_PUD; i++, pud++)
222 set_pud(pud, __pud(0));
226 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
227 set_pud(pud, __pud(0));
231 pmd = alloc_low_page(&map, &pmd_phys);
232 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
233 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
237 for (; j < PTRS_PER_PMD; j++, pmd++)
238 set_pmd(pmd, __pmd(0));
241 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
242 pe &= __supported_pte_mask;
243 set_pmd(pmd, __pmd(pe));
250 static void __init find_early_table_space(unsigned long end)
252 unsigned long puds, pmds, tables;
254 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
255 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
256 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
257 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
259 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
260 if (table_start == -1UL)
261 panic("Cannot find space for the kernel page tables");
263 table_start >>= PAGE_SHIFT;
264 table_end = table_start;
267 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
268 This runs before bootmem is initialized and gets pages directly from the
269 physical memory. To access them they are temporarily mapped. */
270 void __init init_memory_mapping(unsigned long start, unsigned long end)
274 Dprintk("init_memory_mapping\n");
277 * Find space for the kernel direct mapping tables.
278 * Later we should allocate these tables in the local node of the memory
279 * mapped. Unfortunately this is done currently before the nodes are
282 find_early_table_space(end);
284 start = (unsigned long)__va(start);
285 end = (unsigned long)__va(end);
287 for (; start < end; start = next) {
289 unsigned long pud_phys;
290 pud_t *pud = alloc_low_page(&map, &pud_phys);
291 next = start + PGDIR_SIZE;
294 phys_pud_init(pud, __pa(start), __pa(next));
295 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
299 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
301 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
302 table_start<<PAGE_SHIFT,
303 table_end<<PAGE_SHIFT);
306 void __cpuinit zap_low_mappings(int cpu)
309 pgd_t *pgd = pgd_offset_k(0UL);
313 * For AP's, zap the low identity mappings by changing the cr3
314 * to init_level4_pgt and doing local flush tlb all
316 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
321 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
323 size_zones(unsigned long *z, unsigned long *h,
324 unsigned long start_pfn, unsigned long end_pfn)
329 for (i = 0; i < MAX_NR_ZONES; i++)
332 if (start_pfn < MAX_DMA_PFN)
333 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
334 if (start_pfn < MAX_DMA32_PFN) {
335 unsigned long dma32_pfn = MAX_DMA32_PFN;
336 if (dma32_pfn > end_pfn)
338 z[ZONE_DMA32] = dma32_pfn - start_pfn;
340 z[ZONE_NORMAL] = end_pfn - start_pfn;
342 /* Remove lower zones from higher ones. */
344 for (i = 0; i < MAX_NR_ZONES; i++) {
352 for (i = 0; i < MAX_NR_ZONES; i++) {
355 h[i] = e820_hole_size(s, w);
358 /* Add the space pace needed for mem_map to the holes too. */
359 for (i = 0; i < MAX_NR_ZONES; i++)
360 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
362 /* The 16MB DMA zone has the kernel and other misc mappings.
365 h[ZONE_DMA] += dma_reserve;
366 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
368 "Kernel too large and filling up ZONE_DMA?\n");
369 h[ZONE_DMA] = z[ZONE_DMA];
375 void __init paging_init(void)
377 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
378 size_zones(zones, holes, 0, end_pfn);
379 free_area_init_node(0, NODE_DATA(0), zones,
380 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
384 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
385 from the CPU leading to inconsistent cache lines. address and size
386 must be aligned to 2MB boundaries.
387 Does nothing when the mapping doesn't exist. */
388 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
390 unsigned long end = address + size;
392 BUG_ON(address & ~LARGE_PAGE_MASK);
393 BUG_ON(size & ~LARGE_PAGE_MASK);
395 for (; address < end; address += LARGE_PAGE_SIZE) {
396 pgd_t *pgd = pgd_offset_k(address);
401 pud = pud_offset(pgd, address);
404 pmd = pmd_offset(pud, address);
405 if (!pmd || pmd_none(*pmd))
407 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
408 /* Could handle this, but it should not happen currently. */
410 "clear_kernel_mapping: mapping has been split. will leak memory\n");
413 set_pmd(pmd, __pmd(0));
418 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
421 void __init mem_init(void)
423 long codesize, reservedpages, datasize, initsize;
425 #ifdef CONFIG_SWIOTLB
426 if (!iommu_aperture &&
427 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
433 /* How many end-of-memory variables you have, grandma! */
434 max_low_pfn = end_pfn;
436 num_physpages = end_pfn;
437 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
439 /* clear the zero-page */
440 memset(empty_zero_page, 0, PAGE_SIZE);
444 /* this will put all low memory onto the freelists */
446 totalram_pages = numa_free_all_bootmem();
448 totalram_pages = free_all_bootmem();
450 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
454 codesize = (unsigned long) &_etext - (unsigned long) &_text;
455 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
456 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
458 /* Register memory areas for /proc/kcore */
459 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
460 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
461 VMALLOC_END-VMALLOC_START);
462 kclist_add(&kcore_kernel, &_stext, _end - _stext);
463 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
464 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
465 VSYSCALL_END - VSYSCALL_START);
467 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
468 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
469 end_pfn << (PAGE_SHIFT-10),
471 reservedpages << (PAGE_SHIFT-10),
477 * Sync boot_level4_pgt mappings with the init_level4_pgt
478 * except for the low identity mappings which are already zapped
479 * in init_level4_pgt. This sync-up is essential for AP's bringup
481 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
485 void free_initmem(void)
489 addr = (unsigned long)(&__init_begin);
490 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
491 ClearPageReserved(virt_to_page(addr));
492 set_page_count(virt_to_page(addr), 1);
493 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
497 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
498 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
501 #ifdef CONFIG_BLK_DEV_INITRD
502 void free_initrd_mem(unsigned long start, unsigned long end)
504 if (start < (unsigned long)&_end)
506 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
507 for (; start < end; start += PAGE_SIZE) {
508 ClearPageReserved(virt_to_page(start));
509 set_page_count(virt_to_page(start), 1);
516 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
518 /* Should check here against the e820 map to avoid double free */
520 int nid = phys_to_nid(phys);
521 reserve_bootmem_node(NODE_DATA(nid), phys, len);
523 reserve_bootmem(phys, len);
525 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
526 dma_reserve += len / PAGE_SIZE;
529 int kern_addr_valid(unsigned long addr)
531 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
537 if (above != 0 && above != -1UL)
540 pgd = pgd_offset_k(addr);
544 pud = pud_offset(pgd, addr);
548 pmd = pmd_offset(pud, addr);
552 return pfn_valid(pmd_pfn(*pmd));
554 pte = pte_offset_kernel(pmd, addr);
557 return pfn_valid(pte_pfn(*pte));
561 #include <linux/sysctl.h>
563 extern int exception_trace, page_fault_trace;
565 static ctl_table debug_table2[] = {
566 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
571 static ctl_table debug_root_table2[] = {
572 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
573 .child = debug_table2 },
577 static __init int x8664_sysctl_init(void)
579 register_sysctl_table(debug_root_table2, 1);
582 __initcall(x8664_sysctl_init);
585 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
586 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
587 not need special handling anymore. */
589 static struct vm_area_struct gate_vma = {
590 .vm_start = VSYSCALL_START,
591 .vm_end = VSYSCALL_END,
592 .vm_page_prot = PAGE_READONLY
595 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
597 #ifdef CONFIG_IA32_EMULATION
598 if (test_tsk_thread_flag(tsk, TIF_IA32))
604 int in_gate_area(struct task_struct *task, unsigned long addr)
606 struct vm_area_struct *vma = get_gate_vma(task);
609 return (addr >= vma->vm_start) && (addr < vma->vm_end);
612 /* Use this when you have no reliable task/vma, typically from interrupt
613 * context. It is less reliable than using the task's vma and may give
616 int in_gate_area_no_task(unsigned long addr)
618 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);