2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/nmi.h>
32 #include <asm/processor.h>
33 #include <asm/system.h>
34 #include <asm/uaccess.h>
35 #include <asm/pgtable.h>
36 #include <asm/pgalloc.h>
38 #include <asm/fixmap.h>
42 #include <asm/mmu_context.h>
43 #include <asm/proto.h>
45 #include <asm/sections.h>
46 #include <asm/kdebug.h>
53 const struct dma_mapping_ops* dma_ops;
54 EXPORT_SYMBOL(dma_ops);
56 static unsigned long dma_reserve __initdata;
58 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
68 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
73 printk(KERN_INFO "Mem-info:\n");
75 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
77 for_each_online_pgdat(pgdat) {
78 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
79 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
84 if (!pfn_valid(pgdat->node_start_pfn + i))
86 page = pfn_to_page(pgdat->node_start_pfn + i);
88 if (PageReserved(page))
90 else if (PageSwapCache(page))
92 else if (page_count(page))
93 shared += page_count(page) - 1;
96 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
104 static __init void *spp_getpage(void)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
114 Dprintk("spp_getpage %p\n", ptr);
118 static __init void set_pte_phys(unsigned long vaddr,
119 unsigned long phys, pgprot_t prot)
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
157 set_pte(pte, new_pte);
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
163 __flush_tlb_one(vaddr);
166 /* NOTE: this is meant to be run only at boot */
168 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
170 unsigned long address = __fix_to_virt(idx);
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
176 set_pte_phys(address, phys, prot);
179 static unsigned long __initdata table_start;
180 static unsigned long __meminitdata table_end;
182 static __meminit void *alloc_low_page(unsigned long *phys)
184 unsigned long pfn = table_end++;
188 adr = (void *)get_zeroed_page(GFP_ATOMIC);
194 panic("alloc_low_page: ran out of memory");
196 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
197 memset(adr, 0, PAGE_SIZE);
198 *phys = pfn * PAGE_SIZE;
202 static __meminit void unmap_low_page(void *adr)
208 early_iounmap(adr, PAGE_SIZE);
211 /* Must run before zap_low_mappings */
212 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
215 pmd_t *pmd, *last_pmd;
218 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
219 vaddr = __START_KERNEL_map;
220 pmd = level2_kernel_pgt;
221 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
222 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
223 for (i = 0; i < pmds; i++) {
224 if (pmd_present(pmd[i]))
227 vaddr += addr & ~PMD_MASK;
229 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
230 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
232 return (void *)vaddr;
236 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
240 /* To avoid virtual aliases later */
241 __meminit void early_iounmap(void *addr, unsigned long size)
247 vaddr = (unsigned long)addr;
248 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
249 pmd = level2_kernel_pgt + pmd_index(vaddr);
250 for (i = 0; i < pmds; i++)
255 static void __meminit
256 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
258 int i = pmd_index(address);
260 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
262 pmd_t *pmd = pmd_page + pmd_index(address);
264 if (address >= end) {
266 for (; i < PTRS_PER_PMD; i++, pmd++)
267 set_pmd(pmd, __pmd(0));
274 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
275 entry &= __supported_pte_mask;
276 set_pmd(pmd, __pmd(entry));
280 static void __meminit
281 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
283 pmd_t *pmd = pmd_offset(pud,0);
284 spin_lock(&init_mm.page_table_lock);
285 phys_pmd_init(pmd, address, end);
286 spin_unlock(&init_mm.page_table_lock);
290 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
292 int i = pud_index(addr);
295 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
296 unsigned long pmd_phys;
297 pud_t *pud = pud_page + pud_index(addr);
303 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
304 set_pud(pud, __pud(0));
309 phys_pmd_update(pud, addr, end);
313 pmd = alloc_low_page(&pmd_phys);
314 spin_lock(&init_mm.page_table_lock);
315 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
316 phys_pmd_init(pmd, addr, end);
317 spin_unlock(&init_mm.page_table_lock);
323 static void __init find_early_table_space(unsigned long end)
325 unsigned long puds, pmds, tables, start;
327 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
328 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
329 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
330 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
332 /* RED-PEN putting page tables only on node 0 could
333 cause a hotspot and fill up ZONE_DMA. The page tables
334 need roughly 0.5KB per GB. */
336 table_start = find_e820_area(start, end, tables);
337 if (table_start == -1UL)
338 panic("Cannot find space for the kernel page tables");
340 table_start >>= PAGE_SHIFT;
341 table_end = table_start;
343 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
344 end, table_start << PAGE_SHIFT,
345 (table_start << PAGE_SHIFT) + tables);
348 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349 This runs before bootmem is initialized and gets pages directly from the
350 physical memory. To access them they are temporarily mapped. */
351 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
355 Dprintk("init_memory_mapping\n");
358 * Find space for the kernel direct mapping tables.
359 * Later we should allocate these tables in the local node of the memory
360 * mapped. Unfortunately this is done currently before the nodes are
364 find_early_table_space(end);
366 start = (unsigned long)__va(start);
367 end = (unsigned long)__va(end);
369 for (; start < end; start = next) {
370 unsigned long pud_phys;
371 pgd_t *pgd = pgd_offset_k(start);
375 pud = pud_offset(pgd, start & PGDIR_MASK);
377 pud = alloc_low_page(&pud_phys);
379 next = start + PGDIR_SIZE;
382 phys_pud_init(pud, __pa(start), __pa(next));
384 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
389 mmu_cr4_features = read_cr4();
392 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
396 void __init paging_init(void)
398 unsigned long max_zone_pfns[MAX_NR_ZONES];
399 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
400 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
401 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
402 max_zone_pfns[ZONE_NORMAL] = end_pfn;
404 memory_present(0, 0, end_pfn);
406 free_area_init_nodes(max_zone_pfns);
410 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
411 from the CPU leading to inconsistent cache lines. address and size
412 must be aligned to 2MB boundaries.
413 Does nothing when the mapping doesn't exist. */
414 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
416 unsigned long end = address + size;
418 BUG_ON(address & ~LARGE_PAGE_MASK);
419 BUG_ON(size & ~LARGE_PAGE_MASK);
421 for (; address < end; address += LARGE_PAGE_SIZE) {
422 pgd_t *pgd = pgd_offset_k(address);
427 pud = pud_offset(pgd, address);
430 pmd = pmd_offset(pud, address);
431 if (!pmd || pmd_none(*pmd))
433 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
434 /* Could handle this, but it should not happen currently. */
436 "clear_kernel_mapping: mapping has been split. will leak memory\n");
439 set_pmd(pmd, __pmd(0));
445 * Memory hotplug specific functions
447 void online_page(struct page *page)
449 ClearPageReserved(page);
450 init_page_count(page);
456 #ifdef CONFIG_MEMORY_HOTPLUG
458 * Memory is added always to NORMAL zone. This means you will never get
459 * additional DMA/DMA32 memory.
461 int arch_add_memory(int nid, u64 start, u64 size)
463 struct pglist_data *pgdat = NODE_DATA(nid);
464 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
465 unsigned long start_pfn = start >> PAGE_SHIFT;
466 unsigned long nr_pages = size >> PAGE_SHIFT;
469 init_memory_mapping(start, (start + size -1));
471 ret = __add_pages(zone, start_pfn, nr_pages);
477 printk("%s: Problem encountered in __add_pages!\n", __func__);
480 EXPORT_SYMBOL_GPL(arch_add_memory);
482 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
483 int memory_add_physaddr_to_nid(u64 start)
487 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
490 #endif /* CONFIG_MEMORY_HOTPLUG */
492 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
495 void __init mem_init(void)
497 long codesize, reservedpages, datasize, initsize;
501 /* clear_bss() already clear the empty_zero_page */
503 /* temporary debugging - double check it's true: */
507 for (i = 0; i < 1024; i++)
508 WARN_ON_ONCE(empty_zero_page[i]);
513 /* this will put all low memory onto the freelists */
515 totalram_pages = numa_free_all_bootmem();
517 totalram_pages = free_all_bootmem();
519 reservedpages = end_pfn - totalram_pages -
520 absent_pages_in_range(0, end_pfn);
524 codesize = (unsigned long) &_etext - (unsigned long) &_text;
525 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
526 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
528 /* Register memory areas for /proc/kcore */
529 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
530 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
531 VMALLOC_END-VMALLOC_START);
532 kclist_add(&kcore_kernel, &_stext, _end - _stext);
533 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
534 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
535 VSYSCALL_END - VSYSCALL_START);
537 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
538 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
539 end_pfn << (PAGE_SHIFT-10),
541 reservedpages << (PAGE_SHIFT-10),
546 void free_init_pages(char *what, unsigned long begin, unsigned long end)
553 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
554 for (addr = begin; addr < end; addr += PAGE_SIZE) {
555 ClearPageReserved(virt_to_page(addr));
556 init_page_count(virt_to_page(addr));
557 memset((void *)(addr & ~(PAGE_SIZE-1)),
558 POISON_FREE_INITMEM, PAGE_SIZE);
559 if (addr >= __START_KERNEL_map)
560 change_page_attr_addr(addr, 1, __pgprot(0));
564 if (addr > __START_KERNEL_map)
568 void free_initmem(void)
570 free_init_pages("unused kernel memory",
571 (unsigned long)(&__init_begin),
572 (unsigned long)(&__init_end));
575 #ifdef CONFIG_DEBUG_RODATA
577 void mark_rodata_ro(void)
579 unsigned long start = (unsigned long)_stext, end;
581 #ifdef CONFIG_HOTPLUG_CPU
582 /* It must still be possible to apply SMP alternatives. */
583 if (num_possible_cpus() > 1)
584 start = (unsigned long)_etext;
587 #ifdef CONFIG_KPROBES
588 start = (unsigned long)__start_rodata;
591 end = (unsigned long)__end_rodata;
592 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
597 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
599 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
600 (end - start) >> 10);
603 * change_page_attr_addr() requires a global_flush_tlb() call after it.
604 * We do this after the printk so that if something went wrong in the
605 * change, the printk gets out at least to give a better debug hint
606 * of who is the culprit.
612 #ifdef CONFIG_BLK_DEV_INITRD
613 void free_initrd_mem(unsigned long start, unsigned long end)
615 free_init_pages("initrd memory", start, end);
619 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
622 int nid = phys_to_nid(phys);
624 unsigned long pfn = phys >> PAGE_SHIFT;
625 if (pfn >= end_pfn) {
626 /* This can happen with kdump kernels when accessing firmware
628 if (pfn < end_pfn_map)
630 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
635 /* Should check here against the e820 map to avoid double free */
637 reserve_bootmem_node(NODE_DATA(nid), phys, len);
639 reserve_bootmem(phys, len);
641 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
642 dma_reserve += len / PAGE_SIZE;
643 set_dma_reserve(dma_reserve);
647 int kern_addr_valid(unsigned long addr)
649 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
655 if (above != 0 && above != -1UL)
658 pgd = pgd_offset_k(addr);
662 pud = pud_offset(pgd, addr);
666 pmd = pmd_offset(pud, addr);
670 return pfn_valid(pmd_pfn(*pmd));
672 pte = pte_offset_kernel(pmd, addr);
675 return pfn_valid(pte_pfn(*pte));
678 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
679 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
680 not need special handling anymore. */
682 static struct vm_area_struct gate_vma = {
683 .vm_start = VSYSCALL_START,
684 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
685 .vm_page_prot = PAGE_READONLY_EXEC,
686 .vm_flags = VM_READ | VM_EXEC
689 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
691 #ifdef CONFIG_IA32_EMULATION
692 if (test_tsk_thread_flag(tsk, TIF_IA32))
698 int in_gate_area(struct task_struct *task, unsigned long addr)
700 struct vm_area_struct *vma = get_gate_vma(task);
703 return (addr >= vma->vm_start) && (addr < vma->vm_end);
706 /* Use this when you have no reliable task/vma, typically from interrupt
707 * context. It is less reliable than using the task's vma and may give
710 int in_gate_area_no_task(unsigned long addr)
712 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
715 const char *arch_vma_name(struct vm_area_struct *vma)
717 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
719 if (vma == &gate_vma)
724 #ifdef CONFIG_SPARSEMEM_VMEMMAP
726 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
728 int __meminit vmemmap_populate(struct page *start_page,
729 unsigned long size, int node)
731 unsigned long addr = (unsigned long)start_page;
732 unsigned long end = (unsigned long)(start_page + size);
738 for (; addr < end; addr = next) {
739 next = pmd_addr_end(addr, end);
741 pgd = vmemmap_pgd_populate(addr, node);
744 pud = vmemmap_pud_populate(pgd, addr, node);
748 pmd = pmd_offset(pud, addr);
749 if (pmd_none(*pmd)) {
751 void *p = vmemmap_alloc_block(PMD_SIZE, node);
755 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE);
756 set_pmd(pmd, __pmd(pte_val(entry)));
758 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
759 addr, addr + PMD_SIZE - 1, p, node);
761 vmemmap_verify((pte_t *)pmd, node, addr, next);