2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/nmi.h>
32 #include <asm/processor.h>
33 #include <asm/system.h>
34 #include <asm/uaccess.h>
35 #include <asm/pgtable.h>
36 #include <asm/pgalloc.h>
38 #include <asm/fixmap.h>
42 #include <asm/mmu_context.h>
43 #include <asm/proto.h>
45 #include <asm/sections.h>
46 #include <asm/kdebug.h>
53 const struct dma_mapping_ops* dma_ops;
54 EXPORT_SYMBOL(dma_ops);
56 static unsigned long dma_reserve __initdata;
58 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
68 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
73 printk(KERN_INFO "Mem-info:\n");
75 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
77 for_each_online_pgdat(pgdat) {
78 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
79 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
84 if (!pfn_valid(pgdat->node_start_pfn + i))
86 page = pfn_to_page(pgdat->node_start_pfn + i);
88 if (PageReserved(page))
90 else if (PageSwapCache(page))
92 else if (page_count(page))
93 shared += page_count(page) - 1;
96 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
104 static __init void *spp_getpage(void)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
114 Dprintk("spp_getpage %p\n", ptr);
118 static __init void set_pte_phys(unsigned long vaddr,
119 unsigned long phys, pgprot_t prot)
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
157 set_pte(pte, new_pte);
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
163 __flush_tlb_one(vaddr);
166 /* NOTE: this is meant to be run only at boot */
168 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
170 unsigned long address = __fix_to_virt(idx);
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
176 set_pte_phys(address, phys, prot);
179 unsigned long __meminitdata table_start, table_end;
181 static __meminit void *alloc_low_page(unsigned long *phys)
183 unsigned long pfn = table_end++;
187 adr = (void *)get_zeroed_page(GFP_ATOMIC);
193 panic("alloc_low_page: ran out of memory");
195 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
196 memset(adr, 0, PAGE_SIZE);
197 *phys = pfn * PAGE_SIZE;
201 static __meminit void unmap_low_page(void *adr)
207 early_iounmap(adr, PAGE_SIZE);
210 /* Must run before zap_low_mappings */
211 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
214 pmd_t *pmd, *last_pmd;
217 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
218 vaddr = __START_KERNEL_map;
219 pmd = level2_kernel_pgt;
220 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
221 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
222 for (i = 0; i < pmds; i++) {
223 if (pmd_present(pmd[i]))
226 vaddr += addr & ~PMD_MASK;
228 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
229 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
231 return (void *)vaddr;
235 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
239 /* To avoid virtual aliases later */
240 __meminit void early_iounmap(void *addr, unsigned long size)
246 vaddr = (unsigned long)addr;
247 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
248 pmd = level2_kernel_pgt + pmd_index(vaddr);
249 for (i = 0; i < pmds; i++)
254 static void __meminit
255 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
257 int i = pmd_index(address);
259 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
261 pmd_t *pmd = pmd_page + pmd_index(address);
263 if (address >= end) {
265 for (; i < PTRS_PER_PMD; i++, pmd++)
266 set_pmd(pmd, __pmd(0));
273 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
274 entry &= __supported_pte_mask;
275 set_pmd(pmd, __pmd(entry));
279 static void __meminit
280 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
282 pmd_t *pmd = pmd_offset(pud,0);
283 spin_lock(&init_mm.page_table_lock);
284 phys_pmd_init(pmd, address, end);
285 spin_unlock(&init_mm.page_table_lock);
289 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
291 int i = pud_index(addr);
294 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
295 unsigned long pmd_phys;
296 pud_t *pud = pud_page + pud_index(addr);
302 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
303 set_pud(pud, __pud(0));
308 phys_pmd_update(pud, addr, end);
312 pmd = alloc_low_page(&pmd_phys);
313 spin_lock(&init_mm.page_table_lock);
314 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
315 phys_pmd_init(pmd, addr, end);
316 spin_unlock(&init_mm.page_table_lock);
322 static void __init find_early_table_space(unsigned long end)
324 unsigned long puds, pmds, tables, start;
326 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
327 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
328 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
329 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
331 /* RED-PEN putting page tables only on node 0 could
332 cause a hotspot and fill up ZONE_DMA. The page tables
333 need roughly 0.5KB per GB. */
335 table_start = find_e820_area(start, end, tables);
336 if (table_start == -1UL)
337 panic("Cannot find space for the kernel page tables");
339 table_start >>= PAGE_SHIFT;
340 table_end = table_start;
342 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
343 end, table_start << PAGE_SHIFT,
344 (table_start << PAGE_SHIFT) + tables);
347 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
348 This runs before bootmem is initialized and gets pages directly from the
349 physical memory. To access them they are temporarily mapped. */
350 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
354 Dprintk("init_memory_mapping\n");
357 * Find space for the kernel direct mapping tables.
358 * Later we should allocate these tables in the local node of the memory
359 * mapped. Unfortunately this is done currently before the nodes are
363 find_early_table_space(end);
365 start = (unsigned long)__va(start);
366 end = (unsigned long)__va(end);
368 for (; start < end; start = next) {
369 unsigned long pud_phys;
370 pgd_t *pgd = pgd_offset_k(start);
374 pud = pud_offset(pgd, start & PGDIR_MASK);
376 pud = alloc_low_page(&pud_phys);
378 next = start + PGDIR_SIZE;
381 phys_pud_init(pud, __pa(start), __pa(next));
383 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
388 mmu_cr4_features = read_cr4();
393 void __init paging_init(void)
395 unsigned long max_zone_pfns[MAX_NR_ZONES];
396 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
397 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
398 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
399 max_zone_pfns[ZONE_NORMAL] = end_pfn;
401 memory_present(0, 0, end_pfn);
403 free_area_init_nodes(max_zone_pfns);
407 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
408 from the CPU leading to inconsistent cache lines. address and size
409 must be aligned to 2MB boundaries.
410 Does nothing when the mapping doesn't exist. */
411 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
413 unsigned long end = address + size;
415 BUG_ON(address & ~LARGE_PAGE_MASK);
416 BUG_ON(size & ~LARGE_PAGE_MASK);
418 for (; address < end; address += LARGE_PAGE_SIZE) {
419 pgd_t *pgd = pgd_offset_k(address);
424 pud = pud_offset(pgd, address);
427 pmd = pmd_offset(pud, address);
428 if (!pmd || pmd_none(*pmd))
430 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
431 /* Could handle this, but it should not happen currently. */
433 "clear_kernel_mapping: mapping has been split. will leak memory\n");
436 set_pmd(pmd, __pmd(0));
442 * Memory hotplug specific functions
444 void online_page(struct page *page)
446 ClearPageReserved(page);
447 init_page_count(page);
453 #ifdef CONFIG_MEMORY_HOTPLUG
455 * Memory is added always to NORMAL zone. This means you will never get
456 * additional DMA/DMA32 memory.
458 int arch_add_memory(int nid, u64 start, u64 size)
460 struct pglist_data *pgdat = NODE_DATA(nid);
461 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
462 unsigned long start_pfn = start >> PAGE_SHIFT;
463 unsigned long nr_pages = size >> PAGE_SHIFT;
466 init_memory_mapping(start, (start + size -1));
468 ret = __add_pages(zone, start_pfn, nr_pages);
474 printk("%s: Problem encountered in __add_pages!\n", __func__);
477 EXPORT_SYMBOL_GPL(arch_add_memory);
479 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
480 int memory_add_physaddr_to_nid(u64 start)
484 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
487 #endif /* CONFIG_MEMORY_HOTPLUG */
489 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
491 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
492 * just online the pages.
494 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
498 unsigned long total = 0, mem = 0;
499 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
500 if (pfn_valid(pfn)) {
501 online_page(pfn_to_page(pfn));
508 z->spanned_pages += total;
509 z->present_pages += mem;
510 z->zone_pgdat->node_spanned_pages += total;
511 z->zone_pgdat->node_present_pages += mem;
517 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
520 void __init mem_init(void)
522 long codesize, reservedpages, datasize, initsize;
526 /* clear the zero-page */
527 memset(empty_zero_page, 0, PAGE_SIZE);
531 /* this will put all low memory onto the freelists */
533 totalram_pages = numa_free_all_bootmem();
535 totalram_pages = free_all_bootmem();
537 reservedpages = end_pfn - totalram_pages -
538 absent_pages_in_range(0, end_pfn);
542 codesize = (unsigned long) &_etext - (unsigned long) &_text;
543 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
544 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
546 /* Register memory areas for /proc/kcore */
547 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
548 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
549 VMALLOC_END-VMALLOC_START);
550 kclist_add(&kcore_kernel, &_stext, _end - _stext);
551 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
552 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
553 VSYSCALL_END - VSYSCALL_START);
555 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
556 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
557 end_pfn << (PAGE_SHIFT-10),
559 reservedpages << (PAGE_SHIFT-10),
564 void free_init_pages(char *what, unsigned long begin, unsigned long end)
571 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
572 for (addr = begin; addr < end; addr += PAGE_SIZE) {
573 ClearPageReserved(virt_to_page(addr));
574 init_page_count(virt_to_page(addr));
575 memset((void *)(addr & ~(PAGE_SIZE-1)),
576 POISON_FREE_INITMEM, PAGE_SIZE);
577 if (addr >= __START_KERNEL_map)
578 change_page_attr_addr(addr, 1, __pgprot(0));
582 if (addr > __START_KERNEL_map)
586 void free_initmem(void)
588 free_init_pages("unused kernel memory",
589 (unsigned long)(&__init_begin),
590 (unsigned long)(&__init_end));
593 #ifdef CONFIG_DEBUG_RODATA
595 void mark_rodata_ro(void)
597 unsigned long start = (unsigned long)_stext, end;
599 #ifdef CONFIG_HOTPLUG_CPU
600 /* It must still be possible to apply SMP alternatives. */
601 if (num_possible_cpus() > 1)
602 start = (unsigned long)_etext;
605 #ifdef CONFIG_KPROBES
606 start = (unsigned long)__start_rodata;
609 end = (unsigned long)__end_rodata;
610 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
615 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
617 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
618 (end - start) >> 10);
621 * change_page_attr_addr() requires a global_flush_tlb() call after it.
622 * We do this after the printk so that if something went wrong in the
623 * change, the printk gets out at least to give a better debug hint
624 * of who is the culprit.
630 #ifdef CONFIG_BLK_DEV_INITRD
631 void free_initrd_mem(unsigned long start, unsigned long end)
633 free_init_pages("initrd memory", start, end);
637 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
640 int nid = phys_to_nid(phys);
642 unsigned long pfn = phys >> PAGE_SHIFT;
643 if (pfn >= end_pfn) {
644 /* This can happen with kdump kernels when accessing firmware
646 if (pfn < end_pfn_map)
648 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
653 /* Should check here against the e820 map to avoid double free */
655 reserve_bootmem_node(NODE_DATA(nid), phys, len);
657 reserve_bootmem(phys, len);
659 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
660 dma_reserve += len / PAGE_SIZE;
661 set_dma_reserve(dma_reserve);
665 int kern_addr_valid(unsigned long addr)
667 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
673 if (above != 0 && above != -1UL)
676 pgd = pgd_offset_k(addr);
680 pud = pud_offset(pgd, addr);
684 pmd = pmd_offset(pud, addr);
688 return pfn_valid(pmd_pfn(*pmd));
690 pte = pte_offset_kernel(pmd, addr);
693 return pfn_valid(pte_pfn(*pte));
696 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
697 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
698 not need special handling anymore. */
700 static struct vm_area_struct gate_vma = {
701 .vm_start = VSYSCALL_START,
702 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
703 .vm_page_prot = PAGE_READONLY_EXEC,
704 .vm_flags = VM_READ | VM_EXEC
707 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
709 #ifdef CONFIG_IA32_EMULATION
710 if (test_tsk_thread_flag(tsk, TIF_IA32))
716 int in_gate_area(struct task_struct *task, unsigned long addr)
718 struct vm_area_struct *vma = get_gate_vma(task);
721 return (addr >= vma->vm_start) && (addr < vma->vm_end);
724 /* Use this when you have no reliable task/vma, typically from interrupt
725 * context. It is less reliable than using the task's vma and may give
728 int in_gate_area_no_task(unsigned long addr)
730 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
733 const char *arch_vma_name(struct vm_area_struct *vma)
735 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
737 if (vma == &gate_vma)
742 #ifdef CONFIG_SPARSEMEM_VMEMMAP
744 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
746 int __meminit vmemmap_populate(struct page *start_page,
747 unsigned long size, int node)
749 unsigned long addr = (unsigned long)start_page;
750 unsigned long end = (unsigned long)(start_page + size);
756 for (; addr < end; addr = next) {
757 next = pmd_addr_end(addr, end);
759 pgd = vmemmap_pgd_populate(addr, node);
762 pud = vmemmap_pud_populate(pgd, addr, node);
766 pmd = pmd_offset(pud, addr);
767 if (pmd_none(*pmd)) {
769 void *p = vmemmap_alloc_block(PMD_SIZE, node);
773 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
775 set_pmd(pmd, __pmd(pte_val(entry)));
777 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
778 addr, addr + PMD_SIZE - 1, p, node);
780 vmemmap_verify((pte_t *)pmd, node, addr, next);