Merge master.kernel.org:/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw
[linux-2.6] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26
27 #include <asm/smp.h>
28 #include <asm/nmi.h>
29 #include <asm/kdebug.h>
30 #include <asm/intel_arch_perfmon.h>
31
32 #include "mach_traps.h"
33
34 int unknown_nmi_panic;
35 int nmi_watchdog_enabled;
36
37 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
38  * evtsel_nmi_owner tracks the ownership of the event selection
39  * - different performance counters/ event selection may be reserved for
40  *   different subsystems this reservation system just tries to coordinate
41  *   things a little
42  */
43 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
44 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
45
46 static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
48 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
49  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
50  */
51 #define NMI_MAX_COUNTER_BITS 66
52
53 /* nmi_active:
54  * >0: the lapic NMI watchdog is active, but can be disabled
55  * <0: the lapic NMI watchdog has not been set up, and cannot
56  *     be enabled
57  *  0: the lapic NMI watchdog is disabled, but can be enabled
58  */
59 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
60
61 unsigned int nmi_watchdog = NMI_DEFAULT;
62 static unsigned int nmi_hz = HZ;
63
64 struct nmi_watchdog_ctlblk {
65         int enabled;
66         u64 check_bit;
67         unsigned int cccr_msr;
68         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
69         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
70 };
71 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
72
73 /* local prototypes */
74 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
75
76 extern void show_registers(struct pt_regs *regs);
77 extern int unknown_nmi_panic;
78
79 /* converts an msr to an appropriate reservation bit */
80 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
81 {
82         /* returns the bit offset of the performance counter register */
83         switch (boot_cpu_data.x86_vendor) {
84         case X86_VENDOR_AMD:
85                 return (msr - MSR_K7_PERFCTR0);
86         case X86_VENDOR_INTEL:
87                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
88                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
89
90                 switch (boot_cpu_data.x86) {
91                 case 6:
92                         return (msr - MSR_P6_PERFCTR0);
93                 case 15:
94                         return (msr - MSR_P4_BPU_PERFCTR0);
95                 }
96         }
97         return 0;
98 }
99
100 /* converts an msr to an appropriate reservation bit */
101 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
102 {
103         /* returns the bit offset of the event selection register */
104         switch (boot_cpu_data.x86_vendor) {
105         case X86_VENDOR_AMD:
106                 return (msr - MSR_K7_EVNTSEL0);
107         case X86_VENDOR_INTEL:
108                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
109                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
110
111                 switch (boot_cpu_data.x86) {
112                 case 6:
113                         return (msr - MSR_P6_EVNTSEL0);
114                 case 15:
115                         return (msr - MSR_P4_BSU_ESCR0);
116                 }
117         }
118         return 0;
119 }
120
121 /* checks for a bit availability (hack for oprofile) */
122 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
123 {
124         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
125
126         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
127 }
128
129 /* checks the an msr for availability */
130 int avail_to_resrv_perfctr_nmi(unsigned int msr)
131 {
132         unsigned int counter;
133
134         counter = nmi_perfctr_msr_to_bit(msr);
135         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
136
137         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
138 }
139
140 int reserve_perfctr_nmi(unsigned int msr)
141 {
142         unsigned int counter;
143
144         counter = nmi_perfctr_msr_to_bit(msr);
145         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
146
147         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
148                 return 1;
149         return 0;
150 }
151
152 void release_perfctr_nmi(unsigned int msr)
153 {
154         unsigned int counter;
155
156         counter = nmi_perfctr_msr_to_bit(msr);
157         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
158
159         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
160 }
161
162 int reserve_evntsel_nmi(unsigned int msr)
163 {
164         unsigned int counter;
165
166         counter = nmi_evntsel_msr_to_bit(msr);
167         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
168
169         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
170                 return 1;
171         return 0;
172 }
173
174 void release_evntsel_nmi(unsigned int msr)
175 {
176         unsigned int counter;
177
178         counter = nmi_evntsel_msr_to_bit(msr);
179         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
180
181         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
182 }
183
184 static __cpuinit inline int nmi_known_cpu(void)
185 {
186         switch (boot_cpu_data.x86_vendor) {
187         case X86_VENDOR_AMD:
188                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
189         case X86_VENDOR_INTEL:
190                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
191                         return 1;
192                 else
193                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
194         }
195         return 0;
196 }
197
198 #ifdef CONFIG_SMP
199 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
200  * the CPU is idle. To make sure the NMI watchdog really ticks on all
201  * CPUs during the test make them busy.
202  */
203 static __init void nmi_cpu_busy(void *data)
204 {
205         volatile int *endflag = data;
206         local_irq_enable_in_hardirq();
207         /* Intentionally don't use cpu_relax here. This is
208            to make sure that the performance counter really ticks,
209            even if there is a simulator or similar that catches the
210            pause instruction. On a real HT machine this is fine because
211            all other CPUs are busy with "useless" delay loops and don't
212            care if they get somewhat less cycles. */
213         while (*endflag == 0)
214                 barrier();
215 }
216 #endif
217
218 static int __init check_nmi_watchdog(void)
219 {
220         volatile int endflag = 0;
221         unsigned int *prev_nmi_count;
222         int cpu;
223
224         /* Enable NMI watchdog for newer systems.
225            Probably safe on most older systems too, but let's be careful.
226            IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
227            which hangs the system. Disable watchdog for all thinkpads */
228         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
229                 !dmi_name_in_vendors("ThinkPad"))
230                 nmi_watchdog = NMI_LOCAL_APIC;
231
232         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
233                 return 0;
234
235         if (!atomic_read(&nmi_active))
236                 return 0;
237
238         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
239         if (!prev_nmi_count)
240                 return -1;
241
242         printk(KERN_INFO "Testing NMI watchdog ... ");
243
244         if (nmi_watchdog == NMI_LOCAL_APIC)
245                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
246
247         for_each_possible_cpu(cpu)
248                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
249         local_irq_enable();
250         mdelay((10*1000)/nmi_hz); // wait 10 ticks
251
252         for_each_possible_cpu(cpu) {
253 #ifdef CONFIG_SMP
254                 /* Check cpu_callin_map here because that is set
255                    after the timer is started. */
256                 if (!cpu_isset(cpu, cpu_callin_map))
257                         continue;
258 #endif
259                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
260                         continue;
261                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
262                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
263                                 cpu,
264                                 prev_nmi_count[cpu],
265                                 nmi_count(cpu));
266                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
267                         atomic_dec(&nmi_active);
268                 }
269         }
270         if (!atomic_read(&nmi_active)) {
271                 kfree(prev_nmi_count);
272                 atomic_set(&nmi_active, -1);
273                 return -1;
274         }
275         endflag = 1;
276         printk("OK.\n");
277
278         /* now that we know it works we can reduce NMI frequency to
279            something more reasonable; makes a difference in some configs */
280         if (nmi_watchdog == NMI_LOCAL_APIC) {
281                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
282
283                 nmi_hz = 1;
284                 /*
285                  * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
286                  * are writable, with higher bits sign extending from bit 31.
287                  * So, we can only program the counter with 31 bit values and
288                  * 32nd bit should be 1, for 33.. to be 1.
289                  * Find the appropriate nmi_hz
290                  */
291                 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
292                         ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
293                         u64 count = (u64)cpu_khz * 1000;
294                         do_div(count, 0x7fffffffUL);
295                         nmi_hz = count + 1;
296                 }
297         }
298
299         kfree(prev_nmi_count);
300         return 0;
301 }
302 /* This needs to happen later in boot so counters are working */
303 late_initcall(check_nmi_watchdog);
304
305 static int __init setup_nmi_watchdog(char *str)
306 {
307         int nmi;
308
309         get_option(&str, &nmi);
310
311         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
312                 return 0;
313         /*
314          * If any other x86 CPU has a local APIC, then
315          * please test the NMI stuff there and send me the
316          * missing bits. Right now Intel P6/P4 and AMD K7 only.
317          */
318         if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
319                 return 0;  /* no lapic support */
320         nmi_watchdog = nmi;
321         return 1;
322 }
323
324 __setup("nmi_watchdog=", setup_nmi_watchdog);
325
326 static void disable_lapic_nmi_watchdog(void)
327 {
328         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
329
330         if (atomic_read(&nmi_active) <= 0)
331                 return;
332
333         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
334
335         BUG_ON(atomic_read(&nmi_active) != 0);
336 }
337
338 static void enable_lapic_nmi_watchdog(void)
339 {
340         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
341
342         /* are we already enabled */
343         if (atomic_read(&nmi_active) != 0)
344                 return;
345
346         /* are we lapic aware */
347         if (nmi_known_cpu() <= 0)
348                 return;
349
350         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
351         touch_nmi_watchdog();
352 }
353
354 void disable_timer_nmi_watchdog(void)
355 {
356         BUG_ON(nmi_watchdog != NMI_IO_APIC);
357
358         if (atomic_read(&nmi_active) <= 0)
359                 return;
360
361         disable_irq(0);
362         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
363
364         BUG_ON(atomic_read(&nmi_active) != 0);
365 }
366
367 void enable_timer_nmi_watchdog(void)
368 {
369         BUG_ON(nmi_watchdog != NMI_IO_APIC);
370
371         if (atomic_read(&nmi_active) == 0) {
372                 touch_nmi_watchdog();
373                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
374                 enable_irq(0);
375         }
376 }
377
378 #ifdef CONFIG_PM
379
380 static int nmi_pm_active; /* nmi_active before suspend */
381
382 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
383 {
384         /* only CPU0 goes here, other CPUs should be offline */
385         nmi_pm_active = atomic_read(&nmi_active);
386         stop_apic_nmi_watchdog(NULL);
387         BUG_ON(atomic_read(&nmi_active) != 0);
388         return 0;
389 }
390
391 static int lapic_nmi_resume(struct sys_device *dev)
392 {
393         /* only CPU0 goes here, other CPUs should be offline */
394         if (nmi_pm_active > 0) {
395                 setup_apic_nmi_watchdog(NULL);
396                 touch_nmi_watchdog();
397         }
398         return 0;
399 }
400
401
402 static struct sysdev_class nmi_sysclass = {
403         set_kset_name("lapic_nmi"),
404         .resume         = lapic_nmi_resume,
405         .suspend        = lapic_nmi_suspend,
406 };
407
408 static struct sys_device device_lapic_nmi = {
409         .id     = 0,
410         .cls    = &nmi_sysclass,
411 };
412
413 static int __init init_lapic_nmi_sysfs(void)
414 {
415         int error;
416
417         /* should really be a BUG_ON but b/c this is an
418          * init call, it just doesn't work.  -dcz
419          */
420         if (nmi_watchdog != NMI_LOCAL_APIC)
421                 return 0;
422
423         if ( atomic_read(&nmi_active) < 0 )
424                 return 0;
425
426         error = sysdev_class_register(&nmi_sysclass);
427         if (!error)
428                 error = sysdev_register(&device_lapic_nmi);
429         return error;
430 }
431 /* must come after the local APIC's device_initcall() */
432 late_initcall(init_lapic_nmi_sysfs);
433
434 #endif  /* CONFIG_PM */
435
436 /*
437  * Activate the NMI watchdog via the local APIC.
438  * Original code written by Keith Owens.
439  */
440
441 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
442 {
443         u64 count = (u64)cpu_khz * 1000;
444
445         do_div(count, nmi_hz);
446         if(descr)
447                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
448         wrmsrl(perfctr_msr, 0 - count);
449 }
450
451 /* Note that these events don't tick when the CPU idles. This means
452    the frequency varies with CPU load. */
453
454 #define K7_EVNTSEL_ENABLE       (1 << 22)
455 #define K7_EVNTSEL_INT          (1 << 20)
456 #define K7_EVNTSEL_OS           (1 << 17)
457 #define K7_EVNTSEL_USR          (1 << 16)
458 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
459 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
460
461 static int setup_k7_watchdog(void)
462 {
463         unsigned int perfctr_msr, evntsel_msr;
464         unsigned int evntsel;
465         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
466
467         perfctr_msr = MSR_K7_PERFCTR0;
468         evntsel_msr = MSR_K7_EVNTSEL0;
469         if (!reserve_perfctr_nmi(perfctr_msr))
470                 goto fail;
471
472         if (!reserve_evntsel_nmi(evntsel_msr))
473                 goto fail1;
474
475         wrmsrl(perfctr_msr, 0UL);
476
477         evntsel = K7_EVNTSEL_INT
478                 | K7_EVNTSEL_OS
479                 | K7_EVNTSEL_USR
480                 | K7_NMI_EVENT;
481
482         /* setup the timer */
483         wrmsr(evntsel_msr, evntsel, 0);
484         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
485         apic_write(APIC_LVTPC, APIC_DM_NMI);
486         evntsel |= K7_EVNTSEL_ENABLE;
487         wrmsr(evntsel_msr, evntsel, 0);
488
489         wd->perfctr_msr = perfctr_msr;
490         wd->evntsel_msr = evntsel_msr;
491         wd->cccr_msr = 0;  //unused
492         wd->check_bit = 1ULL<<63;
493         return 1;
494 fail1:
495         release_perfctr_nmi(perfctr_msr);
496 fail:
497         return 0;
498 }
499
500 static void stop_k7_watchdog(void)
501 {
502         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
503
504         wrmsr(wd->evntsel_msr, 0, 0);
505
506         release_evntsel_nmi(wd->evntsel_msr);
507         release_perfctr_nmi(wd->perfctr_msr);
508 }
509
510 #define P6_EVNTSEL0_ENABLE      (1 << 22)
511 #define P6_EVNTSEL_INT          (1 << 20)
512 #define P6_EVNTSEL_OS           (1 << 17)
513 #define P6_EVNTSEL_USR          (1 << 16)
514 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
515 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
516
517 static int setup_p6_watchdog(void)
518 {
519         unsigned int perfctr_msr, evntsel_msr;
520         unsigned int evntsel;
521         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
522
523         perfctr_msr = MSR_P6_PERFCTR0;
524         evntsel_msr = MSR_P6_EVNTSEL0;
525         if (!reserve_perfctr_nmi(perfctr_msr))
526                 goto fail;
527
528         if (!reserve_evntsel_nmi(evntsel_msr))
529                 goto fail1;
530
531         wrmsrl(perfctr_msr, 0UL);
532
533         evntsel = P6_EVNTSEL_INT
534                 | P6_EVNTSEL_OS
535                 | P6_EVNTSEL_USR
536                 | P6_NMI_EVENT;
537
538         /* setup the timer */
539         wrmsr(evntsel_msr, evntsel, 0);
540         write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
541         apic_write(APIC_LVTPC, APIC_DM_NMI);
542         evntsel |= P6_EVNTSEL0_ENABLE;
543         wrmsr(evntsel_msr, evntsel, 0);
544
545         wd->perfctr_msr = perfctr_msr;
546         wd->evntsel_msr = evntsel_msr;
547         wd->cccr_msr = 0;  //unused
548         wd->check_bit = 1ULL<<39;
549         return 1;
550 fail1:
551         release_perfctr_nmi(perfctr_msr);
552 fail:
553         return 0;
554 }
555
556 static void stop_p6_watchdog(void)
557 {
558         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
559
560         wrmsr(wd->evntsel_msr, 0, 0);
561
562         release_evntsel_nmi(wd->evntsel_msr);
563         release_perfctr_nmi(wd->perfctr_msr);
564 }
565
566 /* Note that these events don't tick when the CPU idles. This means
567    the frequency varies with CPU load. */
568
569 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
570 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
571 #define P4_ESCR_OS              (1<<3)
572 #define P4_ESCR_USR             (1<<2)
573 #define P4_CCCR_OVF_PMI0        (1<<26)
574 #define P4_CCCR_OVF_PMI1        (1<<27)
575 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
576 #define P4_CCCR_COMPLEMENT      (1<<19)
577 #define P4_CCCR_COMPARE         (1<<18)
578 #define P4_CCCR_REQUIRED        (3<<16)
579 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
580 #define P4_CCCR_ENABLE          (1<<12)
581 #define P4_CCCR_OVF             (1<<31)
582 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
583    CRU_ESCR0 (with any non-null event selector) through a complemented
584    max threshold. [IA32-Vol3, Section 14.9.9] */
585
586 static int setup_p4_watchdog(void)
587 {
588         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
589         unsigned int evntsel, cccr_val;
590         unsigned int misc_enable, dummy;
591         unsigned int ht_num;
592         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
593
594         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
595         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
596                 return 0;
597
598 #ifdef CONFIG_SMP
599         /* detect which hyperthread we are on */
600         if (smp_num_siblings == 2) {
601                 unsigned int ebx, apicid;
602
603                 ebx = cpuid_ebx(1);
604                 apicid = (ebx >> 24) & 0xff;
605                 ht_num = apicid & 1;
606         } else
607 #endif
608                 ht_num = 0;
609
610         /* performance counters are shared resources
611          * assign each hyperthread its own set
612          * (re-use the ESCR0 register, seems safe
613          * and keeps the cccr_val the same)
614          */
615         if (!ht_num) {
616                 /* logical cpu 0 */
617                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
618                 evntsel_msr = MSR_P4_CRU_ESCR0;
619                 cccr_msr = MSR_P4_IQ_CCCR0;
620                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
621         } else {
622                 /* logical cpu 1 */
623                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
624                 evntsel_msr = MSR_P4_CRU_ESCR0;
625                 cccr_msr = MSR_P4_IQ_CCCR1;
626                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
627         }
628
629         if (!reserve_perfctr_nmi(perfctr_msr))
630                 goto fail;
631
632         if (!reserve_evntsel_nmi(evntsel_msr))
633                 goto fail1;
634
635         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
636                 | P4_ESCR_OS
637                 | P4_ESCR_USR;
638
639         cccr_val |= P4_CCCR_THRESHOLD(15)
640                  | P4_CCCR_COMPLEMENT
641                  | P4_CCCR_COMPARE
642                  | P4_CCCR_REQUIRED;
643
644         wrmsr(evntsel_msr, evntsel, 0);
645         wrmsr(cccr_msr, cccr_val, 0);
646         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
647         apic_write(APIC_LVTPC, APIC_DM_NMI);
648         cccr_val |= P4_CCCR_ENABLE;
649         wrmsr(cccr_msr, cccr_val, 0);
650         wd->perfctr_msr = perfctr_msr;
651         wd->evntsel_msr = evntsel_msr;
652         wd->cccr_msr = cccr_msr;
653         wd->check_bit = 1ULL<<39;
654         return 1;
655 fail1:
656         release_perfctr_nmi(perfctr_msr);
657 fail:
658         return 0;
659 }
660
661 static void stop_p4_watchdog(void)
662 {
663         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
664
665         wrmsr(wd->cccr_msr, 0, 0);
666         wrmsr(wd->evntsel_msr, 0, 0);
667
668         release_evntsel_nmi(wd->evntsel_msr);
669         release_perfctr_nmi(wd->perfctr_msr);
670 }
671
672 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
673 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
674
675 static int setup_intel_arch_watchdog(void)
676 {
677         unsigned int ebx;
678         union cpuid10_eax eax;
679         unsigned int unused;
680         unsigned int perfctr_msr, evntsel_msr;
681         unsigned int evntsel;
682         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
683
684         /*
685          * Check whether the Architectural PerfMon supports
686          * Unhalted Core Cycles Event or not.
687          * NOTE: Corresponding bit = 0 in ebx indicates event present.
688          */
689         cpuid(10, &(eax.full), &ebx, &unused, &unused);
690         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
691             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
692                 goto fail;
693
694         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
695         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
696
697         if (!reserve_perfctr_nmi(perfctr_msr))
698                 goto fail;
699
700         if (!reserve_evntsel_nmi(evntsel_msr))
701                 goto fail1;
702
703         wrmsrl(perfctr_msr, 0UL);
704
705         evntsel = ARCH_PERFMON_EVENTSEL_INT
706                 | ARCH_PERFMON_EVENTSEL_OS
707                 | ARCH_PERFMON_EVENTSEL_USR
708                 | ARCH_PERFMON_NMI_EVENT_SEL
709                 | ARCH_PERFMON_NMI_EVENT_UMASK;
710
711         /* setup the timer */
712         wrmsr(evntsel_msr, evntsel, 0);
713         write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
714         apic_write(APIC_LVTPC, APIC_DM_NMI);
715         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
716         wrmsr(evntsel_msr, evntsel, 0);
717
718         wd->perfctr_msr = perfctr_msr;
719         wd->evntsel_msr = evntsel_msr;
720         wd->cccr_msr = 0;  //unused
721         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
722         return 1;
723 fail1:
724         release_perfctr_nmi(perfctr_msr);
725 fail:
726         return 0;
727 }
728
729 static void stop_intel_arch_watchdog(void)
730 {
731         unsigned int ebx;
732         union cpuid10_eax eax;
733         unsigned int unused;
734         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
735
736         /*
737          * Check whether the Architectural PerfMon supports
738          * Unhalted Core Cycles Event or not.
739          * NOTE: Corresponding bit = 0 in ebx indicates event present.
740          */
741         cpuid(10, &(eax.full), &ebx, &unused, &unused);
742         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
743             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
744                 return;
745
746         wrmsr(wd->evntsel_msr, 0, 0);
747         release_evntsel_nmi(wd->evntsel_msr);
748         release_perfctr_nmi(wd->perfctr_msr);
749 }
750
751 void setup_apic_nmi_watchdog (void *unused)
752 {
753         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
754
755         /* only support LOCAL and IO APICs for now */
756         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
757             (nmi_watchdog != NMI_IO_APIC))
758                 return;
759
760         if (wd->enabled == 1)
761                 return;
762
763         /* cheap hack to support suspend/resume */
764         /* if cpu0 is not active neither should the other cpus */
765         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
766                 return;
767
768         if (nmi_watchdog == NMI_LOCAL_APIC) {
769                 switch (boot_cpu_data.x86_vendor) {
770                 case X86_VENDOR_AMD:
771                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
772                                 return;
773                         if (!setup_k7_watchdog())
774                                 return;
775                         break;
776                 case X86_VENDOR_INTEL:
777                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
778                                 if (!setup_intel_arch_watchdog())
779                                         return;
780                                 break;
781                         }
782                         switch (boot_cpu_data.x86) {
783                         case 6:
784                                 if (boot_cpu_data.x86_model > 0xd)
785                                         return;
786
787                                 if (!setup_p6_watchdog())
788                                         return;
789                                 break;
790                         case 15:
791                                 if (boot_cpu_data.x86_model > 0x4)
792                                         return;
793
794                                 if (!setup_p4_watchdog())
795                                         return;
796                                 break;
797                         default:
798                                 return;
799                         }
800                         break;
801                 default:
802                         return;
803                 }
804         }
805         wd->enabled = 1;
806         atomic_inc(&nmi_active);
807 }
808
809 void stop_apic_nmi_watchdog(void *unused)
810 {
811         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
812
813         /* only support LOCAL and IO APICs for now */
814         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
815             (nmi_watchdog != NMI_IO_APIC))
816                 return;
817
818         if (wd->enabled == 0)
819                 return;
820
821         if (nmi_watchdog == NMI_LOCAL_APIC) {
822                 switch (boot_cpu_data.x86_vendor) {
823                 case X86_VENDOR_AMD:
824                         stop_k7_watchdog();
825                         break;
826                 case X86_VENDOR_INTEL:
827                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
828                                 stop_intel_arch_watchdog();
829                                 break;
830                         }
831                         switch (boot_cpu_data.x86) {
832                         case 6:
833                                 if (boot_cpu_data.x86_model > 0xd)
834                                         break;
835                                 stop_p6_watchdog();
836                                 break;
837                         case 15:
838                                 if (boot_cpu_data.x86_model > 0x4)
839                                         break;
840                                 stop_p4_watchdog();
841                                 break;
842                         }
843                         break;
844                 default:
845                         return;
846                 }
847         }
848         wd->enabled = 0;
849         atomic_dec(&nmi_active);
850 }
851
852 /*
853  * the best way to detect whether a CPU has a 'hard lockup' problem
854  * is to check it's local APIC timer IRQ counts. If they are not
855  * changing then that CPU has some problem.
856  *
857  * as these watchdog NMI IRQs are generated on every CPU, we only
858  * have to check the current processor.
859  *
860  * since NMIs don't listen to _any_ locks, we have to be extremely
861  * careful not to rely on unsafe variables. The printk might lock
862  * up though, so we have to break up any console locks first ...
863  * [when there will be more tty-related locks, break them up
864  *  here too!]
865  */
866
867 static unsigned int
868         last_irq_sums [NR_CPUS],
869         alert_counter [NR_CPUS];
870
871 void touch_nmi_watchdog (void)
872 {
873         if (nmi_watchdog > 0) {
874                 unsigned cpu;
875
876                 /*
877                  * Just reset the alert counters, (other CPUs might be
878                  * spinning on locks we hold):
879                  */
880                 for_each_present_cpu (cpu)
881                         alert_counter[cpu] = 0;
882         }
883
884         /*
885          * Tickle the softlockup detector too:
886          */
887         touch_softlockup_watchdog();
888 }
889 EXPORT_SYMBOL(touch_nmi_watchdog);
890
891 extern void die_nmi(struct pt_regs *, const char *msg);
892
893 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
894 {
895
896         /*
897          * Since current_thread_info()-> is always on the stack, and we
898          * always switch the stack NMI-atomically, it's safe to use
899          * smp_processor_id().
900          */
901         unsigned int sum;
902         int touched = 0;
903         int cpu = smp_processor_id();
904         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
905         u64 dummy;
906         int rc=0;
907
908         /* check for other users first */
909         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
910                         == NOTIFY_STOP) {
911                 rc = 1;
912                 touched = 1;
913         }
914
915         if (cpu_isset(cpu, backtrace_mask)) {
916                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
917
918                 spin_lock(&lock);
919                 printk("NMI backtrace for cpu %d\n", cpu);
920                 dump_stack();
921                 spin_unlock(&lock);
922                 cpu_clear(cpu, backtrace_mask);
923         }
924
925         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
926
927         /* if the apic timer isn't firing, this cpu isn't doing much */
928         if (!touched && last_irq_sums[cpu] == sum) {
929                 /*
930                  * Ayiee, looks like this CPU is stuck ...
931                  * wait a few IRQs (5 seconds) before doing the oops ...
932                  */
933                 alert_counter[cpu]++;
934                 if (alert_counter[cpu] == 5*nmi_hz)
935                         /*
936                          * die_nmi will return ONLY if NOTIFY_STOP happens..
937                          */
938                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
939         } else {
940                 last_irq_sums[cpu] = sum;
941                 alert_counter[cpu] = 0;
942         }
943         /* see if the nmi watchdog went off */
944         if (wd->enabled) {
945                 if (nmi_watchdog == NMI_LOCAL_APIC) {
946                         rdmsrl(wd->perfctr_msr, dummy);
947                         if (dummy & wd->check_bit){
948                                 /* this wasn't a watchdog timer interrupt */
949                                 goto done;
950                         }
951
952                         /* only Intel P4 uses the cccr msr */
953                         if (wd->cccr_msr != 0) {
954                                 /*
955                                  * P4 quirks:
956                                  * - An overflown perfctr will assert its interrupt
957                                  *   until the OVF flag in its CCCR is cleared.
958                                  * - LVTPC is masked on interrupt and must be
959                                  *   unmasked by the LVTPC handler.
960                                  */
961                                 rdmsrl(wd->cccr_msr, dummy);
962                                 dummy &= ~P4_CCCR_OVF;
963                                 wrmsrl(wd->cccr_msr, dummy);
964                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
965                         }
966                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
967                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
968                                 /* P6 based Pentium M need to re-unmask
969                                  * the apic vector but it doesn't hurt
970                                  * other P6 variant.
971                                  * ArchPerfom/Core Duo also needs this */
972                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
973                         }
974                         /* start the cycle over again */
975                         write_watchdog_counter(wd->perfctr_msr, NULL);
976                         rc = 1;
977                 } else if (nmi_watchdog == NMI_IO_APIC) {
978                         /* don't know how to accurately check for this.
979                          * just assume it was a watchdog timer interrupt
980                          * This matches the old behaviour.
981                          */
982                         rc = 1;
983                 }
984         }
985 done:
986         return rc;
987 }
988
989 int do_nmi_callback(struct pt_regs * regs, int cpu)
990 {
991 #ifdef CONFIG_SYSCTL
992         if (unknown_nmi_panic)
993                 return unknown_nmi_panic_callback(regs, cpu);
994 #endif
995         return 0;
996 }
997
998 #ifdef CONFIG_SYSCTL
999
1000 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
1001 {
1002         unsigned char reason = get_nmi_reason();
1003         char buf[64];
1004
1005         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1006         die_nmi(regs, buf);
1007         return 0;
1008 }
1009
1010 /*
1011  * proc handler for /proc/sys/kernel/nmi
1012  */
1013 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1014                         void __user *buffer, size_t *length, loff_t *ppos)
1015 {
1016         int old_state;
1017
1018         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1019         old_state = nmi_watchdog_enabled;
1020         proc_dointvec(table, write, file, buffer, length, ppos);
1021         if (!!old_state == !!nmi_watchdog_enabled)
1022                 return 0;
1023
1024         if (atomic_read(&nmi_active) < 0) {
1025                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1026                 return -EIO;
1027         }
1028
1029         if (nmi_watchdog == NMI_DEFAULT) {
1030                 if (nmi_known_cpu() > 0)
1031                         nmi_watchdog = NMI_LOCAL_APIC;
1032                 else
1033                         nmi_watchdog = NMI_IO_APIC;
1034         }
1035
1036         if (nmi_watchdog == NMI_LOCAL_APIC) {
1037                 if (nmi_watchdog_enabled)
1038                         enable_lapic_nmi_watchdog();
1039                 else
1040                         disable_lapic_nmi_watchdog();
1041         } else {
1042                 printk( KERN_WARNING
1043                         "NMI watchdog doesn't know what hardware to touch\n");
1044                 return -EIO;
1045         }
1046         return 0;
1047 }
1048
1049 #endif
1050
1051 void __trigger_all_cpu_backtrace(void)
1052 {
1053         int i;
1054
1055         backtrace_mask = cpu_online_map;
1056         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1057         for (i = 0; i < 10 * 1000; i++) {
1058                 if (cpus_empty(backtrace_mask))
1059                         break;
1060                 mdelay(1);
1061         }
1062 }
1063
1064 EXPORT_SYMBOL(nmi_active);
1065 EXPORT_SYMBOL(nmi_watchdog);
1066 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1067 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1068 EXPORT_SYMBOL(reserve_perfctr_nmi);
1069 EXPORT_SYMBOL(release_perfctr_nmi);
1070 EXPORT_SYMBOL(reserve_evntsel_nmi);
1071 EXPORT_SYMBOL(release_evntsel_nmi);
1072 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1073 EXPORT_SYMBOL(enable_timer_nmi_watchdog);