[PATCH] i386: use smp_call_function_single()
[linux-2.6] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26
27 #include <asm/smp.h>
28 #include <asm/nmi.h>
29 #include <asm/kdebug.h>
30 #include <asm/intel_arch_perfmon.h>
31
32 #include "mach_traps.h"
33
34 int unknown_nmi_panic;
35 int nmi_watchdog_enabled;
36
37 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
38  * evtsel_nmi_owner tracks the ownership of the event selection
39  * - different performance counters/ event selection may be reserved for
40  *   different subsystems this reservation system just tries to coordinate
41  *   things a little
42  */
43 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
44 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
45
46 static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
48 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
49  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
50  */
51 #define NMI_MAX_COUNTER_BITS 66
52
53 /* nmi_active:
54  * >0: the lapic NMI watchdog is active, but can be disabled
55  * <0: the lapic NMI watchdog has not been set up, and cannot
56  *     be enabled
57  *  0: the lapic NMI watchdog is disabled, but can be enabled
58  */
59 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
60
61 unsigned int nmi_watchdog = NMI_DEFAULT;
62 static unsigned int nmi_hz = HZ;
63
64 struct nmi_watchdog_ctlblk {
65         int enabled;
66         u64 check_bit;
67         unsigned int cccr_msr;
68         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
69         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
70 };
71 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
72
73 /* local prototypes */
74 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
75
76 extern void show_registers(struct pt_regs *regs);
77 extern int unknown_nmi_panic;
78
79 /* converts an msr to an appropriate reservation bit */
80 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
81 {
82         /* returns the bit offset of the performance counter register */
83         switch (boot_cpu_data.x86_vendor) {
84         case X86_VENDOR_AMD:
85                 return (msr - MSR_K7_PERFCTR0);
86         case X86_VENDOR_INTEL:
87                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
88                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
89
90                 switch (boot_cpu_data.x86) {
91                 case 6:
92                         return (msr - MSR_P6_PERFCTR0);
93                 case 15:
94                         return (msr - MSR_P4_BPU_PERFCTR0);
95                 }
96         }
97         return 0;
98 }
99
100 /* converts an msr to an appropriate reservation bit */
101 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
102 {
103         /* returns the bit offset of the event selection register */
104         switch (boot_cpu_data.x86_vendor) {
105         case X86_VENDOR_AMD:
106                 return (msr - MSR_K7_EVNTSEL0);
107         case X86_VENDOR_INTEL:
108                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
109                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
110
111                 switch (boot_cpu_data.x86) {
112                 case 6:
113                         return (msr - MSR_P6_EVNTSEL0);
114                 case 15:
115                         return (msr - MSR_P4_BSU_ESCR0);
116                 }
117         }
118         return 0;
119 }
120
121 /* checks for a bit availability (hack for oprofile) */
122 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
123 {
124         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
125
126         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
127 }
128
129 /* checks the an msr for availability */
130 int avail_to_resrv_perfctr_nmi(unsigned int msr)
131 {
132         unsigned int counter;
133
134         counter = nmi_perfctr_msr_to_bit(msr);
135         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
136
137         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
138 }
139
140 int reserve_perfctr_nmi(unsigned int msr)
141 {
142         unsigned int counter;
143
144         counter = nmi_perfctr_msr_to_bit(msr);
145         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
146
147         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
148                 return 1;
149         return 0;
150 }
151
152 void release_perfctr_nmi(unsigned int msr)
153 {
154         unsigned int counter;
155
156         counter = nmi_perfctr_msr_to_bit(msr);
157         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
158
159         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
160 }
161
162 int reserve_evntsel_nmi(unsigned int msr)
163 {
164         unsigned int counter;
165
166         counter = nmi_evntsel_msr_to_bit(msr);
167         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
168
169         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
170                 return 1;
171         return 0;
172 }
173
174 void release_evntsel_nmi(unsigned int msr)
175 {
176         unsigned int counter;
177
178         counter = nmi_evntsel_msr_to_bit(msr);
179         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
180
181         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
182 }
183
184 static __cpuinit inline int nmi_known_cpu(void)
185 {
186         switch (boot_cpu_data.x86_vendor) {
187         case X86_VENDOR_AMD:
188                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
189         case X86_VENDOR_INTEL:
190                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
191                         return 1;
192                 else
193                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
194         }
195         return 0;
196 }
197
198 static int endflag __initdata = 0;
199
200 #ifdef CONFIG_SMP
201 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
202  * the CPU is idle. To make sure the NMI watchdog really ticks on all
203  * CPUs during the test make them busy.
204  */
205 static __init void nmi_cpu_busy(void *data)
206 {
207         local_irq_enable_in_hardirq();
208         /* Intentionally don't use cpu_relax here. This is
209            to make sure that the performance counter really ticks,
210            even if there is a simulator or similar that catches the
211            pause instruction. On a real HT machine this is fine because
212            all other CPUs are busy with "useless" delay loops and don't
213            care if they get somewhat less cycles. */
214         while (endflag == 0)
215                 mb();
216 }
217 #endif
218
219 static unsigned int adjust_for_32bit_ctr(unsigned int hz)
220 {
221         u64 counter_val;
222         unsigned int retval = hz;
223
224         /*
225          * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
226          * are writable, with higher bits sign extending from bit 31.
227          * So, we can only program the counter with 31 bit values and
228          * 32nd bit should be 1, for 33.. to be 1.
229          * Find the appropriate nmi_hz
230          */
231         counter_val = (u64)cpu_khz * 1000;
232         do_div(counter_val, retval);
233         if (counter_val > 0x7fffffffULL) {
234                 u64 count = (u64)cpu_khz * 1000;
235                 do_div(count, 0x7fffffffUL);
236                 retval = count + 1;
237         }
238         return retval;
239 }
240
241 static int __init check_nmi_watchdog(void)
242 {
243         unsigned int *prev_nmi_count;
244         int cpu;
245
246         /* Enable NMI watchdog for newer systems.
247            Probably safe on most older systems too, but let's be careful.
248            IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
249            which hangs the system. Disable watchdog for all thinkpads */
250         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
251                 !dmi_name_in_vendors("ThinkPad"))
252                 nmi_watchdog = NMI_LOCAL_APIC;
253
254         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
255                 return 0;
256
257         if (!atomic_read(&nmi_active))
258                 return 0;
259
260         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
261         if (!prev_nmi_count)
262                 return -1;
263
264         printk(KERN_INFO "Testing NMI watchdog ... ");
265
266         if (nmi_watchdog == NMI_LOCAL_APIC)
267                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
268
269         for_each_possible_cpu(cpu)
270                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
271         local_irq_enable();
272         mdelay((10*1000)/nmi_hz); // wait 10 ticks
273
274         for_each_possible_cpu(cpu) {
275 #ifdef CONFIG_SMP
276                 /* Check cpu_callin_map here because that is set
277                    after the timer is started. */
278                 if (!cpu_isset(cpu, cpu_callin_map))
279                         continue;
280 #endif
281                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
282                         continue;
283                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
284                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
285                                 cpu,
286                                 prev_nmi_count[cpu],
287                                 nmi_count(cpu));
288                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
289                         atomic_dec(&nmi_active);
290                 }
291         }
292         if (!atomic_read(&nmi_active)) {
293                 kfree(prev_nmi_count);
294                 atomic_set(&nmi_active, -1);
295                 return -1;
296         }
297         endflag = 1;
298         printk("OK.\n");
299
300         /* now that we know it works we can reduce NMI frequency to
301            something more reasonable; makes a difference in some configs */
302         if (nmi_watchdog == NMI_LOCAL_APIC) {
303                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
304
305                 nmi_hz = 1;
306
307                 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
308                     wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
309                         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
310                 }
311         }
312
313         kfree(prev_nmi_count);
314         return 0;
315 }
316 /* This needs to happen later in boot so counters are working */
317 late_initcall(check_nmi_watchdog);
318
319 static int __init setup_nmi_watchdog(char *str)
320 {
321         int nmi;
322
323         get_option(&str, &nmi);
324
325         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
326                 return 0;
327
328         nmi_watchdog = nmi;
329         return 1;
330 }
331
332 __setup("nmi_watchdog=", setup_nmi_watchdog);
333
334 static void disable_lapic_nmi_watchdog(void)
335 {
336         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
337
338         if (atomic_read(&nmi_active) <= 0)
339                 return;
340
341         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
342
343         BUG_ON(atomic_read(&nmi_active) != 0);
344 }
345
346 static void enable_lapic_nmi_watchdog(void)
347 {
348         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
349
350         /* are we already enabled */
351         if (atomic_read(&nmi_active) != 0)
352                 return;
353
354         /* are we lapic aware */
355         if (nmi_known_cpu() <= 0)
356                 return;
357
358         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
359         touch_nmi_watchdog();
360 }
361
362 void disable_timer_nmi_watchdog(void)
363 {
364         BUG_ON(nmi_watchdog != NMI_IO_APIC);
365
366         if (atomic_read(&nmi_active) <= 0)
367                 return;
368
369         disable_irq(0);
370         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
371
372         BUG_ON(atomic_read(&nmi_active) != 0);
373 }
374
375 void enable_timer_nmi_watchdog(void)
376 {
377         BUG_ON(nmi_watchdog != NMI_IO_APIC);
378
379         if (atomic_read(&nmi_active) == 0) {
380                 touch_nmi_watchdog();
381                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
382                 enable_irq(0);
383         }
384 }
385
386 #ifdef CONFIG_PM
387
388 static int nmi_pm_active; /* nmi_active before suspend */
389
390 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
391 {
392         /* only CPU0 goes here, other CPUs should be offline */
393         nmi_pm_active = atomic_read(&nmi_active);
394         stop_apic_nmi_watchdog(NULL);
395         BUG_ON(atomic_read(&nmi_active) != 0);
396         return 0;
397 }
398
399 static int lapic_nmi_resume(struct sys_device *dev)
400 {
401         /* only CPU0 goes here, other CPUs should be offline */
402         if (nmi_pm_active > 0) {
403                 setup_apic_nmi_watchdog(NULL);
404                 touch_nmi_watchdog();
405         }
406         return 0;
407 }
408
409
410 static struct sysdev_class nmi_sysclass = {
411         set_kset_name("lapic_nmi"),
412         .resume         = lapic_nmi_resume,
413         .suspend        = lapic_nmi_suspend,
414 };
415
416 static struct sys_device device_lapic_nmi = {
417         .id     = 0,
418         .cls    = &nmi_sysclass,
419 };
420
421 static int __init init_lapic_nmi_sysfs(void)
422 {
423         int error;
424
425         /* should really be a BUG_ON but b/c this is an
426          * init call, it just doesn't work.  -dcz
427          */
428         if (nmi_watchdog != NMI_LOCAL_APIC)
429                 return 0;
430
431         if ( atomic_read(&nmi_active) < 0 )
432                 return 0;
433
434         error = sysdev_class_register(&nmi_sysclass);
435         if (!error)
436                 error = sysdev_register(&device_lapic_nmi);
437         return error;
438 }
439 /* must come after the local APIC's device_initcall() */
440 late_initcall(init_lapic_nmi_sysfs);
441
442 #endif  /* CONFIG_PM */
443
444 /*
445  * Activate the NMI watchdog via the local APIC.
446  * Original code written by Keith Owens.
447  */
448
449 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
450 {
451         u64 count = (u64)cpu_khz * 1000;
452
453         do_div(count, nmi_hz);
454         if(descr)
455                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
456         wrmsrl(perfctr_msr, 0 - count);
457 }
458
459 static void write_watchdog_counter32(unsigned int perfctr_msr,
460                 const char *descr)
461 {
462         u64 count = (u64)cpu_khz * 1000;
463
464         do_div(count, nmi_hz);
465         if(descr)
466                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
467         wrmsr(perfctr_msr, (u32)(-count), 0);
468 }
469
470 /* Note that these events don't tick when the CPU idles. This means
471    the frequency varies with CPU load. */
472
473 #define K7_EVNTSEL_ENABLE       (1 << 22)
474 #define K7_EVNTSEL_INT          (1 << 20)
475 #define K7_EVNTSEL_OS           (1 << 17)
476 #define K7_EVNTSEL_USR          (1 << 16)
477 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
478 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
479
480 static int setup_k7_watchdog(void)
481 {
482         unsigned int perfctr_msr, evntsel_msr;
483         unsigned int evntsel;
484         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
485
486         perfctr_msr = MSR_K7_PERFCTR0;
487         evntsel_msr = MSR_K7_EVNTSEL0;
488         if (!reserve_perfctr_nmi(perfctr_msr))
489                 goto fail;
490
491         if (!reserve_evntsel_nmi(evntsel_msr))
492                 goto fail1;
493
494         wrmsrl(perfctr_msr, 0UL);
495
496         evntsel = K7_EVNTSEL_INT
497                 | K7_EVNTSEL_OS
498                 | K7_EVNTSEL_USR
499                 | K7_NMI_EVENT;
500
501         /* setup the timer */
502         wrmsr(evntsel_msr, evntsel, 0);
503         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
504         apic_write(APIC_LVTPC, APIC_DM_NMI);
505         evntsel |= K7_EVNTSEL_ENABLE;
506         wrmsr(evntsel_msr, evntsel, 0);
507
508         wd->perfctr_msr = perfctr_msr;
509         wd->evntsel_msr = evntsel_msr;
510         wd->cccr_msr = 0;  //unused
511         wd->check_bit = 1ULL<<63;
512         return 1;
513 fail1:
514         release_perfctr_nmi(perfctr_msr);
515 fail:
516         return 0;
517 }
518
519 static void stop_k7_watchdog(void)
520 {
521         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
522
523         wrmsr(wd->evntsel_msr, 0, 0);
524
525         release_evntsel_nmi(wd->evntsel_msr);
526         release_perfctr_nmi(wd->perfctr_msr);
527 }
528
529 #define P6_EVNTSEL0_ENABLE      (1 << 22)
530 #define P6_EVNTSEL_INT          (1 << 20)
531 #define P6_EVNTSEL_OS           (1 << 17)
532 #define P6_EVNTSEL_USR          (1 << 16)
533 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
534 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
535
536 static int setup_p6_watchdog(void)
537 {
538         unsigned int perfctr_msr, evntsel_msr;
539         unsigned int evntsel;
540         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
541
542         perfctr_msr = MSR_P6_PERFCTR0;
543         evntsel_msr = MSR_P6_EVNTSEL0;
544         if (!reserve_perfctr_nmi(perfctr_msr))
545                 goto fail;
546
547         if (!reserve_evntsel_nmi(evntsel_msr))
548                 goto fail1;
549
550         wrmsrl(perfctr_msr, 0UL);
551
552         evntsel = P6_EVNTSEL_INT
553                 | P6_EVNTSEL_OS
554                 | P6_EVNTSEL_USR
555                 | P6_NMI_EVENT;
556
557         /* setup the timer */
558         wrmsr(evntsel_msr, evntsel, 0);
559         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
560         write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
561         apic_write(APIC_LVTPC, APIC_DM_NMI);
562         evntsel |= P6_EVNTSEL0_ENABLE;
563         wrmsr(evntsel_msr, evntsel, 0);
564
565         wd->perfctr_msr = perfctr_msr;
566         wd->evntsel_msr = evntsel_msr;
567         wd->cccr_msr = 0;  //unused
568         wd->check_bit = 1ULL<<39;
569         return 1;
570 fail1:
571         release_perfctr_nmi(perfctr_msr);
572 fail:
573         return 0;
574 }
575
576 static void stop_p6_watchdog(void)
577 {
578         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
579
580         wrmsr(wd->evntsel_msr, 0, 0);
581
582         release_evntsel_nmi(wd->evntsel_msr);
583         release_perfctr_nmi(wd->perfctr_msr);
584 }
585
586 /* Note that these events don't tick when the CPU idles. This means
587    the frequency varies with CPU load. */
588
589 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
590 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
591 #define P4_ESCR_OS              (1<<3)
592 #define P4_ESCR_USR             (1<<2)
593 #define P4_CCCR_OVF_PMI0        (1<<26)
594 #define P4_CCCR_OVF_PMI1        (1<<27)
595 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
596 #define P4_CCCR_COMPLEMENT      (1<<19)
597 #define P4_CCCR_COMPARE         (1<<18)
598 #define P4_CCCR_REQUIRED        (3<<16)
599 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
600 #define P4_CCCR_ENABLE          (1<<12)
601 #define P4_CCCR_OVF             (1<<31)
602 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
603    CRU_ESCR0 (with any non-null event selector) through a complemented
604    max threshold. [IA32-Vol3, Section 14.9.9] */
605
606 static int setup_p4_watchdog(void)
607 {
608         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
609         unsigned int evntsel, cccr_val;
610         unsigned int misc_enable, dummy;
611         unsigned int ht_num;
612         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
613
614         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
615         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
616                 return 0;
617
618 #ifdef CONFIG_SMP
619         /* detect which hyperthread we are on */
620         if (smp_num_siblings == 2) {
621                 unsigned int ebx, apicid;
622
623                 ebx = cpuid_ebx(1);
624                 apicid = (ebx >> 24) & 0xff;
625                 ht_num = apicid & 1;
626         } else
627 #endif
628                 ht_num = 0;
629
630         /* performance counters are shared resources
631          * assign each hyperthread its own set
632          * (re-use the ESCR0 register, seems safe
633          * and keeps the cccr_val the same)
634          */
635         if (!ht_num) {
636                 /* logical cpu 0 */
637                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
638                 evntsel_msr = MSR_P4_CRU_ESCR0;
639                 cccr_msr = MSR_P4_IQ_CCCR0;
640                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
641         } else {
642                 /* logical cpu 1 */
643                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
644                 evntsel_msr = MSR_P4_CRU_ESCR0;
645                 cccr_msr = MSR_P4_IQ_CCCR1;
646                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
647         }
648
649         if (!reserve_perfctr_nmi(perfctr_msr))
650                 goto fail;
651
652         if (!reserve_evntsel_nmi(evntsel_msr))
653                 goto fail1;
654
655         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
656                 | P4_ESCR_OS
657                 | P4_ESCR_USR;
658
659         cccr_val |= P4_CCCR_THRESHOLD(15)
660                  | P4_CCCR_COMPLEMENT
661                  | P4_CCCR_COMPARE
662                  | P4_CCCR_REQUIRED;
663
664         wrmsr(evntsel_msr, evntsel, 0);
665         wrmsr(cccr_msr, cccr_val, 0);
666         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
667         apic_write(APIC_LVTPC, APIC_DM_NMI);
668         cccr_val |= P4_CCCR_ENABLE;
669         wrmsr(cccr_msr, cccr_val, 0);
670         wd->perfctr_msr = perfctr_msr;
671         wd->evntsel_msr = evntsel_msr;
672         wd->cccr_msr = cccr_msr;
673         wd->check_bit = 1ULL<<39;
674         return 1;
675 fail1:
676         release_perfctr_nmi(perfctr_msr);
677 fail:
678         return 0;
679 }
680
681 static void stop_p4_watchdog(void)
682 {
683         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
684
685         wrmsr(wd->cccr_msr, 0, 0);
686         wrmsr(wd->evntsel_msr, 0, 0);
687
688         release_evntsel_nmi(wd->evntsel_msr);
689         release_perfctr_nmi(wd->perfctr_msr);
690 }
691
692 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
693 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
694
695 static int setup_intel_arch_watchdog(void)
696 {
697         unsigned int ebx;
698         union cpuid10_eax eax;
699         unsigned int unused;
700         unsigned int perfctr_msr, evntsel_msr;
701         unsigned int evntsel;
702         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
703
704         /*
705          * Check whether the Architectural PerfMon supports
706          * Unhalted Core Cycles Event or not.
707          * NOTE: Corresponding bit = 0 in ebx indicates event present.
708          */
709         cpuid(10, &(eax.full), &ebx, &unused, &unused);
710         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
711             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
712                 goto fail;
713
714         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
715         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
716
717         if (!reserve_perfctr_nmi(perfctr_msr))
718                 goto fail;
719
720         if (!reserve_evntsel_nmi(evntsel_msr))
721                 goto fail1;
722
723         wrmsrl(perfctr_msr, 0UL);
724
725         evntsel = ARCH_PERFMON_EVENTSEL_INT
726                 | ARCH_PERFMON_EVENTSEL_OS
727                 | ARCH_PERFMON_EVENTSEL_USR
728                 | ARCH_PERFMON_NMI_EVENT_SEL
729                 | ARCH_PERFMON_NMI_EVENT_UMASK;
730
731         /* setup the timer */
732         wrmsr(evntsel_msr, evntsel, 0);
733         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
734         write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
735         apic_write(APIC_LVTPC, APIC_DM_NMI);
736         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
737         wrmsr(evntsel_msr, evntsel, 0);
738
739         wd->perfctr_msr = perfctr_msr;
740         wd->evntsel_msr = evntsel_msr;
741         wd->cccr_msr = 0;  //unused
742         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
743         return 1;
744 fail1:
745         release_perfctr_nmi(perfctr_msr);
746 fail:
747         return 0;
748 }
749
750 static void stop_intel_arch_watchdog(void)
751 {
752         unsigned int ebx;
753         union cpuid10_eax eax;
754         unsigned int unused;
755         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
756
757         /*
758          * Check whether the Architectural PerfMon supports
759          * Unhalted Core Cycles Event or not.
760          * NOTE: Corresponding bit = 0 in ebx indicates event present.
761          */
762         cpuid(10, &(eax.full), &ebx, &unused, &unused);
763         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
764             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
765                 return;
766
767         wrmsr(wd->evntsel_msr, 0, 0);
768         release_evntsel_nmi(wd->evntsel_msr);
769         release_perfctr_nmi(wd->perfctr_msr);
770 }
771
772 void setup_apic_nmi_watchdog (void *unused)
773 {
774         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
775
776         /* only support LOCAL and IO APICs for now */
777         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
778             (nmi_watchdog != NMI_IO_APIC))
779                 return;
780
781         if (wd->enabled == 1)
782                 return;
783
784         /* cheap hack to support suspend/resume */
785         /* if cpu0 is not active neither should the other cpus */
786         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
787                 return;
788
789         if (nmi_watchdog == NMI_LOCAL_APIC) {
790                 switch (boot_cpu_data.x86_vendor) {
791                 case X86_VENDOR_AMD:
792                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
793                                 return;
794                         if (!setup_k7_watchdog())
795                                 return;
796                         break;
797                 case X86_VENDOR_INTEL:
798                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
799                                 if (!setup_intel_arch_watchdog())
800                                         return;
801                                 break;
802                         }
803                         switch (boot_cpu_data.x86) {
804                         case 6:
805                                 if (boot_cpu_data.x86_model > 0xd)
806                                         return;
807
808                                 if (!setup_p6_watchdog())
809                                         return;
810                                 break;
811                         case 15:
812                                 if (boot_cpu_data.x86_model > 0x4)
813                                         return;
814
815                                 if (!setup_p4_watchdog())
816                                         return;
817                                 break;
818                         default:
819                                 return;
820                         }
821                         break;
822                 default:
823                         return;
824                 }
825         }
826         wd->enabled = 1;
827         atomic_inc(&nmi_active);
828 }
829
830 void stop_apic_nmi_watchdog(void *unused)
831 {
832         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
833
834         /* only support LOCAL and IO APICs for now */
835         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
836             (nmi_watchdog != NMI_IO_APIC))
837                 return;
838
839         if (wd->enabled == 0)
840                 return;
841
842         if (nmi_watchdog == NMI_LOCAL_APIC) {
843                 switch (boot_cpu_data.x86_vendor) {
844                 case X86_VENDOR_AMD:
845                         stop_k7_watchdog();
846                         break;
847                 case X86_VENDOR_INTEL:
848                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
849                                 stop_intel_arch_watchdog();
850                                 break;
851                         }
852                         switch (boot_cpu_data.x86) {
853                         case 6:
854                                 if (boot_cpu_data.x86_model > 0xd)
855                                         break;
856                                 stop_p6_watchdog();
857                                 break;
858                         case 15:
859                                 if (boot_cpu_data.x86_model > 0x4)
860                                         break;
861                                 stop_p4_watchdog();
862                                 break;
863                         }
864                         break;
865                 default:
866                         return;
867                 }
868         }
869         wd->enabled = 0;
870         atomic_dec(&nmi_active);
871 }
872
873 /*
874  * the best way to detect whether a CPU has a 'hard lockup' problem
875  * is to check it's local APIC timer IRQ counts. If they are not
876  * changing then that CPU has some problem.
877  *
878  * as these watchdog NMI IRQs are generated on every CPU, we only
879  * have to check the current processor.
880  *
881  * since NMIs don't listen to _any_ locks, we have to be extremely
882  * careful not to rely on unsafe variables. The printk might lock
883  * up though, so we have to break up any console locks first ...
884  * [when there will be more tty-related locks, break them up
885  *  here too!]
886  */
887
888 static unsigned int
889         last_irq_sums [NR_CPUS],
890         alert_counter [NR_CPUS];
891
892 void touch_nmi_watchdog (void)
893 {
894         if (nmi_watchdog > 0) {
895                 unsigned cpu;
896
897                 /*
898                  * Just reset the alert counters, (other CPUs might be
899                  * spinning on locks we hold):
900                  */
901                 for_each_present_cpu (cpu)
902                         alert_counter[cpu] = 0;
903         }
904
905         /*
906          * Tickle the softlockup detector too:
907          */
908         touch_softlockup_watchdog();
909 }
910 EXPORT_SYMBOL(touch_nmi_watchdog);
911
912 extern void die_nmi(struct pt_regs *, const char *msg);
913
914 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
915 {
916
917         /*
918          * Since current_thread_info()-> is always on the stack, and we
919          * always switch the stack NMI-atomically, it's safe to use
920          * smp_processor_id().
921          */
922         unsigned int sum;
923         int touched = 0;
924         int cpu = smp_processor_id();
925         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
926         u64 dummy;
927         int rc=0;
928
929         /* check for other users first */
930         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
931                         == NOTIFY_STOP) {
932                 rc = 1;
933                 touched = 1;
934         }
935
936         if (cpu_isset(cpu, backtrace_mask)) {
937                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
938
939                 spin_lock(&lock);
940                 printk("NMI backtrace for cpu %d\n", cpu);
941                 dump_stack();
942                 spin_unlock(&lock);
943                 cpu_clear(cpu, backtrace_mask);
944         }
945
946         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
947
948         /* if the apic timer isn't firing, this cpu isn't doing much */
949         if (!touched && last_irq_sums[cpu] == sum) {
950                 /*
951                  * Ayiee, looks like this CPU is stuck ...
952                  * wait a few IRQs (5 seconds) before doing the oops ...
953                  */
954                 alert_counter[cpu]++;
955                 if (alert_counter[cpu] == 5*nmi_hz)
956                         /*
957                          * die_nmi will return ONLY if NOTIFY_STOP happens..
958                          */
959                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
960         } else {
961                 last_irq_sums[cpu] = sum;
962                 alert_counter[cpu] = 0;
963         }
964         /* see if the nmi watchdog went off */
965         if (wd->enabled) {
966                 if (nmi_watchdog == NMI_LOCAL_APIC) {
967                         rdmsrl(wd->perfctr_msr, dummy);
968                         if (dummy & wd->check_bit){
969                                 /* this wasn't a watchdog timer interrupt */
970                                 goto done;
971                         }
972
973                         /* only Intel P4 uses the cccr msr */
974                         if (wd->cccr_msr != 0) {
975                                 /*
976                                  * P4 quirks:
977                                  * - An overflown perfctr will assert its interrupt
978                                  *   until the OVF flag in its CCCR is cleared.
979                                  * - LVTPC is masked on interrupt and must be
980                                  *   unmasked by the LVTPC handler.
981                                  */
982                                 rdmsrl(wd->cccr_msr, dummy);
983                                 dummy &= ~P4_CCCR_OVF;
984                                 wrmsrl(wd->cccr_msr, dummy);
985                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
986                                 /* start the cycle over again */
987                                 write_watchdog_counter(wd->perfctr_msr, NULL);
988                         }
989                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
990                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
991                                 /* P6 based Pentium M need to re-unmask
992                                  * the apic vector but it doesn't hurt
993                                  * other P6 variant.
994                                  * ArchPerfom/Core Duo also needs this */
995                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
996                                 /* P6/ARCH_PERFMON has 32 bit counter write */
997                                 write_watchdog_counter32(wd->perfctr_msr, NULL);
998                         } else {
999                                 /* start the cycle over again */
1000                                 write_watchdog_counter(wd->perfctr_msr, NULL);
1001                         }
1002                         rc = 1;
1003                 } else if (nmi_watchdog == NMI_IO_APIC) {
1004                         /* don't know how to accurately check for this.
1005                          * just assume it was a watchdog timer interrupt
1006                          * This matches the old behaviour.
1007                          */
1008                         rc = 1;
1009                 }
1010         }
1011 done:
1012         return rc;
1013 }
1014
1015 int do_nmi_callback(struct pt_regs * regs, int cpu)
1016 {
1017 #ifdef CONFIG_SYSCTL
1018         if (unknown_nmi_panic)
1019                 return unknown_nmi_panic_callback(regs, cpu);
1020 #endif
1021         return 0;
1022 }
1023
1024 #ifdef CONFIG_SYSCTL
1025
1026 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
1027 {
1028         unsigned char reason = get_nmi_reason();
1029         char buf[64];
1030
1031         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1032         die_nmi(regs, buf);
1033         return 0;
1034 }
1035
1036 /*
1037  * proc handler for /proc/sys/kernel/nmi
1038  */
1039 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1040                         void __user *buffer, size_t *length, loff_t *ppos)
1041 {
1042         int old_state;
1043
1044         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1045         old_state = nmi_watchdog_enabled;
1046         proc_dointvec(table, write, file, buffer, length, ppos);
1047         if (!!old_state == !!nmi_watchdog_enabled)
1048                 return 0;
1049
1050         if (atomic_read(&nmi_active) < 0) {
1051                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1052                 return -EIO;
1053         }
1054
1055         if (nmi_watchdog == NMI_DEFAULT) {
1056                 if (nmi_known_cpu() > 0)
1057                         nmi_watchdog = NMI_LOCAL_APIC;
1058                 else
1059                         nmi_watchdog = NMI_IO_APIC;
1060         }
1061
1062         if (nmi_watchdog == NMI_LOCAL_APIC) {
1063                 if (nmi_watchdog_enabled)
1064                         enable_lapic_nmi_watchdog();
1065                 else
1066                         disable_lapic_nmi_watchdog();
1067         } else {
1068                 printk( KERN_WARNING
1069                         "NMI watchdog doesn't know what hardware to touch\n");
1070                 return -EIO;
1071         }
1072         return 0;
1073 }
1074
1075 #endif
1076
1077 void __trigger_all_cpu_backtrace(void)
1078 {
1079         int i;
1080
1081         backtrace_mask = cpu_online_map;
1082         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1083         for (i = 0; i < 10 * 1000; i++) {
1084                 if (cpus_empty(backtrace_mask))
1085                         break;
1086                 mdelay(1);
1087         }
1088 }
1089
1090 EXPORT_SYMBOL(nmi_active);
1091 EXPORT_SYMBOL(nmi_watchdog);
1092 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1093 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1094 EXPORT_SYMBOL(reserve_perfctr_nmi);
1095 EXPORT_SYMBOL(release_perfctr_nmi);
1096 EXPORT_SYMBOL(reserve_evntsel_nmi);
1097 EXPORT_SYMBOL(release_evntsel_nmi);
1098 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1099 EXPORT_SYMBOL(enable_timer_nmi_watchdog);