Merge branch 'upstream-linus' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik...
[linux-2.6] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25
26 #include <asm/smp.h>
27 #include <asm/nmi.h>
28 #include <asm/kdebug.h>
29 #include <asm/intel_arch_perfmon.h>
30
31 #include "mach_traps.h"
32
33 int unknown_nmi_panic;
34 int nmi_watchdog_enabled;
35
36 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
37  * evtsel_nmi_owner tracks the ownership of the event selection
38  * - different performance counters/ event selection may be reserved for
39  *   different subsystems this reservation system just tries to coordinate
40  *   things a little
41  */
42 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
43 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
44
45 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
47  */
48 #define NMI_MAX_COUNTER_BITS 66
49
50 /* nmi_active:
51  * >0: the lapic NMI watchdog is active, but can be disabled
52  * <0: the lapic NMI watchdog has not been set up, and cannot
53  *     be enabled
54  *  0: the lapic NMI watchdog is disabled, but can be enabled
55  */
56 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
57
58 unsigned int nmi_watchdog = NMI_DEFAULT;
59 static unsigned int nmi_hz = HZ;
60
61 struct nmi_watchdog_ctlblk {
62         int enabled;
63         u64 check_bit;
64         unsigned int cccr_msr;
65         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
66         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
67 };
68 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
69
70 /* local prototypes */
71 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
72
73 extern void show_registers(struct pt_regs *regs);
74 extern int unknown_nmi_panic;
75
76 /* converts an msr to an appropriate reservation bit */
77 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
78 {
79         /* returns the bit offset of the performance counter register */
80         switch (boot_cpu_data.x86_vendor) {
81         case X86_VENDOR_AMD:
82                 return (msr - MSR_K7_PERFCTR0);
83         case X86_VENDOR_INTEL:
84                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
85                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
86
87                 switch (boot_cpu_data.x86) {
88                 case 6:
89                         return (msr - MSR_P6_PERFCTR0);
90                 case 15:
91                         return (msr - MSR_P4_BPU_PERFCTR0);
92                 }
93         }
94         return 0;
95 }
96
97 /* converts an msr to an appropriate reservation bit */
98 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
99 {
100         /* returns the bit offset of the event selection register */
101         switch (boot_cpu_data.x86_vendor) {
102         case X86_VENDOR_AMD:
103                 return (msr - MSR_K7_EVNTSEL0);
104         case X86_VENDOR_INTEL:
105                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
106                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
107
108                 switch (boot_cpu_data.x86) {
109                 case 6:
110                         return (msr - MSR_P6_EVNTSEL0);
111                 case 15:
112                         return (msr - MSR_P4_BSU_ESCR0);
113                 }
114         }
115         return 0;
116 }
117
118 /* checks for a bit availability (hack for oprofile) */
119 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
120 {
121         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
122
123         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
124 }
125
126 /* checks the an msr for availability */
127 int avail_to_resrv_perfctr_nmi(unsigned int msr)
128 {
129         unsigned int counter;
130
131         counter = nmi_perfctr_msr_to_bit(msr);
132         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
133
134         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
135 }
136
137 int reserve_perfctr_nmi(unsigned int msr)
138 {
139         unsigned int counter;
140
141         counter = nmi_perfctr_msr_to_bit(msr);
142         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
143
144         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
145                 return 1;
146         return 0;
147 }
148
149 void release_perfctr_nmi(unsigned int msr)
150 {
151         unsigned int counter;
152
153         counter = nmi_perfctr_msr_to_bit(msr);
154         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
155
156         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
157 }
158
159 int reserve_evntsel_nmi(unsigned int msr)
160 {
161         unsigned int counter;
162
163         counter = nmi_evntsel_msr_to_bit(msr);
164         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
165
166         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
167                 return 1;
168         return 0;
169 }
170
171 void release_evntsel_nmi(unsigned int msr)
172 {
173         unsigned int counter;
174
175         counter = nmi_evntsel_msr_to_bit(msr);
176         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
177
178         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
179 }
180
181 static __cpuinit inline int nmi_known_cpu(void)
182 {
183         switch (boot_cpu_data.x86_vendor) {
184         case X86_VENDOR_AMD:
185                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
186         case X86_VENDOR_INTEL:
187                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
188                         return 1;
189                 else
190                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
191         }
192         return 0;
193 }
194
195 #ifdef CONFIG_SMP
196 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
197  * the CPU is idle. To make sure the NMI watchdog really ticks on all
198  * CPUs during the test make them busy.
199  */
200 static __init void nmi_cpu_busy(void *data)
201 {
202         volatile int *endflag = data;
203         local_irq_enable_in_hardirq();
204         /* Intentionally don't use cpu_relax here. This is
205            to make sure that the performance counter really ticks,
206            even if there is a simulator or similar that catches the
207            pause instruction. On a real HT machine this is fine because
208            all other CPUs are busy with "useless" delay loops and don't
209            care if they get somewhat less cycles. */
210         while (*endflag == 0)
211                 barrier();
212 }
213 #endif
214
215 static int __init check_nmi_watchdog(void)
216 {
217         volatile int endflag = 0;
218         unsigned int *prev_nmi_count;
219         int cpu;
220
221         /* Enable NMI watchdog for newer systems.
222            Actually it should be safe for most systems before 2004 too except
223            for some IBM systems that corrupt registers when NMI happens
224            during SMM. Unfortunately we don't have more exact information
225            on these and use this coarse check. */
226         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004)
227                 nmi_watchdog = NMI_LOCAL_APIC;
228
229         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
230                 return 0;
231
232         if (!atomic_read(&nmi_active))
233                 return 0;
234
235         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
236         if (!prev_nmi_count)
237                 return -1;
238
239         printk(KERN_INFO "Testing NMI watchdog ... ");
240
241         if (nmi_watchdog == NMI_LOCAL_APIC)
242                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
243
244         for_each_possible_cpu(cpu)
245                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
246         local_irq_enable();
247         mdelay((10*1000)/nmi_hz); // wait 10 ticks
248
249         for_each_possible_cpu(cpu) {
250 #ifdef CONFIG_SMP
251                 /* Check cpu_callin_map here because that is set
252                    after the timer is started. */
253                 if (!cpu_isset(cpu, cpu_callin_map))
254                         continue;
255 #endif
256                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
257                         continue;
258                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
259                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
260                                 cpu,
261                                 prev_nmi_count[cpu],
262                                 nmi_count(cpu));
263                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
264                         atomic_dec(&nmi_active);
265                 }
266         }
267         if (!atomic_read(&nmi_active)) {
268                 kfree(prev_nmi_count);
269                 atomic_set(&nmi_active, -1);
270                 return -1;
271         }
272         endflag = 1;
273         printk("OK.\n");
274
275         /* now that we know it works we can reduce NMI frequency to
276            something more reasonable; makes a difference in some configs */
277         if (nmi_watchdog == NMI_LOCAL_APIC) {
278                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
279
280                 nmi_hz = 1;
281                 /*
282                  * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
283                  * are writable, with higher bits sign extending from bit 31.
284                  * So, we can only program the counter with 31 bit values and
285                  * 32nd bit should be 1, for 33.. to be 1.
286                  * Find the appropriate nmi_hz
287                  */
288                 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
289                         ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
290                         u64 count = (u64)cpu_khz * 1000;
291                         do_div(count, 0x7fffffffUL);
292                         nmi_hz = count + 1;
293                 }
294         }
295
296         kfree(prev_nmi_count);
297         return 0;
298 }
299 /* This needs to happen later in boot so counters are working */
300 late_initcall(check_nmi_watchdog);
301
302 static int __init setup_nmi_watchdog(char *str)
303 {
304         int nmi;
305
306         get_option(&str, &nmi);
307
308         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
309                 return 0;
310         /*
311          * If any other x86 CPU has a local APIC, then
312          * please test the NMI stuff there and send me the
313          * missing bits. Right now Intel P6/P4 and AMD K7 only.
314          */
315         if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
316                 return 0;  /* no lapic support */
317         nmi_watchdog = nmi;
318         return 1;
319 }
320
321 __setup("nmi_watchdog=", setup_nmi_watchdog);
322
323 static void disable_lapic_nmi_watchdog(void)
324 {
325         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
326
327         if (atomic_read(&nmi_active) <= 0)
328                 return;
329
330         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
331
332         BUG_ON(atomic_read(&nmi_active) != 0);
333 }
334
335 static void enable_lapic_nmi_watchdog(void)
336 {
337         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
338
339         /* are we already enabled */
340         if (atomic_read(&nmi_active) != 0)
341                 return;
342
343         /* are we lapic aware */
344         if (nmi_known_cpu() <= 0)
345                 return;
346
347         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
348         touch_nmi_watchdog();
349 }
350
351 void disable_timer_nmi_watchdog(void)
352 {
353         BUG_ON(nmi_watchdog != NMI_IO_APIC);
354
355         if (atomic_read(&nmi_active) <= 0)
356                 return;
357
358         disable_irq(0);
359         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
360
361         BUG_ON(atomic_read(&nmi_active) != 0);
362 }
363
364 void enable_timer_nmi_watchdog(void)
365 {
366         BUG_ON(nmi_watchdog != NMI_IO_APIC);
367
368         if (atomic_read(&nmi_active) == 0) {
369                 touch_nmi_watchdog();
370                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
371                 enable_irq(0);
372         }
373 }
374
375 #ifdef CONFIG_PM
376
377 static int nmi_pm_active; /* nmi_active before suspend */
378
379 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
380 {
381         /* only CPU0 goes here, other CPUs should be offline */
382         nmi_pm_active = atomic_read(&nmi_active);
383         stop_apic_nmi_watchdog(NULL);
384         BUG_ON(atomic_read(&nmi_active) != 0);
385         return 0;
386 }
387
388 static int lapic_nmi_resume(struct sys_device *dev)
389 {
390         /* only CPU0 goes here, other CPUs should be offline */
391         if (nmi_pm_active > 0) {
392                 setup_apic_nmi_watchdog(NULL);
393                 touch_nmi_watchdog();
394         }
395         return 0;
396 }
397
398
399 static struct sysdev_class nmi_sysclass = {
400         set_kset_name("lapic_nmi"),
401         .resume         = lapic_nmi_resume,
402         .suspend        = lapic_nmi_suspend,
403 };
404
405 static struct sys_device device_lapic_nmi = {
406         .id     = 0,
407         .cls    = &nmi_sysclass,
408 };
409
410 static int __init init_lapic_nmi_sysfs(void)
411 {
412         int error;
413
414         /* should really be a BUG_ON but b/c this is an
415          * init call, it just doesn't work.  -dcz
416          */
417         if (nmi_watchdog != NMI_LOCAL_APIC)
418                 return 0;
419
420         if ( atomic_read(&nmi_active) < 0 )
421                 return 0;
422
423         error = sysdev_class_register(&nmi_sysclass);
424         if (!error)
425                 error = sysdev_register(&device_lapic_nmi);
426         return error;
427 }
428 /* must come after the local APIC's device_initcall() */
429 late_initcall(init_lapic_nmi_sysfs);
430
431 #endif  /* CONFIG_PM */
432
433 /*
434  * Activate the NMI watchdog via the local APIC.
435  * Original code written by Keith Owens.
436  */
437
438 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
439 {
440         u64 count = (u64)cpu_khz * 1000;
441
442         do_div(count, nmi_hz);
443         if(descr)
444                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
445         wrmsrl(perfctr_msr, 0 - count);
446 }
447
448 /* Note that these events don't tick when the CPU idles. This means
449    the frequency varies with CPU load. */
450
451 #define K7_EVNTSEL_ENABLE       (1 << 22)
452 #define K7_EVNTSEL_INT          (1 << 20)
453 #define K7_EVNTSEL_OS           (1 << 17)
454 #define K7_EVNTSEL_USR          (1 << 16)
455 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
456 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
457
458 static int setup_k7_watchdog(void)
459 {
460         unsigned int perfctr_msr, evntsel_msr;
461         unsigned int evntsel;
462         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
463
464         perfctr_msr = MSR_K7_PERFCTR0;
465         evntsel_msr = MSR_K7_EVNTSEL0;
466         if (!reserve_perfctr_nmi(perfctr_msr))
467                 goto fail;
468
469         if (!reserve_evntsel_nmi(evntsel_msr))
470                 goto fail1;
471
472         wrmsrl(perfctr_msr, 0UL);
473
474         evntsel = K7_EVNTSEL_INT
475                 | K7_EVNTSEL_OS
476                 | K7_EVNTSEL_USR
477                 | K7_NMI_EVENT;
478
479         /* setup the timer */
480         wrmsr(evntsel_msr, evntsel, 0);
481         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
482         apic_write(APIC_LVTPC, APIC_DM_NMI);
483         evntsel |= K7_EVNTSEL_ENABLE;
484         wrmsr(evntsel_msr, evntsel, 0);
485
486         wd->perfctr_msr = perfctr_msr;
487         wd->evntsel_msr = evntsel_msr;
488         wd->cccr_msr = 0;  //unused
489         wd->check_bit = 1ULL<<63;
490         return 1;
491 fail1:
492         release_perfctr_nmi(perfctr_msr);
493 fail:
494         return 0;
495 }
496
497 static void stop_k7_watchdog(void)
498 {
499         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
500
501         wrmsr(wd->evntsel_msr, 0, 0);
502
503         release_evntsel_nmi(wd->evntsel_msr);
504         release_perfctr_nmi(wd->perfctr_msr);
505 }
506
507 #define P6_EVNTSEL0_ENABLE      (1 << 22)
508 #define P6_EVNTSEL_INT          (1 << 20)
509 #define P6_EVNTSEL_OS           (1 << 17)
510 #define P6_EVNTSEL_USR          (1 << 16)
511 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
512 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
513
514 static int setup_p6_watchdog(void)
515 {
516         unsigned int perfctr_msr, evntsel_msr;
517         unsigned int evntsel;
518         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
519
520         perfctr_msr = MSR_P6_PERFCTR0;
521         evntsel_msr = MSR_P6_EVNTSEL0;
522         if (!reserve_perfctr_nmi(perfctr_msr))
523                 goto fail;
524
525         if (!reserve_evntsel_nmi(evntsel_msr))
526                 goto fail1;
527
528         wrmsrl(perfctr_msr, 0UL);
529
530         evntsel = P6_EVNTSEL_INT
531                 | P6_EVNTSEL_OS
532                 | P6_EVNTSEL_USR
533                 | P6_NMI_EVENT;
534
535         /* setup the timer */
536         wrmsr(evntsel_msr, evntsel, 0);
537         write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
538         apic_write(APIC_LVTPC, APIC_DM_NMI);
539         evntsel |= P6_EVNTSEL0_ENABLE;
540         wrmsr(evntsel_msr, evntsel, 0);
541
542         wd->perfctr_msr = perfctr_msr;
543         wd->evntsel_msr = evntsel_msr;
544         wd->cccr_msr = 0;  //unused
545         wd->check_bit = 1ULL<<39;
546         return 1;
547 fail1:
548         release_perfctr_nmi(perfctr_msr);
549 fail:
550         return 0;
551 }
552
553 static void stop_p6_watchdog(void)
554 {
555         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
556
557         wrmsr(wd->evntsel_msr, 0, 0);
558
559         release_evntsel_nmi(wd->evntsel_msr);
560         release_perfctr_nmi(wd->perfctr_msr);
561 }
562
563 /* Note that these events don't tick when the CPU idles. This means
564    the frequency varies with CPU load. */
565
566 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
567 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
568 #define P4_ESCR_OS              (1<<3)
569 #define P4_ESCR_USR             (1<<2)
570 #define P4_CCCR_OVF_PMI0        (1<<26)
571 #define P4_CCCR_OVF_PMI1        (1<<27)
572 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
573 #define P4_CCCR_COMPLEMENT      (1<<19)
574 #define P4_CCCR_COMPARE         (1<<18)
575 #define P4_CCCR_REQUIRED        (3<<16)
576 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
577 #define P4_CCCR_ENABLE          (1<<12)
578 #define P4_CCCR_OVF             (1<<31)
579 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
580    CRU_ESCR0 (with any non-null event selector) through a complemented
581    max threshold. [IA32-Vol3, Section 14.9.9] */
582
583 static int setup_p4_watchdog(void)
584 {
585         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
586         unsigned int evntsel, cccr_val;
587         unsigned int misc_enable, dummy;
588         unsigned int ht_num;
589         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
590
591         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
592         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
593                 return 0;
594
595 #ifdef CONFIG_SMP
596         /* detect which hyperthread we are on */
597         if (smp_num_siblings == 2) {
598                 unsigned int ebx, apicid;
599
600                 ebx = cpuid_ebx(1);
601                 apicid = (ebx >> 24) & 0xff;
602                 ht_num = apicid & 1;
603         } else
604 #endif
605                 ht_num = 0;
606
607         /* performance counters are shared resources
608          * assign each hyperthread its own set
609          * (re-use the ESCR0 register, seems safe
610          * and keeps the cccr_val the same)
611          */
612         if (!ht_num) {
613                 /* logical cpu 0 */
614                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
615                 evntsel_msr = MSR_P4_CRU_ESCR0;
616                 cccr_msr = MSR_P4_IQ_CCCR0;
617                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
618         } else {
619                 /* logical cpu 1 */
620                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
621                 evntsel_msr = MSR_P4_CRU_ESCR0;
622                 cccr_msr = MSR_P4_IQ_CCCR1;
623                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
624         }
625
626         if (!reserve_perfctr_nmi(perfctr_msr))
627                 goto fail;
628
629         if (!reserve_evntsel_nmi(evntsel_msr))
630                 goto fail1;
631
632         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
633                 | P4_ESCR_OS
634                 | P4_ESCR_USR;
635
636         cccr_val |= P4_CCCR_THRESHOLD(15)
637                  | P4_CCCR_COMPLEMENT
638                  | P4_CCCR_COMPARE
639                  | P4_CCCR_REQUIRED;
640
641         wrmsr(evntsel_msr, evntsel, 0);
642         wrmsr(cccr_msr, cccr_val, 0);
643         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
644         apic_write(APIC_LVTPC, APIC_DM_NMI);
645         cccr_val |= P4_CCCR_ENABLE;
646         wrmsr(cccr_msr, cccr_val, 0);
647         wd->perfctr_msr = perfctr_msr;
648         wd->evntsel_msr = evntsel_msr;
649         wd->cccr_msr = cccr_msr;
650         wd->check_bit = 1ULL<<39;
651         return 1;
652 fail1:
653         release_perfctr_nmi(perfctr_msr);
654 fail:
655         return 0;
656 }
657
658 static void stop_p4_watchdog(void)
659 {
660         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
661
662         wrmsr(wd->cccr_msr, 0, 0);
663         wrmsr(wd->evntsel_msr, 0, 0);
664
665         release_evntsel_nmi(wd->evntsel_msr);
666         release_perfctr_nmi(wd->perfctr_msr);
667 }
668
669 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
670 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
671
672 static int setup_intel_arch_watchdog(void)
673 {
674         unsigned int ebx;
675         union cpuid10_eax eax;
676         unsigned int unused;
677         unsigned int perfctr_msr, evntsel_msr;
678         unsigned int evntsel;
679         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
680
681         /*
682          * Check whether the Architectural PerfMon supports
683          * Unhalted Core Cycles Event or not.
684          * NOTE: Corresponding bit = 0 in ebx indicates event present.
685          */
686         cpuid(10, &(eax.full), &ebx, &unused, &unused);
687         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
688             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
689                 goto fail;
690
691         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
692         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
693
694         if (!reserve_perfctr_nmi(perfctr_msr))
695                 goto fail;
696
697         if (!reserve_evntsel_nmi(evntsel_msr))
698                 goto fail1;
699
700         wrmsrl(perfctr_msr, 0UL);
701
702         evntsel = ARCH_PERFMON_EVENTSEL_INT
703                 | ARCH_PERFMON_EVENTSEL_OS
704                 | ARCH_PERFMON_EVENTSEL_USR
705                 | ARCH_PERFMON_NMI_EVENT_SEL
706                 | ARCH_PERFMON_NMI_EVENT_UMASK;
707
708         /* setup the timer */
709         wrmsr(evntsel_msr, evntsel, 0);
710         write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
711         apic_write(APIC_LVTPC, APIC_DM_NMI);
712         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
713         wrmsr(evntsel_msr, evntsel, 0);
714
715         wd->perfctr_msr = perfctr_msr;
716         wd->evntsel_msr = evntsel_msr;
717         wd->cccr_msr = 0;  //unused
718         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
719         return 1;
720 fail1:
721         release_perfctr_nmi(perfctr_msr);
722 fail:
723         return 0;
724 }
725
726 static void stop_intel_arch_watchdog(void)
727 {
728         unsigned int ebx;
729         union cpuid10_eax eax;
730         unsigned int unused;
731         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
732
733         /*
734          * Check whether the Architectural PerfMon supports
735          * Unhalted Core Cycles Event or not.
736          * NOTE: Corresponding bit = 0 in ebx indicates event present.
737          */
738         cpuid(10, &(eax.full), &ebx, &unused, &unused);
739         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
740             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
741                 return;
742
743         wrmsr(wd->evntsel_msr, 0, 0);
744         release_evntsel_nmi(wd->evntsel_msr);
745         release_perfctr_nmi(wd->perfctr_msr);
746 }
747
748 void setup_apic_nmi_watchdog (void *unused)
749 {
750         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
751
752         /* only support LOCAL and IO APICs for now */
753         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
754             (nmi_watchdog != NMI_IO_APIC))
755                 return;
756
757         if (wd->enabled == 1)
758                 return;
759
760         /* cheap hack to support suspend/resume */
761         /* if cpu0 is not active neither should the other cpus */
762         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
763                 return;
764
765         if (nmi_watchdog == NMI_LOCAL_APIC) {
766                 switch (boot_cpu_data.x86_vendor) {
767                 case X86_VENDOR_AMD:
768                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
769                                 return;
770                         if (!setup_k7_watchdog())
771                                 return;
772                         break;
773                 case X86_VENDOR_INTEL:
774                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
775                                 if (!setup_intel_arch_watchdog())
776                                         return;
777                                 break;
778                         }
779                         switch (boot_cpu_data.x86) {
780                         case 6:
781                                 if (boot_cpu_data.x86_model > 0xd)
782                                         return;
783
784                                 if (!setup_p6_watchdog())
785                                         return;
786                                 break;
787                         case 15:
788                                 if (boot_cpu_data.x86_model > 0x4)
789                                         return;
790
791                                 if (!setup_p4_watchdog())
792                                         return;
793                                 break;
794                         default:
795                                 return;
796                         }
797                         break;
798                 default:
799                         return;
800                 }
801         }
802         wd->enabled = 1;
803         atomic_inc(&nmi_active);
804 }
805
806 void stop_apic_nmi_watchdog(void *unused)
807 {
808         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
809
810         /* only support LOCAL and IO APICs for now */
811         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
812             (nmi_watchdog != NMI_IO_APIC))
813                 return;
814
815         if (wd->enabled == 0)
816                 return;
817
818         if (nmi_watchdog == NMI_LOCAL_APIC) {
819                 switch (boot_cpu_data.x86_vendor) {
820                 case X86_VENDOR_AMD:
821                         stop_k7_watchdog();
822                         break;
823                 case X86_VENDOR_INTEL:
824                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
825                                 stop_intel_arch_watchdog();
826                                 break;
827                         }
828                         switch (boot_cpu_data.x86) {
829                         case 6:
830                                 if (boot_cpu_data.x86_model > 0xd)
831                                         break;
832                                 stop_p6_watchdog();
833                                 break;
834                         case 15:
835                                 if (boot_cpu_data.x86_model > 0x4)
836                                         break;
837                                 stop_p4_watchdog();
838                                 break;
839                         }
840                         break;
841                 default:
842                         return;
843                 }
844         }
845         wd->enabled = 0;
846         atomic_dec(&nmi_active);
847 }
848
849 /*
850  * the best way to detect whether a CPU has a 'hard lockup' problem
851  * is to check it's local APIC timer IRQ counts. If they are not
852  * changing then that CPU has some problem.
853  *
854  * as these watchdog NMI IRQs are generated on every CPU, we only
855  * have to check the current processor.
856  *
857  * since NMIs don't listen to _any_ locks, we have to be extremely
858  * careful not to rely on unsafe variables. The printk might lock
859  * up though, so we have to break up any console locks first ...
860  * [when there will be more tty-related locks, break them up
861  *  here too!]
862  */
863
864 static unsigned int
865         last_irq_sums [NR_CPUS],
866         alert_counter [NR_CPUS];
867
868 void touch_nmi_watchdog (void)
869 {
870         int i;
871
872         /*
873          * Just reset the alert counters, (other CPUs might be
874          * spinning on locks we hold):
875          */
876         for_each_possible_cpu(i)
877                 alert_counter[i] = 0;
878
879         /*
880          * Tickle the softlockup detector too:
881          */
882         touch_softlockup_watchdog();
883 }
884 EXPORT_SYMBOL(touch_nmi_watchdog);
885
886 extern void die_nmi(struct pt_regs *, const char *msg);
887
888 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
889 {
890
891         /*
892          * Since current_thread_info()-> is always on the stack, and we
893          * always switch the stack NMI-atomically, it's safe to use
894          * smp_processor_id().
895          */
896         unsigned int sum;
897         int touched = 0;
898         int cpu = smp_processor_id();
899         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
900         u64 dummy;
901         int rc=0;
902
903         /* check for other users first */
904         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
905                         == NOTIFY_STOP) {
906                 rc = 1;
907                 touched = 1;
908         }
909
910         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
911
912         /* if the apic timer isn't firing, this cpu isn't doing much */
913         if (!touched && last_irq_sums[cpu] == sum) {
914                 /*
915                  * Ayiee, looks like this CPU is stuck ...
916                  * wait a few IRQs (5 seconds) before doing the oops ...
917                  */
918                 alert_counter[cpu]++;
919                 if (alert_counter[cpu] == 5*nmi_hz)
920                         /*
921                          * die_nmi will return ONLY if NOTIFY_STOP happens..
922                          */
923                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
924         } else {
925                 last_irq_sums[cpu] = sum;
926                 alert_counter[cpu] = 0;
927         }
928         /* see if the nmi watchdog went off */
929         if (wd->enabled) {
930                 if (nmi_watchdog == NMI_LOCAL_APIC) {
931                         rdmsrl(wd->perfctr_msr, dummy);
932                         if (dummy & wd->check_bit){
933                                 /* this wasn't a watchdog timer interrupt */
934                                 goto done;
935                         }
936
937                         /* only Intel P4 uses the cccr msr */
938                         if (wd->cccr_msr != 0) {
939                                 /*
940                                  * P4 quirks:
941                                  * - An overflown perfctr will assert its interrupt
942                                  *   until the OVF flag in its CCCR is cleared.
943                                  * - LVTPC is masked on interrupt and must be
944                                  *   unmasked by the LVTPC handler.
945                                  */
946                                 rdmsrl(wd->cccr_msr, dummy);
947                                 dummy &= ~P4_CCCR_OVF;
948                                 wrmsrl(wd->cccr_msr, dummy);
949                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
950                         }
951                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
952                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
953                                 /* P6 based Pentium M need to re-unmask
954                                  * the apic vector but it doesn't hurt
955                                  * other P6 variant.
956                                  * ArchPerfom/Core Duo also needs this */
957                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
958                         }
959                         /* start the cycle over again */
960                         write_watchdog_counter(wd->perfctr_msr, NULL);
961                         rc = 1;
962                 } else if (nmi_watchdog == NMI_IO_APIC) {
963                         /* don't know how to accurately check for this.
964                          * just assume it was a watchdog timer interrupt
965                          * This matches the old behaviour.
966                          */
967                         rc = 1;
968                 }
969         }
970 done:
971         return rc;
972 }
973
974 int do_nmi_callback(struct pt_regs * regs, int cpu)
975 {
976 #ifdef CONFIG_SYSCTL
977         if (unknown_nmi_panic)
978                 return unknown_nmi_panic_callback(regs, cpu);
979 #endif
980         return 0;
981 }
982
983 #ifdef CONFIG_SYSCTL
984
985 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
986 {
987         unsigned char reason = get_nmi_reason();
988         char buf[64];
989
990         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
991         die_nmi(regs, buf);
992         return 0;
993 }
994
995 /*
996  * proc handler for /proc/sys/kernel/nmi
997  */
998 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
999                         void __user *buffer, size_t *length, loff_t *ppos)
1000 {
1001         int old_state;
1002
1003         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1004         old_state = nmi_watchdog_enabled;
1005         proc_dointvec(table, write, file, buffer, length, ppos);
1006         if (!!old_state == !!nmi_watchdog_enabled)
1007                 return 0;
1008
1009         if (atomic_read(&nmi_active) < 0) {
1010                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1011                 return -EIO;
1012         }
1013
1014         if (nmi_watchdog == NMI_DEFAULT) {
1015                 if (nmi_known_cpu() > 0)
1016                         nmi_watchdog = NMI_LOCAL_APIC;
1017                 else
1018                         nmi_watchdog = NMI_IO_APIC;
1019         }
1020
1021         if (nmi_watchdog == NMI_LOCAL_APIC) {
1022                 if (nmi_watchdog_enabled)
1023                         enable_lapic_nmi_watchdog();
1024                 else
1025                         disable_lapic_nmi_watchdog();
1026         } else {
1027                 printk( KERN_WARNING
1028                         "NMI watchdog doesn't know what hardware to touch\n");
1029                 return -EIO;
1030         }
1031         return 0;
1032 }
1033
1034 #endif
1035
1036 EXPORT_SYMBOL(nmi_active);
1037 EXPORT_SYMBOL(nmi_watchdog);
1038 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1039 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1040 EXPORT_SYMBOL(reserve_perfctr_nmi);
1041 EXPORT_SYMBOL(release_perfctr_nmi);
1042 EXPORT_SYMBOL(reserve_evntsel_nmi);
1043 EXPORT_SYMBOL(release_evntsel_nmi);
1044 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1045 EXPORT_SYMBOL(enable_timer_nmi_watchdog);