Merge ../linux-2.6-watchdog-mm
[linux-2.6] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26
27 #include <asm/smp.h>
28 #include <asm/nmi.h>
29 #include <asm/kdebug.h>
30 #include <asm/intel_arch_perfmon.h>
31
32 #include "mach_traps.h"
33
34 int unknown_nmi_panic;
35 int nmi_watchdog_enabled;
36
37 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
38  * evtsel_nmi_owner tracks the ownership of the event selection
39  * - different performance counters/ event selection may be reserved for
40  *   different subsystems this reservation system just tries to coordinate
41  *   things a little
42  */
43 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
44 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
45
46 static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
48 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
49  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
50  */
51 #define NMI_MAX_COUNTER_BITS 66
52
53 /* nmi_active:
54  * >0: the lapic NMI watchdog is active, but can be disabled
55  * <0: the lapic NMI watchdog has not been set up, and cannot
56  *     be enabled
57  *  0: the lapic NMI watchdog is disabled, but can be enabled
58  */
59 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
60
61 unsigned int nmi_watchdog = NMI_DEFAULT;
62 static unsigned int nmi_hz = HZ;
63
64 struct nmi_watchdog_ctlblk {
65         int enabled;
66         u64 check_bit;
67         unsigned int cccr_msr;
68         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
69         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
70 };
71 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
72
73 /* local prototypes */
74 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
75
76 extern void show_registers(struct pt_regs *regs);
77 extern int unknown_nmi_panic;
78
79 /* converts an msr to an appropriate reservation bit */
80 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
81 {
82         /* returns the bit offset of the performance counter register */
83         switch (boot_cpu_data.x86_vendor) {
84         case X86_VENDOR_AMD:
85                 return (msr - MSR_K7_PERFCTR0);
86         case X86_VENDOR_INTEL:
87                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
88                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
89
90                 switch (boot_cpu_data.x86) {
91                 case 6:
92                         return (msr - MSR_P6_PERFCTR0);
93                 case 15:
94                         return (msr - MSR_P4_BPU_PERFCTR0);
95                 }
96         }
97         return 0;
98 }
99
100 /* converts an msr to an appropriate reservation bit */
101 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
102 {
103         /* returns the bit offset of the event selection register */
104         switch (boot_cpu_data.x86_vendor) {
105         case X86_VENDOR_AMD:
106                 return (msr - MSR_K7_EVNTSEL0);
107         case X86_VENDOR_INTEL:
108                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
109                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
110
111                 switch (boot_cpu_data.x86) {
112                 case 6:
113                         return (msr - MSR_P6_EVNTSEL0);
114                 case 15:
115                         return (msr - MSR_P4_BSU_ESCR0);
116                 }
117         }
118         return 0;
119 }
120
121 /* checks for a bit availability (hack for oprofile) */
122 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
123 {
124         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
125
126         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
127 }
128
129 /* checks the an msr for availability */
130 int avail_to_resrv_perfctr_nmi(unsigned int msr)
131 {
132         unsigned int counter;
133
134         counter = nmi_perfctr_msr_to_bit(msr);
135         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
136
137         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
138 }
139
140 int reserve_perfctr_nmi(unsigned int msr)
141 {
142         unsigned int counter;
143
144         counter = nmi_perfctr_msr_to_bit(msr);
145         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
146
147         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
148                 return 1;
149         return 0;
150 }
151
152 void release_perfctr_nmi(unsigned int msr)
153 {
154         unsigned int counter;
155
156         counter = nmi_perfctr_msr_to_bit(msr);
157         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
158
159         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
160 }
161
162 int reserve_evntsel_nmi(unsigned int msr)
163 {
164         unsigned int counter;
165
166         counter = nmi_evntsel_msr_to_bit(msr);
167         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
168
169         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
170                 return 1;
171         return 0;
172 }
173
174 void release_evntsel_nmi(unsigned int msr)
175 {
176         unsigned int counter;
177
178         counter = nmi_evntsel_msr_to_bit(msr);
179         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
180
181         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
182 }
183
184 static __cpuinit inline int nmi_known_cpu(void)
185 {
186         switch (boot_cpu_data.x86_vendor) {
187         case X86_VENDOR_AMD:
188                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
189         case X86_VENDOR_INTEL:
190                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
191                         return 1;
192                 else
193                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
194         }
195         return 0;
196 }
197
198 static int endflag __initdata = 0;
199
200 #ifdef CONFIG_SMP
201 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
202  * the CPU is idle. To make sure the NMI watchdog really ticks on all
203  * CPUs during the test make them busy.
204  */
205 static __init void nmi_cpu_busy(void *data)
206 {
207         local_irq_enable_in_hardirq();
208         /* Intentionally don't use cpu_relax here. This is
209            to make sure that the performance counter really ticks,
210            even if there is a simulator or similar that catches the
211            pause instruction. On a real HT machine this is fine because
212            all other CPUs are busy with "useless" delay loops and don't
213            care if they get somewhat less cycles. */
214         while (endflag == 0)
215                 mb();
216 }
217 #endif
218
219 static int __init check_nmi_watchdog(void)
220 {
221         unsigned int *prev_nmi_count;
222         int cpu;
223
224         /* Enable NMI watchdog for newer systems.
225            Probably safe on most older systems too, but let's be careful.
226            IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
227            which hangs the system. Disable watchdog for all thinkpads */
228         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
229                 !dmi_name_in_vendors("ThinkPad"))
230                 nmi_watchdog = NMI_LOCAL_APIC;
231
232         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
233                 return 0;
234
235         if (!atomic_read(&nmi_active))
236                 return 0;
237
238         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
239         if (!prev_nmi_count)
240                 return -1;
241
242         printk(KERN_INFO "Testing NMI watchdog ... ");
243
244         if (nmi_watchdog == NMI_LOCAL_APIC)
245                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
246
247         for_each_possible_cpu(cpu)
248                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
249         local_irq_enable();
250         mdelay((10*1000)/nmi_hz); // wait 10 ticks
251
252         for_each_possible_cpu(cpu) {
253 #ifdef CONFIG_SMP
254                 /* Check cpu_callin_map here because that is set
255                    after the timer is started. */
256                 if (!cpu_isset(cpu, cpu_callin_map))
257                         continue;
258 #endif
259                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
260                         continue;
261                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
262                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
263                                 cpu,
264                                 prev_nmi_count[cpu],
265                                 nmi_count(cpu));
266                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
267                         atomic_dec(&nmi_active);
268                 }
269         }
270         if (!atomic_read(&nmi_active)) {
271                 kfree(prev_nmi_count);
272                 atomic_set(&nmi_active, -1);
273                 return -1;
274         }
275         endflag = 1;
276         printk("OK.\n");
277
278         /* now that we know it works we can reduce NMI frequency to
279            something more reasonable; makes a difference in some configs */
280         if (nmi_watchdog == NMI_LOCAL_APIC) {
281                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
282
283                 nmi_hz = 1;
284                 /*
285                  * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
286                  * are writable, with higher bits sign extending from bit 31.
287                  * So, we can only program the counter with 31 bit values and
288                  * 32nd bit should be 1, for 33.. to be 1.
289                  * Find the appropriate nmi_hz
290                  */
291                 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
292                         ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
293                         u64 count = (u64)cpu_khz * 1000;
294                         do_div(count, 0x7fffffffUL);
295                         nmi_hz = count + 1;
296                 }
297         }
298
299         kfree(prev_nmi_count);
300         return 0;
301 }
302 /* This needs to happen later in boot so counters are working */
303 late_initcall(check_nmi_watchdog);
304
305 static int __init setup_nmi_watchdog(char *str)
306 {
307         int nmi;
308
309         get_option(&str, &nmi);
310
311         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
312                 return 0;
313
314         nmi_watchdog = nmi;
315         return 1;
316 }
317
318 __setup("nmi_watchdog=", setup_nmi_watchdog);
319
320 static void disable_lapic_nmi_watchdog(void)
321 {
322         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
323
324         if (atomic_read(&nmi_active) <= 0)
325                 return;
326
327         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
328
329         BUG_ON(atomic_read(&nmi_active) != 0);
330 }
331
332 static void enable_lapic_nmi_watchdog(void)
333 {
334         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
335
336         /* are we already enabled */
337         if (atomic_read(&nmi_active) != 0)
338                 return;
339
340         /* are we lapic aware */
341         if (nmi_known_cpu() <= 0)
342                 return;
343
344         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
345         touch_nmi_watchdog();
346 }
347
348 void disable_timer_nmi_watchdog(void)
349 {
350         BUG_ON(nmi_watchdog != NMI_IO_APIC);
351
352         if (atomic_read(&nmi_active) <= 0)
353                 return;
354
355         disable_irq(0);
356         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
357
358         BUG_ON(atomic_read(&nmi_active) != 0);
359 }
360
361 void enable_timer_nmi_watchdog(void)
362 {
363         BUG_ON(nmi_watchdog != NMI_IO_APIC);
364
365         if (atomic_read(&nmi_active) == 0) {
366                 touch_nmi_watchdog();
367                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
368                 enable_irq(0);
369         }
370 }
371
372 #ifdef CONFIG_PM
373
374 static int nmi_pm_active; /* nmi_active before suspend */
375
376 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
377 {
378         /* only CPU0 goes here, other CPUs should be offline */
379         nmi_pm_active = atomic_read(&nmi_active);
380         stop_apic_nmi_watchdog(NULL);
381         BUG_ON(atomic_read(&nmi_active) != 0);
382         return 0;
383 }
384
385 static int lapic_nmi_resume(struct sys_device *dev)
386 {
387         /* only CPU0 goes here, other CPUs should be offline */
388         if (nmi_pm_active > 0) {
389                 setup_apic_nmi_watchdog(NULL);
390                 touch_nmi_watchdog();
391         }
392         return 0;
393 }
394
395
396 static struct sysdev_class nmi_sysclass = {
397         set_kset_name("lapic_nmi"),
398         .resume         = lapic_nmi_resume,
399         .suspend        = lapic_nmi_suspend,
400 };
401
402 static struct sys_device device_lapic_nmi = {
403         .id     = 0,
404         .cls    = &nmi_sysclass,
405 };
406
407 static int __init init_lapic_nmi_sysfs(void)
408 {
409         int error;
410
411         /* should really be a BUG_ON but b/c this is an
412          * init call, it just doesn't work.  -dcz
413          */
414         if (nmi_watchdog != NMI_LOCAL_APIC)
415                 return 0;
416
417         if ( atomic_read(&nmi_active) < 0 )
418                 return 0;
419
420         error = sysdev_class_register(&nmi_sysclass);
421         if (!error)
422                 error = sysdev_register(&device_lapic_nmi);
423         return error;
424 }
425 /* must come after the local APIC's device_initcall() */
426 late_initcall(init_lapic_nmi_sysfs);
427
428 #endif  /* CONFIG_PM */
429
430 /*
431  * Activate the NMI watchdog via the local APIC.
432  * Original code written by Keith Owens.
433  */
434
435 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
436 {
437         u64 count = (u64)cpu_khz * 1000;
438
439         do_div(count, nmi_hz);
440         if(descr)
441                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
442         wrmsrl(perfctr_msr, 0 - count);
443 }
444
445 /* Note that these events don't tick when the CPU idles. This means
446    the frequency varies with CPU load. */
447
448 #define K7_EVNTSEL_ENABLE       (1 << 22)
449 #define K7_EVNTSEL_INT          (1 << 20)
450 #define K7_EVNTSEL_OS           (1 << 17)
451 #define K7_EVNTSEL_USR          (1 << 16)
452 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
453 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
454
455 static int setup_k7_watchdog(void)
456 {
457         unsigned int perfctr_msr, evntsel_msr;
458         unsigned int evntsel;
459         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
460
461         perfctr_msr = MSR_K7_PERFCTR0;
462         evntsel_msr = MSR_K7_EVNTSEL0;
463         if (!reserve_perfctr_nmi(perfctr_msr))
464                 goto fail;
465
466         if (!reserve_evntsel_nmi(evntsel_msr))
467                 goto fail1;
468
469         wrmsrl(perfctr_msr, 0UL);
470
471         evntsel = K7_EVNTSEL_INT
472                 | K7_EVNTSEL_OS
473                 | K7_EVNTSEL_USR
474                 | K7_NMI_EVENT;
475
476         /* setup the timer */
477         wrmsr(evntsel_msr, evntsel, 0);
478         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
479         apic_write(APIC_LVTPC, APIC_DM_NMI);
480         evntsel |= K7_EVNTSEL_ENABLE;
481         wrmsr(evntsel_msr, evntsel, 0);
482
483         wd->perfctr_msr = perfctr_msr;
484         wd->evntsel_msr = evntsel_msr;
485         wd->cccr_msr = 0;  //unused
486         wd->check_bit = 1ULL<<63;
487         return 1;
488 fail1:
489         release_perfctr_nmi(perfctr_msr);
490 fail:
491         return 0;
492 }
493
494 static void stop_k7_watchdog(void)
495 {
496         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
497
498         wrmsr(wd->evntsel_msr, 0, 0);
499
500         release_evntsel_nmi(wd->evntsel_msr);
501         release_perfctr_nmi(wd->perfctr_msr);
502 }
503
504 #define P6_EVNTSEL0_ENABLE      (1 << 22)
505 #define P6_EVNTSEL_INT          (1 << 20)
506 #define P6_EVNTSEL_OS           (1 << 17)
507 #define P6_EVNTSEL_USR          (1 << 16)
508 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
509 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
510
511 static int setup_p6_watchdog(void)
512 {
513         unsigned int perfctr_msr, evntsel_msr;
514         unsigned int evntsel;
515         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
516
517         perfctr_msr = MSR_P6_PERFCTR0;
518         evntsel_msr = MSR_P6_EVNTSEL0;
519         if (!reserve_perfctr_nmi(perfctr_msr))
520                 goto fail;
521
522         if (!reserve_evntsel_nmi(evntsel_msr))
523                 goto fail1;
524
525         wrmsrl(perfctr_msr, 0UL);
526
527         evntsel = P6_EVNTSEL_INT
528                 | P6_EVNTSEL_OS
529                 | P6_EVNTSEL_USR
530                 | P6_NMI_EVENT;
531
532         /* setup the timer */
533         wrmsr(evntsel_msr, evntsel, 0);
534         write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
535         apic_write(APIC_LVTPC, APIC_DM_NMI);
536         evntsel |= P6_EVNTSEL0_ENABLE;
537         wrmsr(evntsel_msr, evntsel, 0);
538
539         wd->perfctr_msr = perfctr_msr;
540         wd->evntsel_msr = evntsel_msr;
541         wd->cccr_msr = 0;  //unused
542         wd->check_bit = 1ULL<<39;
543         return 1;
544 fail1:
545         release_perfctr_nmi(perfctr_msr);
546 fail:
547         return 0;
548 }
549
550 static void stop_p6_watchdog(void)
551 {
552         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
553
554         wrmsr(wd->evntsel_msr, 0, 0);
555
556         release_evntsel_nmi(wd->evntsel_msr);
557         release_perfctr_nmi(wd->perfctr_msr);
558 }
559
560 /* Note that these events don't tick when the CPU idles. This means
561    the frequency varies with CPU load. */
562
563 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
564 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
565 #define P4_ESCR_OS              (1<<3)
566 #define P4_ESCR_USR             (1<<2)
567 #define P4_CCCR_OVF_PMI0        (1<<26)
568 #define P4_CCCR_OVF_PMI1        (1<<27)
569 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
570 #define P4_CCCR_COMPLEMENT      (1<<19)
571 #define P4_CCCR_COMPARE         (1<<18)
572 #define P4_CCCR_REQUIRED        (3<<16)
573 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
574 #define P4_CCCR_ENABLE          (1<<12)
575 #define P4_CCCR_OVF             (1<<31)
576 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
577    CRU_ESCR0 (with any non-null event selector) through a complemented
578    max threshold. [IA32-Vol3, Section 14.9.9] */
579
580 static int setup_p4_watchdog(void)
581 {
582         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
583         unsigned int evntsel, cccr_val;
584         unsigned int misc_enable, dummy;
585         unsigned int ht_num;
586         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
587
588         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
589         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
590                 return 0;
591
592 #ifdef CONFIG_SMP
593         /* detect which hyperthread we are on */
594         if (smp_num_siblings == 2) {
595                 unsigned int ebx, apicid;
596
597                 ebx = cpuid_ebx(1);
598                 apicid = (ebx >> 24) & 0xff;
599                 ht_num = apicid & 1;
600         } else
601 #endif
602                 ht_num = 0;
603
604         /* performance counters are shared resources
605          * assign each hyperthread its own set
606          * (re-use the ESCR0 register, seems safe
607          * and keeps the cccr_val the same)
608          */
609         if (!ht_num) {
610                 /* logical cpu 0 */
611                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
612                 evntsel_msr = MSR_P4_CRU_ESCR0;
613                 cccr_msr = MSR_P4_IQ_CCCR0;
614                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
615         } else {
616                 /* logical cpu 1 */
617                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
618                 evntsel_msr = MSR_P4_CRU_ESCR0;
619                 cccr_msr = MSR_P4_IQ_CCCR1;
620                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
621         }
622
623         if (!reserve_perfctr_nmi(perfctr_msr))
624                 goto fail;
625
626         if (!reserve_evntsel_nmi(evntsel_msr))
627                 goto fail1;
628
629         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
630                 | P4_ESCR_OS
631                 | P4_ESCR_USR;
632
633         cccr_val |= P4_CCCR_THRESHOLD(15)
634                  | P4_CCCR_COMPLEMENT
635                  | P4_CCCR_COMPARE
636                  | P4_CCCR_REQUIRED;
637
638         wrmsr(evntsel_msr, evntsel, 0);
639         wrmsr(cccr_msr, cccr_val, 0);
640         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
641         apic_write(APIC_LVTPC, APIC_DM_NMI);
642         cccr_val |= P4_CCCR_ENABLE;
643         wrmsr(cccr_msr, cccr_val, 0);
644         wd->perfctr_msr = perfctr_msr;
645         wd->evntsel_msr = evntsel_msr;
646         wd->cccr_msr = cccr_msr;
647         wd->check_bit = 1ULL<<39;
648         return 1;
649 fail1:
650         release_perfctr_nmi(perfctr_msr);
651 fail:
652         return 0;
653 }
654
655 static void stop_p4_watchdog(void)
656 {
657         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
658
659         wrmsr(wd->cccr_msr, 0, 0);
660         wrmsr(wd->evntsel_msr, 0, 0);
661
662         release_evntsel_nmi(wd->evntsel_msr);
663         release_perfctr_nmi(wd->perfctr_msr);
664 }
665
666 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
667 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
668
669 static int setup_intel_arch_watchdog(void)
670 {
671         unsigned int ebx;
672         union cpuid10_eax eax;
673         unsigned int unused;
674         unsigned int perfctr_msr, evntsel_msr;
675         unsigned int evntsel;
676         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
677
678         /*
679          * Check whether the Architectural PerfMon supports
680          * Unhalted Core Cycles Event or not.
681          * NOTE: Corresponding bit = 0 in ebx indicates event present.
682          */
683         cpuid(10, &(eax.full), &ebx, &unused, &unused);
684         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
685             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
686                 goto fail;
687
688         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
689         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
690
691         if (!reserve_perfctr_nmi(perfctr_msr))
692                 goto fail;
693
694         if (!reserve_evntsel_nmi(evntsel_msr))
695                 goto fail1;
696
697         wrmsrl(perfctr_msr, 0UL);
698
699         evntsel = ARCH_PERFMON_EVENTSEL_INT
700                 | ARCH_PERFMON_EVENTSEL_OS
701                 | ARCH_PERFMON_EVENTSEL_USR
702                 | ARCH_PERFMON_NMI_EVENT_SEL
703                 | ARCH_PERFMON_NMI_EVENT_UMASK;
704
705         /* setup the timer */
706         wrmsr(evntsel_msr, evntsel, 0);
707         write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
708         apic_write(APIC_LVTPC, APIC_DM_NMI);
709         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
710         wrmsr(evntsel_msr, evntsel, 0);
711
712         wd->perfctr_msr = perfctr_msr;
713         wd->evntsel_msr = evntsel_msr;
714         wd->cccr_msr = 0;  //unused
715         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
716         return 1;
717 fail1:
718         release_perfctr_nmi(perfctr_msr);
719 fail:
720         return 0;
721 }
722
723 static void stop_intel_arch_watchdog(void)
724 {
725         unsigned int ebx;
726         union cpuid10_eax eax;
727         unsigned int unused;
728         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
729
730         /*
731          * Check whether the Architectural PerfMon supports
732          * Unhalted Core Cycles Event or not.
733          * NOTE: Corresponding bit = 0 in ebx indicates event present.
734          */
735         cpuid(10, &(eax.full), &ebx, &unused, &unused);
736         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
737             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
738                 return;
739
740         wrmsr(wd->evntsel_msr, 0, 0);
741         release_evntsel_nmi(wd->evntsel_msr);
742         release_perfctr_nmi(wd->perfctr_msr);
743 }
744
745 void setup_apic_nmi_watchdog (void *unused)
746 {
747         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
748
749         /* only support LOCAL and IO APICs for now */
750         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
751             (nmi_watchdog != NMI_IO_APIC))
752                 return;
753
754         if (wd->enabled == 1)
755                 return;
756
757         /* cheap hack to support suspend/resume */
758         /* if cpu0 is not active neither should the other cpus */
759         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
760                 return;
761
762         if (nmi_watchdog == NMI_LOCAL_APIC) {
763                 switch (boot_cpu_data.x86_vendor) {
764                 case X86_VENDOR_AMD:
765                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
766                                 return;
767                         if (!setup_k7_watchdog())
768                                 return;
769                         break;
770                 case X86_VENDOR_INTEL:
771                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
772                                 if (!setup_intel_arch_watchdog())
773                                         return;
774                                 break;
775                         }
776                         switch (boot_cpu_data.x86) {
777                         case 6:
778                                 if (boot_cpu_data.x86_model > 0xd)
779                                         return;
780
781                                 if (!setup_p6_watchdog())
782                                         return;
783                                 break;
784                         case 15:
785                                 if (boot_cpu_data.x86_model > 0x4)
786                                         return;
787
788                                 if (!setup_p4_watchdog())
789                                         return;
790                                 break;
791                         default:
792                                 return;
793                         }
794                         break;
795                 default:
796                         return;
797                 }
798         }
799         wd->enabled = 1;
800         atomic_inc(&nmi_active);
801 }
802
803 void stop_apic_nmi_watchdog(void *unused)
804 {
805         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
806
807         /* only support LOCAL and IO APICs for now */
808         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
809             (nmi_watchdog != NMI_IO_APIC))
810                 return;
811
812         if (wd->enabled == 0)
813                 return;
814
815         if (nmi_watchdog == NMI_LOCAL_APIC) {
816                 switch (boot_cpu_data.x86_vendor) {
817                 case X86_VENDOR_AMD:
818                         stop_k7_watchdog();
819                         break;
820                 case X86_VENDOR_INTEL:
821                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
822                                 stop_intel_arch_watchdog();
823                                 break;
824                         }
825                         switch (boot_cpu_data.x86) {
826                         case 6:
827                                 if (boot_cpu_data.x86_model > 0xd)
828                                         break;
829                                 stop_p6_watchdog();
830                                 break;
831                         case 15:
832                                 if (boot_cpu_data.x86_model > 0x4)
833                                         break;
834                                 stop_p4_watchdog();
835                                 break;
836                         }
837                         break;
838                 default:
839                         return;
840                 }
841         }
842         wd->enabled = 0;
843         atomic_dec(&nmi_active);
844 }
845
846 /*
847  * the best way to detect whether a CPU has a 'hard lockup' problem
848  * is to check it's local APIC timer IRQ counts. If they are not
849  * changing then that CPU has some problem.
850  *
851  * as these watchdog NMI IRQs are generated on every CPU, we only
852  * have to check the current processor.
853  *
854  * since NMIs don't listen to _any_ locks, we have to be extremely
855  * careful not to rely on unsafe variables. The printk might lock
856  * up though, so we have to break up any console locks first ...
857  * [when there will be more tty-related locks, break them up
858  *  here too!]
859  */
860
861 static unsigned int
862         last_irq_sums [NR_CPUS],
863         alert_counter [NR_CPUS];
864
865 void touch_nmi_watchdog (void)
866 {
867         if (nmi_watchdog > 0) {
868                 unsigned cpu;
869
870                 /*
871                  * Just reset the alert counters, (other CPUs might be
872                  * spinning on locks we hold):
873                  */
874                 for_each_present_cpu (cpu)
875                         alert_counter[cpu] = 0;
876         }
877
878         /*
879          * Tickle the softlockup detector too:
880          */
881         touch_softlockup_watchdog();
882 }
883 EXPORT_SYMBOL(touch_nmi_watchdog);
884
885 extern void die_nmi(struct pt_regs *, const char *msg);
886
887 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
888 {
889
890         /*
891          * Since current_thread_info()-> is always on the stack, and we
892          * always switch the stack NMI-atomically, it's safe to use
893          * smp_processor_id().
894          */
895         unsigned int sum;
896         int touched = 0;
897         int cpu = smp_processor_id();
898         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
899         u64 dummy;
900         int rc=0;
901
902         /* check for other users first */
903         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
904                         == NOTIFY_STOP) {
905                 rc = 1;
906                 touched = 1;
907         }
908
909         if (cpu_isset(cpu, backtrace_mask)) {
910                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
911
912                 spin_lock(&lock);
913                 printk("NMI backtrace for cpu %d\n", cpu);
914                 dump_stack();
915                 spin_unlock(&lock);
916                 cpu_clear(cpu, backtrace_mask);
917         }
918
919         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
920
921         /* if the apic timer isn't firing, this cpu isn't doing much */
922         if (!touched && last_irq_sums[cpu] == sum) {
923                 /*
924                  * Ayiee, looks like this CPU is stuck ...
925                  * wait a few IRQs (5 seconds) before doing the oops ...
926                  */
927                 alert_counter[cpu]++;
928                 if (alert_counter[cpu] == 5*nmi_hz)
929                         /*
930                          * die_nmi will return ONLY if NOTIFY_STOP happens..
931                          */
932                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
933         } else {
934                 last_irq_sums[cpu] = sum;
935                 alert_counter[cpu] = 0;
936         }
937         /* see if the nmi watchdog went off */
938         if (wd->enabled) {
939                 if (nmi_watchdog == NMI_LOCAL_APIC) {
940                         rdmsrl(wd->perfctr_msr, dummy);
941                         if (dummy & wd->check_bit){
942                                 /* this wasn't a watchdog timer interrupt */
943                                 goto done;
944                         }
945
946                         /* only Intel P4 uses the cccr msr */
947                         if (wd->cccr_msr != 0) {
948                                 /*
949                                  * P4 quirks:
950                                  * - An overflown perfctr will assert its interrupt
951                                  *   until the OVF flag in its CCCR is cleared.
952                                  * - LVTPC is masked on interrupt and must be
953                                  *   unmasked by the LVTPC handler.
954                                  */
955                                 rdmsrl(wd->cccr_msr, dummy);
956                                 dummy &= ~P4_CCCR_OVF;
957                                 wrmsrl(wd->cccr_msr, dummy);
958                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
959                         }
960                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
961                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
962                                 /* P6 based Pentium M need to re-unmask
963                                  * the apic vector but it doesn't hurt
964                                  * other P6 variant.
965                                  * ArchPerfom/Core Duo also needs this */
966                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
967                         }
968                         /* start the cycle over again */
969                         write_watchdog_counter(wd->perfctr_msr, NULL);
970                         rc = 1;
971                 } else if (nmi_watchdog == NMI_IO_APIC) {
972                         /* don't know how to accurately check for this.
973                          * just assume it was a watchdog timer interrupt
974                          * This matches the old behaviour.
975                          */
976                         rc = 1;
977                 }
978         }
979 done:
980         return rc;
981 }
982
983 int do_nmi_callback(struct pt_regs * regs, int cpu)
984 {
985 #ifdef CONFIG_SYSCTL
986         if (unknown_nmi_panic)
987                 return unknown_nmi_panic_callback(regs, cpu);
988 #endif
989         return 0;
990 }
991
992 #ifdef CONFIG_SYSCTL
993
994 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
995 {
996         unsigned char reason = get_nmi_reason();
997         char buf[64];
998
999         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1000         die_nmi(regs, buf);
1001         return 0;
1002 }
1003
1004 /*
1005  * proc handler for /proc/sys/kernel/nmi
1006  */
1007 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1008                         void __user *buffer, size_t *length, loff_t *ppos)
1009 {
1010         int old_state;
1011
1012         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1013         old_state = nmi_watchdog_enabled;
1014         proc_dointvec(table, write, file, buffer, length, ppos);
1015         if (!!old_state == !!nmi_watchdog_enabled)
1016                 return 0;
1017
1018         if (atomic_read(&nmi_active) < 0) {
1019                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1020                 return -EIO;
1021         }
1022
1023         if (nmi_watchdog == NMI_DEFAULT) {
1024                 if (nmi_known_cpu() > 0)
1025                         nmi_watchdog = NMI_LOCAL_APIC;
1026                 else
1027                         nmi_watchdog = NMI_IO_APIC;
1028         }
1029
1030         if (nmi_watchdog == NMI_LOCAL_APIC) {
1031                 if (nmi_watchdog_enabled)
1032                         enable_lapic_nmi_watchdog();
1033                 else
1034                         disable_lapic_nmi_watchdog();
1035         } else {
1036                 printk( KERN_WARNING
1037                         "NMI watchdog doesn't know what hardware to touch\n");
1038                 return -EIO;
1039         }
1040         return 0;
1041 }
1042
1043 #endif
1044
1045 void __trigger_all_cpu_backtrace(void)
1046 {
1047         int i;
1048
1049         backtrace_mask = cpu_online_map;
1050         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1051         for (i = 0; i < 10 * 1000; i++) {
1052                 if (cpus_empty(backtrace_mask))
1053                         break;
1054                 mdelay(1);
1055         }
1056 }
1057
1058 EXPORT_SYMBOL(nmi_active);
1059 EXPORT_SYMBOL(nmi_watchdog);
1060 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1061 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1062 EXPORT_SYMBOL(reserve_perfctr_nmi);
1063 EXPORT_SYMBOL(release_perfctr_nmi);
1064 EXPORT_SYMBOL(reserve_evntsel_nmi);
1065 EXPORT_SYMBOL(release_evntsel_nmi);
1066 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1067 EXPORT_SYMBOL(enable_timer_nmi_watchdog);