Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / arch / x86 / kernel / cpu / perfctr-watchdog.c
1 /*
2  * local apic based NMI watchdog for various CPUs.
3  *
4  * This file also handles reservation of performance counters for coordination
5  * with other users (like oprofile).
6  *
7  * Note that these events normally don't tick when the CPU idles. This means
8  * the frequency varies with CPU load.
9  *
10  * Original code for K7/P6 written by Keith Owens
11  *
12  */
13
14 #include <linux/percpu.h>
15 #include <linux/module.h>
16 #include <linux/kernel.h>
17 #include <linux/bitops.h>
18 #include <linux/smp.h>
19 #include <linux/nmi.h>
20 #include <linux/kprobes.h>
21
22 #include <asm/apic.h>
23 #include <asm/perf_counter.h>
24
25 struct nmi_watchdog_ctlblk {
26         unsigned int cccr_msr;
27         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
28         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
29 };
30
31 /* Interface defining a CPU specific perfctr watchdog */
32 struct wd_ops {
33         int (*reserve)(void);
34         void (*unreserve)(void);
35         int (*setup)(unsigned nmi_hz);
36         void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
37         void (*stop)(void);
38         unsigned perfctr;
39         unsigned evntsel;
40         u64 checkbit;
41 };
42
43 static const struct wd_ops *wd_ops;
44
45 /*
46  * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
47  * offset from MSR_P4_BSU_ESCR0.
48  *
49  * It will be the max for all platforms (for now)
50  */
51 #define NMI_MAX_COUNTER_BITS 66
52
53 /*
54  * perfctr_nmi_owner tracks the ownership of the perfctr registers:
55  * evtsel_nmi_owner tracks the ownership of the event selection
56  * - different performance counters/ event selection may be reserved for
57  *   different subsystems this reservation system just tries to coordinate
58  *   things a little
59  */
60 static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
61 static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
62
63 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
64
65 /* converts an msr to an appropriate reservation bit */
66 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
67 {
68         /* returns the bit offset of the performance counter register */
69         switch (boot_cpu_data.x86_vendor) {
70         case X86_VENDOR_AMD:
71                 return (msr - MSR_K7_PERFCTR0);
72         case X86_VENDOR_INTEL:
73                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
74                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
75
76                 switch (boot_cpu_data.x86) {
77                 case 6:
78                         return (msr - MSR_P6_PERFCTR0);
79                 case 15:
80                         return (msr - MSR_P4_BPU_PERFCTR0);
81                 }
82         }
83         return 0;
84 }
85
86 /*
87  * converts an msr to an appropriate reservation bit
88  * returns the bit offset of the event selection register
89  */
90 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
91 {
92         /* returns the bit offset of the event selection register */
93         switch (boot_cpu_data.x86_vendor) {
94         case X86_VENDOR_AMD:
95                 return (msr - MSR_K7_EVNTSEL0);
96         case X86_VENDOR_INTEL:
97                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
98                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
99
100                 switch (boot_cpu_data.x86) {
101                 case 6:
102                         return (msr - MSR_P6_EVNTSEL0);
103                 case 15:
104                         return (msr - MSR_P4_BSU_ESCR0);
105                 }
106         }
107         return 0;
108
109 }
110
111 /* checks for a bit availability (hack for oprofile) */
112 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
113 {
114         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
115
116         return (!test_bit(counter, perfctr_nmi_owner));
117 }
118
119 /* checks the an msr for availability */
120 int avail_to_resrv_perfctr_nmi(unsigned int msr)
121 {
122         unsigned int counter;
123
124         counter = nmi_perfctr_msr_to_bit(msr);
125         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127         return (!test_bit(counter, perfctr_nmi_owner));
128 }
129 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130
131 int reserve_perfctr_nmi(unsigned int msr)
132 {
133         unsigned int counter;
134
135         counter = nmi_perfctr_msr_to_bit(msr);
136         /* register not managed by the allocator? */
137         if (counter > NMI_MAX_COUNTER_BITS)
138                 return 1;
139
140         if (!test_and_set_bit(counter, perfctr_nmi_owner))
141                 return 1;
142         return 0;
143 }
144 EXPORT_SYMBOL(reserve_perfctr_nmi);
145
146 void release_perfctr_nmi(unsigned int msr)
147 {
148         unsigned int counter;
149
150         counter = nmi_perfctr_msr_to_bit(msr);
151         /* register not managed by the allocator? */
152         if (counter > NMI_MAX_COUNTER_BITS)
153                 return;
154
155         clear_bit(counter, perfctr_nmi_owner);
156 }
157 EXPORT_SYMBOL(release_perfctr_nmi);
158
159 int reserve_evntsel_nmi(unsigned int msr)
160 {
161         unsigned int counter;
162
163         counter = nmi_evntsel_msr_to_bit(msr);
164         /* register not managed by the allocator? */
165         if (counter > NMI_MAX_COUNTER_BITS)
166                 return 1;
167
168         if (!test_and_set_bit(counter, evntsel_nmi_owner))
169                 return 1;
170         return 0;
171 }
172 EXPORT_SYMBOL(reserve_evntsel_nmi);
173
174 void release_evntsel_nmi(unsigned int msr)
175 {
176         unsigned int counter;
177
178         counter = nmi_evntsel_msr_to_bit(msr);
179         /* register not managed by the allocator? */
180         if (counter > NMI_MAX_COUNTER_BITS)
181                 return;
182
183         clear_bit(counter, evntsel_nmi_owner);
184 }
185 EXPORT_SYMBOL(release_evntsel_nmi);
186
187 void disable_lapic_nmi_watchdog(void)
188 {
189         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
190
191         if (atomic_read(&nmi_active) <= 0)
192                 return;
193
194         on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
195
196         if (wd_ops)
197                 wd_ops->unreserve();
198
199         BUG_ON(atomic_read(&nmi_active) != 0);
200 }
201
202 void enable_lapic_nmi_watchdog(void)
203 {
204         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
205
206         /* are we already enabled */
207         if (atomic_read(&nmi_active) != 0)
208                 return;
209
210         /* are we lapic aware */
211         if (!wd_ops)
212                 return;
213         if (!wd_ops->reserve()) {
214                 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
215                 return;
216         }
217
218         on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
219         touch_nmi_watchdog();
220 }
221
222 /*
223  * Activate the NMI watchdog via the local APIC.
224  */
225
226 static unsigned int adjust_for_32bit_ctr(unsigned int hz)
227 {
228         u64 counter_val;
229         unsigned int retval = hz;
230
231         /*
232          * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
233          * are writable, with higher bits sign extending from bit 31.
234          * So, we can only program the counter with 31 bit values and
235          * 32nd bit should be 1, for 33.. to be 1.
236          * Find the appropriate nmi_hz
237          */
238         counter_val = (u64)cpu_khz * 1000;
239         do_div(counter_val, retval);
240         if (counter_val > 0x7fffffffULL) {
241                 u64 count = (u64)cpu_khz * 1000;
242                 do_div(count, 0x7fffffffUL);
243                 retval = count + 1;
244         }
245         return retval;
246 }
247
248 static void write_watchdog_counter(unsigned int perfctr_msr,
249                                 const char *descr, unsigned nmi_hz)
250 {
251         u64 count = (u64)cpu_khz * 1000;
252
253         do_div(count, nmi_hz);
254         if(descr)
255                 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256         wrmsrl(perfctr_msr, 0 - count);
257 }
258
259 static void write_watchdog_counter32(unsigned int perfctr_msr,
260                                 const char *descr, unsigned nmi_hz)
261 {
262         u64 count = (u64)cpu_khz * 1000;
263
264         do_div(count, nmi_hz);
265         if(descr)
266                 pr_debug("setting %s to -0x%08Lx\n", descr, count);
267         wrmsr(perfctr_msr, (u32)(-count), 0);
268 }
269
270 /*
271  * AMD K7/K8/Family10h/Family11h support.
272  * AMD keeps this interface nicely stable so there is not much variety
273  */
274 #define K7_EVNTSEL_ENABLE       (1 << 22)
275 #define K7_EVNTSEL_INT          (1 << 20)
276 #define K7_EVNTSEL_OS           (1 << 17)
277 #define K7_EVNTSEL_USR          (1 << 16)
278 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
279 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
280
281 static int setup_k7_watchdog(unsigned nmi_hz)
282 {
283         unsigned int perfctr_msr, evntsel_msr;
284         unsigned int evntsel;
285         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
286
287         perfctr_msr = wd_ops->perfctr;
288         evntsel_msr = wd_ops->evntsel;
289
290         wrmsrl(perfctr_msr, 0UL);
291
292         evntsel = K7_EVNTSEL_INT
293                 | K7_EVNTSEL_OS
294                 | K7_EVNTSEL_USR
295                 | K7_NMI_EVENT;
296
297         /* setup the timer */
298         wrmsr(evntsel_msr, evntsel, 0);
299         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
300
301         /* initialize the wd struct before enabling */
302         wd->perfctr_msr = perfctr_msr;
303         wd->evntsel_msr = evntsel_msr;
304         wd->cccr_msr = 0;  /* unused */
305
306         /* ok, everything is initialized, announce that we're set */
307         cpu_nmi_set_wd_enabled();
308
309         apic_write(APIC_LVTPC, APIC_DM_NMI);
310         evntsel |= K7_EVNTSEL_ENABLE;
311         wrmsr(evntsel_msr, evntsel, 0);
312
313         return 1;
314 }
315
316 static void single_msr_stop_watchdog(void)
317 {
318         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
319
320         wrmsr(wd->evntsel_msr, 0, 0);
321 }
322
323 static int single_msr_reserve(void)
324 {
325         if (!reserve_perfctr_nmi(wd_ops->perfctr))
326                 return 0;
327
328         if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
329                 release_perfctr_nmi(wd_ops->perfctr);
330                 return 0;
331         }
332         return 1;
333 }
334
335 static void single_msr_unreserve(void)
336 {
337         release_evntsel_nmi(wd_ops->evntsel);
338         release_perfctr_nmi(wd_ops->perfctr);
339 }
340
341 static void __kprobes
342 single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
343 {
344         /* start the cycle over again */
345         write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
346 }
347
348 static const struct wd_ops k7_wd_ops = {
349         .reserve        = single_msr_reserve,
350         .unreserve      = single_msr_unreserve,
351         .setup          = setup_k7_watchdog,
352         .rearm          = single_msr_rearm,
353         .stop           = single_msr_stop_watchdog,
354         .perfctr        = MSR_K7_PERFCTR0,
355         .evntsel        = MSR_K7_EVNTSEL0,
356         .checkbit       = 1ULL << 47,
357 };
358
359 /*
360  * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
361  */
362 #define P6_EVNTSEL0_ENABLE      (1 << 22)
363 #define P6_EVNTSEL_INT          (1 << 20)
364 #define P6_EVNTSEL_OS           (1 << 17)
365 #define P6_EVNTSEL_USR          (1 << 16)
366 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
367 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
368
369 static int setup_p6_watchdog(unsigned nmi_hz)
370 {
371         unsigned int perfctr_msr, evntsel_msr;
372         unsigned int evntsel;
373         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
374
375         perfctr_msr = wd_ops->perfctr;
376         evntsel_msr = wd_ops->evntsel;
377
378         /* KVM doesn't implement this MSR */
379         if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
380                 return 0;
381
382         evntsel = P6_EVNTSEL_INT
383                 | P6_EVNTSEL_OS
384                 | P6_EVNTSEL_USR
385                 | P6_NMI_EVENT;
386
387         /* setup the timer */
388         wrmsr(evntsel_msr, evntsel, 0);
389         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
390         write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
391
392         /* initialize the wd struct before enabling */
393         wd->perfctr_msr = perfctr_msr;
394         wd->evntsel_msr = evntsel_msr;
395         wd->cccr_msr = 0;  /* unused */
396
397         /* ok, everything is initialized, announce that we're set */
398         cpu_nmi_set_wd_enabled();
399
400         apic_write(APIC_LVTPC, APIC_DM_NMI);
401         evntsel |= P6_EVNTSEL0_ENABLE;
402         wrmsr(evntsel_msr, evntsel, 0);
403
404         return 1;
405 }
406
407 static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
408 {
409         /*
410          * P6 based Pentium M need to re-unmask
411          * the apic vector but it doesn't hurt
412          * other P6 variant.
413          * ArchPerfom/Core Duo also needs this
414          */
415         apic_write(APIC_LVTPC, APIC_DM_NMI);
416
417         /* P6/ARCH_PERFMON has 32 bit counter write */
418         write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
419 }
420
421 static const struct wd_ops p6_wd_ops = {
422         .reserve        = single_msr_reserve,
423         .unreserve      = single_msr_unreserve,
424         .setup          = setup_p6_watchdog,
425         .rearm          = p6_rearm,
426         .stop           = single_msr_stop_watchdog,
427         .perfctr        = MSR_P6_PERFCTR0,
428         .evntsel        = MSR_P6_EVNTSEL0,
429         .checkbit       = 1ULL << 39,
430 };
431
432 /*
433  * Intel P4 performance counters.
434  * By far the most complicated of all.
435  */
436 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1 << 7)
437 #define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
438 #define P4_ESCR_OS              (1 << 3)
439 #define P4_ESCR_USR             (1 << 2)
440 #define P4_CCCR_OVF_PMI0        (1 << 26)
441 #define P4_CCCR_OVF_PMI1        (1 << 27)
442 #define P4_CCCR_THRESHOLD(N)    ((N) << 20)
443 #define P4_CCCR_COMPLEMENT      (1 << 19)
444 #define P4_CCCR_COMPARE         (1 << 18)
445 #define P4_CCCR_REQUIRED        (3 << 16)
446 #define P4_CCCR_ESCR_SELECT(N)  ((N) << 13)
447 #define P4_CCCR_ENABLE          (1 << 12)
448 #define P4_CCCR_OVF             (1 << 31)
449
450 #define P4_CONTROLS 18
451 static unsigned int p4_controls[18] = {
452         MSR_P4_BPU_CCCR0,
453         MSR_P4_BPU_CCCR1,
454         MSR_P4_BPU_CCCR2,
455         MSR_P4_BPU_CCCR3,
456         MSR_P4_MS_CCCR0,
457         MSR_P4_MS_CCCR1,
458         MSR_P4_MS_CCCR2,
459         MSR_P4_MS_CCCR3,
460         MSR_P4_FLAME_CCCR0,
461         MSR_P4_FLAME_CCCR1,
462         MSR_P4_FLAME_CCCR2,
463         MSR_P4_FLAME_CCCR3,
464         MSR_P4_IQ_CCCR0,
465         MSR_P4_IQ_CCCR1,
466         MSR_P4_IQ_CCCR2,
467         MSR_P4_IQ_CCCR3,
468         MSR_P4_IQ_CCCR4,
469         MSR_P4_IQ_CCCR5,
470 };
471 /*
472  * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
473  * CRU_ESCR0 (with any non-null event selector) through a complemented
474  * max threshold. [IA32-Vol3, Section 14.9.9]
475  */
476 static int setup_p4_watchdog(unsigned nmi_hz)
477 {
478         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
479         unsigned int evntsel, cccr_val;
480         unsigned int misc_enable, dummy;
481         unsigned int ht_num;
482         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
483
484         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
485         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
486                 return 0;
487
488 #ifdef CONFIG_SMP
489         /* detect which hyperthread we are on */
490         if (smp_num_siblings == 2) {
491                 unsigned int ebx, apicid;
492
493                 ebx = cpuid_ebx(1);
494                 apicid = (ebx >> 24) & 0xff;
495                 ht_num = apicid & 1;
496         } else
497 #endif
498                 ht_num = 0;
499
500         /*
501          * performance counters are shared resources
502          * assign each hyperthread its own set
503          * (re-use the ESCR0 register, seems safe
504          * and keeps the cccr_val the same)
505          */
506         if (!ht_num) {
507                 /* logical cpu 0 */
508                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
509                 evntsel_msr = MSR_P4_CRU_ESCR0;
510                 cccr_msr = MSR_P4_IQ_CCCR0;
511                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
512
513                 /*
514                  * If we're on the kdump kernel or other situation, we may
515                  * still have other performance counter registers set to
516                  * interrupt and they'll keep interrupting forever because
517                  * of the P4_CCCR_OVF quirk. So we need to ACK all the
518                  * pending interrupts and disable all the registers here,
519                  * before reenabling the NMI delivery. Refer to p4_rearm()
520                  * about the P4_CCCR_OVF quirk.
521                  */
522                 if (reset_devices) {
523                         unsigned int low, high;
524                         int i;
525
526                         for (i = 0; i < P4_CONTROLS; i++) {
527                                 rdmsr(p4_controls[i], low, high);
528                                 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
529                                 wrmsr(p4_controls[i], low, high);
530                         }
531                 }
532         } else {
533                 /* logical cpu 1 */
534                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
535                 evntsel_msr = MSR_P4_CRU_ESCR0;
536                 cccr_msr = MSR_P4_IQ_CCCR1;
537
538                 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
539                 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
540                         cccr_val = P4_CCCR_OVF_PMI0;
541                 else
542                         cccr_val = P4_CCCR_OVF_PMI1;
543                 cccr_val |= P4_CCCR_ESCR_SELECT(4);
544         }
545
546         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
547                 | P4_ESCR_OS
548                 | P4_ESCR_USR;
549
550         cccr_val |= P4_CCCR_THRESHOLD(15)
551                  | P4_CCCR_COMPLEMENT
552                  | P4_CCCR_COMPARE
553                  | P4_CCCR_REQUIRED;
554
555         wrmsr(evntsel_msr, evntsel, 0);
556         wrmsr(cccr_msr, cccr_val, 0);
557         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
558
559         wd->perfctr_msr = perfctr_msr;
560         wd->evntsel_msr = evntsel_msr;
561         wd->cccr_msr = cccr_msr;
562
563         /* ok, everything is initialized, announce that we're set */
564         cpu_nmi_set_wd_enabled();
565
566         apic_write(APIC_LVTPC, APIC_DM_NMI);
567         cccr_val |= P4_CCCR_ENABLE;
568         wrmsr(cccr_msr, cccr_val, 0);
569         return 1;
570 }
571
572 static void stop_p4_watchdog(void)
573 {
574         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
575         wrmsr(wd->cccr_msr, 0, 0);
576         wrmsr(wd->evntsel_msr, 0, 0);
577 }
578
579 static int p4_reserve(void)
580 {
581         if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
582                 return 0;
583 #ifdef CONFIG_SMP
584         if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
585                 goto fail1;
586 #endif
587         if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
588                 goto fail2;
589         /* RED-PEN why is ESCR1 not reserved here? */
590         return 1;
591  fail2:
592 #ifdef CONFIG_SMP
593         if (smp_num_siblings > 1)
594                 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
595  fail1:
596 #endif
597         release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
598         return 0;
599 }
600
601 static void p4_unreserve(void)
602 {
603 #ifdef CONFIG_SMP
604         if (smp_num_siblings > 1)
605                 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
606 #endif
607         release_evntsel_nmi(MSR_P4_CRU_ESCR0);
608         release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
609 }
610
611 static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
612 {
613         unsigned dummy;
614         /*
615          * P4 quirks:
616          * - An overflown perfctr will assert its interrupt
617          *   until the OVF flag in its CCCR is cleared.
618          * - LVTPC is masked on interrupt and must be
619          *   unmasked by the LVTPC handler.
620          */
621         rdmsrl(wd->cccr_msr, dummy);
622         dummy &= ~P4_CCCR_OVF;
623         wrmsrl(wd->cccr_msr, dummy);
624         apic_write(APIC_LVTPC, APIC_DM_NMI);
625         /* start the cycle over again */
626         write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
627 }
628
629 static const struct wd_ops p4_wd_ops = {
630         .reserve        = p4_reserve,
631         .unreserve      = p4_unreserve,
632         .setup          = setup_p4_watchdog,
633         .rearm          = p4_rearm,
634         .stop           = stop_p4_watchdog,
635         /* RED-PEN this is wrong for the other sibling */
636         .perfctr        = MSR_P4_BPU_PERFCTR0,
637         .evntsel        = MSR_P4_BSU_ESCR0,
638         .checkbit       = 1ULL << 39,
639 };
640
641 /*
642  * Watchdog using the Intel architected PerfMon.
643  * Used for Core2 and hopefully all future Intel CPUs.
644  */
645 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
646 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
647
648 static struct wd_ops intel_arch_wd_ops;
649
650 static int setup_intel_arch_watchdog(unsigned nmi_hz)
651 {
652         unsigned int ebx;
653         union cpuid10_eax eax;
654         unsigned int unused;
655         unsigned int perfctr_msr, evntsel_msr;
656         unsigned int evntsel;
657         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
658
659         /*
660          * Check whether the Architectural PerfMon supports
661          * Unhalted Core Cycles Event or not.
662          * NOTE: Corresponding bit = 0 in ebx indicates event present.
663          */
664         cpuid(10, &(eax.full), &ebx, &unused, &unused);
665         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
666             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
667                 return 0;
668
669         perfctr_msr = wd_ops->perfctr;
670         evntsel_msr = wd_ops->evntsel;
671
672         wrmsrl(perfctr_msr, 0UL);
673
674         evntsel = ARCH_PERFMON_EVENTSEL_INT
675                 | ARCH_PERFMON_EVENTSEL_OS
676                 | ARCH_PERFMON_EVENTSEL_USR
677                 | ARCH_PERFMON_NMI_EVENT_SEL
678                 | ARCH_PERFMON_NMI_EVENT_UMASK;
679
680         /* setup the timer */
681         wrmsr(evntsel_msr, evntsel, 0);
682         nmi_hz = adjust_for_32bit_ctr(nmi_hz);
683         write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
684
685         wd->perfctr_msr = perfctr_msr;
686         wd->evntsel_msr = evntsel_msr;
687         wd->cccr_msr = 0;  /* unused */
688
689         /* ok, everything is initialized, announce that we're set */
690         cpu_nmi_set_wd_enabled();
691
692         apic_write(APIC_LVTPC, APIC_DM_NMI);
693         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
694         wrmsr(evntsel_msr, evntsel, 0);
695         intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
696         return 1;
697 }
698
699 static struct wd_ops intel_arch_wd_ops __read_mostly = {
700         .reserve        = single_msr_reserve,
701         .unreserve      = single_msr_unreserve,
702         .setup          = setup_intel_arch_watchdog,
703         .rearm          = p6_rearm,
704         .stop           = single_msr_stop_watchdog,
705         .perfctr        = MSR_ARCH_PERFMON_PERFCTR1,
706         .evntsel        = MSR_ARCH_PERFMON_EVENTSEL1,
707 };
708
709 static void probe_nmi_watchdog(void)
710 {
711         switch (boot_cpu_data.x86_vendor) {
712         case X86_VENDOR_AMD:
713                 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
714                     boot_cpu_data.x86 != 16)
715                         return;
716                 wd_ops = &k7_wd_ops;
717                 break;
718         case X86_VENDOR_INTEL:
719                 /* Work around where perfctr1 doesn't have a working enable
720                  * bit as described in the following errata:
721                  * AE49 Core Duo and Intel Core Solo 65 nm
722                  * AN49 Intel Pentium Dual-Core
723                  * AF49 Dual-Core Intel Xeon Processor LV
724                  */
725                 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
726                     ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
727                      boot_cpu_data.x86_mask == 4))) {
728                         intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
729                         intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
730                 }
731                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
732                         wd_ops = &intel_arch_wd_ops;
733                         break;
734                 }
735                 switch (boot_cpu_data.x86) {
736                 case 6:
737                         if (boot_cpu_data.x86_model > 13)
738                                 return;
739
740                         wd_ops = &p6_wd_ops;
741                         break;
742                 case 15:
743                         wd_ops = &p4_wd_ops;
744                         break;
745                 default:
746                         return;
747                 }
748                 break;
749         }
750 }
751
752 /* Interface to nmi.c */
753
754 int lapic_watchdog_init(unsigned nmi_hz)
755 {
756         if (!wd_ops) {
757                 probe_nmi_watchdog();
758                 if (!wd_ops) {
759                         printk(KERN_INFO "NMI watchdog: CPU not supported\n");
760                         return -1;
761                 }
762
763                 if (!wd_ops->reserve()) {
764                         printk(KERN_ERR
765                                 "NMI watchdog: cannot reserve perfctrs\n");
766                         return -1;
767                 }
768         }
769
770         if (!(wd_ops->setup(nmi_hz))) {
771                 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
772                        raw_smp_processor_id());
773                 return -1;
774         }
775
776         return 0;
777 }
778
779 void lapic_watchdog_stop(void)
780 {
781         if (wd_ops)
782                 wd_ops->stop();
783 }
784
785 unsigned lapic_adjust_nmi_hz(unsigned hz)
786 {
787         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
788         if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
789             wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
790                 hz = adjust_for_32bit_ctr(hz);
791         return hz;
792 }
793
794 int __kprobes lapic_wd_event(unsigned nmi_hz)
795 {
796         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
797         u64 ctr;
798
799         rdmsrl(wd->perfctr_msr, ctr);
800         if (ctr & wd_ops->checkbit) /* perfctr still running? */
801                 return 0;
802
803         wd_ops->rearm(wd, nmi_hz);
804         return 1;
805 }