perfcounters/powerpc: fix oops with multiple counters in a group
[linux-2.6] / arch / powerpc / kernel / perf_counter.c
1 /*
2  * Performance counter support - powerpc architecture code
3  *
4  * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11 #include <linux/kernel.h>
12 #include <linux/sched.h>
13 #include <linux/perf_counter.h>
14 #include <linux/percpu.h>
15 #include <linux/hardirq.h>
16 #include <asm/reg.h>
17 #include <asm/pmc.h>
18 #include <asm/machdep.h>
19 #include <asm/firmware.h>
20
21 struct cpu_hw_counters {
22         int n_counters;
23         int n_percpu;
24         int disabled;
25         int n_added;
26         struct perf_counter *counter[MAX_HWCOUNTERS];
27         unsigned int events[MAX_HWCOUNTERS];
28         u64 mmcr[3];
29         u8 pmcs_enabled;
30 };
31 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
32
33 struct power_pmu *ppmu;
34
35 /*
36  * Normally, to ignore kernel events we set the FCS (freeze counters
37  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
38  * hypervisor bit set in the MSR, or if we are running on a processor
39  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
40  * then we need to use the FCHV bit to ignore kernel events.
41  */
42 static unsigned int freeze_counters_kernel = MMCR0_FCS;
43
44 void perf_counter_print_debug(void)
45 {
46 }
47
48 /*
49  * Read one performance monitor counter (PMC).
50  */
51 static unsigned long read_pmc(int idx)
52 {
53         unsigned long val;
54
55         switch (idx) {
56         case 1:
57                 val = mfspr(SPRN_PMC1);
58                 break;
59         case 2:
60                 val = mfspr(SPRN_PMC2);
61                 break;
62         case 3:
63                 val = mfspr(SPRN_PMC3);
64                 break;
65         case 4:
66                 val = mfspr(SPRN_PMC4);
67                 break;
68         case 5:
69                 val = mfspr(SPRN_PMC5);
70                 break;
71         case 6:
72                 val = mfspr(SPRN_PMC6);
73                 break;
74         case 7:
75                 val = mfspr(SPRN_PMC7);
76                 break;
77         case 8:
78                 val = mfspr(SPRN_PMC8);
79                 break;
80         default:
81                 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
82                 val = 0;
83         }
84         return val;
85 }
86
87 /*
88  * Write one PMC.
89  */
90 static void write_pmc(int idx, unsigned long val)
91 {
92         switch (idx) {
93         case 1:
94                 mtspr(SPRN_PMC1, val);
95                 break;
96         case 2:
97                 mtspr(SPRN_PMC2, val);
98                 break;
99         case 3:
100                 mtspr(SPRN_PMC3, val);
101                 break;
102         case 4:
103                 mtspr(SPRN_PMC4, val);
104                 break;
105         case 5:
106                 mtspr(SPRN_PMC5, val);
107                 break;
108         case 6:
109                 mtspr(SPRN_PMC6, val);
110                 break;
111         case 7:
112                 mtspr(SPRN_PMC7, val);
113                 break;
114         case 8:
115                 mtspr(SPRN_PMC8, val);
116                 break;
117         default:
118                 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
119         }
120 }
121
122 /*
123  * Check if a set of events can all go on the PMU at once.
124  * If they can't, this will look at alternative codes for the events
125  * and see if any combination of alternative codes is feasible.
126  * The feasible set is returned in event[].
127  */
128 static int power_check_constraints(unsigned int event[], int n_ev)
129 {
130         u64 mask, value, nv;
131         unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
132         u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
133         u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
134         u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
135         int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
136         int i, j;
137         u64 addf = ppmu->add_fields;
138         u64 tadd = ppmu->test_adder;
139
140         if (n_ev > ppmu->n_counter)
141                 return -1;
142
143         /* First see if the events will go on as-is */
144         for (i = 0; i < n_ev; ++i) {
145                 alternatives[i][0] = event[i];
146                 if (ppmu->get_constraint(event[i], &amasks[i][0],
147                                          &avalues[i][0]))
148                         return -1;
149                 choice[i] = 0;
150         }
151         value = mask = 0;
152         for (i = 0; i < n_ev; ++i) {
153                 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
154                 if ((((nv + tadd) ^ value) & mask) != 0 ||
155                     (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
156                         break;
157                 value = nv;
158                 mask |= amasks[i][0];
159         }
160         if (i == n_ev)
161                 return 0;       /* all OK */
162
163         /* doesn't work, gather alternatives... */
164         if (!ppmu->get_alternatives)
165                 return -1;
166         for (i = 0; i < n_ev; ++i) {
167                 n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
168                 for (j = 1; j < n_alt[i]; ++j)
169                         ppmu->get_constraint(alternatives[i][j],
170                                              &amasks[i][j], &avalues[i][j]);
171         }
172
173         /* enumerate all possibilities and see if any will work */
174         i = 0;
175         j = -1;
176         value = mask = nv = 0;
177         while (i < n_ev) {
178                 if (j >= 0) {
179                         /* we're backtracking, restore context */
180                         value = svalues[i];
181                         mask = smasks[i];
182                         j = choice[i];
183                 }
184                 /*
185                  * See if any alternative k for event i,
186                  * where k > j, will satisfy the constraints.
187                  */
188                 while (++j < n_alt[i]) {
189                         nv = (value | avalues[i][j]) +
190                                 (value & avalues[i][j] & addf);
191                         if ((((nv + tadd) ^ value) & mask) == 0 &&
192                             (((nv + tadd) ^ avalues[i][j])
193                              & amasks[i][j]) == 0)
194                                 break;
195                 }
196                 if (j >= n_alt[i]) {
197                         /*
198                          * No feasible alternative, backtrack
199                          * to event i-1 and continue enumerating its
200                          * alternatives from where we got up to.
201                          */
202                         if (--i < 0)
203                                 return -1;
204                 } else {
205                         /*
206                          * Found a feasible alternative for event i,
207                          * remember where we got up to with this event,
208                          * go on to the next event, and start with
209                          * the first alternative for it.
210                          */
211                         choice[i] = j;
212                         svalues[i] = value;
213                         smasks[i] = mask;
214                         value = nv;
215                         mask |= amasks[i][j];
216                         ++i;
217                         j = -1;
218                 }
219         }
220
221         /* OK, we have a feasible combination, tell the caller the solution */
222         for (i = 0; i < n_ev; ++i)
223                 event[i] = alternatives[i][choice[i]];
224         return 0;
225 }
226
227 /*
228  * Check if newly-added counters have consistent settings for
229  * exclude_{user,kernel,hv} with each other and any previously
230  * added counters.
231  */
232 static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
233 {
234         int eu, ek, eh;
235         int i, n;
236         struct perf_counter *counter;
237
238         n = n_prev + n_new;
239         if (n <= 1)
240                 return 0;
241
242         eu = ctrs[0]->hw_event.exclude_user;
243         ek = ctrs[0]->hw_event.exclude_kernel;
244         eh = ctrs[0]->hw_event.exclude_hv;
245         if (n_prev == 0)
246                 n_prev = 1;
247         for (i = n_prev; i < n; ++i) {
248                 counter = ctrs[i];
249                 if (counter->hw_event.exclude_user != eu ||
250                     counter->hw_event.exclude_kernel != ek ||
251                     counter->hw_event.exclude_hv != eh)
252                         return -EAGAIN;
253         }
254         return 0;
255 }
256
257 static void power_perf_read(struct perf_counter *counter)
258 {
259         long val, delta, prev;
260
261         if (!counter->hw.idx)
262                 return;
263         /*
264          * Performance monitor interrupts come even when interrupts
265          * are soft-disabled, as long as interrupts are hard-enabled.
266          * Therefore we treat them like NMIs.
267          */
268         do {
269                 prev = atomic64_read(&counter->hw.prev_count);
270                 barrier();
271                 val = read_pmc(counter->hw.idx);
272         } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
273
274         /* The counters are only 32 bits wide */
275         delta = (val - prev) & 0xfffffffful;
276         atomic64_add(delta, &counter->count);
277         atomic64_sub(delta, &counter->hw.period_left);
278 }
279
280 /*
281  * Disable all counters to prevent PMU interrupts and to allow
282  * counters to be added or removed.
283  */
284 u64 hw_perf_save_disable(void)
285 {
286         struct cpu_hw_counters *cpuhw;
287         unsigned long ret;
288         unsigned long flags;
289
290         local_irq_save(flags);
291         cpuhw = &__get_cpu_var(cpu_hw_counters);
292
293         ret = cpuhw->disabled;
294         if (!ret) {
295                 cpuhw->disabled = 1;
296                 cpuhw->n_added = 0;
297
298                 /*
299                  * Check if we ever enabled the PMU on this cpu.
300                  */
301                 if (!cpuhw->pmcs_enabled) {
302                         if (ppc_md.enable_pmcs)
303                                 ppc_md.enable_pmcs();
304                         cpuhw->pmcs_enabled = 1;
305                 }
306
307                 /*
308                  * Set the 'freeze counters' bit.
309                  * The barrier is to make sure the mtspr has been
310                  * executed and the PMU has frozen the counters
311                  * before we return.
312                  */
313                 mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
314                 mb();
315         }
316         local_irq_restore(flags);
317         return ret;
318 }
319
320 /*
321  * Re-enable all counters if disable == 0.
322  * If we were previously disabled and counters were added, then
323  * put the new config on the PMU.
324  */
325 void hw_perf_restore(u64 disable)
326 {
327         struct perf_counter *counter;
328         struct cpu_hw_counters *cpuhw;
329         unsigned long flags;
330         long i;
331         unsigned long val;
332         s64 left;
333         unsigned int hwc_index[MAX_HWCOUNTERS];
334
335         if (disable)
336                 return;
337         local_irq_save(flags);
338         cpuhw = &__get_cpu_var(cpu_hw_counters);
339         cpuhw->disabled = 0;
340
341         /*
342          * If we didn't change anything, or only removed counters,
343          * no need to recalculate MMCR* settings and reset the PMCs.
344          * Just reenable the PMU with the current MMCR* settings
345          * (possibly updated for removal of counters).
346          */
347         if (!cpuhw->n_added) {
348                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
349                 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
350                 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
351                 if (cpuhw->n_counters == 0)
352                         get_lppaca()->pmcregs_in_use = 0;
353                 goto out;
354         }
355
356         /*
357          * Compute MMCR* values for the new set of counters
358          */
359         if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
360                                cpuhw->mmcr)) {
361                 /* shouldn't ever get here */
362                 printk(KERN_ERR "oops compute_mmcr failed\n");
363                 goto out;
364         }
365
366         /*
367          * Add in MMCR0 freeze bits corresponding to the
368          * hw_event.exclude_* bits for the first counter.
369          * We have already checked that all counters have the
370          * same values for these bits as the first counter.
371          */
372         counter = cpuhw->counter[0];
373         if (counter->hw_event.exclude_user)
374                 cpuhw->mmcr[0] |= MMCR0_FCP;
375         if (counter->hw_event.exclude_kernel)
376                 cpuhw->mmcr[0] |= freeze_counters_kernel;
377         if (counter->hw_event.exclude_hv)
378                 cpuhw->mmcr[0] |= MMCR0_FCHV;
379
380         /*
381          * Write the new configuration to MMCR* with the freeze
382          * bit set and set the hardware counters to their initial values.
383          * Then unfreeze the counters.
384          */
385         get_lppaca()->pmcregs_in_use = 1;
386         mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
387         mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
388         mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
389                                 | MMCR0_FC);
390
391         /*
392          * Read off any pre-existing counters that need to move
393          * to another PMC.
394          */
395         for (i = 0; i < cpuhw->n_counters; ++i) {
396                 counter = cpuhw->counter[i];
397                 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
398                         power_perf_read(counter);
399                         write_pmc(counter->hw.idx, 0);
400                         counter->hw.idx = 0;
401                 }
402         }
403
404         /*
405          * Initialize the PMCs for all the new and moved counters.
406          */
407         for (i = 0; i < cpuhw->n_counters; ++i) {
408                 counter = cpuhw->counter[i];
409                 if (counter->hw.idx)
410                         continue;
411                 val = 0;
412                 if (counter->hw_event.irq_period) {
413                         left = atomic64_read(&counter->hw.period_left);
414                         if (left < 0x80000000L)
415                                 val = 0x80000000L - left;
416                 }
417                 atomic64_set(&counter->hw.prev_count, val);
418                 counter->hw.idx = hwc_index[i] + 1;
419                 write_pmc(counter->hw.idx, val);
420         }
421         mb();
422         cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
423         mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
424
425  out:
426         local_irq_restore(flags);
427 }
428
429 static int collect_events(struct perf_counter *group, int max_count,
430                           struct perf_counter *ctrs[], unsigned int *events)
431 {
432         int n = 0;
433         struct perf_counter *counter;
434
435         if (!is_software_counter(group)) {
436                 if (n >= max_count)
437                         return -1;
438                 ctrs[n] = group;
439                 events[n++] = group->hw.config;
440         }
441         list_for_each_entry(counter, &group->sibling_list, list_entry) {
442                 if (!is_software_counter(counter) &&
443                     counter->state != PERF_COUNTER_STATE_OFF) {
444                         if (n >= max_count)
445                                 return -1;
446                         ctrs[n] = counter;
447                         events[n++] = counter->hw.config;
448                 }
449         }
450         return n;
451 }
452
453 static void counter_sched_in(struct perf_counter *counter, int cpu)
454 {
455         counter->state = PERF_COUNTER_STATE_ACTIVE;
456         counter->oncpu = cpu;
457         if (is_software_counter(counter))
458                 counter->hw_ops->enable(counter);
459 }
460
461 /*
462  * Called to enable a whole group of counters.
463  * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
464  * Assumes the caller has disabled interrupts and has
465  * frozen the PMU with hw_perf_save_disable.
466  */
467 int hw_perf_group_sched_in(struct perf_counter *group_leader,
468                struct perf_cpu_context *cpuctx,
469                struct perf_counter_context *ctx, int cpu)
470 {
471         struct cpu_hw_counters *cpuhw;
472         long i, n, n0;
473         struct perf_counter *sub;
474
475         cpuhw = &__get_cpu_var(cpu_hw_counters);
476         n0 = cpuhw->n_counters;
477         n = collect_events(group_leader, ppmu->n_counter - n0,
478                            &cpuhw->counter[n0], &cpuhw->events[n0]);
479         if (n < 0)
480                 return -EAGAIN;
481         if (check_excludes(cpuhw->counter, n0, n))
482                 return -EAGAIN;
483         if (power_check_constraints(cpuhw->events, n + n0))
484                 return -EAGAIN;
485         cpuhw->n_counters = n0 + n;
486         cpuhw->n_added += n;
487
488         /*
489          * OK, this group can go on; update counter states etc.,
490          * and enable any software counters
491          */
492         for (i = n0; i < n0 + n; ++i)
493                 cpuhw->counter[i]->hw.config = cpuhw->events[i];
494         cpuctx->active_oncpu += n;
495         n = 1;
496         counter_sched_in(group_leader, cpu);
497         list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
498                 if (sub->state != PERF_COUNTER_STATE_OFF) {
499                         counter_sched_in(sub, cpu);
500                         ++n;
501                 }
502         }
503         ctx->nr_active += n;
504
505         return 1;
506 }
507
508 /*
509  * Add a counter to the PMU.
510  * If all counters are not already frozen, then we disable and
511  * re-enable the PMU in order to get hw_perf_restore to do the
512  * actual work of reconfiguring the PMU.
513  */
514 static int power_perf_enable(struct perf_counter *counter)
515 {
516         struct cpu_hw_counters *cpuhw;
517         unsigned long flags;
518         u64 pmudis;
519         int n0;
520         int ret = -EAGAIN;
521
522         local_irq_save(flags);
523         pmudis = hw_perf_save_disable();
524
525         /*
526          * Add the counter to the list (if there is room)
527          * and check whether the total set is still feasible.
528          */
529         cpuhw = &__get_cpu_var(cpu_hw_counters);
530         n0 = cpuhw->n_counters;
531         if (n0 >= ppmu->n_counter)
532                 goto out;
533         cpuhw->counter[n0] = counter;
534         cpuhw->events[n0] = counter->hw.config;
535         if (check_excludes(cpuhw->counter, n0, 1))
536                 goto out;
537         if (power_check_constraints(cpuhw->events, n0 + 1))
538                 goto out;
539
540         counter->hw.config = cpuhw->events[n0];
541         ++cpuhw->n_counters;
542         ++cpuhw->n_added;
543
544         ret = 0;
545  out:
546         hw_perf_restore(pmudis);
547         local_irq_restore(flags);
548         return ret;
549 }
550
551 /*
552  * Remove a counter from the PMU.
553  */
554 static void power_perf_disable(struct perf_counter *counter)
555 {
556         struct cpu_hw_counters *cpuhw;
557         long i;
558         u64 pmudis;
559         unsigned long flags;
560
561         local_irq_save(flags);
562         pmudis = hw_perf_save_disable();
563
564         power_perf_read(counter);
565
566         cpuhw = &__get_cpu_var(cpu_hw_counters);
567         for (i = 0; i < cpuhw->n_counters; ++i) {
568                 if (counter == cpuhw->counter[i]) {
569                         while (++i < cpuhw->n_counters)
570                                 cpuhw->counter[i-1] = cpuhw->counter[i];
571                         --cpuhw->n_counters;
572                         ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
573                         write_pmc(counter->hw.idx, 0);
574                         counter->hw.idx = 0;
575                         break;
576                 }
577         }
578         if (cpuhw->n_counters == 0) {
579                 /* disable exceptions if no counters are running */
580                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
581         }
582
583         hw_perf_restore(pmudis);
584         local_irq_restore(flags);
585 }
586
587 struct hw_perf_counter_ops power_perf_ops = {
588         .enable = power_perf_enable,
589         .disable = power_perf_disable,
590         .read = power_perf_read
591 };
592
593 const struct hw_perf_counter_ops *
594 hw_perf_counter_init(struct perf_counter *counter)
595 {
596         unsigned long ev;
597         struct perf_counter *ctrs[MAX_HWCOUNTERS];
598         unsigned int events[MAX_HWCOUNTERS];
599         int n;
600
601         if (!ppmu)
602                 return NULL;
603         if ((s64)counter->hw_event.irq_period < 0)
604                 return NULL;
605         ev = counter->hw_event.type;
606         if (!counter->hw_event.raw) {
607                 if (ev >= ppmu->n_generic ||
608                     ppmu->generic_events[ev] == 0)
609                         return NULL;
610                 ev = ppmu->generic_events[ev];
611         }
612         counter->hw.config_base = ev;
613         counter->hw.idx = 0;
614
615         /*
616          * If we are not running on a hypervisor, force the
617          * exclude_hv bit to 0 so that we don't care what
618          * the user set it to.
619          */
620         if (!firmware_has_feature(FW_FEATURE_LPAR))
621                 counter->hw_event.exclude_hv = 0;
622         
623         /*
624          * If this is in a group, check if it can go on with all the
625          * other hardware counters in the group.  We assume the counter
626          * hasn't been linked into its leader's sibling list at this point.
627          */
628         n = 0;
629         if (counter->group_leader != counter) {
630                 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
631                                    ctrs, events);
632                 if (n < 0)
633                         return NULL;
634         }
635         events[n] = ev;
636         ctrs[n] = counter;
637         if (check_excludes(ctrs, n, 1))
638                 return NULL;
639         if (power_check_constraints(events, n + 1))
640                 return NULL;
641
642         counter->hw.config = events[n];
643         atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
644         return &power_perf_ops;
645 }
646
647 /*
648  * Handle wakeups.
649  */
650 void perf_counter_do_pending(void)
651 {
652         int i;
653         struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
654         struct perf_counter *counter;
655
656         set_perf_counter_pending(0);
657         for (i = 0; i < cpuhw->n_counters; ++i) {
658                 counter = cpuhw->counter[i];
659                 if (counter && counter->wakeup_pending) {
660                         counter->wakeup_pending = 0;
661                         wake_up(&counter->waitq);
662                 }
663         }
664 }
665
666 /*
667  * Record data for an irq counter.
668  * This function was lifted from the x86 code; maybe it should
669  * go in the core?
670  */
671 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
672 {
673         struct perf_data *irqdata = counter->irqdata;
674
675         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
676                 irqdata->overrun++;
677         } else {
678                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
679
680                 *p = data;
681                 irqdata->len += sizeof(u64);
682         }
683 }
684
685 /*
686  * Record all the values of the counters in a group
687  */
688 static void perf_handle_group(struct perf_counter *counter)
689 {
690         struct perf_counter *leader, *sub;
691
692         leader = counter->group_leader;
693         list_for_each_entry(sub, &leader->sibling_list, list_entry) {
694                 if (sub != counter)
695                         sub->hw_ops->read(sub);
696                 perf_store_irq_data(counter, sub->hw_event.type);
697                 perf_store_irq_data(counter, atomic64_read(&sub->count));
698         }
699 }
700
701 /*
702  * A counter has overflowed; update its count and record
703  * things if requested.  Note that interrupts are hard-disabled
704  * here so there is no possibility of being interrupted.
705  */
706 static void record_and_restart(struct perf_counter *counter, long val,
707                                struct pt_regs *regs)
708 {
709         s64 prev, delta, left;
710         int record = 0;
711
712         /* we don't have to worry about interrupts here */
713         prev = atomic64_read(&counter->hw.prev_count);
714         delta = (val - prev) & 0xfffffffful;
715         atomic64_add(delta, &counter->count);
716
717         /*
718          * See if the total period for this counter has expired,
719          * and update for the next period.
720          */
721         val = 0;
722         left = atomic64_read(&counter->hw.period_left) - delta;
723         if (counter->hw_event.irq_period) {
724                 if (left <= 0) {
725                         left += counter->hw_event.irq_period;
726                         if (left <= 0)
727                                 left = counter->hw_event.irq_period;
728                         record = 1;
729                 }
730                 if (left < 0x80000000L)
731                         val = 0x80000000L - left;
732         }
733         write_pmc(counter->hw.idx, val);
734         atomic64_set(&counter->hw.prev_count, val);
735         atomic64_set(&counter->hw.period_left, left);
736
737         /*
738          * Finally record data if requested.
739          */
740         if (record) {
741                 switch (counter->hw_event.record_type) {
742                 case PERF_RECORD_SIMPLE:
743                         break;
744                 case PERF_RECORD_IRQ:
745                         perf_store_irq_data(counter, instruction_pointer(regs));
746                         counter->wakeup_pending = 1;
747                         break;
748                 case PERF_RECORD_GROUP:
749                         perf_handle_group(counter);
750                         counter->wakeup_pending = 1;
751                         break;
752                 }
753         }
754 }
755
756 /*
757  * Performance monitor interrupt stuff
758  */
759 static void perf_counter_interrupt(struct pt_regs *regs)
760 {
761         int i;
762         struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
763         struct perf_counter *counter;
764         long val;
765         int need_wakeup = 0, found = 0;
766
767         for (i = 0; i < cpuhw->n_counters; ++i) {
768                 counter = cpuhw->counter[i];
769                 val = read_pmc(counter->hw.idx);
770                 if ((int)val < 0) {
771                         /* counter has overflowed */
772                         found = 1;
773                         record_and_restart(counter, val, regs);
774                         if (counter->wakeup_pending)
775                                 need_wakeup = 1;
776                 }
777         }
778
779         /*
780          * In case we didn't find and reset the counter that caused
781          * the interrupt, scan all counters and reset any that are
782          * negative, to avoid getting continual interrupts.
783          * Any that we processed in the previous loop will not be negative.
784          */
785         if (!found) {
786                 for (i = 0; i < ppmu->n_counter; ++i) {
787                         val = read_pmc(i + 1);
788                         if ((int)val < 0)
789                                 write_pmc(i + 1, 0);
790                 }
791         }
792
793         /*
794          * Reset MMCR0 to its normal value.  This will set PMXE and
795          * clear FC (freeze counters) and PMAO (perf mon alert occurred)
796          * and thus allow interrupts to occur again.
797          * XXX might want to use MSR.PM to keep the counters frozen until
798          * we get back out of this interrupt.
799          */
800         mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
801
802         /*
803          * If we need a wakeup, check whether interrupts were soft-enabled
804          * when we took the interrupt.  If they were, we can wake stuff up
805          * immediately; otherwise we'll have to set a flag and do the
806          * wakeup when interrupts get soft-enabled.
807          */
808         if (need_wakeup) {
809                 if (regs->softe) {
810                         irq_enter();
811                         perf_counter_do_pending();
812                         irq_exit();
813                 } else {
814                         set_perf_counter_pending(1);
815                 }
816         }
817 }
818
819 void hw_perf_counter_setup(int cpu)
820 {
821         struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
822
823         memset(cpuhw, 0, sizeof(*cpuhw));
824         cpuhw->mmcr[0] = MMCR0_FC;
825 }
826
827 extern struct power_pmu ppc970_pmu;
828 extern struct power_pmu power5_pmu;
829 extern struct power_pmu power6_pmu;
830
831 static int init_perf_counters(void)
832 {
833         unsigned long pvr;
834
835         if (reserve_pmc_hardware(perf_counter_interrupt)) {
836                 printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
837                 return -EBUSY;
838         }
839
840         /* XXX should get this from cputable */
841         pvr = mfspr(SPRN_PVR);
842         switch (PVR_VER(pvr)) {
843         case PV_970:
844         case PV_970FX:
845         case PV_970MP:
846                 ppmu = &ppc970_pmu;
847                 break;
848         case PV_POWER5:
849                 ppmu = &power5_pmu;
850                 break;
851         case 0x3e:
852                 ppmu = &power6_pmu;
853                 break;
854         }
855
856         /*
857          * Use FCHV to ignore kernel events if MSR.HV is set.
858          */
859         if (mfmsr() & MSR_HV)
860                 freeze_counters_kernel = MMCR0_FCHV;
861
862         return 0;
863 }
864
865 arch_initcall(init_perf_counters);