perf_counter: avoid recursion
[linux-2.6] / kernel / perf_counter.c
1 /*
2  * Performance counter core code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *
7  *  For licencing details see kernel-base/COPYING
8  */
9
10 #include <linux/fs.h>
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/perf_counter.h>
23 #include <linux/mm.h>
24 #include <linux/vmstat.h>
25 #include <linux/rculist.h>
26 #include <linux/hardirq.h>
27
28 #include <asm/irq_regs.h>
29
30 /*
31  * Each CPU has a list of per CPU counters:
32  */
33 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
34
35 int perf_max_counters __read_mostly = 1;
36 static int perf_reserved_percpu __read_mostly;
37 static int perf_overcommit __read_mostly = 1;
38
39 /*
40  * Mutex for (sysadmin-configurable) counter reservations:
41  */
42 static DEFINE_MUTEX(perf_resource_mutex);
43
44 /*
45  * Architecture provided APIs - weak aliases:
46  */
47 extern __weak const struct hw_perf_counter_ops *
48 hw_perf_counter_init(struct perf_counter *counter)
49 {
50         return NULL;
51 }
52
53 u64 __weak hw_perf_save_disable(void)           { return 0; }
54 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
55 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
56 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
57                struct perf_cpu_context *cpuctx,
58                struct perf_counter_context *ctx, int cpu)
59 {
60         return 0;
61 }
62
63 void __weak perf_counter_print_debug(void)      { }
64
65 static void
66 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
67 {
68         struct perf_counter *group_leader = counter->group_leader;
69
70         /*
71          * Depending on whether it is a standalone or sibling counter,
72          * add it straight to the context's counter list, or to the group
73          * leader's sibling list:
74          */
75         if (counter->group_leader == counter)
76                 list_add_tail(&counter->list_entry, &ctx->counter_list);
77         else
78                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
79
80         list_add_rcu(&counter->event_entry, &ctx->event_list);
81 }
82
83 static void
84 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
85 {
86         struct perf_counter *sibling, *tmp;
87
88         list_del_init(&counter->list_entry);
89         list_del_rcu(&counter->event_entry);
90
91         /*
92          * If this was a group counter with sibling counters then
93          * upgrade the siblings to singleton counters by adding them
94          * to the context list directly:
95          */
96         list_for_each_entry_safe(sibling, tmp,
97                                  &counter->sibling_list, list_entry) {
98
99                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
100                 sibling->group_leader = sibling;
101         }
102 }
103
104 static void
105 counter_sched_out(struct perf_counter *counter,
106                   struct perf_cpu_context *cpuctx,
107                   struct perf_counter_context *ctx)
108 {
109         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
110                 return;
111
112         counter->state = PERF_COUNTER_STATE_INACTIVE;
113         counter->hw_ops->disable(counter);
114         counter->oncpu = -1;
115
116         if (!is_software_counter(counter))
117                 cpuctx->active_oncpu--;
118         ctx->nr_active--;
119         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
120                 cpuctx->exclusive = 0;
121 }
122
123 static void
124 group_sched_out(struct perf_counter *group_counter,
125                 struct perf_cpu_context *cpuctx,
126                 struct perf_counter_context *ctx)
127 {
128         struct perf_counter *counter;
129
130         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
131                 return;
132
133         counter_sched_out(group_counter, cpuctx, ctx);
134
135         /*
136          * Schedule out siblings (if any):
137          */
138         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
139                 counter_sched_out(counter, cpuctx, ctx);
140
141         if (group_counter->hw_event.exclusive)
142                 cpuctx->exclusive = 0;
143 }
144
145 /*
146  * Cross CPU call to remove a performance counter
147  *
148  * We disable the counter on the hardware level first. After that we
149  * remove it from the context list.
150  */
151 static void __perf_counter_remove_from_context(void *info)
152 {
153         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
154         struct perf_counter *counter = info;
155         struct perf_counter_context *ctx = counter->ctx;
156         unsigned long flags;
157         u64 perf_flags;
158
159         /*
160          * If this is a task context, we need to check whether it is
161          * the current task context of this cpu. If not it has been
162          * scheduled out before the smp call arrived.
163          */
164         if (ctx->task && cpuctx->task_ctx != ctx)
165                 return;
166
167         curr_rq_lock_irq_save(&flags);
168         spin_lock(&ctx->lock);
169
170         counter_sched_out(counter, cpuctx, ctx);
171
172         counter->task = NULL;
173         ctx->nr_counters--;
174
175         /*
176          * Protect the list operation against NMI by disabling the
177          * counters on a global level. NOP for non NMI based counters.
178          */
179         perf_flags = hw_perf_save_disable();
180         list_del_counter(counter, ctx);
181         hw_perf_restore(perf_flags);
182
183         if (!ctx->task) {
184                 /*
185                  * Allow more per task counters with respect to the
186                  * reservation:
187                  */
188                 cpuctx->max_pertask =
189                         min(perf_max_counters - ctx->nr_counters,
190                             perf_max_counters - perf_reserved_percpu);
191         }
192
193         spin_unlock(&ctx->lock);
194         curr_rq_unlock_irq_restore(&flags);
195 }
196
197
198 /*
199  * Remove the counter from a task's (or a CPU's) list of counters.
200  *
201  * Must be called with counter->mutex and ctx->mutex held.
202  *
203  * CPU counters are removed with a smp call. For task counters we only
204  * call when the task is on a CPU.
205  */
206 static void perf_counter_remove_from_context(struct perf_counter *counter)
207 {
208         struct perf_counter_context *ctx = counter->ctx;
209         struct task_struct *task = ctx->task;
210
211         if (!task) {
212                 /*
213                  * Per cpu counters are removed via an smp call and
214                  * the removal is always sucessful.
215                  */
216                 smp_call_function_single(counter->cpu,
217                                          __perf_counter_remove_from_context,
218                                          counter, 1);
219                 return;
220         }
221
222 retry:
223         task_oncpu_function_call(task, __perf_counter_remove_from_context,
224                                  counter);
225
226         spin_lock_irq(&ctx->lock);
227         /*
228          * If the context is active we need to retry the smp call.
229          */
230         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
231                 spin_unlock_irq(&ctx->lock);
232                 goto retry;
233         }
234
235         /*
236          * The lock prevents that this context is scheduled in so we
237          * can remove the counter safely, if the call above did not
238          * succeed.
239          */
240         if (!list_empty(&counter->list_entry)) {
241                 ctx->nr_counters--;
242                 list_del_counter(counter, ctx);
243                 counter->task = NULL;
244         }
245         spin_unlock_irq(&ctx->lock);
246 }
247
248 /*
249  * Cross CPU call to disable a performance counter
250  */
251 static void __perf_counter_disable(void *info)
252 {
253         struct perf_counter *counter = info;
254         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
255         struct perf_counter_context *ctx = counter->ctx;
256         unsigned long flags;
257
258         /*
259          * If this is a per-task counter, need to check whether this
260          * counter's task is the current task on this cpu.
261          */
262         if (ctx->task && cpuctx->task_ctx != ctx)
263                 return;
264
265         curr_rq_lock_irq_save(&flags);
266         spin_lock(&ctx->lock);
267
268         /*
269          * If the counter is on, turn it off.
270          * If it is in error state, leave it in error state.
271          */
272         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
273                 if (counter == counter->group_leader)
274                         group_sched_out(counter, cpuctx, ctx);
275                 else
276                         counter_sched_out(counter, cpuctx, ctx);
277                 counter->state = PERF_COUNTER_STATE_OFF;
278         }
279
280         spin_unlock(&ctx->lock);
281         curr_rq_unlock_irq_restore(&flags);
282 }
283
284 /*
285  * Disable a counter.
286  */
287 static void perf_counter_disable(struct perf_counter *counter)
288 {
289         struct perf_counter_context *ctx = counter->ctx;
290         struct task_struct *task = ctx->task;
291
292         if (!task) {
293                 /*
294                  * Disable the counter on the cpu that it's on
295                  */
296                 smp_call_function_single(counter->cpu, __perf_counter_disable,
297                                          counter, 1);
298                 return;
299         }
300
301  retry:
302         task_oncpu_function_call(task, __perf_counter_disable, counter);
303
304         spin_lock_irq(&ctx->lock);
305         /*
306          * If the counter is still active, we need to retry the cross-call.
307          */
308         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
309                 spin_unlock_irq(&ctx->lock);
310                 goto retry;
311         }
312
313         /*
314          * Since we have the lock this context can't be scheduled
315          * in, so we can change the state safely.
316          */
317         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
318                 counter->state = PERF_COUNTER_STATE_OFF;
319
320         spin_unlock_irq(&ctx->lock);
321 }
322
323 /*
324  * Disable a counter and all its children.
325  */
326 static void perf_counter_disable_family(struct perf_counter *counter)
327 {
328         struct perf_counter *child;
329
330         perf_counter_disable(counter);
331
332         /*
333          * Lock the mutex to protect the list of children
334          */
335         mutex_lock(&counter->mutex);
336         list_for_each_entry(child, &counter->child_list, child_list)
337                 perf_counter_disable(child);
338         mutex_unlock(&counter->mutex);
339 }
340
341 static int
342 counter_sched_in(struct perf_counter *counter,
343                  struct perf_cpu_context *cpuctx,
344                  struct perf_counter_context *ctx,
345                  int cpu)
346 {
347         if (counter->state <= PERF_COUNTER_STATE_OFF)
348                 return 0;
349
350         counter->state = PERF_COUNTER_STATE_ACTIVE;
351         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
352         /*
353          * The new state must be visible before we turn it on in the hardware:
354          */
355         smp_wmb();
356
357         if (counter->hw_ops->enable(counter)) {
358                 counter->state = PERF_COUNTER_STATE_INACTIVE;
359                 counter->oncpu = -1;
360                 return -EAGAIN;
361         }
362
363         if (!is_software_counter(counter))
364                 cpuctx->active_oncpu++;
365         ctx->nr_active++;
366
367         if (counter->hw_event.exclusive)
368                 cpuctx->exclusive = 1;
369
370         return 0;
371 }
372
373 /*
374  * Return 1 for a group consisting entirely of software counters,
375  * 0 if the group contains any hardware counters.
376  */
377 static int is_software_only_group(struct perf_counter *leader)
378 {
379         struct perf_counter *counter;
380
381         if (!is_software_counter(leader))
382                 return 0;
383         list_for_each_entry(counter, &leader->sibling_list, list_entry)
384                 if (!is_software_counter(counter))
385                         return 0;
386         return 1;
387 }
388
389 /*
390  * Work out whether we can put this counter group on the CPU now.
391  */
392 static int group_can_go_on(struct perf_counter *counter,
393                            struct perf_cpu_context *cpuctx,
394                            int can_add_hw)
395 {
396         /*
397          * Groups consisting entirely of software counters can always go on.
398          */
399         if (is_software_only_group(counter))
400                 return 1;
401         /*
402          * If an exclusive group is already on, no other hardware
403          * counters can go on.
404          */
405         if (cpuctx->exclusive)
406                 return 0;
407         /*
408          * If this group is exclusive and there are already
409          * counters on the CPU, it can't go on.
410          */
411         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
412                 return 0;
413         /*
414          * Otherwise, try to add it if all previous groups were able
415          * to go on.
416          */
417         return can_add_hw;
418 }
419
420 /*
421  * Cross CPU call to install and enable a performance counter
422  */
423 static void __perf_install_in_context(void *info)
424 {
425         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
426         struct perf_counter *counter = info;
427         struct perf_counter_context *ctx = counter->ctx;
428         struct perf_counter *leader = counter->group_leader;
429         int cpu = smp_processor_id();
430         unsigned long flags;
431         u64 perf_flags;
432         int err;
433
434         /*
435          * If this is a task context, we need to check whether it is
436          * the current task context of this cpu. If not it has been
437          * scheduled out before the smp call arrived.
438          */
439         if (ctx->task && cpuctx->task_ctx != ctx)
440                 return;
441
442         curr_rq_lock_irq_save(&flags);
443         spin_lock(&ctx->lock);
444
445         /*
446          * Protect the list operation against NMI by disabling the
447          * counters on a global level. NOP for non NMI based counters.
448          */
449         perf_flags = hw_perf_save_disable();
450
451         list_add_counter(counter, ctx);
452         ctx->nr_counters++;
453         counter->prev_state = PERF_COUNTER_STATE_OFF;
454
455         /*
456          * Don't put the counter on if it is disabled or if
457          * it is in a group and the group isn't on.
458          */
459         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
460             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
461                 goto unlock;
462
463         /*
464          * An exclusive counter can't go on if there are already active
465          * hardware counters, and no hardware counter can go on if there
466          * is already an exclusive counter on.
467          */
468         if (!group_can_go_on(counter, cpuctx, 1))
469                 err = -EEXIST;
470         else
471                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
472
473         if (err) {
474                 /*
475                  * This counter couldn't go on.  If it is in a group
476                  * then we have to pull the whole group off.
477                  * If the counter group is pinned then put it in error state.
478                  */
479                 if (leader != counter)
480                         group_sched_out(leader, cpuctx, ctx);
481                 if (leader->hw_event.pinned)
482                         leader->state = PERF_COUNTER_STATE_ERROR;
483         }
484
485         if (!err && !ctx->task && cpuctx->max_pertask)
486                 cpuctx->max_pertask--;
487
488  unlock:
489         hw_perf_restore(perf_flags);
490
491         spin_unlock(&ctx->lock);
492         curr_rq_unlock_irq_restore(&flags);
493 }
494
495 /*
496  * Attach a performance counter to a context
497  *
498  * First we add the counter to the list with the hardware enable bit
499  * in counter->hw_config cleared.
500  *
501  * If the counter is attached to a task which is on a CPU we use a smp
502  * call to enable it in the task context. The task might have been
503  * scheduled away, but we check this in the smp call again.
504  *
505  * Must be called with ctx->mutex held.
506  */
507 static void
508 perf_install_in_context(struct perf_counter_context *ctx,
509                         struct perf_counter *counter,
510                         int cpu)
511 {
512         struct task_struct *task = ctx->task;
513
514         if (!task) {
515                 /*
516                  * Per cpu counters are installed via an smp call and
517                  * the install is always sucessful.
518                  */
519                 smp_call_function_single(cpu, __perf_install_in_context,
520                                          counter, 1);
521                 return;
522         }
523
524         counter->task = task;
525 retry:
526         task_oncpu_function_call(task, __perf_install_in_context,
527                                  counter);
528
529         spin_lock_irq(&ctx->lock);
530         /*
531          * we need to retry the smp call.
532          */
533         if (ctx->is_active && list_empty(&counter->list_entry)) {
534                 spin_unlock_irq(&ctx->lock);
535                 goto retry;
536         }
537
538         /*
539          * The lock prevents that this context is scheduled in so we
540          * can add the counter safely, if it the call above did not
541          * succeed.
542          */
543         if (list_empty(&counter->list_entry)) {
544                 list_add_counter(counter, ctx);
545                 ctx->nr_counters++;
546         }
547         spin_unlock_irq(&ctx->lock);
548 }
549
550 /*
551  * Cross CPU call to enable a performance counter
552  */
553 static void __perf_counter_enable(void *info)
554 {
555         struct perf_counter *counter = info;
556         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
557         struct perf_counter_context *ctx = counter->ctx;
558         struct perf_counter *leader = counter->group_leader;
559         unsigned long flags;
560         int err;
561
562         /*
563          * If this is a per-task counter, need to check whether this
564          * counter's task is the current task on this cpu.
565          */
566         if (ctx->task && cpuctx->task_ctx != ctx)
567                 return;
568
569         curr_rq_lock_irq_save(&flags);
570         spin_lock(&ctx->lock);
571
572         counter->prev_state = counter->state;
573         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
574                 goto unlock;
575         counter->state = PERF_COUNTER_STATE_INACTIVE;
576
577         /*
578          * If the counter is in a group and isn't the group leader,
579          * then don't put it on unless the group is on.
580          */
581         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
582                 goto unlock;
583
584         if (!group_can_go_on(counter, cpuctx, 1))
585                 err = -EEXIST;
586         else
587                 err = counter_sched_in(counter, cpuctx, ctx,
588                                        smp_processor_id());
589
590         if (err) {
591                 /*
592                  * If this counter can't go on and it's part of a
593                  * group, then the whole group has to come off.
594                  */
595                 if (leader != counter)
596                         group_sched_out(leader, cpuctx, ctx);
597                 if (leader->hw_event.pinned)
598                         leader->state = PERF_COUNTER_STATE_ERROR;
599         }
600
601  unlock:
602         spin_unlock(&ctx->lock);
603         curr_rq_unlock_irq_restore(&flags);
604 }
605
606 /*
607  * Enable a counter.
608  */
609 static void perf_counter_enable(struct perf_counter *counter)
610 {
611         struct perf_counter_context *ctx = counter->ctx;
612         struct task_struct *task = ctx->task;
613
614         if (!task) {
615                 /*
616                  * Enable the counter on the cpu that it's on
617                  */
618                 smp_call_function_single(counter->cpu, __perf_counter_enable,
619                                          counter, 1);
620                 return;
621         }
622
623         spin_lock_irq(&ctx->lock);
624         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
625                 goto out;
626
627         /*
628          * If the counter is in error state, clear that first.
629          * That way, if we see the counter in error state below, we
630          * know that it has gone back into error state, as distinct
631          * from the task having been scheduled away before the
632          * cross-call arrived.
633          */
634         if (counter->state == PERF_COUNTER_STATE_ERROR)
635                 counter->state = PERF_COUNTER_STATE_OFF;
636
637  retry:
638         spin_unlock_irq(&ctx->lock);
639         task_oncpu_function_call(task, __perf_counter_enable, counter);
640
641         spin_lock_irq(&ctx->lock);
642
643         /*
644          * If the context is active and the counter is still off,
645          * we need to retry the cross-call.
646          */
647         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
648                 goto retry;
649
650         /*
651          * Since we have the lock this context can't be scheduled
652          * in, so we can change the state safely.
653          */
654         if (counter->state == PERF_COUNTER_STATE_OFF)
655                 counter->state = PERF_COUNTER_STATE_INACTIVE;
656  out:
657         spin_unlock_irq(&ctx->lock);
658 }
659
660 /*
661  * Enable a counter and all its children.
662  */
663 static void perf_counter_enable_family(struct perf_counter *counter)
664 {
665         struct perf_counter *child;
666
667         perf_counter_enable(counter);
668
669         /*
670          * Lock the mutex to protect the list of children
671          */
672         mutex_lock(&counter->mutex);
673         list_for_each_entry(child, &counter->child_list, child_list)
674                 perf_counter_enable(child);
675         mutex_unlock(&counter->mutex);
676 }
677
678 void __perf_counter_sched_out(struct perf_counter_context *ctx,
679                               struct perf_cpu_context *cpuctx)
680 {
681         struct perf_counter *counter;
682         u64 flags;
683
684         spin_lock(&ctx->lock);
685         ctx->is_active = 0;
686         if (likely(!ctx->nr_counters))
687                 goto out;
688
689         flags = hw_perf_save_disable();
690         if (ctx->nr_active) {
691                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
692                         group_sched_out(counter, cpuctx, ctx);
693         }
694         hw_perf_restore(flags);
695  out:
696         spin_unlock(&ctx->lock);
697 }
698
699 /*
700  * Called from scheduler to remove the counters of the current task,
701  * with interrupts disabled.
702  *
703  * We stop each counter and update the counter value in counter->count.
704  *
705  * This does not protect us against NMI, but disable()
706  * sets the disabled bit in the control field of counter _before_
707  * accessing the counter control register. If a NMI hits, then it will
708  * not restart the counter.
709  */
710 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
711 {
712         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
713         struct perf_counter_context *ctx = &task->perf_counter_ctx;
714         struct pt_regs *regs;
715
716         if (likely(!cpuctx->task_ctx))
717                 return;
718
719         regs = task_pt_regs(task);
720         perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
721         __perf_counter_sched_out(ctx, cpuctx);
722
723         cpuctx->task_ctx = NULL;
724 }
725
726 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
727 {
728         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
729 }
730
731 static int
732 group_sched_in(struct perf_counter *group_counter,
733                struct perf_cpu_context *cpuctx,
734                struct perf_counter_context *ctx,
735                int cpu)
736 {
737         struct perf_counter *counter, *partial_group;
738         int ret;
739
740         if (group_counter->state == PERF_COUNTER_STATE_OFF)
741                 return 0;
742
743         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
744         if (ret)
745                 return ret < 0 ? ret : 0;
746
747         group_counter->prev_state = group_counter->state;
748         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
749                 return -EAGAIN;
750
751         /*
752          * Schedule in siblings as one group (if any):
753          */
754         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
755                 counter->prev_state = counter->state;
756                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
757                         partial_group = counter;
758                         goto group_error;
759                 }
760         }
761
762         return 0;
763
764 group_error:
765         /*
766          * Groups can be scheduled in as one unit only, so undo any
767          * partial group before returning:
768          */
769         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
770                 if (counter == partial_group)
771                         break;
772                 counter_sched_out(counter, cpuctx, ctx);
773         }
774         counter_sched_out(group_counter, cpuctx, ctx);
775
776         return -EAGAIN;
777 }
778
779 static void
780 __perf_counter_sched_in(struct perf_counter_context *ctx,
781                         struct perf_cpu_context *cpuctx, int cpu)
782 {
783         struct perf_counter *counter;
784         u64 flags;
785         int can_add_hw = 1;
786
787         spin_lock(&ctx->lock);
788         ctx->is_active = 1;
789         if (likely(!ctx->nr_counters))
790                 goto out;
791
792         flags = hw_perf_save_disable();
793
794         /*
795          * First go through the list and put on any pinned groups
796          * in order to give them the best chance of going on.
797          */
798         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
799                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
800                     !counter->hw_event.pinned)
801                         continue;
802                 if (counter->cpu != -1 && counter->cpu != cpu)
803                         continue;
804
805                 if (group_can_go_on(counter, cpuctx, 1))
806                         group_sched_in(counter, cpuctx, ctx, cpu);
807
808                 /*
809                  * If this pinned group hasn't been scheduled,
810                  * put it in error state.
811                  */
812                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
813                         counter->state = PERF_COUNTER_STATE_ERROR;
814         }
815
816         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
817                 /*
818                  * Ignore counters in OFF or ERROR state, and
819                  * ignore pinned counters since we did them already.
820                  */
821                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
822                     counter->hw_event.pinned)
823                         continue;
824
825                 /*
826                  * Listen to the 'cpu' scheduling filter constraint
827                  * of counters:
828                  */
829                 if (counter->cpu != -1 && counter->cpu != cpu)
830                         continue;
831
832                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
833                         if (group_sched_in(counter, cpuctx, ctx, cpu))
834                                 can_add_hw = 0;
835                 }
836         }
837         hw_perf_restore(flags);
838  out:
839         spin_unlock(&ctx->lock);
840 }
841
842 /*
843  * Called from scheduler to add the counters of the current task
844  * with interrupts disabled.
845  *
846  * We restore the counter value and then enable it.
847  *
848  * This does not protect us against NMI, but enable()
849  * sets the enabled bit in the control field of counter _before_
850  * accessing the counter control register. If a NMI hits, then it will
851  * keep the counter running.
852  */
853 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
854 {
855         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
856         struct perf_counter_context *ctx = &task->perf_counter_ctx;
857
858         __perf_counter_sched_in(ctx, cpuctx, cpu);
859         cpuctx->task_ctx = ctx;
860 }
861
862 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
863 {
864         struct perf_counter_context *ctx = &cpuctx->ctx;
865
866         __perf_counter_sched_in(ctx, cpuctx, cpu);
867 }
868
869 int perf_counter_task_disable(void)
870 {
871         struct task_struct *curr = current;
872         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
873         struct perf_counter *counter;
874         unsigned long flags;
875         u64 perf_flags;
876         int cpu;
877
878         if (likely(!ctx->nr_counters))
879                 return 0;
880
881         curr_rq_lock_irq_save(&flags);
882         cpu = smp_processor_id();
883
884         /* force the update of the task clock: */
885         __task_delta_exec(curr, 1);
886
887         perf_counter_task_sched_out(curr, cpu);
888
889         spin_lock(&ctx->lock);
890
891         /*
892          * Disable all the counters:
893          */
894         perf_flags = hw_perf_save_disable();
895
896         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
897                 if (counter->state != PERF_COUNTER_STATE_ERROR)
898                         counter->state = PERF_COUNTER_STATE_OFF;
899         }
900
901         hw_perf_restore(perf_flags);
902
903         spin_unlock(&ctx->lock);
904
905         curr_rq_unlock_irq_restore(&flags);
906
907         return 0;
908 }
909
910 int perf_counter_task_enable(void)
911 {
912         struct task_struct *curr = current;
913         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
914         struct perf_counter *counter;
915         unsigned long flags;
916         u64 perf_flags;
917         int cpu;
918
919         if (likely(!ctx->nr_counters))
920                 return 0;
921
922         curr_rq_lock_irq_save(&flags);
923         cpu = smp_processor_id();
924
925         /* force the update of the task clock: */
926         __task_delta_exec(curr, 1);
927
928         perf_counter_task_sched_out(curr, cpu);
929
930         spin_lock(&ctx->lock);
931
932         /*
933          * Disable all the counters:
934          */
935         perf_flags = hw_perf_save_disable();
936
937         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
938                 if (counter->state > PERF_COUNTER_STATE_OFF)
939                         continue;
940                 counter->state = PERF_COUNTER_STATE_INACTIVE;
941                 counter->hw_event.disabled = 0;
942         }
943         hw_perf_restore(perf_flags);
944
945         spin_unlock(&ctx->lock);
946
947         perf_counter_task_sched_in(curr, cpu);
948
949         curr_rq_unlock_irq_restore(&flags);
950
951         return 0;
952 }
953
954 /*
955  * Round-robin a context's counters:
956  */
957 static void rotate_ctx(struct perf_counter_context *ctx)
958 {
959         struct perf_counter *counter;
960         u64 perf_flags;
961
962         if (!ctx->nr_counters)
963                 return;
964
965         spin_lock(&ctx->lock);
966         /*
967          * Rotate the first entry last (works just fine for group counters too):
968          */
969         perf_flags = hw_perf_save_disable();
970         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
971                 list_move_tail(&counter->list_entry, &ctx->counter_list);
972                 break;
973         }
974         hw_perf_restore(perf_flags);
975
976         spin_unlock(&ctx->lock);
977 }
978
979 void perf_counter_task_tick(struct task_struct *curr, int cpu)
980 {
981         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
982         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
983         const int rotate_percpu = 0;
984
985         if (rotate_percpu)
986                 perf_counter_cpu_sched_out(cpuctx);
987         perf_counter_task_sched_out(curr, cpu);
988
989         if (rotate_percpu)
990                 rotate_ctx(&cpuctx->ctx);
991         rotate_ctx(ctx);
992
993         if (rotate_percpu)
994                 perf_counter_cpu_sched_in(cpuctx, cpu);
995         perf_counter_task_sched_in(curr, cpu);
996 }
997
998 /*
999  * Cross CPU call to read the hardware counter
1000  */
1001 static void __read(void *info)
1002 {
1003         struct perf_counter *counter = info;
1004         unsigned long flags;
1005
1006         curr_rq_lock_irq_save(&flags);
1007         counter->hw_ops->read(counter);
1008         curr_rq_unlock_irq_restore(&flags);
1009 }
1010
1011 static u64 perf_counter_read(struct perf_counter *counter)
1012 {
1013         /*
1014          * If counter is enabled and currently active on a CPU, update the
1015          * value in the counter structure:
1016          */
1017         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1018                 smp_call_function_single(counter->oncpu,
1019                                          __read, counter, 1);
1020         }
1021
1022         return atomic64_read(&counter->count);
1023 }
1024
1025 /*
1026  * Cross CPU call to switch performance data pointers
1027  */
1028 static void __perf_switch_irq_data(void *info)
1029 {
1030         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1031         struct perf_counter *counter = info;
1032         struct perf_counter_context *ctx = counter->ctx;
1033         struct perf_data *oldirqdata = counter->irqdata;
1034
1035         /*
1036          * If this is a task context, we need to check whether it is
1037          * the current task context of this cpu. If not it has been
1038          * scheduled out before the smp call arrived.
1039          */
1040         if (ctx->task) {
1041                 if (cpuctx->task_ctx != ctx)
1042                         return;
1043                 spin_lock(&ctx->lock);
1044         }
1045
1046         /* Change the pointer NMI safe */
1047         atomic_long_set((atomic_long_t *)&counter->irqdata,
1048                         (unsigned long) counter->usrdata);
1049         counter->usrdata = oldirqdata;
1050
1051         if (ctx->task)
1052                 spin_unlock(&ctx->lock);
1053 }
1054
1055 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1056 {
1057         struct perf_counter_context *ctx = counter->ctx;
1058         struct perf_data *oldirqdata = counter->irqdata;
1059         struct task_struct *task = ctx->task;
1060
1061         if (!task) {
1062                 smp_call_function_single(counter->cpu,
1063                                          __perf_switch_irq_data,
1064                                          counter, 1);
1065                 return counter->usrdata;
1066         }
1067
1068 retry:
1069         spin_lock_irq(&ctx->lock);
1070         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1071                 counter->irqdata = counter->usrdata;
1072                 counter->usrdata = oldirqdata;
1073                 spin_unlock_irq(&ctx->lock);
1074                 return oldirqdata;
1075         }
1076         spin_unlock_irq(&ctx->lock);
1077         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1078         /* Might have failed, because task was scheduled out */
1079         if (counter->irqdata == oldirqdata)
1080                 goto retry;
1081
1082         return counter->usrdata;
1083 }
1084
1085 static void put_context(struct perf_counter_context *ctx)
1086 {
1087         if (ctx->task)
1088                 put_task_struct(ctx->task);
1089 }
1090
1091 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1092 {
1093         struct perf_cpu_context *cpuctx;
1094         struct perf_counter_context *ctx;
1095         struct task_struct *task;
1096
1097         /*
1098          * If cpu is not a wildcard then this is a percpu counter:
1099          */
1100         if (cpu != -1) {
1101                 /* Must be root to operate on a CPU counter: */
1102                 if (!capable(CAP_SYS_ADMIN))
1103                         return ERR_PTR(-EACCES);
1104
1105                 if (cpu < 0 || cpu > num_possible_cpus())
1106                         return ERR_PTR(-EINVAL);
1107
1108                 /*
1109                  * We could be clever and allow to attach a counter to an
1110                  * offline CPU and activate it when the CPU comes up, but
1111                  * that's for later.
1112                  */
1113                 if (!cpu_isset(cpu, cpu_online_map))
1114                         return ERR_PTR(-ENODEV);
1115
1116                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1117                 ctx = &cpuctx->ctx;
1118
1119                 return ctx;
1120         }
1121
1122         rcu_read_lock();
1123         if (!pid)
1124                 task = current;
1125         else
1126                 task = find_task_by_vpid(pid);
1127         if (task)
1128                 get_task_struct(task);
1129         rcu_read_unlock();
1130
1131         if (!task)
1132                 return ERR_PTR(-ESRCH);
1133
1134         ctx = &task->perf_counter_ctx;
1135         ctx->task = task;
1136
1137         /* Reuse ptrace permission checks for now. */
1138         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1139                 put_context(ctx);
1140                 return ERR_PTR(-EACCES);
1141         }
1142
1143         return ctx;
1144 }
1145
1146 static void free_counter_rcu(struct rcu_head *head)
1147 {
1148         struct perf_counter *counter;
1149
1150         counter = container_of(head, struct perf_counter, rcu_head);
1151         kfree(counter);
1152 }
1153
1154 static void free_counter(struct perf_counter *counter)
1155 {
1156         if (counter->destroy)
1157                 counter->destroy(counter);
1158
1159         call_rcu(&counter->rcu_head, free_counter_rcu);
1160 }
1161
1162 /*
1163  * Called when the last reference to the file is gone.
1164  */
1165 static int perf_release(struct inode *inode, struct file *file)
1166 {
1167         struct perf_counter *counter = file->private_data;
1168         struct perf_counter_context *ctx = counter->ctx;
1169
1170         file->private_data = NULL;
1171
1172         mutex_lock(&ctx->mutex);
1173         mutex_lock(&counter->mutex);
1174
1175         perf_counter_remove_from_context(counter);
1176
1177         mutex_unlock(&counter->mutex);
1178         mutex_unlock(&ctx->mutex);
1179
1180         free_counter(counter);
1181         put_context(ctx);
1182
1183         return 0;
1184 }
1185
1186 /*
1187  * Read the performance counter - simple non blocking version for now
1188  */
1189 static ssize_t
1190 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1191 {
1192         u64 cntval;
1193
1194         if (count != sizeof(cntval))
1195                 return -EINVAL;
1196
1197         /*
1198          * Return end-of-file for a read on a counter that is in
1199          * error state (i.e. because it was pinned but it couldn't be
1200          * scheduled on to the CPU at some point).
1201          */
1202         if (counter->state == PERF_COUNTER_STATE_ERROR)
1203                 return 0;
1204
1205         mutex_lock(&counter->mutex);
1206         cntval = perf_counter_read(counter);
1207         mutex_unlock(&counter->mutex);
1208
1209         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1210 }
1211
1212 static ssize_t
1213 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1214 {
1215         if (!usrdata->len)
1216                 return 0;
1217
1218         count = min(count, (size_t)usrdata->len);
1219         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1220                 return -EFAULT;
1221
1222         /* Adjust the counters */
1223         usrdata->len -= count;
1224         if (!usrdata->len)
1225                 usrdata->rd_idx = 0;
1226         else
1227                 usrdata->rd_idx += count;
1228
1229         return count;
1230 }
1231
1232 static ssize_t
1233 perf_read_irq_data(struct perf_counter  *counter,
1234                    char __user          *buf,
1235                    size_t               count,
1236                    int                  nonblocking)
1237 {
1238         struct perf_data *irqdata, *usrdata;
1239         DECLARE_WAITQUEUE(wait, current);
1240         ssize_t res, res2;
1241
1242         irqdata = counter->irqdata;
1243         usrdata = counter->usrdata;
1244
1245         if (usrdata->len + irqdata->len >= count)
1246                 goto read_pending;
1247
1248         if (nonblocking)
1249                 return -EAGAIN;
1250
1251         spin_lock_irq(&counter->waitq.lock);
1252         __add_wait_queue(&counter->waitq, &wait);
1253         for (;;) {
1254                 set_current_state(TASK_INTERRUPTIBLE);
1255                 if (usrdata->len + irqdata->len >= count)
1256                         break;
1257
1258                 if (signal_pending(current))
1259                         break;
1260
1261                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1262                         break;
1263
1264                 spin_unlock_irq(&counter->waitq.lock);
1265                 schedule();
1266                 spin_lock_irq(&counter->waitq.lock);
1267         }
1268         __remove_wait_queue(&counter->waitq, &wait);
1269         __set_current_state(TASK_RUNNING);
1270         spin_unlock_irq(&counter->waitq.lock);
1271
1272         if (usrdata->len + irqdata->len < count &&
1273             counter->state != PERF_COUNTER_STATE_ERROR)
1274                 return -ERESTARTSYS;
1275 read_pending:
1276         mutex_lock(&counter->mutex);
1277
1278         /* Drain pending data first: */
1279         res = perf_copy_usrdata(usrdata, buf, count);
1280         if (res < 0 || res == count)
1281                 goto out;
1282
1283         /* Switch irq buffer: */
1284         usrdata = perf_switch_irq_data(counter);
1285         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1286         if (res2 < 0) {
1287                 if (!res)
1288                         res = -EFAULT;
1289         } else {
1290                 res += res2;
1291         }
1292 out:
1293         mutex_unlock(&counter->mutex);
1294
1295         return res;
1296 }
1297
1298 static ssize_t
1299 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1300 {
1301         struct perf_counter *counter = file->private_data;
1302
1303         switch (counter->hw_event.record_type) {
1304         case PERF_RECORD_SIMPLE:
1305                 return perf_read_hw(counter, buf, count);
1306
1307         case PERF_RECORD_IRQ:
1308         case PERF_RECORD_GROUP:
1309                 return perf_read_irq_data(counter, buf, count,
1310                                           file->f_flags & O_NONBLOCK);
1311         }
1312         return -EINVAL;
1313 }
1314
1315 static unsigned int perf_poll(struct file *file, poll_table *wait)
1316 {
1317         struct perf_counter *counter = file->private_data;
1318         unsigned int events = 0;
1319         unsigned long flags;
1320
1321         poll_wait(file, &counter->waitq, wait);
1322
1323         spin_lock_irqsave(&counter->waitq.lock, flags);
1324         if (counter->usrdata->len || counter->irqdata->len)
1325                 events |= POLLIN;
1326         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1327
1328         return events;
1329 }
1330
1331 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1332 {
1333         struct perf_counter *counter = file->private_data;
1334         int err = 0;
1335
1336         switch (cmd) {
1337         case PERF_COUNTER_IOC_ENABLE:
1338                 perf_counter_enable_family(counter);
1339                 break;
1340         case PERF_COUNTER_IOC_DISABLE:
1341                 perf_counter_disable_family(counter);
1342                 break;
1343         default:
1344                 err = -ENOTTY;
1345         }
1346         return err;
1347 }
1348
1349 static const struct file_operations perf_fops = {
1350         .release                = perf_release,
1351         .read                   = perf_read,
1352         .poll                   = perf_poll,
1353         .unlocked_ioctl         = perf_ioctl,
1354         .compat_ioctl           = perf_ioctl,
1355 };
1356
1357 /*
1358  * Output
1359  */
1360
1361 static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
1362 {
1363         struct perf_data *irqdata = counter->irqdata;
1364
1365         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
1366                 irqdata->overrun++;
1367         } else {
1368                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
1369
1370                 *p = data;
1371                 irqdata->len += sizeof(u64);
1372         }
1373 }
1374
1375 static void perf_counter_handle_group(struct perf_counter *counter)
1376 {
1377         struct perf_counter *leader, *sub;
1378
1379         leader = counter->group_leader;
1380         list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1381                 if (sub != counter)
1382                         sub->hw_ops->read(sub);
1383                 perf_counter_store_irq(counter, sub->hw_event.config);
1384                 perf_counter_store_irq(counter, atomic64_read(&sub->count));
1385         }
1386 }
1387
1388 void perf_counter_output(struct perf_counter *counter,
1389                          int nmi, struct pt_regs *regs)
1390 {
1391         switch (counter->hw_event.record_type) {
1392         case PERF_RECORD_SIMPLE:
1393                 return;
1394
1395         case PERF_RECORD_IRQ:
1396                 perf_counter_store_irq(counter, instruction_pointer(regs));
1397                 break;
1398
1399         case PERF_RECORD_GROUP:
1400                 perf_counter_handle_group(counter);
1401                 break;
1402         }
1403
1404         if (nmi) {
1405                 counter->wakeup_pending = 1;
1406                 set_perf_counter_pending();
1407         } else
1408                 wake_up(&counter->waitq);
1409 }
1410
1411 /*
1412  * Generic software counter infrastructure
1413  */
1414
1415 static void perf_swcounter_update(struct perf_counter *counter)
1416 {
1417         struct hw_perf_counter *hwc = &counter->hw;
1418         u64 prev, now;
1419         s64 delta;
1420
1421 again:
1422         prev = atomic64_read(&hwc->prev_count);
1423         now = atomic64_read(&hwc->count);
1424         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
1425                 goto again;
1426
1427         delta = now - prev;
1428
1429         atomic64_add(delta, &counter->count);
1430         atomic64_sub(delta, &hwc->period_left);
1431 }
1432
1433 static void perf_swcounter_set_period(struct perf_counter *counter)
1434 {
1435         struct hw_perf_counter *hwc = &counter->hw;
1436         s64 left = atomic64_read(&hwc->period_left);
1437         s64 period = hwc->irq_period;
1438
1439         if (unlikely(left <= -period)) {
1440                 left = period;
1441                 atomic64_set(&hwc->period_left, left);
1442         }
1443
1444         if (unlikely(left <= 0)) {
1445                 left += period;
1446                 atomic64_add(period, &hwc->period_left);
1447         }
1448
1449         atomic64_set(&hwc->prev_count, -left);
1450         atomic64_set(&hwc->count, -left);
1451 }
1452
1453 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
1454 {
1455         struct perf_counter *counter;
1456         struct pt_regs *regs;
1457
1458         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
1459         counter->hw_ops->read(counter);
1460
1461         regs = get_irq_regs();
1462         /*
1463          * In case we exclude kernel IPs or are somehow not in interrupt
1464          * context, provide the next best thing, the user IP.
1465          */
1466         if ((counter->hw_event.exclude_kernel || !regs) &&
1467                         !counter->hw_event.exclude_user)
1468                 regs = task_pt_regs(current);
1469
1470         if (regs)
1471                 perf_counter_output(counter, 0, regs);
1472
1473         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
1474
1475         return HRTIMER_RESTART;
1476 }
1477
1478 static void perf_swcounter_overflow(struct perf_counter *counter,
1479                                     int nmi, struct pt_regs *regs)
1480 {
1481         perf_swcounter_update(counter);
1482         perf_swcounter_set_period(counter);
1483         perf_counter_output(counter, nmi, regs);
1484 }
1485
1486 static int perf_swcounter_match(struct perf_counter *counter,
1487                                 enum perf_event_types type,
1488                                 u32 event, struct pt_regs *regs)
1489 {
1490         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1491                 return 0;
1492
1493         if (perf_event_raw(&counter->hw_event))
1494                 return 0;
1495
1496         if (perf_event_type(&counter->hw_event) != type)
1497                 return 0;
1498
1499         if (perf_event_id(&counter->hw_event) != event)
1500                 return 0;
1501
1502         if (counter->hw_event.exclude_user && user_mode(regs))
1503                 return 0;
1504
1505         if (counter->hw_event.exclude_kernel && !user_mode(regs))
1506                 return 0;
1507
1508         return 1;
1509 }
1510
1511 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
1512                                int nmi, struct pt_regs *regs)
1513 {
1514         int neg = atomic64_add_negative(nr, &counter->hw.count);
1515         if (counter->hw.irq_period && !neg)
1516                 perf_swcounter_overflow(counter, nmi, regs);
1517 }
1518
1519 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
1520                                      enum perf_event_types type, u32 event,
1521                                      u64 nr, int nmi, struct pt_regs *regs)
1522 {
1523         struct perf_counter *counter;
1524
1525         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1526                 return;
1527
1528         rcu_read_lock();
1529         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1530                 if (perf_swcounter_match(counter, type, event, regs))
1531                         perf_swcounter_add(counter, nr, nmi, regs);
1532         }
1533         rcu_read_unlock();
1534 }
1535
1536 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
1537 {
1538         if (in_nmi())
1539                 return &cpuctx->recursion[3];
1540
1541         if (in_irq())
1542                 return &cpuctx->recursion[2];
1543
1544         if (in_softirq())
1545                 return &cpuctx->recursion[1];
1546
1547         return &cpuctx->recursion[0];
1548 }
1549
1550 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
1551                                    u64 nr, int nmi, struct pt_regs *regs)
1552 {
1553         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
1554         int *recursion = perf_swcounter_recursion_context(cpuctx);
1555
1556         if (*recursion)
1557                 goto out;
1558
1559         (*recursion)++;
1560         barrier();
1561
1562         perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
1563         if (cpuctx->task_ctx) {
1564                 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
1565                                 nr, nmi, regs);
1566         }
1567
1568         barrier();
1569         (*recursion)--;
1570
1571 out:
1572         put_cpu_var(perf_cpu_context);
1573 }
1574
1575 void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
1576 {
1577         __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
1578 }
1579
1580 static void perf_swcounter_read(struct perf_counter *counter)
1581 {
1582         perf_swcounter_update(counter);
1583 }
1584
1585 static int perf_swcounter_enable(struct perf_counter *counter)
1586 {
1587         perf_swcounter_set_period(counter);
1588         return 0;
1589 }
1590
1591 static void perf_swcounter_disable(struct perf_counter *counter)
1592 {
1593         perf_swcounter_update(counter);
1594 }
1595
1596 static const struct hw_perf_counter_ops perf_ops_generic = {
1597         .enable         = perf_swcounter_enable,
1598         .disable        = perf_swcounter_disable,
1599         .read           = perf_swcounter_read,
1600 };
1601
1602 /*
1603  * Software counter: cpu wall time clock
1604  */
1605
1606 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1607 {
1608         int cpu = raw_smp_processor_id();
1609         s64 prev;
1610         u64 now;
1611
1612         now = cpu_clock(cpu);
1613         prev = atomic64_read(&counter->hw.prev_count);
1614         atomic64_set(&counter->hw.prev_count, now);
1615         atomic64_add(now - prev, &counter->count);
1616 }
1617
1618 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1619 {
1620         struct hw_perf_counter *hwc = &counter->hw;
1621         int cpu = raw_smp_processor_id();
1622
1623         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
1624         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1625         hwc->hrtimer.function = perf_swcounter_hrtimer;
1626         if (hwc->irq_period) {
1627                 __hrtimer_start_range_ns(&hwc->hrtimer,
1628                                 ns_to_ktime(hwc->irq_period), 0,
1629                                 HRTIMER_MODE_REL, 0);
1630         }
1631
1632         return 0;
1633 }
1634
1635 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1636 {
1637         hrtimer_cancel(&counter->hw.hrtimer);
1638         cpu_clock_perf_counter_update(counter);
1639 }
1640
1641 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1642 {
1643         cpu_clock_perf_counter_update(counter);
1644 }
1645
1646 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1647         .enable         = cpu_clock_perf_counter_enable,
1648         .disable        = cpu_clock_perf_counter_disable,
1649         .read           = cpu_clock_perf_counter_read,
1650 };
1651
1652 /*
1653  * Software counter: task time clock
1654  */
1655
1656 /*
1657  * Called from within the scheduler:
1658  */
1659 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1660 {
1661         struct task_struct *curr = counter->task;
1662         u64 delta;
1663
1664         delta = __task_delta_exec(curr, update);
1665
1666         return curr->se.sum_exec_runtime + delta;
1667 }
1668
1669 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1670 {
1671         u64 prev;
1672         s64 delta;
1673
1674         prev = atomic64_read(&counter->hw.prev_count);
1675
1676         atomic64_set(&counter->hw.prev_count, now);
1677
1678         delta = now - prev;
1679
1680         atomic64_add(delta, &counter->count);
1681 }
1682
1683 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1684 {
1685         struct hw_perf_counter *hwc = &counter->hw;
1686
1687         atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
1688         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1689         hwc->hrtimer.function = perf_swcounter_hrtimer;
1690         if (hwc->irq_period) {
1691                 __hrtimer_start_range_ns(&hwc->hrtimer,
1692                                 ns_to_ktime(hwc->irq_period), 0,
1693                                 HRTIMER_MODE_REL, 0);
1694         }
1695
1696         return 0;
1697 }
1698
1699 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1700 {
1701         hrtimer_cancel(&counter->hw.hrtimer);
1702         task_clock_perf_counter_update(counter,
1703                         task_clock_perf_counter_val(counter, 0));
1704 }
1705
1706 static void task_clock_perf_counter_read(struct perf_counter *counter)
1707 {
1708         task_clock_perf_counter_update(counter,
1709                         task_clock_perf_counter_val(counter, 1));
1710 }
1711
1712 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1713         .enable         = task_clock_perf_counter_enable,
1714         .disable        = task_clock_perf_counter_disable,
1715         .read           = task_clock_perf_counter_read,
1716 };
1717
1718 /*
1719  * Software counter: cpu migrations
1720  */
1721
1722 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1723 {
1724         struct task_struct *curr = counter->ctx->task;
1725
1726         if (curr)
1727                 return curr->se.nr_migrations;
1728         return cpu_nr_migrations(smp_processor_id());
1729 }
1730
1731 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1732 {
1733         u64 prev, now;
1734         s64 delta;
1735
1736         prev = atomic64_read(&counter->hw.prev_count);
1737         now = get_cpu_migrations(counter);
1738
1739         atomic64_set(&counter->hw.prev_count, now);
1740
1741         delta = now - prev;
1742
1743         atomic64_add(delta, &counter->count);
1744 }
1745
1746 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1747 {
1748         cpu_migrations_perf_counter_update(counter);
1749 }
1750
1751 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1752 {
1753         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1754                 atomic64_set(&counter->hw.prev_count,
1755                              get_cpu_migrations(counter));
1756         return 0;
1757 }
1758
1759 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1760 {
1761         cpu_migrations_perf_counter_update(counter);
1762 }
1763
1764 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1765         .enable         = cpu_migrations_perf_counter_enable,
1766         .disable        = cpu_migrations_perf_counter_disable,
1767         .read           = cpu_migrations_perf_counter_read,
1768 };
1769
1770 #ifdef CONFIG_EVENT_PROFILE
1771 void perf_tpcounter_event(int event_id)
1772 {
1773         struct pt_regs *regs = get_irq_regs();
1774
1775         if (!regs)
1776                 regs = task_pt_regs(current);
1777
1778         __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
1779 }
1780
1781 extern int ftrace_profile_enable(int);
1782 extern void ftrace_profile_disable(int);
1783
1784 static void tp_perf_counter_destroy(struct perf_counter *counter)
1785 {
1786         ftrace_profile_disable(perf_event_id(&counter->hw_event));
1787 }
1788
1789 static const struct hw_perf_counter_ops *
1790 tp_perf_counter_init(struct perf_counter *counter)
1791 {
1792         int event_id = perf_event_id(&counter->hw_event);
1793         int ret;
1794
1795         ret = ftrace_profile_enable(event_id);
1796         if (ret)
1797                 return NULL;
1798
1799         counter->destroy = tp_perf_counter_destroy;
1800         counter->hw.irq_period = counter->hw_event.irq_period;
1801
1802         return &perf_ops_generic;
1803 }
1804 #else
1805 static const struct hw_perf_counter_ops *
1806 tp_perf_counter_init(struct perf_counter *counter)
1807 {
1808         return NULL;
1809 }
1810 #endif
1811
1812 static const struct hw_perf_counter_ops *
1813 sw_perf_counter_init(struct perf_counter *counter)
1814 {
1815         struct perf_counter_hw_event *hw_event = &counter->hw_event;
1816         const struct hw_perf_counter_ops *hw_ops = NULL;
1817         struct hw_perf_counter *hwc = &counter->hw;
1818
1819         /*
1820          * Software counters (currently) can't in general distinguish
1821          * between user, kernel and hypervisor events.
1822          * However, context switches and cpu migrations are considered
1823          * to be kernel events, and page faults are never hypervisor
1824          * events.
1825          */
1826         switch (perf_event_id(&counter->hw_event)) {
1827         case PERF_COUNT_CPU_CLOCK:
1828                 hw_ops = &perf_ops_cpu_clock;
1829
1830                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1831                         hw_event->irq_period = 10000;
1832                 break;
1833         case PERF_COUNT_TASK_CLOCK:
1834                 /*
1835                  * If the user instantiates this as a per-cpu counter,
1836                  * use the cpu_clock counter instead.
1837                  */
1838                 if (counter->ctx->task)
1839                         hw_ops = &perf_ops_task_clock;
1840                 else
1841                         hw_ops = &perf_ops_cpu_clock;
1842
1843                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1844                         hw_event->irq_period = 10000;
1845                 break;
1846         case PERF_COUNT_PAGE_FAULTS:
1847         case PERF_COUNT_PAGE_FAULTS_MIN:
1848         case PERF_COUNT_PAGE_FAULTS_MAJ:
1849         case PERF_COUNT_CONTEXT_SWITCHES:
1850                 hw_ops = &perf_ops_generic;
1851                 break;
1852         case PERF_COUNT_CPU_MIGRATIONS:
1853                 if (!counter->hw_event.exclude_kernel)
1854                         hw_ops = &perf_ops_cpu_migrations;
1855                 break;
1856         }
1857
1858         if (hw_ops)
1859                 hwc->irq_period = hw_event->irq_period;
1860
1861         return hw_ops;
1862 }
1863
1864 /*
1865  * Allocate and initialize a counter structure
1866  */
1867 static struct perf_counter *
1868 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1869                    int cpu,
1870                    struct perf_counter_context *ctx,
1871                    struct perf_counter *group_leader,
1872                    gfp_t gfpflags)
1873 {
1874         const struct hw_perf_counter_ops *hw_ops;
1875         struct perf_counter *counter;
1876
1877         counter = kzalloc(sizeof(*counter), gfpflags);
1878         if (!counter)
1879                 return NULL;
1880
1881         /*
1882          * Single counters are their own group leaders, with an
1883          * empty sibling list:
1884          */
1885         if (!group_leader)
1886                 group_leader = counter;
1887
1888         mutex_init(&counter->mutex);
1889         INIT_LIST_HEAD(&counter->list_entry);
1890         INIT_LIST_HEAD(&counter->event_entry);
1891         INIT_LIST_HEAD(&counter->sibling_list);
1892         init_waitqueue_head(&counter->waitq);
1893
1894         INIT_LIST_HEAD(&counter->child_list);
1895
1896         counter->irqdata                = &counter->data[0];
1897         counter->usrdata                = &counter->data[1];
1898         counter->cpu                    = cpu;
1899         counter->hw_event               = *hw_event;
1900         counter->wakeup_pending         = 0;
1901         counter->group_leader           = group_leader;
1902         counter->hw_ops                 = NULL;
1903         counter->ctx                    = ctx;
1904
1905         counter->state = PERF_COUNTER_STATE_INACTIVE;
1906         if (hw_event->disabled)
1907                 counter->state = PERF_COUNTER_STATE_OFF;
1908
1909         hw_ops = NULL;
1910
1911         if (perf_event_raw(hw_event)) {
1912                 hw_ops = hw_perf_counter_init(counter);
1913                 goto done;
1914         }
1915
1916         switch (perf_event_type(hw_event)) {
1917         case PERF_TYPE_HARDWARE:
1918                 hw_ops = hw_perf_counter_init(counter);
1919                 break;
1920
1921         case PERF_TYPE_SOFTWARE:
1922                 hw_ops = sw_perf_counter_init(counter);
1923                 break;
1924
1925         case PERF_TYPE_TRACEPOINT:
1926                 hw_ops = tp_perf_counter_init(counter);
1927                 break;
1928         }
1929
1930         if (!hw_ops) {
1931                 kfree(counter);
1932                 return NULL;
1933         }
1934 done:
1935         counter->hw_ops = hw_ops;
1936
1937         return counter;
1938 }
1939
1940 /**
1941  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
1942  *
1943  * @hw_event_uptr:      event type attributes for monitoring/sampling
1944  * @pid:                target pid
1945  * @cpu:                target cpu
1946  * @group_fd:           group leader counter fd
1947  */
1948 SYSCALL_DEFINE5(perf_counter_open,
1949                 const struct perf_counter_hw_event __user *, hw_event_uptr,
1950                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
1951 {
1952         struct perf_counter *counter, *group_leader;
1953         struct perf_counter_hw_event hw_event;
1954         struct perf_counter_context *ctx;
1955         struct file *counter_file = NULL;
1956         struct file *group_file = NULL;
1957         int fput_needed = 0;
1958         int fput_needed2 = 0;
1959         int ret;
1960
1961         /* for future expandability... */
1962         if (flags)
1963                 return -EINVAL;
1964
1965         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1966                 return -EFAULT;
1967
1968         /*
1969          * Get the target context (task or percpu):
1970          */
1971         ctx = find_get_context(pid, cpu);
1972         if (IS_ERR(ctx))
1973                 return PTR_ERR(ctx);
1974
1975         /*
1976          * Look up the group leader (we will attach this counter to it):
1977          */
1978         group_leader = NULL;
1979         if (group_fd != -1) {
1980                 ret = -EINVAL;
1981                 group_file = fget_light(group_fd, &fput_needed);
1982                 if (!group_file)
1983                         goto err_put_context;
1984                 if (group_file->f_op != &perf_fops)
1985                         goto err_put_context;
1986
1987                 group_leader = group_file->private_data;
1988                 /*
1989                  * Do not allow a recursive hierarchy (this new sibling
1990                  * becoming part of another group-sibling):
1991                  */
1992                 if (group_leader->group_leader != group_leader)
1993                         goto err_put_context;
1994                 /*
1995                  * Do not allow to attach to a group in a different
1996                  * task or CPU context:
1997                  */
1998                 if (group_leader->ctx != ctx)
1999                         goto err_put_context;
2000                 /*
2001                  * Only a group leader can be exclusive or pinned
2002                  */
2003                 if (hw_event.exclusive || hw_event.pinned)
2004                         goto err_put_context;
2005         }
2006
2007         ret = -EINVAL;
2008         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2009                                      GFP_KERNEL);
2010         if (!counter)
2011                 goto err_put_context;
2012
2013         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2014         if (ret < 0)
2015                 goto err_free_put_context;
2016
2017         counter_file = fget_light(ret, &fput_needed2);
2018         if (!counter_file)
2019                 goto err_free_put_context;
2020
2021         counter->filp = counter_file;
2022         mutex_lock(&ctx->mutex);
2023         perf_install_in_context(ctx, counter, cpu);
2024         mutex_unlock(&ctx->mutex);
2025
2026         fput_light(counter_file, fput_needed2);
2027
2028 out_fput:
2029         fput_light(group_file, fput_needed);
2030
2031         return ret;
2032
2033 err_free_put_context:
2034         kfree(counter);
2035
2036 err_put_context:
2037         put_context(ctx);
2038
2039         goto out_fput;
2040 }
2041
2042 /*
2043  * Initialize the perf_counter context in a task_struct:
2044  */
2045 static void
2046 __perf_counter_init_context(struct perf_counter_context *ctx,
2047                             struct task_struct *task)
2048 {
2049         memset(ctx, 0, sizeof(*ctx));
2050         spin_lock_init(&ctx->lock);
2051         mutex_init(&ctx->mutex);
2052         INIT_LIST_HEAD(&ctx->counter_list);
2053         INIT_LIST_HEAD(&ctx->event_list);
2054         ctx->task = task;
2055 }
2056
2057 /*
2058  * inherit a counter from parent task to child task:
2059  */
2060 static struct perf_counter *
2061 inherit_counter(struct perf_counter *parent_counter,
2062               struct task_struct *parent,
2063               struct perf_counter_context *parent_ctx,
2064               struct task_struct *child,
2065               struct perf_counter *group_leader,
2066               struct perf_counter_context *child_ctx)
2067 {
2068         struct perf_counter *child_counter;
2069
2070         /*
2071          * Instead of creating recursive hierarchies of counters,
2072          * we link inherited counters back to the original parent,
2073          * which has a filp for sure, which we use as the reference
2074          * count:
2075          */
2076         if (parent_counter->parent)
2077                 parent_counter = parent_counter->parent;
2078
2079         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2080                                            parent_counter->cpu, child_ctx,
2081                                            group_leader, GFP_KERNEL);
2082         if (!child_counter)
2083                 return NULL;
2084
2085         /*
2086          * Link it up in the child's context:
2087          */
2088         child_counter->task = child;
2089         list_add_counter(child_counter, child_ctx);
2090         child_ctx->nr_counters++;
2091
2092         child_counter->parent = parent_counter;
2093         /*
2094          * inherit into child's child as well:
2095          */
2096         child_counter->hw_event.inherit = 1;
2097
2098         /*
2099          * Get a reference to the parent filp - we will fput it
2100          * when the child counter exits. This is safe to do because
2101          * we are in the parent and we know that the filp still
2102          * exists and has a nonzero count:
2103          */
2104         atomic_long_inc(&parent_counter->filp->f_count);
2105
2106         /*
2107          * Link this into the parent counter's child list
2108          */
2109         mutex_lock(&parent_counter->mutex);
2110         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2111
2112         /*
2113          * Make the child state follow the state of the parent counter,
2114          * not its hw_event.disabled bit.  We hold the parent's mutex,
2115          * so we won't race with perf_counter_{en,dis}able_family.
2116          */
2117         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2118                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2119         else
2120                 child_counter->state = PERF_COUNTER_STATE_OFF;
2121
2122         mutex_unlock(&parent_counter->mutex);
2123
2124         return child_counter;
2125 }
2126
2127 static int inherit_group(struct perf_counter *parent_counter,
2128               struct task_struct *parent,
2129               struct perf_counter_context *parent_ctx,
2130               struct task_struct *child,
2131               struct perf_counter_context *child_ctx)
2132 {
2133         struct perf_counter *leader;
2134         struct perf_counter *sub;
2135
2136         leader = inherit_counter(parent_counter, parent, parent_ctx,
2137                                  child, NULL, child_ctx);
2138         if (!leader)
2139                 return -ENOMEM;
2140         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2141                 if (!inherit_counter(sub, parent, parent_ctx,
2142                                      child, leader, child_ctx))
2143                         return -ENOMEM;
2144         }
2145         return 0;
2146 }
2147
2148 static void sync_child_counter(struct perf_counter *child_counter,
2149                                struct perf_counter *parent_counter)
2150 {
2151         u64 parent_val, child_val;
2152
2153         parent_val = atomic64_read(&parent_counter->count);
2154         child_val = atomic64_read(&child_counter->count);
2155
2156         /*
2157          * Add back the child's count to the parent's count:
2158          */
2159         atomic64_add(child_val, &parent_counter->count);
2160
2161         /*
2162          * Remove this counter from the parent's list
2163          */
2164         mutex_lock(&parent_counter->mutex);
2165         list_del_init(&child_counter->child_list);
2166         mutex_unlock(&parent_counter->mutex);
2167
2168         /*
2169          * Release the parent counter, if this was the last
2170          * reference to it.
2171          */
2172         fput(parent_counter->filp);
2173 }
2174
2175 static void
2176 __perf_counter_exit_task(struct task_struct *child,
2177                          struct perf_counter *child_counter,
2178                          struct perf_counter_context *child_ctx)
2179 {
2180         struct perf_counter *parent_counter;
2181         struct perf_counter *sub, *tmp;
2182
2183         /*
2184          * If we do not self-reap then we have to wait for the
2185          * child task to unschedule (it will happen for sure),
2186          * so that its counter is at its final count. (This
2187          * condition triggers rarely - child tasks usually get
2188          * off their CPU before the parent has a chance to
2189          * get this far into the reaping action)
2190          */
2191         if (child != current) {
2192                 wait_task_inactive(child, 0);
2193                 list_del_init(&child_counter->list_entry);
2194         } else {
2195                 struct perf_cpu_context *cpuctx;
2196                 unsigned long flags;
2197                 u64 perf_flags;
2198
2199                 /*
2200                  * Disable and unlink this counter.
2201                  *
2202                  * Be careful about zapping the list - IRQ/NMI context
2203                  * could still be processing it:
2204                  */
2205                 curr_rq_lock_irq_save(&flags);
2206                 perf_flags = hw_perf_save_disable();
2207
2208                 cpuctx = &__get_cpu_var(perf_cpu_context);
2209
2210                 group_sched_out(child_counter, cpuctx, child_ctx);
2211
2212                 list_del_init(&child_counter->list_entry);
2213
2214                 child_ctx->nr_counters--;
2215
2216                 hw_perf_restore(perf_flags);
2217                 curr_rq_unlock_irq_restore(&flags);
2218         }
2219
2220         parent_counter = child_counter->parent;
2221         /*
2222          * It can happen that parent exits first, and has counters
2223          * that are still around due to the child reference. These
2224          * counters need to be zapped - but otherwise linger.
2225          */
2226         if (parent_counter) {
2227                 sync_child_counter(child_counter, parent_counter);
2228                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2229                                          list_entry) {
2230                         if (sub->parent) {
2231                                 sync_child_counter(sub, sub->parent);
2232                                 free_counter(sub);
2233                         }
2234                 }
2235                 free_counter(child_counter);
2236         }
2237 }
2238
2239 /*
2240  * When a child task exits, feed back counter values to parent counters.
2241  *
2242  * Note: we may be running in child context, but the PID is not hashed
2243  * anymore so new counters will not be added.
2244  */
2245 void perf_counter_exit_task(struct task_struct *child)
2246 {
2247         struct perf_counter *child_counter, *tmp;
2248         struct perf_counter_context *child_ctx;
2249
2250         child_ctx = &child->perf_counter_ctx;
2251
2252         if (likely(!child_ctx->nr_counters))
2253                 return;
2254
2255         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2256                                  list_entry)
2257                 __perf_counter_exit_task(child, child_counter, child_ctx);
2258 }
2259
2260 /*
2261  * Initialize the perf_counter context in task_struct
2262  */
2263 void perf_counter_init_task(struct task_struct *child)
2264 {
2265         struct perf_counter_context *child_ctx, *parent_ctx;
2266         struct perf_counter *counter;
2267         struct task_struct *parent = current;
2268
2269         child_ctx  =  &child->perf_counter_ctx;
2270         parent_ctx = &parent->perf_counter_ctx;
2271
2272         __perf_counter_init_context(child_ctx, child);
2273
2274         /*
2275          * This is executed from the parent task context, so inherit
2276          * counters that have been marked for cloning:
2277          */
2278
2279         if (likely(!parent_ctx->nr_counters))
2280                 return;
2281
2282         /*
2283          * Lock the parent list. No need to lock the child - not PID
2284          * hashed yet and not running, so nobody can access it.
2285          */
2286         mutex_lock(&parent_ctx->mutex);
2287
2288         /*
2289          * We dont have to disable NMIs - we are only looking at
2290          * the list, not manipulating it:
2291          */
2292         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2293                 if (!counter->hw_event.inherit)
2294                         continue;
2295
2296                 if (inherit_group(counter, parent,
2297                                   parent_ctx, child, child_ctx))
2298                         break;
2299         }
2300
2301         mutex_unlock(&parent_ctx->mutex);
2302 }
2303
2304 static void __cpuinit perf_counter_init_cpu(int cpu)
2305 {
2306         struct perf_cpu_context *cpuctx;
2307
2308         cpuctx = &per_cpu(perf_cpu_context, cpu);
2309         __perf_counter_init_context(&cpuctx->ctx, NULL);
2310
2311         mutex_lock(&perf_resource_mutex);
2312         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2313         mutex_unlock(&perf_resource_mutex);
2314
2315         hw_perf_counter_setup(cpu);
2316 }
2317
2318 #ifdef CONFIG_HOTPLUG_CPU
2319 static void __perf_counter_exit_cpu(void *info)
2320 {
2321         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2322         struct perf_counter_context *ctx = &cpuctx->ctx;
2323         struct perf_counter *counter, *tmp;
2324
2325         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2326                 __perf_counter_remove_from_context(counter);
2327 }
2328 static void perf_counter_exit_cpu(int cpu)
2329 {
2330         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2331         struct perf_counter_context *ctx = &cpuctx->ctx;
2332
2333         mutex_lock(&ctx->mutex);
2334         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2335         mutex_unlock(&ctx->mutex);
2336 }
2337 #else
2338 static inline void perf_counter_exit_cpu(int cpu) { }
2339 #endif
2340
2341 static int __cpuinit
2342 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2343 {
2344         unsigned int cpu = (long)hcpu;
2345
2346         switch (action) {
2347
2348         case CPU_UP_PREPARE:
2349         case CPU_UP_PREPARE_FROZEN:
2350                 perf_counter_init_cpu(cpu);
2351                 break;
2352
2353         case CPU_DOWN_PREPARE:
2354         case CPU_DOWN_PREPARE_FROZEN:
2355                 perf_counter_exit_cpu(cpu);
2356                 break;
2357
2358         default:
2359                 break;
2360         }
2361
2362         return NOTIFY_OK;
2363 }
2364
2365 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2366         .notifier_call          = perf_cpu_notify,
2367 };
2368
2369 static int __init perf_counter_init(void)
2370 {
2371         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2372                         (void *)(long)smp_processor_id());
2373         register_cpu_notifier(&perf_cpu_nb);
2374
2375         return 0;
2376 }
2377 early_initcall(perf_counter_init);
2378
2379 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2380 {
2381         return sprintf(buf, "%d\n", perf_reserved_percpu);
2382 }
2383
2384 static ssize_t
2385 perf_set_reserve_percpu(struct sysdev_class *class,
2386                         const char *buf,
2387                         size_t count)
2388 {
2389         struct perf_cpu_context *cpuctx;
2390         unsigned long val;
2391         int err, cpu, mpt;
2392
2393         err = strict_strtoul(buf, 10, &val);
2394         if (err)
2395                 return err;
2396         if (val > perf_max_counters)
2397                 return -EINVAL;
2398
2399         mutex_lock(&perf_resource_mutex);
2400         perf_reserved_percpu = val;
2401         for_each_online_cpu(cpu) {
2402                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2403                 spin_lock_irq(&cpuctx->ctx.lock);
2404                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2405                           perf_max_counters - perf_reserved_percpu);
2406                 cpuctx->max_pertask = mpt;
2407                 spin_unlock_irq(&cpuctx->ctx.lock);
2408         }
2409         mutex_unlock(&perf_resource_mutex);
2410
2411         return count;
2412 }
2413
2414 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2415 {
2416         return sprintf(buf, "%d\n", perf_overcommit);
2417 }
2418
2419 static ssize_t
2420 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2421 {
2422         unsigned long val;
2423         int err;
2424
2425         err = strict_strtoul(buf, 10, &val);
2426         if (err)
2427                 return err;
2428         if (val > 1)
2429                 return -EINVAL;
2430
2431         mutex_lock(&perf_resource_mutex);
2432         perf_overcommit = val;
2433         mutex_unlock(&perf_resource_mutex);
2434
2435         return count;
2436 }
2437
2438 static SYSDEV_CLASS_ATTR(
2439                                 reserve_percpu,
2440                                 0644,
2441                                 perf_show_reserve_percpu,
2442                                 perf_set_reserve_percpu
2443                         );
2444
2445 static SYSDEV_CLASS_ATTR(
2446                                 overcommit,
2447                                 0644,
2448                                 perf_show_overcommit,
2449                                 perf_set_overcommit
2450                         );
2451
2452 static struct attribute *perfclass_attrs[] = {
2453         &attr_reserve_percpu.attr,
2454         &attr_overcommit.attr,
2455         NULL
2456 };
2457
2458 static struct attribute_group perfclass_attr_group = {
2459         .attrs                  = perfclass_attrs,
2460         .name                   = "perf_counters",
2461 };
2462
2463 static int __init perf_counter_sysfs_init(void)
2464 {
2465         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2466                                   &perfclass_attr_group);
2467 }
2468 device_initcall(perf_counter_sysfs_init);