git.oblomov.eu Git - linux-2.6/blob - kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23 #include <linux/mm.h>
  24 #include <linux/vmstat.h>
  25 #include <linux/rculist.h>
  26 #include <linux/hardirq.h>
  27
  28 #include <asm/irq_regs.h>
  29
  30 /*
  31  * Each CPU has a list of per CPU counters:
  32  */
  33 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  34
  35 int perf_max_counters __read_mostly = 1;
  36 static int perf_reserved_percpu __read_mostly;
  37 static int perf_overcommit __read_mostly = 1;
  38
  39 /*
  40  * Mutex for (sysadmin-configurable) counter reservations:
  41  */
  42 static DEFINE_MUTEX(perf_resource_mutex);
  43
  44 /*
  45  * Architecture provided APIs - weak aliases:
  46  */
  47 extern __weak const struct hw_perf_counter_ops *
  48 hw_perf_counter_init(struct perf_counter *counter)
  49 {
  50         return NULL;
  51 }
  52
  53 u64 __weak hw_perf_save_disable(void)           { return 0; }
  54 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  55 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  56 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  57                struct perf_cpu_context *cpuctx,
  58                struct perf_counter_context *ctx, int cpu)
  59 {
  60         return 0;
  61 }
  62
  63 void __weak perf_counter_print_debug(void)      { }
  64
  65 static void
  66 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  67 {
  68         struct perf_counter *group_leader = counter->group_leader;
  69
  70         /*
  71          * Depending on whether it is a standalone or sibling counter,
  72          * add it straight to the context's counter list, or to the group
  73          * leader's sibling list:
  74          */
  75         if (counter->group_leader == counter)
  76                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  77         else
  78                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  79
  80         list_add_rcu(&counter->event_entry, &ctx->event_list);
  81 }
  82
  83 static void
  84 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  85 {
  86         struct perf_counter *sibling, *tmp;
  87
  88         list_del_init(&counter->list_entry);
  89         list_del_rcu(&counter->event_entry);
  90
  91         /*
  92          * If this was a group counter with sibling counters then
  93          * upgrade the siblings to singleton counters by adding them
  94          * to the context list directly:
  95          */
  96         list_for_each_entry_safe(sibling, tmp,
  97                                  &counter->sibling_list, list_entry) {
  98
  99                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
 100                 sibling->group_leader = sibling;
 101         }
 102 }
 103
 104 static void
 105 counter_sched_out(struct perf_counter *counter,
 106                   struct perf_cpu_context *cpuctx,
 107                   struct perf_counter_context *ctx)
 108 {
 109         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 110                 return;
 111
 112         counter->state = PERF_COUNTER_STATE_INACTIVE;
 113         counter->hw_ops->disable(counter);
 114         counter->oncpu = -1;
 115
 116         if (!is_software_counter(counter))
 117                 cpuctx->active_oncpu--;
 118         ctx->nr_active--;
 119         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 120                 cpuctx->exclusive = 0;
 121 }
 122
 123 static void
 124 group_sched_out(struct perf_counter *group_counter,
 125                 struct perf_cpu_context *cpuctx,
 126                 struct perf_counter_context *ctx)
 127 {
 128         struct perf_counter *counter;
 129
 130         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 131                 return;
 132
 133         counter_sched_out(group_counter, cpuctx, ctx);
 134
 135         /*
 136          * Schedule out siblings (if any):
 137          */
 138         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 139                 counter_sched_out(counter, cpuctx, ctx);
 140
 141         if (group_counter->hw_event.exclusive)
 142                 cpuctx->exclusive = 0;
 143 }
 144
 145 /*
 146  * Cross CPU call to remove a performance counter
 147  *
 148  * We disable the counter on the hardware level first. After that we
 149  * remove it from the context list.
 150  */
 151 static void __perf_counter_remove_from_context(void *info)
 152 {
 153         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 154         struct perf_counter *counter = info;
 155         struct perf_counter_context *ctx = counter->ctx;
 156         unsigned long flags;
 157         u64 perf_flags;
 158
 159         /*
 160          * If this is a task context, we need to check whether it is
 161          * the current task context of this cpu. If not it has been
 162          * scheduled out before the smp call arrived.
 163          */
 164         if (ctx->task && cpuctx->task_ctx != ctx)
 165                 return;
 166
 167         curr_rq_lock_irq_save(&flags);
 168         spin_lock(&ctx->lock);
 169
 170         counter_sched_out(counter, cpuctx, ctx);
 171
 172         counter->task = NULL;
 173         ctx->nr_counters--;
 174
 175         /*
 176          * Protect the list operation against NMI by disabling the
 177          * counters on a global level. NOP for non NMI based counters.
 178          */
 179         perf_flags = hw_perf_save_disable();
 180         list_del_counter(counter, ctx);
 181         hw_perf_restore(perf_flags);
 182
 183         if (!ctx->task) {
 184                 /*
 185                  * Allow more per task counters with respect to the
 186                  * reservation:
 187                  */
 188                 cpuctx->max_pertask =
 189                         min(perf_max_counters - ctx->nr_counters,
 190                             perf_max_counters - perf_reserved_percpu);
 191         }
 192
 193         spin_unlock(&ctx->lock);
 194         curr_rq_unlock_irq_restore(&flags);
 195 }
 196
 197
 198 /*
 199  * Remove the counter from a task's (or a CPU's) list of counters.
 200  *
 201  * Must be called with counter->mutex and ctx->mutex held.
 202  *
 203  * CPU counters are removed with a smp call. For task counters we only
 204  * call when the task is on a CPU.
 205  */
 206 static void perf_counter_remove_from_context(struct perf_counter *counter)
 207 {
 208         struct perf_counter_context *ctx = counter->ctx;
 209         struct task_struct *task = ctx->task;
 210
 211         if (!task) {
 212                 /*
 213                  * Per cpu counters are removed via an smp call and
 214                  * the removal is always sucessful.
 215                  */
 216                 smp_call_function_single(counter->cpu,
 217                                          __perf_counter_remove_from_context,
 218                                          counter, 1);
 219                 return;
 220         }
 221
 222 retry:
 223         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 224                                  counter);
 225
 226         spin_lock_irq(&ctx->lock);
 227         /*
 228          * If the context is active we need to retry the smp call.
 229          */
 230         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 231                 spin_unlock_irq(&ctx->lock);
 232                 goto retry;
 233         }
 234
 235         /*
 236          * The lock prevents that this context is scheduled in so we
 237          * can remove the counter safely, if the call above did not
 238          * succeed.
 239          */
 240         if (!list_empty(&counter->list_entry)) {
 241                 ctx->nr_counters--;
 242                 list_del_counter(counter, ctx);
 243                 counter->task = NULL;
 244         }
 245         spin_unlock_irq(&ctx->lock);
 246 }
 247
 248 /*
 249  * Cross CPU call to disable a performance counter
 250  */
 251 static void __perf_counter_disable(void *info)
 252 {
 253         struct perf_counter *counter = info;
 254         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 255         struct perf_counter_context *ctx = counter->ctx;
 256         unsigned long flags;
 257
 258         /*
 259          * If this is a per-task counter, need to check whether this
 260          * counter's task is the current task on this cpu.
 261          */
 262         if (ctx->task && cpuctx->task_ctx != ctx)
 263                 return;
 264
 265         curr_rq_lock_irq_save(&flags);
 266         spin_lock(&ctx->lock);
 267
 268         /*
 269          * If the counter is on, turn it off.
 270          * If it is in error state, leave it in error state.
 271          */
 272         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 273                 if (counter == counter->group_leader)
 274                         group_sched_out(counter, cpuctx, ctx);
 275                 else
 276                         counter_sched_out(counter, cpuctx, ctx);
 277                 counter->state = PERF_COUNTER_STATE_OFF;
 278         }
 279
 280         spin_unlock(&ctx->lock);
 281         curr_rq_unlock_irq_restore(&flags);
 282 }
 283
 284 /*
 285  * Disable a counter.
 286  */
 287 static void perf_counter_disable(struct perf_counter *counter)
 288 {
 289         struct perf_counter_context *ctx = counter->ctx;
 290         struct task_struct *task = ctx->task;
 291
 292         if (!task) {
 293                 /*
 294                  * Disable the counter on the cpu that it's on
 295                  */
 296                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 297                                          counter, 1);
 298                 return;
 299         }
 300
 301  retry:
 302         task_oncpu_function_call(task, __perf_counter_disable, counter);
 303
 304         spin_lock_irq(&ctx->lock);
 305         /*
 306          * If the counter is still active, we need to retry the cross-call.
 307          */
 308         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 309                 spin_unlock_irq(&ctx->lock);
 310                 goto retry;
 311         }
 312
 313         /*
 314          * Since we have the lock this context can't be scheduled
 315          * in, so we can change the state safely.
 316          */
 317         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 318                 counter->state = PERF_COUNTER_STATE_OFF;
 319
 320         spin_unlock_irq(&ctx->lock);
 321 }
 322
 323 /*
 324  * Disable a counter and all its children.
 325  */
 326 static void perf_counter_disable_family(struct perf_counter *counter)
 327 {
 328         struct perf_counter *child;
 329
 330         perf_counter_disable(counter);
 331
 332         /*
 333          * Lock the mutex to protect the list of children
 334          */
 335         mutex_lock(&counter->mutex);
 336         list_for_each_entry(child, &counter->child_list, child_list)
 337                 perf_counter_disable(child);
 338         mutex_unlock(&counter->mutex);
 339 }
 340
 341 static int
 342 counter_sched_in(struct perf_counter *counter,
 343                  struct perf_cpu_context *cpuctx,
 344                  struct perf_counter_context *ctx,
 345                  int cpu)
 346 {
 347         if (counter->state <= PERF_COUNTER_STATE_OFF)
 348                 return 0;
 349
 350         counter->state = PERF_COUNTER_STATE_ACTIVE;
 351         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 352         /*
 353          * The new state must be visible before we turn it on in the hardware:
 354          */
 355         smp_wmb();
 356
 357         if (counter->hw_ops->enable(counter)) {
 358                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 359                 counter->oncpu = -1;
 360                 return -EAGAIN;
 361         }
 362
 363         if (!is_software_counter(counter))
 364                 cpuctx->active_oncpu++;
 365         ctx->nr_active++;
 366
 367         if (counter->hw_event.exclusive)
 368                 cpuctx->exclusive = 1;
 369
 370         return 0;
 371 }
 372
 373 /*
 374  * Return 1 for a group consisting entirely of software counters,
 375  * 0 if the group contains any hardware counters.
 376  */
 377 static int is_software_only_group(struct perf_counter *leader)
 378 {
 379         struct perf_counter *counter;
 380
 381         if (!is_software_counter(leader))
 382                 return 0;
 383         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 384                 if (!is_software_counter(counter))
 385                         return 0;
 386         return 1;
 387 }
 388
 389 /*
 390  * Work out whether we can put this counter group on the CPU now.
 391  */
 392 static int group_can_go_on(struct perf_counter *counter,
 393                            struct perf_cpu_context *cpuctx,
 394                            int can_add_hw)
 395 {
 396         /*
 397          * Groups consisting entirely of software counters can always go on.
 398          */
 399         if (is_software_only_group(counter))
 400                 return 1;
 401         /*
 402          * If an exclusive group is already on, no other hardware
 403          * counters can go on.
 404          */
 405         if (cpuctx->exclusive)
 406                 return 0;
 407         /*
 408          * If this group is exclusive and there are already
 409          * counters on the CPU, it can't go on.
 410          */
 411         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 412                 return 0;
 413         /*
 414          * Otherwise, try to add it if all previous groups were able
 415          * to go on.
 416          */
 417         return can_add_hw;
 418 }
 419
 420 /*
 421  * Cross CPU call to install and enable a performance counter
 422  */
 423 static void __perf_install_in_context(void *info)
 424 {
 425         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 426         struct perf_counter *counter = info;
 427         struct perf_counter_context *ctx = counter->ctx;
 428         struct perf_counter *leader = counter->group_leader;
 429         int cpu = smp_processor_id();
 430         unsigned long flags;
 431         u64 perf_flags;
 432         int err;
 433
 434         /*
 435          * If this is a task context, we need to check whether it is
 436          * the current task context of this cpu. If not it has been
 437          * scheduled out before the smp call arrived.
 438          */
 439         if (ctx->task && cpuctx->task_ctx != ctx)
 440                 return;
 441
 442         curr_rq_lock_irq_save(&flags);
 443         spin_lock(&ctx->lock);
 444
 445         /*
 446          * Protect the list operation against NMI by disabling the
 447          * counters on a global level. NOP for non NMI based counters.
 448          */
 449         perf_flags = hw_perf_save_disable();
 450
 451         list_add_counter(counter, ctx);
 452         ctx->nr_counters++;
 453         counter->prev_state = PERF_COUNTER_STATE_OFF;
 454
 455         /*
 456          * Don't put the counter on if it is disabled or if
 457          * it is in a group and the group isn't on.
 458          */
 459         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 460             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 461                 goto unlock;
 462
 463         /*
 464          * An exclusive counter can't go on if there are already active
 465          * hardware counters, and no hardware counter can go on if there
 466          * is already an exclusive counter on.
 467          */
 468         if (!group_can_go_on(counter, cpuctx, 1))
 469                 err = -EEXIST;
 470         else
 471                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 472
 473         if (err) {
 474                 /*
 475                  * This counter couldn't go on.  If it is in a group
 476                  * then we have to pull the whole group off.
 477                  * If the counter group is pinned then put it in error state.
 478                  */
 479                 if (leader != counter)
 480                         group_sched_out(leader, cpuctx, ctx);
 481                 if (leader->hw_event.pinned)
 482                         leader->state = PERF_COUNTER_STATE_ERROR;
 483         }
 484
 485         if (!err && !ctx->task && cpuctx->max_pertask)
 486                 cpuctx->max_pertask--;
 487
 488  unlock:
 489         hw_perf_restore(perf_flags);
 490
 491         spin_unlock(&ctx->lock);
 492         curr_rq_unlock_irq_restore(&flags);
 493 }
 494
 495 /*
 496  * Attach a performance counter to a context
 497  *
 498  * First we add the counter to the list with the hardware enable bit
 499  * in counter->hw_config cleared.
 500  *
 501  * If the counter is attached to a task which is on a CPU we use a smp
 502  * call to enable it in the task context. The task might have been
 503  * scheduled away, but we check this in the smp call again.
 504  *
 505  * Must be called with ctx->mutex held.
 506  */
 507 static void
 508 perf_install_in_context(struct perf_counter_context *ctx,
 509                         struct perf_counter *counter,
 510                         int cpu)
 511 {
 512         struct task_struct *task = ctx->task;
 513
 514         if (!task) {
 515                 /*
 516                  * Per cpu counters are installed via an smp call and
 517                  * the install is always sucessful.
 518                  */
 519                 smp_call_function_single(cpu, __perf_install_in_context,
 520                                          counter, 1);
 521                 return;
 522         }
 523
 524         counter->task = task;
 525 retry:
 526         task_oncpu_function_call(task, __perf_install_in_context,
 527                                  counter);
 528
 529         spin_lock_irq(&ctx->lock);
 530         /*
 531          * we need to retry the smp call.
 532          */
 533         if (ctx->is_active && list_empty(&counter->list_entry)) {
 534                 spin_unlock_irq(&ctx->lock);
 535                 goto retry;
 536         }
 537
 538         /*
 539          * The lock prevents that this context is scheduled in so we
 540          * can add the counter safely, if it the call above did not
 541          * succeed.
 542          */
 543         if (list_empty(&counter->list_entry)) {
 544                 list_add_counter(counter, ctx);
 545                 ctx->nr_counters++;
 546         }
 547         spin_unlock_irq(&ctx->lock);
 548 }
 549
 550 /*
 551  * Cross CPU call to enable a performance counter
 552  */
 553 static void __perf_counter_enable(void *info)
 554 {
 555         struct perf_counter *counter = info;
 556         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 557         struct perf_counter_context *ctx = counter->ctx;
 558         struct perf_counter *leader = counter->group_leader;
 559         unsigned long flags;
 560         int err;
 561
 562         /*
 563          * If this is a per-task counter, need to check whether this
 564          * counter's task is the current task on this cpu.
 565          */
 566         if (ctx->task && cpuctx->task_ctx != ctx)
 567                 return;
 568
 569         curr_rq_lock_irq_save(&flags);
 570         spin_lock(&ctx->lock);
 571
 572         counter->prev_state = counter->state;
 573         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 574                 goto unlock;
 575         counter->state = PERF_COUNTER_STATE_INACTIVE;
 576
 577         /*
 578          * If the counter is in a group and isn't the group leader,
 579          * then don't put it on unless the group is on.
 580          */
 581         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 582                 goto unlock;
 583
 584         if (!group_can_go_on(counter, cpuctx, 1))
 585                 err = -EEXIST;
 586         else
 587                 err = counter_sched_in(counter, cpuctx, ctx,
 588                                        smp_processor_id());
 589
 590         if (err) {
 591                 /*
 592                  * If this counter can't go on and it's part of a
 593                  * group, then the whole group has to come off.
 594                  */
 595                 if (leader != counter)
 596                         group_sched_out(leader, cpuctx, ctx);
 597                 if (leader->hw_event.pinned)
 598                         leader->state = PERF_COUNTER_STATE_ERROR;
 599         }
 600
 601  unlock:
 602         spin_unlock(&ctx->lock);
 603         curr_rq_unlock_irq_restore(&flags);
 604 }
 605
 606 /*
 607  * Enable a counter.
 608  */
 609 static void perf_counter_enable(struct perf_counter *counter)
 610 {
 611         struct perf_counter_context *ctx = counter->ctx;
 612         struct task_struct *task = ctx->task;
 613
 614         if (!task) {
 615                 /*
 616                  * Enable the counter on the cpu that it's on
 617                  */
 618                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 619                                          counter, 1);
 620                 return;
 621         }
 622
 623         spin_lock_irq(&ctx->lock);
 624         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 625                 goto out;
 626
 627         /*
 628          * If the counter is in error state, clear that first.
 629          * That way, if we see the counter in error state below, we
 630          * know that it has gone back into error state, as distinct
 631          * from the task having been scheduled away before the
 632          * cross-call arrived.
 633          */
 634         if (counter->state == PERF_COUNTER_STATE_ERROR)
 635                 counter->state = PERF_COUNTER_STATE_OFF;
 636
 637  retry:
 638         spin_unlock_irq(&ctx->lock);
 639         task_oncpu_function_call(task, __perf_counter_enable, counter);
 640
 641         spin_lock_irq(&ctx->lock);
 642
 643         /*
 644          * If the context is active and the counter is still off,
 645          * we need to retry the cross-call.
 646          */
 647         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 648                 goto retry;
 649
 650         /*
 651          * Since we have the lock this context can't be scheduled
 652          * in, so we can change the state safely.
 653          */
 654         if (counter->state == PERF_COUNTER_STATE_OFF)
 655                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 656  out:
 657         spin_unlock_irq(&ctx->lock);
 658 }
 659
 660 /*
 661  * Enable a counter and all its children.
 662  */
 663 static void perf_counter_enable_family(struct perf_counter *counter)
 664 {
 665         struct perf_counter *child;
 666
 667         perf_counter_enable(counter);
 668
 669         /*
 670          * Lock the mutex to protect the list of children
 671          */
 672         mutex_lock(&counter->mutex);
 673         list_for_each_entry(child, &counter->child_list, child_list)
 674                 perf_counter_enable(child);
 675         mutex_unlock(&counter->mutex);
 676 }
 677
 678 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 679                               struct perf_cpu_context *cpuctx)
 680 {
 681         struct perf_counter *counter;
 682         u64 flags;
 683
 684         spin_lock(&ctx->lock);
 685         ctx->is_active = 0;
 686         if (likely(!ctx->nr_counters))
 687                 goto out;
 688
 689         flags = hw_perf_save_disable();
 690         if (ctx->nr_active) {
 691                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 692                         group_sched_out(counter, cpuctx, ctx);
 693         }
 694         hw_perf_restore(flags);
 695  out:
 696         spin_unlock(&ctx->lock);
 697 }
 698
 699 /*
 700  * Called from scheduler to remove the counters of the current task,
 701  * with interrupts disabled.
 702  *
 703  * We stop each counter and update the counter value in counter->count.
 704  *
 705  * This does not protect us against NMI, but disable()
 706  * sets the disabled bit in the control field of counter _before_
 707  * accessing the counter control register. If a NMI hits, then it will
 708  * not restart the counter.
 709  */
 710 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 711 {
 712         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 713         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 714         struct pt_regs *regs;
 715
 716         if (likely(!cpuctx->task_ctx))
 717                 return;
 718
 719         regs = task_pt_regs(task);
 720         perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 721         __perf_counter_sched_out(ctx, cpuctx);
 722
 723         cpuctx->task_ctx = NULL;
 724 }
 725
 726 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 727 {
 728         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 729 }
 730
 731 static int
 732 group_sched_in(struct perf_counter *group_counter,
 733                struct perf_cpu_context *cpuctx,
 734                struct perf_counter_context *ctx,
 735                int cpu)
 736 {
 737         struct perf_counter *counter, *partial_group;
 738         int ret;
 739
 740         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 741                 return 0;
 742
 743         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 744         if (ret)
 745                 return ret < 0 ? ret : 0;
 746
 747         group_counter->prev_state = group_counter->state;
 748         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 749                 return -EAGAIN;
 750
 751         /*
 752          * Schedule in siblings as one group (if any):
 753          */
 754         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 755                 counter->prev_state = counter->state;
 756                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 757                         partial_group = counter;
 758                         goto group_error;
 759                 }
 760         }
 761
 762         return 0;
 763
 764 group_error:
 765         /*
 766          * Groups can be scheduled in as one unit only, so undo any
 767          * partial group before returning:
 768          */
 769         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 770                 if (counter == partial_group)
 771                         break;
 772                 counter_sched_out(counter, cpuctx, ctx);
 773         }
 774         counter_sched_out(group_counter, cpuctx, ctx);
 775
 776         return -EAGAIN;
 777 }
 778
 779 static void
 780 __perf_counter_sched_in(struct perf_counter_context *ctx,
 781                         struct perf_cpu_context *cpuctx, int cpu)
 782 {
 783         struct perf_counter *counter;
 784         u64 flags;
 785         int can_add_hw = 1;
 786
 787         spin_lock(&ctx->lock);
 788         ctx->is_active = 1;
 789         if (likely(!ctx->nr_counters))
 790                 goto out;
 791
 792         flags = hw_perf_save_disable();
 793
 794         /*
 795          * First go through the list and put on any pinned groups
 796          * in order to give them the best chance of going on.
 797          */
 798         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 799                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 800                     !counter->hw_event.pinned)
 801                         continue;
 802                 if (counter->cpu != -1 && counter->cpu != cpu)
 803                         continue;
 804
 805                 if (group_can_go_on(counter, cpuctx, 1))
 806                         group_sched_in(counter, cpuctx, ctx, cpu);
 807
 808                 /*
 809                  * If this pinned group hasn't been scheduled,
 810                  * put it in error state.
 811                  */
 812                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 813                         counter->state = PERF_COUNTER_STATE_ERROR;
 814         }
 815
 816         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 817                 /*
 818                  * Ignore counters in OFF or ERROR state, and
 819                  * ignore pinned counters since we did them already.
 820                  */
 821                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 822                     counter->hw_event.pinned)
 823                         continue;
 824
 825                 /*
 826                  * Listen to the 'cpu' scheduling filter constraint
 827                  * of counters:
 828                  */
 829                 if (counter->cpu != -1 && counter->cpu != cpu)
 830                         continue;
 831
 832                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 833                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 834                                 can_add_hw = 0;
 835                 }
 836         }
 837         hw_perf_restore(flags);
 838  out:
 839         spin_unlock(&ctx->lock);
 840 }
 841
 842 /*
 843  * Called from scheduler to add the counters of the current task
 844  * with interrupts disabled.
 845  *
 846  * We restore the counter value and then enable it.
 847  *
 848  * This does not protect us against NMI, but enable()
 849  * sets the enabled bit in the control field of counter _before_
 850  * accessing the counter control register. If a NMI hits, then it will
 851  * keep the counter running.
 852  */
 853 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 854 {
 855         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 856         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 857
 858         __perf_counter_sched_in(ctx, cpuctx, cpu);
 859         cpuctx->task_ctx = ctx;
 860 }
 861
 862 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 863 {
 864         struct perf_counter_context *ctx = &cpuctx->ctx;
 865
 866         __perf_counter_sched_in(ctx, cpuctx, cpu);
 867 }
 868
 869 int perf_counter_task_disable(void)
 870 {
 871         struct task_struct *curr = current;
 872         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 873         struct perf_counter *counter;
 874         unsigned long flags;
 875         u64 perf_flags;
 876         int cpu;
 877
 878         if (likely(!ctx->nr_counters))
 879                 return 0;
 880
 881         curr_rq_lock_irq_save(&flags);
 882         cpu = smp_processor_id();
 883
 884         /* force the update of the task clock: */
 885         __task_delta_exec(curr, 1);
 886
 887         perf_counter_task_sched_out(curr, cpu);
 888
 889         spin_lock(&ctx->lock);
 890
 891         /*
 892          * Disable all the counters:
 893          */
 894         perf_flags = hw_perf_save_disable();
 895
 896         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 897                 if (counter->state != PERF_COUNTER_STATE_ERROR)
 898                         counter->state = PERF_COUNTER_STATE_OFF;
 899         }
 900
 901         hw_perf_restore(perf_flags);
 902
 903         spin_unlock(&ctx->lock);
 904
 905         curr_rq_unlock_irq_restore(&flags);
 906
 907         return 0;
 908 }
 909
 910 int perf_counter_task_enable(void)
 911 {
 912         struct task_struct *curr = current;
 913         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 914         struct perf_counter *counter;
 915         unsigned long flags;
 916         u64 perf_flags;
 917         int cpu;
 918
 919         if (likely(!ctx->nr_counters))
 920                 return 0;
 921
 922         curr_rq_lock_irq_save(&flags);
 923         cpu = smp_processor_id();
 924
 925         /* force the update of the task clock: */
 926         __task_delta_exec(curr, 1);
 927
 928         perf_counter_task_sched_out(curr, cpu);
 929
 930         spin_lock(&ctx->lock);
 931
 932         /*
 933          * Disable all the counters:
 934          */
 935         perf_flags = hw_perf_save_disable();
 936
 937         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 938                 if (counter->state > PERF_COUNTER_STATE_OFF)
 939                         continue;
 940                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 941                 counter->hw_event.disabled = 0;
 942         }
 943         hw_perf_restore(perf_flags);
 944
 945         spin_unlock(&ctx->lock);
 946
 947         perf_counter_task_sched_in(curr, cpu);
 948
 949         curr_rq_unlock_irq_restore(&flags);
 950
 951         return 0;
 952 }
 953
 954 /*
 955  * Round-robin a context's counters:
 956  */
 957 static void rotate_ctx(struct perf_counter_context *ctx)
 958 {
 959         struct perf_counter *counter;
 960         u64 perf_flags;
 961
 962         if (!ctx->nr_counters)
 963                 return;
 964
 965         spin_lock(&ctx->lock);
 966         /*
 967          * Rotate the first entry last (works just fine for group counters too):
 968          */
 969         perf_flags = hw_perf_save_disable();
 970         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 971                 list_move_tail(&counter->list_entry, &ctx->counter_list);
 972                 break;
 973         }
 974         hw_perf_restore(perf_flags);
 975
 976         spin_unlock(&ctx->lock);
 977 }
 978
 979 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 980 {
 981         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 982         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 983         const int rotate_percpu = 0;
 984
 985         if (rotate_percpu)
 986                 perf_counter_cpu_sched_out(cpuctx);
 987         perf_counter_task_sched_out(curr, cpu);
 988
 989         if (rotate_percpu)
 990                 rotate_ctx(&cpuctx->ctx);
 991         rotate_ctx(ctx);
 992
 993         if (rotate_percpu)
 994                 perf_counter_cpu_sched_in(cpuctx, cpu);
 995         perf_counter_task_sched_in(curr, cpu);
 996 }
 997
 998 /*
 999  * Cross CPU call to read the hardware counter
1000  */
1001 static void __read(void *info)
1002 {
1003         struct perf_counter *counter = info;
1004         unsigned long flags;
1005
1006         curr_rq_lock_irq_save(&flags);
1007         counter->hw_ops->read(counter);
1008         curr_rq_unlock_irq_restore(&flags);
1009 }
1010
1011 static u64 perf_counter_read(struct perf_counter *counter)
1012 {
1013         /*
1014          * If counter is enabled and currently active on a CPU, update the
1015          * value in the counter structure:
1016          */
1017         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1018                 smp_call_function_single(counter->oncpu,
1019                                          __read, counter, 1);
1020         }
1021
1022         return atomic64_read(&counter->count);
1023 }
1024
1025 /*
1026  * Cross CPU call to switch performance data pointers
1027  */
1028 static void __perf_switch_irq_data(void *info)
1029 {
1030         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1031         struct perf_counter *counter = info;
1032         struct perf_counter_context *ctx = counter->ctx;
1033         struct perf_data *oldirqdata = counter->irqdata;
1034
1035         /*
1036          * If this is a task context, we need to check whether it is
1037          * the current task context of this cpu. If not it has been
1038          * scheduled out before the smp call arrived.
1039          */
1040         if (ctx->task) {
1041                 if (cpuctx->task_ctx != ctx)
1042                         return;
1043                 spin_lock(&ctx->lock);
1044         }
1045
1046         /* Change the pointer NMI safe */
1047         atomic_long_set((atomic_long_t *)&counter->irqdata,
1048                         (unsigned long) counter->usrdata);
1049         counter->usrdata = oldirqdata;
1050
1051         if (ctx->task)
1052                 spin_unlock(&ctx->lock);
1053 }
1054
1055 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1056 {
1057         struct perf_counter_context *ctx = counter->ctx;
1058         struct perf_data *oldirqdata = counter->irqdata;
1059         struct task_struct *task = ctx->task;
1060
1061         if (!task) {
1062                 smp_call_function_single(counter->cpu,
1063                                          __perf_switch_irq_data,
1064                                          counter, 1);
1065                 return counter->usrdata;
1066         }
1067
1068 retry:
1069         spin_lock_irq(&ctx->lock);
1070         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1071                 counter->irqdata = counter->usrdata;
1072                 counter->usrdata = oldirqdata;
1073                 spin_unlock_irq(&ctx->lock);
1074                 return oldirqdata;
1075         }
1076         spin_unlock_irq(&ctx->lock);
1077         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1078         /* Might have failed, because task was scheduled out */
1079         if (counter->irqdata == oldirqdata)
1080                 goto retry;
1081
1082         return counter->usrdata;
1083 }
1084
1085 static void put_context(struct perf_counter_context *ctx)
1086 {
1087         if (ctx->task)
1088                 put_task_struct(ctx->task);
1089 }
1090
1091 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1092 {
1093         struct perf_cpu_context *cpuctx;
1094         struct perf_counter_context *ctx;
1095         struct task_struct *task;
1096
1097         /*
1098          * If cpu is not a wildcard then this is a percpu counter:
1099          */
1100         if (cpu != -1) {
1101                 /* Must be root to operate on a CPU counter: */
1102                 if (!capable(CAP_SYS_ADMIN))
1103                         return ERR_PTR(-EACCES);
1104
1105                 if (cpu < 0 || cpu > num_possible_cpus())
1106                         return ERR_PTR(-EINVAL);
1107
1108                 /*
1109                  * We could be clever and allow to attach a counter to an
1110                  * offline CPU and activate it when the CPU comes up, but
1111                  * that's for later.
1112                  */
1113                 if (!cpu_isset(cpu, cpu_online_map))
1114                         return ERR_PTR(-ENODEV);
1115
1116                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1117                 ctx = &cpuctx->ctx;
1118
1119                 return ctx;
1120         }
1121
1122         rcu_read_lock();
1123         if (!pid)
1124                 task = current;
1125         else
1126                 task = find_task_by_vpid(pid);
1127         if (task)
1128                 get_task_struct(task);
1129         rcu_read_unlock();
1130
1131         if (!task)
1132                 return ERR_PTR(-ESRCH);
1133
1134         ctx = &task->perf_counter_ctx;
1135         ctx->task = task;
1136
1137         /* Reuse ptrace permission checks for now. */
1138         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1139                 put_context(ctx);
1140                 return ERR_PTR(-EACCES);
1141         }
1142
1143         return ctx;
1144 }
1145
1146 static void free_counter_rcu(struct rcu_head *head)
1147 {
1148         struct perf_counter *counter;
1149
1150         counter = container_of(head, struct perf_counter, rcu_head);
1151         kfree(counter);
1152 }
1153
1154 static void free_counter(struct perf_counter *counter)
1155 {
1156         if (counter->destroy)
1157                 counter->destroy(counter);
1158
1159         call_rcu(&counter->rcu_head, free_counter_rcu);
1160 }
1161
1162 /*
1163  * Called when the last reference to the file is gone.
1164  */
1165 static int perf_release(struct inode *inode, struct file *file)
1166 {
1167         struct perf_counter *counter = file->private_data;
1168         struct perf_counter_context *ctx = counter->ctx;
1169
1170         file->private_data = NULL;
1171
1172         mutex_lock(&ctx->mutex);
1173         mutex_lock(&counter->mutex);
1174
1175         perf_counter_remove_from_context(counter);
1176
1177         mutex_unlock(&counter->mutex);
1178         mutex_unlock(&ctx->mutex);
1179
1180         free_counter(counter);
1181         put_context(ctx);
1182
1183         return 0;
1184 }
1185
1186 /*
1187  * Read the performance counter - simple non blocking version for now
1188  */
1189 static ssize_t
1190 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1191 {
1192         u64 cntval;
1193
1194         if (count != sizeof(cntval))
1195                 return -EINVAL;
1196
1197         /*
1198          * Return end-of-file for a read on a counter that is in
1199          * error state (i.e. because it was pinned but it couldn't be
1200          * scheduled on to the CPU at some point).
1201          */
1202         if (counter->state == PERF_COUNTER_STATE_ERROR)
1203                 return 0;
1204
1205         mutex_lock(&counter->mutex);
1206         cntval = perf_counter_read(counter);
1207         mutex_unlock(&counter->mutex);
1208
1209         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1210 }
1211
1212 static ssize_t
1213 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1214 {
1215         if (!usrdata->len)
1216                 return 0;
1217
1218         count = min(count, (size_t)usrdata->len);
1219         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1220                 return -EFAULT;
1221
1222         /* Adjust the counters */
1223         usrdata->len -= count;
1224         if (!usrdata->len)
1225                 usrdata->rd_idx = 0;
1226         else
1227                 usrdata->rd_idx += count;
1228
1229         return count;
1230 }
1231
1232 static ssize_t
1233 perf_read_irq_data(struct perf_counter  *counter,
1234                    char __user          *buf,
1235                    size_t               count,
1236                    int                  nonblocking)
1237 {
1238         struct perf_data *irqdata, *usrdata;
1239         DECLARE_WAITQUEUE(wait, current);
1240         ssize_t res, res2;
1241
1242         irqdata = counter->irqdata;
1243         usrdata = counter->usrdata;
1244
1245         if (usrdata->len + irqdata->len >= count)
1246                 goto read_pending;
1247
1248         if (nonblocking)
1249                 return -EAGAIN;
1250
1251         spin_lock_irq(&counter->waitq.lock);
1252         __add_wait_queue(&counter->waitq, &wait);
1253         for (;;) {
1254                 set_current_state(TASK_INTERRUPTIBLE);
1255                 if (usrdata->len + irqdata->len >= count)
1256                         break;
1257
1258                 if (signal_pending(current))
1259                         break;
1260
1261                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1262                         break;
1263
1264                 spin_unlock_irq(&counter->waitq.lock);
1265                 schedule();
1266                 spin_lock_irq(&counter->waitq.lock);
1267         }
1268         __remove_wait_queue(&counter->waitq, &wait);
1269         __set_current_state(TASK_RUNNING);
1270         spin_unlock_irq(&counter->waitq.lock);
1271
1272         if (usrdata->len + irqdata->len < count &&
1273             counter->state != PERF_COUNTER_STATE_ERROR)
1274                 return -ERESTARTSYS;
1275 read_pending:
1276         mutex_lock(&counter->mutex);
1277
1278         /* Drain pending data first: */
1279         res = perf_copy_usrdata(usrdata, buf, count);
1280         if (res < 0 || res == count)
1281                 goto out;
1282
1283         /* Switch irq buffer: */
1284         usrdata = perf_switch_irq_data(counter);
1285         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1286         if (res2 < 0) {
1287                 if (!res)
1288                         res = -EFAULT;
1289         } else {
1290                 res += res2;
1291         }
1292 out:
1293         mutex_unlock(&counter->mutex);
1294
1295         return res;
1296 }
1297
1298 static ssize_t
1299 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1300 {
1301         struct perf_counter *counter = file->private_data;
1302
1303         switch (counter->hw_event.record_type) {
1304         case PERF_RECORD_SIMPLE:
1305                 return perf_read_hw(counter, buf, count);
1306
1307         case PERF_RECORD_IRQ:
1308         case PERF_RECORD_GROUP:
1309                 return perf_read_irq_data(counter, buf, count,
1310                                           file->f_flags & O_NONBLOCK);
1311         }
1312         return -EINVAL;
1313 }
1314
1315 static unsigned int perf_poll(struct file *file, poll_table *wait)
1316 {
1317         struct perf_counter *counter = file->private_data;
1318         unsigned int events = 0;
1319         unsigned long flags;
1320
1321         poll_wait(file, &counter->waitq, wait);
1322
1323         spin_lock_irqsave(&counter->waitq.lock, flags);
1324         if (counter->usrdata->len || counter->irqdata->len)
1325                 events |= POLLIN;
1326         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1327
1328         return events;
1329 }
1330
1331 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1332 {
1333         struct perf_counter *counter = file->private_data;
1334         int err = 0;
1335
1336         switch (cmd) {
1337         case PERF_COUNTER_IOC_ENABLE:
1338                 perf_counter_enable_family(counter);
1339                 break;
1340         case PERF_COUNTER_IOC_DISABLE:
1341                 perf_counter_disable_family(counter);
1342                 break;
1343         default:
1344                 err = -ENOTTY;
1345         }
1346         return err;
1347 }
1348
1349 static const struct file_operations perf_fops = {
1350         .release                = perf_release,
1351         .read                   = perf_read,
1352         .poll                   = perf_poll,
1353         .unlocked_ioctl         = perf_ioctl,
1354         .compat_ioctl           = perf_ioctl,
1355 };
1356
1357 /*
1358  * Output
1359  */
1360
1361 static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
1362 {
1363         struct perf_data *irqdata = counter->irqdata;
1364
1365         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
1366                 irqdata->overrun++;
1367         } else {
1368                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
1369
1370                 *p = data;
1371                 irqdata->len += sizeof(u64);
1372         }
1373 }
1374
1375 static void perf_counter_handle_group(struct perf_counter *counter)
1376 {
1377         struct perf_counter *leader, *sub;
1378
1379         leader = counter->group_leader;
1380         list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1381                 if (sub != counter)
1382                         sub->hw_ops->read(sub);
1383                 perf_counter_store_irq(counter, sub->hw_event.config);
1384                 perf_counter_store_irq(counter, atomic64_read(&sub->count));
1385         }
1386 }
1387
1388 void perf_counter_output(struct perf_counter *counter,
1389                          int nmi, struct pt_regs *regs)
1390 {
1391         switch (counter->hw_event.record_type) {
1392         case PERF_RECORD_SIMPLE:
1393                 return;
1394
1395         case PERF_RECORD_IRQ:
1396                 perf_counter_store_irq(counter, instruction_pointer(regs));
1397                 break;
1398
1399         case PERF_RECORD_GROUP:
1400                 perf_counter_handle_group(counter);
1401                 break;
1402         }
1403
1404         if (nmi) {
1405                 counter->wakeup_pending = 1;
1406                 set_perf_counter_pending();
1407         } else
1408                 wake_up(&counter->waitq);
1409 }
1410
1411 /*
1412  * Generic software counter infrastructure
1413  */
1414
1415 static void perf_swcounter_update(struct perf_counter *counter)
1416 {
1417         struct hw_perf_counter *hwc = &counter->hw;
1418         u64 prev, now;
1419         s64 delta;
1420
1421 again:
1422         prev = atomic64_read(&hwc->prev_count);
1423         now = atomic64_read(&hwc->count);
1424         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
1425                 goto again;
1426
1427         delta = now - prev;
1428
1429         atomic64_add(delta, &counter->count);
1430         atomic64_sub(delta, &hwc->period_left);
1431 }
1432
1433 static void perf_swcounter_set_period(struct perf_counter *counter)
1434 {
1435         struct hw_perf_counter *hwc = &counter->hw;
1436         s64 left = atomic64_read(&hwc->period_left);
1437         s64 period = hwc->irq_period;
1438
1439         if (unlikely(left <= -period)) {
1440                 left = period;
1441                 atomic64_set(&hwc->period_left, left);
1442         }
1443
1444         if (unlikely(left <= 0)) {
1445                 left += period;
1446                 atomic64_add(period, &hwc->period_left);
1447         }
1448
1449         atomic64_set(&hwc->prev_count, -left);
1450         atomic64_set(&hwc->count, -left);
1451 }
1452
1453 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
1454 {
1455         struct perf_counter *counter;
1456         struct pt_regs *regs;
1457
1458         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
1459         counter->hw_ops->read(counter);
1460
1461         regs = get_irq_regs();
1462         /*
1463          * In case we exclude kernel IPs or are somehow not in interrupt
1464          * context, provide the next best thing, the user IP.
1465          */
1466         if ((counter->hw_event.exclude_kernel || !regs) &&
1467                         !counter->hw_event.exclude_user)
1468                 regs = task_pt_regs(current);
1469
1470         if (regs)
1471                 perf_counter_output(counter, 0, regs);
1472
1473         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
1474
1475         return HRTIMER_RESTART;
1476 }
1477
1478 static void perf_swcounter_overflow(struct perf_counter *counter,
1479                                     int nmi, struct pt_regs *regs)
1480 {
1481         perf_swcounter_update(counter);
1482         perf_swcounter_set_period(counter);
1483         perf_counter_output(counter, nmi, regs);
1484 }
1485
1486 static int perf_swcounter_match(struct perf_counter *counter,
1487                                 enum perf_event_types type,
1488                                 u32 event, struct pt_regs *regs)
1489 {
1490         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1491                 return 0;
1492
1493         if (perf_event_raw(&counter->hw_event))
1494                 return 0;
1495
1496         if (perf_event_type(&counter->hw_event) != type)
1497                 return 0;
1498
1499         if (perf_event_id(&counter->hw_event) != event)
1500                 return 0;
1501
1502         if (counter->hw_event.exclude_user && user_mode(regs))
1503                 return 0;
1504
1505         if (counter->hw_event.exclude_kernel && !user_mode(regs))
1506                 return 0;
1507
1508         return 1;
1509 }
1510
1511 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
1512                                int nmi, struct pt_regs *regs)
1513 {
1514         int neg = atomic64_add_negative(nr, &counter->hw.count);
1515         if (counter->hw.irq_period && !neg)
1516                 perf_swcounter_overflow(counter, nmi, regs);
1517 }
1518
1519 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
1520                                      enum perf_event_types type, u32 event,
1521                                      u64 nr, int nmi, struct pt_regs *regs)
1522 {
1523         struct perf_counter *counter;
1524
1525         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1526                 return;
1527
1528         rcu_read_lock();
1529         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1530                 if (perf_swcounter_match(counter, type, event, regs))
1531                         perf_swcounter_add(counter, nr, nmi, regs);
1532         }
1533         rcu_read_unlock();
1534 }
1535
1536 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
1537 {
1538         if (in_nmi())
1539                 return &cpuctx->recursion[3];
1540
1541         if (in_irq())
1542                 return &cpuctx->recursion[2];
1543
1544         if (in_softirq())
1545                 return &cpuctx->recursion[1];
1546
1547         return &cpuctx->recursion[0];
1548 }
1549
1550 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
1551                                    u64 nr, int nmi, struct pt_regs *regs)
1552 {
1553         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
1554         int *recursion = perf_swcounter_recursion_context(cpuctx);
1555
1556         if (*recursion)
1557                 goto out;
1558
1559         (*recursion)++;
1560         barrier();
1561
1562         perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
1563         if (cpuctx->task_ctx) {
1564                 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
1565                                 nr, nmi, regs);
1566         }
1567
1568         barrier();
1569         (*recursion)--;
1570
1571 out:
1572         put_cpu_var(perf_cpu_context);
1573 }
1574
1575 void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
1576 {
1577         __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
1578 }
1579
1580 static void perf_swcounter_read(struct perf_counter *counter)
1581 {
1582         perf_swcounter_update(counter);
1583 }
1584
1585 static int perf_swcounter_enable(struct perf_counter *counter)
1586 {
1587         perf_swcounter_set_period(counter);
1588         return 0;
1589 }
1590
1591 static void perf_swcounter_disable(struct perf_counter *counter)
1592 {
1593         perf_swcounter_update(counter);
1594 }
1595
1596 static const struct hw_perf_counter_ops perf_ops_generic = {
1597         .enable         = perf_swcounter_enable,
1598         .disable        = perf_swcounter_disable,
1599         .read           = perf_swcounter_read,
1600 };
1601
1602 /*
1603  * Software counter: cpu wall time clock
1604  */
1605
1606 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1607 {
1608         int cpu = raw_smp_processor_id();
1609         s64 prev;
1610         u64 now;
1611
1612         now = cpu_clock(cpu);
1613         prev = atomic64_read(&counter->hw.prev_count);
1614         atomic64_set(&counter->hw.prev_count, now);
1615         atomic64_add(now - prev, &counter->count);
1616 }
1617
1618 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1619 {
1620         struct hw_perf_counter *hwc = &counter->hw;
1621         int cpu = raw_smp_processor_id();
1622
1623         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
1624         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1625         hwc->hrtimer.function = perf_swcounter_hrtimer;
1626         if (hwc->irq_period) {
1627                 __hrtimer_start_range_ns(&hwc->hrtimer,
1628                                 ns_to_ktime(hwc->irq_period), 0,
1629                                 HRTIMER_MODE_REL, 0);
1630         }
1631
1632         return 0;
1633 }
1634
1635 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1636 {
1637         hrtimer_cancel(&counter->hw.hrtimer);
1638         cpu_clock_perf_counter_update(counter);
1639 }
1640
1641 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1642 {
1643         cpu_clock_perf_counter_update(counter);
1644 }
1645
1646 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1647         .enable         = cpu_clock_perf_counter_enable,
1648         .disable        = cpu_clock_perf_counter_disable,
1649         .read           = cpu_clock_perf_counter_read,
1650 };
1651
1652 /*
1653  * Software counter: task time clock
1654  */
1655
1656 /*
1657  * Called from within the scheduler:
1658  */
1659 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1660 {
1661         struct task_struct *curr = counter->task;
1662         u64 delta;
1663
1664         delta = __task_delta_exec(curr, update);
1665
1666         return curr->se.sum_exec_runtime + delta;
1667 }
1668
1669 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1670 {
1671         u64 prev;
1672         s64 delta;
1673
1674         prev = atomic64_read(&counter->hw.prev_count);
1675
1676         atomic64_set(&counter->hw.prev_count, now);
1677
1678         delta = now - prev;
1679
1680         atomic64_add(delta, &counter->count);
1681 }
1682
1683 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1684 {
1685         struct hw_perf_counter *hwc = &counter->hw;
1686
1687         atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
1688         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1689         hwc->hrtimer.function = perf_swcounter_hrtimer;
1690         if (hwc->irq_period) {
1691                 __hrtimer_start_range_ns(&hwc->hrtimer,
1692                                 ns_to_ktime(hwc->irq_period), 0,
1693                                 HRTIMER_MODE_REL, 0);
1694         }
1695
1696         return 0;
1697 }
1698
1699 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1700 {
1701         hrtimer_cancel(&counter->hw.hrtimer);
1702         task_clock_perf_counter_update(counter,
1703                         task_clock_perf_counter_val(counter, 0));
1704 }
1705
1706 static void task_clock_perf_counter_read(struct perf_counter *counter)
1707 {
1708         task_clock_perf_counter_update(counter,
1709                         task_clock_perf_counter_val(counter, 1));
1710 }
1711
1712 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1713         .enable         = task_clock_perf_counter_enable,
1714         .disable        = task_clock_perf_counter_disable,
1715         .read           = task_clock_perf_counter_read,
1716 };
1717
1718 /*
1719  * Software counter: cpu migrations
1720  */
1721
1722 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1723 {
1724         struct task_struct *curr = counter->ctx->task;
1725
1726         if (curr)
1727                 return curr->se.nr_migrations;
1728         return cpu_nr_migrations(smp_processor_id());
1729 }
1730
1731 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1732 {
1733         u64 prev, now;
1734         s64 delta;
1735
1736         prev = atomic64_read(&counter->hw.prev_count);
1737         now = get_cpu_migrations(counter);
1738
1739         atomic64_set(&counter->hw.prev_count, now);
1740
1741         delta = now - prev;
1742
1743         atomic64_add(delta, &counter->count);
1744 }
1745
1746 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1747 {
1748         cpu_migrations_perf_counter_update(counter);
1749 }
1750
1751 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1752 {
1753         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1754                 atomic64_set(&counter->hw.prev_count,
1755                              get_cpu_migrations(counter));
1756         return 0;
1757 }
1758
1759 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1760 {
1761         cpu_migrations_perf_counter_update(counter);
1762 }
1763
1764 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1765         .enable         = cpu_migrations_perf_counter_enable,
1766         .disable        = cpu_migrations_perf_counter_disable,
1767         .read           = cpu_migrations_perf_counter_read,
1768 };
1769
1770 #ifdef CONFIG_EVENT_PROFILE
1771 void perf_tpcounter_event(int event_id)
1772 {
1773         struct pt_regs *regs = get_irq_regs();
1774
1775         if (!regs)
1776                 regs = task_pt_regs(current);
1777
1778         __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
1779 }
1780
1781 extern int ftrace_profile_enable(int);
1782 extern void ftrace_profile_disable(int);
1783
1784 static void tp_perf_counter_destroy(struct perf_counter *counter)
1785 {
1786         ftrace_profile_disable(perf_event_id(&counter->hw_event));
1787 }
1788
1789 static const struct hw_perf_counter_ops *
1790 tp_perf_counter_init(struct perf_counter *counter)
1791 {
1792         int event_id = perf_event_id(&counter->hw_event);
1793         int ret;
1794
1795         ret = ftrace_profile_enable(event_id);
1796         if (ret)
1797                 return NULL;
1798
1799         counter->destroy = tp_perf_counter_destroy;
1800         counter->hw.irq_period = counter->hw_event.irq_period;
1801
1802         return &perf_ops_generic;
1803 }
1804 #else
1805 static const struct hw_perf_counter_ops *
1806 tp_perf_counter_init(struct perf_counter *counter)
1807 {
1808         return NULL;
1809 }
1810 #endif
1811
1812 static const struct hw_perf_counter_ops *
1813 sw_perf_counter_init(struct perf_counter *counter)
1814 {
1815         struct perf_counter_hw_event *hw_event = &counter->hw_event;
1816         const struct hw_perf_counter_ops *hw_ops = NULL;
1817         struct hw_perf_counter *hwc = &counter->hw;
1818
1819         /*
1820          * Software counters (currently) can't in general distinguish
1821          * between user, kernel and hypervisor events.
1822          * However, context switches and cpu migrations are considered
1823          * to be kernel events, and page faults are never hypervisor
1824          * events.
1825          */
1826         switch (perf_event_id(&counter->hw_event)) {
1827         case PERF_COUNT_CPU_CLOCK:
1828                 hw_ops = &perf_ops_cpu_clock;
1829
1830                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1831                         hw_event->irq_period = 10000;
1832                 break;
1833         case PERF_COUNT_TASK_CLOCK:
1834                 /*
1835                  * If the user instantiates this as a per-cpu counter,
1836                  * use the cpu_clock counter instead.
1837                  */
1838                 if (counter->ctx->task)
1839                         hw_ops = &perf_ops_task_clock;
1840                 else
1841                         hw_ops = &perf_ops_cpu_clock;
1842
1843                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1844                         hw_event->irq_period = 10000;
1845                 break;
1846         case PERF_COUNT_PAGE_FAULTS:
1847         case PERF_COUNT_PAGE_FAULTS_MIN:
1848         case PERF_COUNT_PAGE_FAULTS_MAJ:
1849         case PERF_COUNT_CONTEXT_SWITCHES:
1850                 hw_ops = &perf_ops_generic;
1851                 break;
1852         case PERF_COUNT_CPU_MIGRATIONS:
1853                 if (!counter->hw_event.exclude_kernel)
1854                         hw_ops = &perf_ops_cpu_migrations;
1855                 break;
1856         }
1857
1858         if (hw_ops)
1859                 hwc->irq_period = hw_event->irq_period;
1860
1861         return hw_ops;
1862 }
1863
1864 /*
1865  * Allocate and initialize a counter structure
1866  */
1867 static struct perf_counter *
1868 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1869                    int cpu,
1870                    struct perf_counter_context *ctx,
1871                    struct perf_counter *group_leader,
1872                    gfp_t gfpflags)
1873 {
1874         const struct hw_perf_counter_ops *hw_ops;
1875         struct perf_counter *counter;
1876
1877         counter = kzalloc(sizeof(*counter), gfpflags);
1878         if (!counter)
1879                 return NULL;
1880
1881         /*
1882          * Single counters are their own group leaders, with an
1883          * empty sibling list:
1884          */
1885         if (!group_leader)
1886                 group_leader = counter;
1887
1888         mutex_init(&counter->mutex);
1889         INIT_LIST_HEAD(&counter->list_entry);
1890         INIT_LIST_HEAD(&counter->event_entry);
1891         INIT_LIST_HEAD(&counter->sibling_list);
1892         init_waitqueue_head(&counter->waitq);
1893
1894         INIT_LIST_HEAD(&counter->child_list);
1895
1896         counter->irqdata                = &counter->data[0];
1897         counter->usrdata                = &counter->data[1];
1898         counter->cpu                    = cpu;
1899         counter->hw_event               = *hw_event;
1900         counter->wakeup_pending         = 0;
1901         counter->group_leader           = group_leader;
1902         counter->hw_ops                 = NULL;
1903         counter->ctx                    = ctx;
1904
1905         counter->state = PERF_COUNTER_STATE_INACTIVE;
1906         if (hw_event->disabled)
1907                 counter->state = PERF_COUNTER_STATE_OFF;
1908
1909         hw_ops = NULL;
1910
1911         if (perf_event_raw(hw_event)) {
1912                 hw_ops = hw_perf_counter_init(counter);
1913                 goto done;
1914         }
1915
1916         switch (perf_event_type(hw_event)) {
1917         case PERF_TYPE_HARDWARE:
1918                 hw_ops = hw_perf_counter_init(counter);
1919                 break;
1920
1921         case PERF_TYPE_SOFTWARE:
1922                 hw_ops = sw_perf_counter_init(counter);
1923                 break;
1924
1925         case PERF_TYPE_TRACEPOINT:
1926                 hw_ops = tp_perf_counter_init(counter);
1927                 break;
1928         }
1929
1930         if (!hw_ops) {
1931                 kfree(counter);
1932                 return NULL;
1933         }
1934 done:
1935         counter->hw_ops = hw_ops;
1936
1937         return counter;
1938 }
1939
1940 /**
1941  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
1942  *
1943  * @hw_event_uptr:      event type attributes for monitoring/sampling
1944  * @pid:                target pid
1945  * @cpu:                target cpu
1946  * @group_fd:           group leader counter fd
1947  */
1948 SYSCALL_DEFINE5(perf_counter_open,
1949                 const struct perf_counter_hw_event __user *, hw_event_uptr,
1950                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
1951 {
1952         struct perf_counter *counter, *group_leader;
1953         struct perf_counter_hw_event hw_event;
1954         struct perf_counter_context *ctx;
1955         struct file *counter_file = NULL;
1956         struct file *group_file = NULL;
1957         int fput_needed = 0;
1958         int fput_needed2 = 0;
1959         int ret;
1960
1961         /* for future expandability... */
1962         if (flags)
1963                 return -EINVAL;
1964
1965         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1966                 return -EFAULT;
1967
1968         /*
1969          * Get the target context (task or percpu):
1970          */
1971         ctx = find_get_context(pid, cpu);
1972         if (IS_ERR(ctx))
1973                 return PTR_ERR(ctx);
1974
1975         /*
1976          * Look up the group leader (we will attach this counter to it):
1977          */
1978         group_leader = NULL;
1979         if (group_fd != -1) {
1980                 ret = -EINVAL;
1981                 group_file = fget_light(group_fd, &fput_needed);
1982                 if (!group_file)
1983                         goto err_put_context;
1984                 if (group_file->f_op != &perf_fops)
1985                         goto err_put_context;
1986
1987                 group_leader = group_file->private_data;
1988                 /*
1989                  * Do not allow a recursive hierarchy (this new sibling
1990                  * becoming part of another group-sibling):
1991                  */
1992                 if (group_leader->group_leader != group_leader)
1993                         goto err_put_context;
1994                 /*
1995                  * Do not allow to attach to a group in a different
1996                  * task or CPU context:
1997                  */
1998                 if (group_leader->ctx != ctx)
1999                         goto err_put_context;
2000                 /*
2001                  * Only a group leader can be exclusive or pinned
2002                  */
2003                 if (hw_event.exclusive || hw_event.pinned)
2004                         goto err_put_context;
2005         }
2006
2007         ret = -EINVAL;
2008         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2009                                      GFP_KERNEL);
2010         if (!counter)
2011                 goto err_put_context;
2012
2013         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2014         if (ret < 0)
2015                 goto err_free_put_context;
2016
2017         counter_file = fget_light(ret, &fput_needed2);
2018         if (!counter_file)
2019                 goto err_free_put_context;
2020
2021         counter->filp = counter_file;
2022         mutex_lock(&ctx->mutex);
2023         perf_install_in_context(ctx, counter, cpu);
2024         mutex_unlock(&ctx->mutex);
2025
2026         fput_light(counter_file, fput_needed2);
2027
2028 out_fput:
2029         fput_light(group_file, fput_needed);
2030
2031         return ret;
2032
2033 err_free_put_context:
2034         kfree(counter);
2035
2036 err_put_context:
2037         put_context(ctx);
2038
2039         goto out_fput;
2040 }
2041
2042 /*
2043  * Initialize the perf_counter context in a task_struct:
2044  */
2045 static void
2046 __perf_counter_init_context(struct perf_counter_context *ctx,
2047                             struct task_struct *task)
2048 {
2049         memset(ctx, 0, sizeof(*ctx));
2050         spin_lock_init(&ctx->lock);
2051         mutex_init(&ctx->mutex);
2052         INIT_LIST_HEAD(&ctx->counter_list);
2053         INIT_LIST_HEAD(&ctx->event_list);
2054         ctx->task = task;
2055 }
2056
2057 /*
2058  * inherit a counter from parent task to child task:
2059  */
2060 static struct perf_counter *
2061 inherit_counter(struct perf_counter *parent_counter,
2062               struct task_struct *parent,
2063               struct perf_counter_context *parent_ctx,
2064               struct task_struct *child,
2065               struct perf_counter *group_leader,
2066               struct perf_counter_context *child_ctx)
2067 {
2068         struct perf_counter *child_counter;
2069
2070         /*
2071          * Instead of creating recursive hierarchies of counters,
2072          * we link inherited counters back to the original parent,
2073          * which has a filp for sure, which we use as the reference
2074          * count:
2075          */
2076         if (parent_counter->parent)
2077                 parent_counter = parent_counter->parent;
2078
2079         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2080                                            parent_counter->cpu, child_ctx,
2081                                            group_leader, GFP_KERNEL);
2082         if (!child_counter)
2083                 return NULL;
2084
2085         /*
2086          * Link it up in the child's context:
2087          */
2088         child_counter->task = child;
2089         list_add_counter(child_counter, child_ctx);
2090         child_ctx->nr_counters++;
2091
2092         child_counter->parent = parent_counter;
2093         /*
2094          * inherit into child's child as well:
2095          */
2096         child_counter->hw_event.inherit = 1;
2097
2098         /*
2099          * Get a reference to the parent filp - we will fput it
2100          * when the child counter exits. This is safe to do because
2101          * we are in the parent and we know that the filp still
2102          * exists and has a nonzero count:
2103          */
2104         atomic_long_inc(&parent_counter->filp->f_count);
2105
2106         /*
2107          * Link this into the parent counter's child list
2108          */
2109         mutex_lock(&parent_counter->mutex);
2110         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2111
2112         /*
2113          * Make the child state follow the state of the parent counter,
2114          * not its hw_event.disabled bit.  We hold the parent's mutex,
2115          * so we won't race with perf_counter_{en,dis}able_family.
2116          */
2117         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2118                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2119         else
2120                 child_counter->state = PERF_COUNTER_STATE_OFF;
2121
2122         mutex_unlock(&parent_counter->mutex);
2123
2124         return child_counter;
2125 }
2126
2127 static int inherit_group(struct perf_counter *parent_counter,
2128               struct task_struct *parent,
2129               struct perf_counter_context *parent_ctx,
2130               struct task_struct *child,
2131               struct perf_counter_context *child_ctx)
2132 {
2133         struct perf_counter *leader;
2134         struct perf_counter *sub;
2135
2136         leader = inherit_counter(parent_counter, parent, parent_ctx,
2137                                  child, NULL, child_ctx);
2138         if (!leader)
2139                 return -ENOMEM;
2140         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2141                 if (!inherit_counter(sub, parent, parent_ctx,
2142                                      child, leader, child_ctx))
2143                         return -ENOMEM;
2144         }
2145         return 0;
2146 }
2147
2148 static void sync_child_counter(struct perf_counter *child_counter,
2149                                struct perf_counter *parent_counter)
2150 {
2151         u64 parent_val, child_val;
2152
2153         parent_val = atomic64_read(&parent_counter->count);
2154         child_val = atomic64_read(&child_counter->count);
2155
2156         /*
2157          * Add back the child's count to the parent's count:
2158          */
2159         atomic64_add(child_val, &parent_counter->count);
2160
2161         /*
2162          * Remove this counter from the parent's list
2163          */
2164         mutex_lock(&parent_counter->mutex);
2165         list_del_init(&child_counter->child_list);
2166         mutex_unlock(&parent_counter->mutex);
2167
2168         /*
2169          * Release the parent counter, if this was the last
2170          * reference to it.
2171          */
2172         fput(parent_counter->filp);
2173 }
2174
2175 static void
2176 __perf_counter_exit_task(struct task_struct *child,
2177                          struct perf_counter *child_counter,
2178                          struct perf_counter_context *child_ctx)
2179 {
2180         struct perf_counter *parent_counter;
2181         struct perf_counter *sub, *tmp;
2182
2183         /*
2184          * If we do not self-reap then we have to wait for the
2185          * child task to unschedule (it will happen for sure),
2186          * so that its counter is at its final count. (This
2187          * condition triggers rarely - child tasks usually get
2188          * off their CPU before the parent has a chance to
2189          * get this far into the reaping action)
2190          */
2191         if (child != current) {
2192                 wait_task_inactive(child, 0);
2193                 list_del_init(&child_counter->list_entry);
2194         } else {
2195                 struct perf_cpu_context *cpuctx;
2196                 unsigned long flags;
2197                 u64 perf_flags;
2198
2199                 /*
2200                  * Disable and unlink this counter.
2201                  *
2202                  * Be careful about zapping the list - IRQ/NMI context
2203                  * could still be processing it:
2204                  */
2205                 curr_rq_lock_irq_save(&flags);
2206                 perf_flags = hw_perf_save_disable();
2207
2208                 cpuctx = &__get_cpu_var(perf_cpu_context);
2209
2210                 group_sched_out(child_counter, cpuctx, child_ctx);
2211
2212                 list_del_init(&child_counter->list_entry);
2213
2214                 child_ctx->nr_counters--;
2215
2216                 hw_perf_restore(perf_flags);
2217                 curr_rq_unlock_irq_restore(&flags);
2218         }
2219
2220         parent_counter = child_counter->parent;
2221         /*
2222          * It can happen that parent exits first, and has counters
2223          * that are still around due to the child reference. These
2224          * counters need to be zapped - but otherwise linger.
2225          */
2226         if (parent_counter) {
2227                 sync_child_counter(child_counter, parent_counter);
2228                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2229                                          list_entry) {
2230                         if (sub->parent) {
2231                                 sync_child_counter(sub, sub->parent);
2232                                 free_counter(sub);
2233                         }
2234                 }
2235                 free_counter(child_counter);
2236         }
2237 }
2238
2239 /*
2240  * When a child task exits, feed back counter values to parent counters.
2241  *
2242  * Note: we may be running in child context, but the PID is not hashed
2243  * anymore so new counters will not be added.
2244  */
2245 void perf_counter_exit_task(struct task_struct *child)
2246 {
2247         struct perf_counter *child_counter, *tmp;
2248         struct perf_counter_context *child_ctx;
2249
2250         child_ctx = &child->perf_counter_ctx;
2251
2252         if (likely(!child_ctx->nr_counters))
2253                 return;
2254
2255         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2256                                  list_entry)
2257                 __perf_counter_exit_task(child, child_counter, child_ctx);
2258 }
2259
2260 /*
2261  * Initialize the perf_counter context in task_struct
2262  */
2263 void perf_counter_init_task(struct task_struct *child)
2264 {
2265         struct perf_counter_context *child_ctx, *parent_ctx;
2266         struct perf_counter *counter;
2267         struct task_struct *parent = current;
2268
2269         child_ctx  =  &child->perf_counter_ctx;
2270         parent_ctx = &parent->perf_counter_ctx;
2271
2272         __perf_counter_init_context(child_ctx, child);
2273
2274         /*
2275          * This is executed from the parent task context, so inherit
2276          * counters that have been marked for cloning:
2277          */
2278
2279         if (likely(!parent_ctx->nr_counters))
2280                 return;
2281
2282         /*
2283          * Lock the parent list. No need to lock the child - not PID
2284          * hashed yet and not running, so nobody can access it.
2285          */
2286         mutex_lock(&parent_ctx->mutex);
2287
2288         /*
2289          * We dont have to disable NMIs - we are only looking at
2290          * the list, not manipulating it:
2291          */
2292         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2293                 if (!counter->hw_event.inherit)
2294                         continue;
2295
2296                 if (inherit_group(counter, parent,
2297                                   parent_ctx, child, child_ctx))
2298                         break;
2299         }
2300
2301         mutex_unlock(&parent_ctx->mutex);
2302 }
2303
2304 static void __cpuinit perf_counter_init_cpu(int cpu)
2305 {
2306         struct perf_cpu_context *cpuctx;
2307
2308         cpuctx = &per_cpu(perf_cpu_context, cpu);
2309         __perf_counter_init_context(&cpuctx->ctx, NULL);
2310
2311         mutex_lock(&perf_resource_mutex);
2312         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2313         mutex_unlock(&perf_resource_mutex);
2314
2315         hw_perf_counter_setup(cpu);
2316 }
2317
2318 #ifdef CONFIG_HOTPLUG_CPU
2319 static void __perf_counter_exit_cpu(void *info)
2320 {
2321         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2322         struct perf_counter_context *ctx = &cpuctx->ctx;
2323         struct perf_counter *counter, *tmp;
2324
2325         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2326                 __perf_counter_remove_from_context(counter);
2327 }
2328 static void perf_counter_exit_cpu(int cpu)
2329 {
2330         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2331         struct perf_counter_context *ctx = &cpuctx->ctx;
2332
2333         mutex_lock(&ctx->mutex);
2334         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2335         mutex_unlock(&ctx->mutex);
2336 }
2337 #else
2338 static inline void perf_counter_exit_cpu(int cpu) { }
2339 #endif
2340
2341 static int __cpuinit
2342 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2343 {
2344         unsigned int cpu = (long)hcpu;
2345
2346         switch (action) {
2347
2348         case CPU_UP_PREPARE:
2349         case CPU_UP_PREPARE_FROZEN:
2350                 perf_counter_init_cpu(cpu);
2351                 break;
2352
2353         case CPU_DOWN_PREPARE:
2354         case CPU_DOWN_PREPARE_FROZEN:
2355                 perf_counter_exit_cpu(cpu);
2356                 break;
2357
2358         default:
2359                 break;
2360         }
2361
2362         return NOTIFY_OK;
2363 }
2364
2365 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2366         .notifier_call          = perf_cpu_notify,
2367 };
2368
2369 static int __init perf_counter_init(void)
2370 {
2371         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2372                         (void *)(long)smp_processor_id());
2373         register_cpu_notifier(&perf_cpu_nb);
2374
2375         return 0;
2376 }
2377 early_initcall(perf_counter_init);
2378
2379 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2380 {
2381         return sprintf(buf, "%d\n", perf_reserved_percpu);
2382 }
2383
2384 static ssize_t
2385 perf_set_reserve_percpu(struct sysdev_class *class,
2386                         const char *buf,
2387                         size_t count)
2388 {
2389         struct perf_cpu_context *cpuctx;
2390         unsigned long val;
2391         int err, cpu, mpt;
2392
2393         err = strict_strtoul(buf, 10, &val);
2394         if (err)
2395                 return err;
2396         if (val > perf_max_counters)
2397                 return -EINVAL;
2398
2399         mutex_lock(&perf_resource_mutex);
2400         perf_reserved_percpu = val;
2401         for_each_online_cpu(cpu) {
2402                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2403                 spin_lock_irq(&cpuctx->ctx.lock);
2404                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2405                           perf_max_counters - perf_reserved_percpu);
2406                 cpuctx->max_pertask = mpt;
2407                 spin_unlock_irq(&cpuctx->ctx.lock);
2408         }
2409         mutex_unlock(&perf_resource_mutex);
2410
2411         return count;
2412 }
2413
2414 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2415 {
2416         return sprintf(buf, "%d\n", perf_overcommit);
2417 }
2418
2419 static ssize_t
2420 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2421 {
2422         unsigned long val;
2423         int err;
2424
2425         err = strict_strtoul(buf, 10, &val);
2426         if (err)
2427                 return err;
2428         if (val > 1)
2429                 return -EINVAL;
2430
2431         mutex_lock(&perf_resource_mutex);
2432         perf_overcommit = val;
2433         mutex_unlock(&perf_resource_mutex);
2434
2435         return count;
2436 }
2437
2438 static SYSDEV_CLASS_ATTR(
2439                                 reserve_percpu,
2440                                 0644,
2441                                 perf_show_reserve_percpu,
2442                                 perf_set_reserve_percpu
2443                         );
2444
2445 static SYSDEV_CLASS_ATTR(
2446                                 overcommit,
2447                                 0644,
2448                                 perf_show_overcommit,
2449                                 perf_set_overcommit
2450                         );
2451
2452 static struct attribute *perfclass_attrs[] = {
2453         &attr_reserve_percpu.attr,
2454         &attr_overcommit.attr,
2455         NULL
2456 };
2457
2458 static struct attribute_group perfclass_attr_group = {
2459         .attrs                  = perfclass_attrs,
2460         .name                   = "perf_counters",
2461 };
2462
2463 static int __init perf_counter_sysfs_init(void)
2464 {
2465         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2466                                   &perfclass_attr_group);
2467 }
2468 device_initcall(perf_counter_sysfs_init);