git.oblomov.eu Git - linux-2.6/blob - kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  *  For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/hardirq.h>
  24 #include <linux/rculist.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/syscalls.h>
  27 #include <linux/anon_inodes.h>
  28 #include <linux/kernel_stat.h>
  29 #include <linux/perf_counter.h>
  30
  31 #include <asm/irq_regs.h>
  32
  33 /*
  34  * Each CPU has a list of per CPU counters:
  35  */
  36 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  37
  38 int perf_max_counters __read_mostly = 1;
  39 static int perf_reserved_percpu __read_mostly;
  40 static int perf_overcommit __read_mostly = 1;
  41
  42 static atomic_t nr_counters __read_mostly;
  43 static atomic_t nr_mmap_counters __read_mostly;
  44 static atomic_t nr_comm_counters __read_mostly;
  45
  46 /*
  47  * perf counter paranoia level:
  48  *  0 - not paranoid
  49  *  1 - disallow cpu counters to unpriv
  50  *  2 - disallow kernel profiling to unpriv
  51  */
  52 int sysctl_perf_counter_paranoid __read_mostly;
  53
  54 static inline bool perf_paranoid_cpu(void)
  55 {
  56         return sysctl_perf_counter_paranoid > 0;
  57 }
  58
  59 static inline bool perf_paranoid_kernel(void)
  60 {
  61         return sysctl_perf_counter_paranoid > 1;
  62 }
  63
  64 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
  65
  66 /*
  67  * max perf counter sample rate
  68  */
  69 int sysctl_perf_counter_sample_rate __read_mostly = 100000;
  70
  71 static atomic64_t perf_counter_id;
  72
  73 /*
  74  * Lock for (sysadmin-configurable) counter reservations:
  75  */
  76 static DEFINE_SPINLOCK(perf_resource_lock);
  77
  78 /*
  79  * Architecture provided APIs - weak aliases:
  80  */
  81 extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
  82 {
  83         return NULL;
  84 }
  85
  86 void __weak hw_perf_disable(void)               { barrier(); }
  87 void __weak hw_perf_enable(void)                { barrier(); }
  88
  89 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  90
  91 int __weak
  92 hw_perf_group_sched_in(struct perf_counter *group_leader,
  93                struct perf_cpu_context *cpuctx,
  94                struct perf_counter_context *ctx, int cpu)
  95 {
  96         return 0;
  97 }
  98
  99 void __weak perf_counter_print_debug(void)      { }
 100
 101 static DEFINE_PER_CPU(int, disable_count);
 102
 103 void __perf_disable(void)
 104 {
 105         __get_cpu_var(disable_count)++;
 106 }
 107
 108 bool __perf_enable(void)
 109 {
 110         return !--__get_cpu_var(disable_count);
 111 }
 112
 113 void perf_disable(void)
 114 {
 115         __perf_disable();
 116         hw_perf_disable();
 117 }
 118
 119 void perf_enable(void)
 120 {
 121         if (__perf_enable())
 122                 hw_perf_enable();
 123 }
 124
 125 static void get_ctx(struct perf_counter_context *ctx)
 126 {
 127         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 128 }
 129
 130 static void free_ctx(struct rcu_head *head)
 131 {
 132         struct perf_counter_context *ctx;
 133
 134         ctx = container_of(head, struct perf_counter_context, rcu_head);
 135         kfree(ctx);
 136 }
 137
 138 static void put_ctx(struct perf_counter_context *ctx)
 139 {
 140         if (atomic_dec_and_test(&ctx->refcount)) {
 141                 if (ctx->parent_ctx)
 142                         put_ctx(ctx->parent_ctx);
 143                 if (ctx->task)
 144                         put_task_struct(ctx->task);
 145                 call_rcu(&ctx->rcu_head, free_ctx);
 146         }
 147 }
 148
 149 static void unclone_ctx(struct perf_counter_context *ctx)
 150 {
 151         if (ctx->parent_ctx) {
 152                 put_ctx(ctx->parent_ctx);
 153                 ctx->parent_ctx = NULL;
 154         }
 155 }
 156
 157 /*
 158  * If we inherit counters we want to return the parent counter id
 159  * to userspace.
 160  */
 161 static u64 primary_counter_id(struct perf_counter *counter)
 162 {
 163         u64 id = counter->id;
 164
 165         if (counter->parent)
 166                 id = counter->parent->id;
 167
 168         return id;
 169 }
 170
 171 /*
 172  * Get the perf_counter_context for a task and lock it.
 173  * This has to cope with with the fact that until it is locked,
 174  * the context could get moved to another task.
 175  */
 176 static struct perf_counter_context *
 177 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 178 {
 179         struct perf_counter_context *ctx;
 180
 181         rcu_read_lock();
 182  retry:
 183         ctx = rcu_dereference(task->perf_counter_ctxp);
 184         if (ctx) {
 185                 /*
 186                  * If this context is a clone of another, it might
 187                  * get swapped for another underneath us by
 188                  * perf_counter_task_sched_out, though the
 189                  * rcu_read_lock() protects us from any context
 190                  * getting freed.  Lock the context and check if it
 191                  * got swapped before we could get the lock, and retry
 192                  * if so.  If we locked the right context, then it
 193                  * can't get swapped on us any more.
 194                  */
 195                 spin_lock_irqsave(&ctx->lock, *flags);
 196                 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
 197                         spin_unlock_irqrestore(&ctx->lock, *flags);
 198                         goto retry;
 199                 }
 200
 201                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 202                         spin_unlock_irqrestore(&ctx->lock, *flags);
 203                         ctx = NULL;
 204                 }
 205         }
 206         rcu_read_unlock();
 207         return ctx;
 208 }
 209
 210 /*
 211  * Get the context for a task and increment its pin_count so it
 212  * can't get swapped to another task.  This also increments its
 213  * reference count so that the context can't get freed.
 214  */
 215 static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
 216 {
 217         struct perf_counter_context *ctx;
 218         unsigned long flags;
 219
 220         ctx = perf_lock_task_context(task, &flags);
 221         if (ctx) {
 222                 ++ctx->pin_count;
 223                 spin_unlock_irqrestore(&ctx->lock, flags);
 224         }
 225         return ctx;
 226 }
 227
 228 static void perf_unpin_context(struct perf_counter_context *ctx)
 229 {
 230         unsigned long flags;
 231
 232         spin_lock_irqsave(&ctx->lock, flags);
 233         --ctx->pin_count;
 234         spin_unlock_irqrestore(&ctx->lock, flags);
 235         put_ctx(ctx);
 236 }
 237
 238 /*
 239  * Add a counter from the lists for its context.
 240  * Must be called with ctx->mutex and ctx->lock held.
 241  */
 242 static void
 243 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 244 {
 245         struct perf_counter *group_leader = counter->group_leader;
 246
 247         /*
 248          * Depending on whether it is a standalone or sibling counter,
 249          * add it straight to the context's counter list, or to the group
 250          * leader's sibling list:
 251          */
 252         if (group_leader == counter)
 253                 list_add_tail(&counter->list_entry, &ctx->counter_list);
 254         else {
 255                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
 256                 group_leader->nr_siblings++;
 257         }
 258
 259         list_add_rcu(&counter->event_entry, &ctx->event_list);
 260         ctx->nr_counters++;
 261         if (counter->attr.inherit_stat)
 262                 ctx->nr_stat++;
 263 }
 264
 265 /*
 266  * Remove a counter from the lists for its context.
 267  * Must be called with ctx->mutex and ctx->lock held.
 268  */
 269 static void
 270 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 271 {
 272         struct perf_counter *sibling, *tmp;
 273
 274         if (list_empty(&counter->list_entry))
 275                 return;
 276         ctx->nr_counters--;
 277         if (counter->attr.inherit_stat)
 278                 ctx->nr_stat--;
 279
 280         list_del_init(&counter->list_entry);
 281         list_del_rcu(&counter->event_entry);
 282
 283         if (counter->group_leader != counter)
 284                 counter->group_leader->nr_siblings--;
 285
 286         /*
 287          * If this was a group counter with sibling counters then
 288          * upgrade the siblings to singleton counters by adding them
 289          * to the context list directly:
 290          */
 291         list_for_each_entry_safe(sibling, tmp,
 292                                  &counter->sibling_list, list_entry) {
 293
 294                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
 295                 sibling->group_leader = sibling;
 296         }
 297 }
 298
 299 static void
 300 counter_sched_out(struct perf_counter *counter,
 301                   struct perf_cpu_context *cpuctx,
 302                   struct perf_counter_context *ctx)
 303 {
 304         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 305                 return;
 306
 307         counter->state = PERF_COUNTER_STATE_INACTIVE;
 308         counter->tstamp_stopped = ctx->time;
 309         counter->pmu->disable(counter);
 310         counter->oncpu = -1;
 311
 312         if (!is_software_counter(counter))
 313                 cpuctx->active_oncpu--;
 314         ctx->nr_active--;
 315         if (counter->attr.exclusive || !cpuctx->active_oncpu)
 316                 cpuctx->exclusive = 0;
 317 }
 318
 319 static void
 320 group_sched_out(struct perf_counter *group_counter,
 321                 struct perf_cpu_context *cpuctx,
 322                 struct perf_counter_context *ctx)
 323 {
 324         struct perf_counter *counter;
 325
 326         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 327                 return;
 328
 329         counter_sched_out(group_counter, cpuctx, ctx);
 330
 331         /*
 332          * Schedule out siblings (if any):
 333          */
 334         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 335                 counter_sched_out(counter, cpuctx, ctx);
 336
 337         if (group_counter->attr.exclusive)
 338                 cpuctx->exclusive = 0;
 339 }
 340
 341 /*
 342  * Cross CPU call to remove a performance counter
 343  *
 344  * We disable the counter on the hardware level first. After that we
 345  * remove it from the context list.
 346  */
 347 static void __perf_counter_remove_from_context(void *info)
 348 {
 349         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 350         struct perf_counter *counter = info;
 351         struct perf_counter_context *ctx = counter->ctx;
 352
 353         /*
 354          * If this is a task context, we need to check whether it is
 355          * the current task context of this cpu. If not it has been
 356          * scheduled out before the smp call arrived.
 357          */
 358         if (ctx->task && cpuctx->task_ctx != ctx)
 359                 return;
 360
 361         spin_lock(&ctx->lock);
 362         /*
 363          * Protect the list operation against NMI by disabling the
 364          * counters on a global level.
 365          */
 366         perf_disable();
 367
 368         counter_sched_out(counter, cpuctx, ctx);
 369
 370         list_del_counter(counter, ctx);
 371
 372         if (!ctx->task) {
 373                 /*
 374                  * Allow more per task counters with respect to the
 375                  * reservation:
 376                  */
 377                 cpuctx->max_pertask =
 378                         min(perf_max_counters - ctx->nr_counters,
 379                             perf_max_counters - perf_reserved_percpu);
 380         }
 381
 382         perf_enable();
 383         spin_unlock(&ctx->lock);
 384 }
 385
 386
 387 /*
 388  * Remove the counter from a task's (or a CPU's) list of counters.
 389  *
 390  * Must be called with ctx->mutex held.
 391  *
 392  * CPU counters are removed with a smp call. For task counters we only
 393  * call when the task is on a CPU.
 394  *
 395  * If counter->ctx is a cloned context, callers must make sure that
 396  * every task struct that counter->ctx->task could possibly point to
 397  * remains valid.  This is OK when called from perf_release since
 398  * that only calls us on the top-level context, which can't be a clone.
 399  * When called from perf_counter_exit_task, it's OK because the
 400  * context has been detached from its task.
 401  */
 402 static void perf_counter_remove_from_context(struct perf_counter *counter)
 403 {
 404         struct perf_counter_context *ctx = counter->ctx;
 405         struct task_struct *task = ctx->task;
 406
 407         if (!task) {
 408                 /*
 409                  * Per cpu counters are removed via an smp call and
 410                  * the removal is always sucessful.
 411                  */
 412                 smp_call_function_single(counter->cpu,
 413                                          __perf_counter_remove_from_context,
 414                                          counter, 1);
 415                 return;
 416         }
 417
 418 retry:
 419         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 420                                  counter);
 421
 422         spin_lock_irq(&ctx->lock);
 423         /*
 424          * If the context is active we need to retry the smp call.
 425          */
 426         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 427                 spin_unlock_irq(&ctx->lock);
 428                 goto retry;
 429         }
 430
 431         /*
 432          * The lock prevents that this context is scheduled in so we
 433          * can remove the counter safely, if the call above did not
 434          * succeed.
 435          */
 436         if (!list_empty(&counter->list_entry)) {
 437                 list_del_counter(counter, ctx);
 438         }
 439         spin_unlock_irq(&ctx->lock);
 440 }
 441
 442 static inline u64 perf_clock(void)
 443 {
 444         return cpu_clock(smp_processor_id());
 445 }
 446
 447 /*
 448  * Update the record of the current time in a context.
 449  */
 450 static void update_context_time(struct perf_counter_context *ctx)
 451 {
 452         u64 now = perf_clock();
 453
 454         ctx->time += now - ctx->timestamp;
 455         ctx->timestamp = now;
 456 }
 457
 458 /*
 459  * Update the total_time_enabled and total_time_running fields for a counter.
 460  */
 461 static void update_counter_times(struct perf_counter *counter)
 462 {
 463         struct perf_counter_context *ctx = counter->ctx;
 464         u64 run_end;
 465
 466         if (counter->state < PERF_COUNTER_STATE_INACTIVE)
 467                 return;
 468
 469         counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
 470
 471         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 472                 run_end = counter->tstamp_stopped;
 473         else
 474                 run_end = ctx->time;
 475
 476         counter->total_time_running = run_end - counter->tstamp_running;
 477 }
 478
 479 /*
 480  * Update total_time_enabled and total_time_running for all counters in a group.
 481  */
 482 static void update_group_times(struct perf_counter *leader)
 483 {
 484         struct perf_counter *counter;
 485
 486         update_counter_times(leader);
 487         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 488                 update_counter_times(counter);
 489 }
 490
 491 /*
 492  * Cross CPU call to disable a performance counter
 493  */
 494 static void __perf_counter_disable(void *info)
 495 {
 496         struct perf_counter *counter = info;
 497         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 498         struct perf_counter_context *ctx = counter->ctx;
 499
 500         /*
 501          * If this is a per-task counter, need to check whether this
 502          * counter's task is the current task on this cpu.
 503          */
 504         if (ctx->task && cpuctx->task_ctx != ctx)
 505                 return;
 506
 507         spin_lock(&ctx->lock);
 508
 509         /*
 510          * If the counter is on, turn it off.
 511          * If it is in error state, leave it in error state.
 512          */
 513         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 514                 update_context_time(ctx);
 515                 update_counter_times(counter);
 516                 if (counter == counter->group_leader)
 517                         group_sched_out(counter, cpuctx, ctx);
 518                 else
 519                         counter_sched_out(counter, cpuctx, ctx);
 520                 counter->state = PERF_COUNTER_STATE_OFF;
 521         }
 522
 523         spin_unlock(&ctx->lock);
 524 }
 525
 526 /*
 527  * Disable a counter.
 528  *
 529  * If counter->ctx is a cloned context, callers must make sure that
 530  * every task struct that counter->ctx->task could possibly point to
 531  * remains valid.  This condition is satisifed when called through
 532  * perf_counter_for_each_child or perf_counter_for_each because they
 533  * hold the top-level counter's child_mutex, so any descendant that
 534  * goes to exit will block in sync_child_counter.
 535  * When called from perf_pending_counter it's OK because counter->ctx
 536  * is the current context on this CPU and preemption is disabled,
 537  * hence we can't get into perf_counter_task_sched_out for this context.
 538  */
 539 static void perf_counter_disable(struct perf_counter *counter)
 540 {
 541         struct perf_counter_context *ctx = counter->ctx;
 542         struct task_struct *task = ctx->task;
 543
 544         if (!task) {
 545                 /*
 546                  * Disable the counter on the cpu that it's on
 547                  */
 548                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 549                                          counter, 1);
 550                 return;
 551         }
 552
 553  retry:
 554         task_oncpu_function_call(task, __perf_counter_disable, counter);
 555
 556         spin_lock_irq(&ctx->lock);
 557         /*
 558          * If the counter is still active, we need to retry the cross-call.
 559          */
 560         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 561                 spin_unlock_irq(&ctx->lock);
 562                 goto retry;
 563         }
 564
 565         /*
 566          * Since we have the lock this context can't be scheduled
 567          * in, so we can change the state safely.
 568          */
 569         if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 570                 update_counter_times(counter);
 571                 counter->state = PERF_COUNTER_STATE_OFF;
 572         }
 573
 574         spin_unlock_irq(&ctx->lock);
 575 }
 576
 577 static int
 578 counter_sched_in(struct perf_counter *counter,
 579                  struct perf_cpu_context *cpuctx,
 580                  struct perf_counter_context *ctx,
 581                  int cpu)
 582 {
 583         if (counter->state <= PERF_COUNTER_STATE_OFF)
 584                 return 0;
 585
 586         counter->state = PERF_COUNTER_STATE_ACTIVE;
 587         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 588         /*
 589          * The new state must be visible before we turn it on in the hardware:
 590          */
 591         smp_wmb();
 592
 593         if (counter->pmu->enable(counter)) {
 594                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 595                 counter->oncpu = -1;
 596                 return -EAGAIN;
 597         }
 598
 599         counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 600
 601         if (!is_software_counter(counter))
 602                 cpuctx->active_oncpu++;
 603         ctx->nr_active++;
 604
 605         if (counter->attr.exclusive)
 606                 cpuctx->exclusive = 1;
 607
 608         return 0;
 609 }
 610
 611 static int
 612 group_sched_in(struct perf_counter *group_counter,
 613                struct perf_cpu_context *cpuctx,
 614                struct perf_counter_context *ctx,
 615                int cpu)
 616 {
 617         struct perf_counter *counter, *partial_group;
 618         int ret;
 619
 620         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 621                 return 0;
 622
 623         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 624         if (ret)
 625                 return ret < 0 ? ret : 0;
 626
 627         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 628                 return -EAGAIN;
 629
 630         /*
 631          * Schedule in siblings as one group (if any):
 632          */
 633         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 634                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 635                         partial_group = counter;
 636                         goto group_error;
 637                 }
 638         }
 639
 640         return 0;
 641
 642 group_error:
 643         /*
 644          * Groups can be scheduled in as one unit only, so undo any
 645          * partial group before returning:
 646          */
 647         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 648                 if (counter == partial_group)
 649                         break;
 650                 counter_sched_out(counter, cpuctx, ctx);
 651         }
 652         counter_sched_out(group_counter, cpuctx, ctx);
 653
 654         return -EAGAIN;
 655 }
 656
 657 /*
 658  * Return 1 for a group consisting entirely of software counters,
 659  * 0 if the group contains any hardware counters.
 660  */
 661 static int is_software_only_group(struct perf_counter *leader)
 662 {
 663         struct perf_counter *counter;
 664
 665         if (!is_software_counter(leader))
 666                 return 0;
 667
 668         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 669                 if (!is_software_counter(counter))
 670                         return 0;
 671
 672         return 1;
 673 }
 674
 675 /*
 676  * Work out whether we can put this counter group on the CPU now.
 677  */
 678 static int group_can_go_on(struct perf_counter *counter,
 679                            struct perf_cpu_context *cpuctx,
 680                            int can_add_hw)
 681 {
 682         /*
 683          * Groups consisting entirely of software counters can always go on.
 684          */
 685         if (is_software_only_group(counter))
 686                 return 1;
 687         /*
 688          * If an exclusive group is already on, no other hardware
 689          * counters can go on.
 690          */
 691         if (cpuctx->exclusive)
 692                 return 0;
 693         /*
 694          * If this group is exclusive and there are already
 695          * counters on the CPU, it can't go on.
 696          */
 697         if (counter->attr.exclusive && cpuctx->active_oncpu)
 698                 return 0;
 699         /*
 700          * Otherwise, try to add it if all previous groups were able
 701          * to go on.
 702          */
 703         return can_add_hw;
 704 }
 705
 706 static void add_counter_to_ctx(struct perf_counter *counter,
 707                                struct perf_counter_context *ctx)
 708 {
 709         list_add_counter(counter, ctx);
 710         counter->tstamp_enabled = ctx->time;
 711         counter->tstamp_running = ctx->time;
 712         counter->tstamp_stopped = ctx->time;
 713 }
 714
 715 /*
 716  * Cross CPU call to install and enable a performance counter
 717  *
 718  * Must be called with ctx->mutex held
 719  */
 720 static void __perf_install_in_context(void *info)
 721 {
 722         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 723         struct perf_counter *counter = info;
 724         struct perf_counter_context *ctx = counter->ctx;
 725         struct perf_counter *leader = counter->group_leader;
 726         int cpu = smp_processor_id();
 727         int err;
 728
 729         /*
 730          * If this is a task context, we need to check whether it is
 731          * the current task context of this cpu. If not it has been
 732          * scheduled out before the smp call arrived.
 733          * Or possibly this is the right context but it isn't
 734          * on this cpu because it had no counters.
 735          */
 736         if (ctx->task && cpuctx->task_ctx != ctx) {
 737                 if (cpuctx->task_ctx || ctx->task != current)
 738                         return;
 739                 cpuctx->task_ctx = ctx;
 740         }
 741
 742         spin_lock(&ctx->lock);
 743         ctx->is_active = 1;
 744         update_context_time(ctx);
 745
 746         /*
 747          * Protect the list operation against NMI by disabling the
 748          * counters on a global level. NOP for non NMI based counters.
 749          */
 750         perf_disable();
 751
 752         add_counter_to_ctx(counter, ctx);
 753
 754         /*
 755          * Don't put the counter on if it is disabled or if
 756          * it is in a group and the group isn't on.
 757          */
 758         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 759             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 760                 goto unlock;
 761
 762         /*
 763          * An exclusive counter can't go on if there are already active
 764          * hardware counters, and no hardware counter can go on if there
 765          * is already an exclusive counter on.
 766          */
 767         if (!group_can_go_on(counter, cpuctx, 1))
 768                 err = -EEXIST;
 769         else
 770                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 771
 772         if (err) {
 773                 /*
 774                  * This counter couldn't go on.  If it is in a group
 775                  * then we have to pull the whole group off.
 776                  * If the counter group is pinned then put it in error state.
 777                  */
 778                 if (leader != counter)
 779                         group_sched_out(leader, cpuctx, ctx);
 780                 if (leader->attr.pinned) {
 781                         update_group_times(leader);
 782                         leader->state = PERF_COUNTER_STATE_ERROR;
 783                 }
 784         }
 785
 786         if (!err && !ctx->task && cpuctx->max_pertask)
 787                 cpuctx->max_pertask--;
 788
 789  unlock:
 790         perf_enable();
 791
 792         spin_unlock(&ctx->lock);
 793 }
 794
 795 /*
 796  * Attach a performance counter to a context
 797  *
 798  * First we add the counter to the list with the hardware enable bit
 799  * in counter->hw_config cleared.
 800  *
 801  * If the counter is attached to a task which is on a CPU we use a smp
 802  * call to enable it in the task context. The task might have been
 803  * scheduled away, but we check this in the smp call again.
 804  *
 805  * Must be called with ctx->mutex held.
 806  */
 807 static void
 808 perf_install_in_context(struct perf_counter_context *ctx,
 809                         struct perf_counter *counter,
 810                         int cpu)
 811 {
 812         struct task_struct *task = ctx->task;
 813
 814         if (!task) {
 815                 /*
 816                  * Per cpu counters are installed via an smp call and
 817                  * the install is always sucessful.
 818                  */
 819                 smp_call_function_single(cpu, __perf_install_in_context,
 820                                          counter, 1);
 821                 return;
 822         }
 823
 824 retry:
 825         task_oncpu_function_call(task, __perf_install_in_context,
 826                                  counter);
 827
 828         spin_lock_irq(&ctx->lock);
 829         /*
 830          * we need to retry the smp call.
 831          */
 832         if (ctx->is_active && list_empty(&counter->list_entry)) {
 833                 spin_unlock_irq(&ctx->lock);
 834                 goto retry;
 835         }
 836
 837         /*
 838          * The lock prevents that this context is scheduled in so we
 839          * can add the counter safely, if it the call above did not
 840          * succeed.
 841          */
 842         if (list_empty(&counter->list_entry))
 843                 add_counter_to_ctx(counter, ctx);
 844         spin_unlock_irq(&ctx->lock);
 845 }
 846
 847 /*
 848  * Cross CPU call to enable a performance counter
 849  */
 850 static void __perf_counter_enable(void *info)
 851 {
 852         struct perf_counter *counter = info;
 853         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 854         struct perf_counter_context *ctx = counter->ctx;
 855         struct perf_counter *leader = counter->group_leader;
 856         int err;
 857
 858         /*
 859          * If this is a per-task counter, need to check whether this
 860          * counter's task is the current task on this cpu.
 861          */
 862         if (ctx->task && cpuctx->task_ctx != ctx) {
 863                 if (cpuctx->task_ctx || ctx->task != current)
 864                         return;
 865                 cpuctx->task_ctx = ctx;
 866         }
 867
 868         spin_lock(&ctx->lock);
 869         ctx->is_active = 1;
 870         update_context_time(ctx);
 871
 872         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 873                 goto unlock;
 874         counter->state = PERF_COUNTER_STATE_INACTIVE;
 875         counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 876
 877         /*
 878          * If the counter is in a group and isn't the group leader,
 879          * then don't put it on unless the group is on.
 880          */
 881         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 882                 goto unlock;
 883
 884         if (!group_can_go_on(counter, cpuctx, 1)) {
 885                 err = -EEXIST;
 886         } else {
 887                 perf_disable();
 888                 if (counter == leader)
 889                         err = group_sched_in(counter, cpuctx, ctx,
 890                                              smp_processor_id());
 891                 else
 892                         err = counter_sched_in(counter, cpuctx, ctx,
 893                                                smp_processor_id());
 894                 perf_enable();
 895         }
 896
 897         if (err) {
 898                 /*
 899                  * If this counter can't go on and it's part of a
 900                  * group, then the whole group has to come off.
 901                  */
 902                 if (leader != counter)
 903                         group_sched_out(leader, cpuctx, ctx);
 904                 if (leader->attr.pinned) {
 905                         update_group_times(leader);
 906                         leader->state = PERF_COUNTER_STATE_ERROR;
 907                 }
 908         }
 909
 910  unlock:
 911         spin_unlock(&ctx->lock);
 912 }
 913
 914 /*
 915  * Enable a counter.
 916  *
 917  * If counter->ctx is a cloned context, callers must make sure that
 918  * every task struct that counter->ctx->task could possibly point to
 919  * remains valid.  This condition is satisfied when called through
 920  * perf_counter_for_each_child or perf_counter_for_each as described
 921  * for perf_counter_disable.
 922  */
 923 static void perf_counter_enable(struct perf_counter *counter)
 924 {
 925         struct perf_counter_context *ctx = counter->ctx;
 926         struct task_struct *task = ctx->task;
 927
 928         if (!task) {
 929                 /*
 930                  * Enable the counter on the cpu that it's on
 931                  */
 932                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 933                                          counter, 1);
 934                 return;
 935         }
 936
 937         spin_lock_irq(&ctx->lock);
 938         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 939                 goto out;
 940
 941         /*
 942          * If the counter is in error state, clear that first.
 943          * That way, if we see the counter in error state below, we
 944          * know that it has gone back into error state, as distinct
 945          * from the task having been scheduled away before the
 946          * cross-call arrived.
 947          */
 948         if (counter->state == PERF_COUNTER_STATE_ERROR)
 949                 counter->state = PERF_COUNTER_STATE_OFF;
 950
 951  retry:
 952         spin_unlock_irq(&ctx->lock);
 953         task_oncpu_function_call(task, __perf_counter_enable, counter);
 954
 955         spin_lock_irq(&ctx->lock);
 956
 957         /*
 958          * If the context is active and the counter is still off,
 959          * we need to retry the cross-call.
 960          */
 961         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 962                 goto retry;
 963
 964         /*
 965          * Since we have the lock this context can't be scheduled
 966          * in, so we can change the state safely.
 967          */
 968         if (counter->state == PERF_COUNTER_STATE_OFF) {
 969                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 970                 counter->tstamp_enabled =
 971                         ctx->time - counter->total_time_enabled;
 972         }
 973  out:
 974         spin_unlock_irq(&ctx->lock);
 975 }
 976
 977 static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 978 {
 979         /*
 980          * not supported on inherited counters
 981          */
 982         if (counter->attr.inherit)
 983                 return -EINVAL;
 984
 985         atomic_add(refresh, &counter->event_limit);
 986         perf_counter_enable(counter);
 987
 988         return 0;
 989 }
 990
 991 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 992                               struct perf_cpu_context *cpuctx)
 993 {
 994         struct perf_counter *counter;
 995
 996         spin_lock(&ctx->lock);
 997         ctx->is_active = 0;
 998         if (likely(!ctx->nr_counters))
 999                 goto out;
1000         update_context_time(ctx);
1001
1002         perf_disable();
1003         if (ctx->nr_active) {
1004                 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1005                         if (counter != counter->group_leader)
1006                                 counter_sched_out(counter, cpuctx, ctx);
1007                         else
1008                                 group_sched_out(counter, cpuctx, ctx);
1009                 }
1010         }
1011         perf_enable();
1012  out:
1013         spin_unlock(&ctx->lock);
1014 }
1015
1016 /*
1017  * Test whether two contexts are equivalent, i.e. whether they
1018  * have both been cloned from the same version of the same context
1019  * and they both have the same number of enabled counters.
1020  * If the number of enabled counters is the same, then the set
1021  * of enabled counters should be the same, because these are both
1022  * inherited contexts, therefore we can't access individual counters
1023  * in them directly with an fd; we can only enable/disable all
1024  * counters via prctl, or enable/disable all counters in a family
1025  * via ioctl, which will have the same effect on both contexts.
1026  */
1027 static int context_equiv(struct perf_counter_context *ctx1,
1028                          struct perf_counter_context *ctx2)
1029 {
1030         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1031                 && ctx1->parent_gen == ctx2->parent_gen
1032                 && !ctx1->pin_count && !ctx2->pin_count;
1033 }
1034
1035 static void __perf_counter_read(void *counter);
1036
1037 static void __perf_counter_sync_stat(struct perf_counter *counter,
1038                                      struct perf_counter *next_counter)
1039 {
1040         u64 value;
1041
1042         if (!counter->attr.inherit_stat)
1043                 return;
1044
1045         /*
1046          * Update the counter value, we cannot use perf_counter_read()
1047          * because we're in the middle of a context switch and have IRQs
1048          * disabled, which upsets smp_call_function_single(), however
1049          * we know the counter must be on the current CPU, therefore we
1050          * don't need to use it.
1051          */
1052         switch (counter->state) {
1053         case PERF_COUNTER_STATE_ACTIVE:
1054                 __perf_counter_read(counter);
1055                 break;
1056
1057         case PERF_COUNTER_STATE_INACTIVE:
1058                 update_counter_times(counter);
1059                 break;
1060
1061         default:
1062                 break;
1063         }
1064
1065         /*
1066          * In order to keep per-task stats reliable we need to flip the counter
1067          * values when we flip the contexts.
1068          */
1069         value = atomic64_read(&next_counter->count);
1070         value = atomic64_xchg(&counter->count, value);
1071         atomic64_set(&next_counter->count, value);
1072
1073         swap(counter->total_time_enabled, next_counter->total_time_enabled);
1074         swap(counter->total_time_running, next_counter->total_time_running);
1075
1076         /*
1077          * Since we swizzled the values, update the user visible data too.
1078          */
1079         perf_counter_update_userpage(counter);
1080         perf_counter_update_userpage(next_counter);
1081 }
1082
1083 #define list_next_entry(pos, member) \
1084         list_entry(pos->member.next, typeof(*pos), member)
1085
1086 static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1087                                    struct perf_counter_context *next_ctx)
1088 {
1089         struct perf_counter *counter, *next_counter;
1090
1091         if (!ctx->nr_stat)
1092                 return;
1093
1094         counter = list_first_entry(&ctx->event_list,
1095                                    struct perf_counter, event_entry);
1096
1097         next_counter = list_first_entry(&next_ctx->event_list,
1098                                         struct perf_counter, event_entry);
1099
1100         while (&counter->event_entry != &ctx->event_list &&
1101                &next_counter->event_entry != &next_ctx->event_list) {
1102
1103                 __perf_counter_sync_stat(counter, next_counter);
1104
1105                 counter = list_next_entry(counter, event_entry);
1106                 next_counter = list_next_entry(counter, event_entry);
1107         }
1108 }
1109
1110 /*
1111  * Called from scheduler to remove the counters of the current task,
1112  * with interrupts disabled.
1113  *
1114  * We stop each counter and update the counter value in counter->count.
1115  *
1116  * This does not protect us against NMI, but disable()
1117  * sets the disabled bit in the control field of counter _before_
1118  * accessing the counter control register. If a NMI hits, then it will
1119  * not restart the counter.
1120  */
1121 void perf_counter_task_sched_out(struct task_struct *task,
1122                                  struct task_struct *next, int cpu)
1123 {
1124         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1125         struct perf_counter_context *ctx = task->perf_counter_ctxp;
1126         struct perf_counter_context *next_ctx;
1127         struct perf_counter_context *parent;
1128         struct pt_regs *regs;
1129         int do_switch = 1;
1130
1131         regs = task_pt_regs(task);
1132         perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1133
1134         if (likely(!ctx || !cpuctx->task_ctx))
1135                 return;
1136
1137         update_context_time(ctx);
1138
1139         rcu_read_lock();
1140         parent = rcu_dereference(ctx->parent_ctx);
1141         next_ctx = next->perf_counter_ctxp;
1142         if (parent && next_ctx &&
1143             rcu_dereference(next_ctx->parent_ctx) == parent) {
1144                 /*
1145                  * Looks like the two contexts are clones, so we might be
1146                  * able to optimize the context switch.  We lock both
1147                  * contexts and check that they are clones under the
1148                  * lock (including re-checking that neither has been
1149                  * uncloned in the meantime).  It doesn't matter which
1150                  * order we take the locks because no other cpu could
1151                  * be trying to lock both of these tasks.
1152                  */
1153                 spin_lock(&ctx->lock);
1154                 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1155                 if (context_equiv(ctx, next_ctx)) {
1156                         /*
1157                          * XXX do we need a memory barrier of sorts
1158                          * wrt to rcu_dereference() of perf_counter_ctxp
1159                          */
1160                         task->perf_counter_ctxp = next_ctx;
1161                         next->perf_counter_ctxp = ctx;
1162                         ctx->task = next;
1163                         next_ctx->task = task;
1164                         do_switch = 0;
1165
1166                         perf_counter_sync_stat(ctx, next_ctx);
1167                 }
1168                 spin_unlock(&next_ctx->lock);
1169                 spin_unlock(&ctx->lock);
1170         }
1171         rcu_read_unlock();
1172
1173         if (do_switch) {
1174                 __perf_counter_sched_out(ctx, cpuctx);
1175                 cpuctx->task_ctx = NULL;
1176         }
1177 }
1178
1179 /*
1180  * Called with IRQs disabled
1181  */
1182 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1183 {
1184         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1185
1186         if (!cpuctx->task_ctx)
1187                 return;
1188
1189         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1190                 return;
1191
1192         __perf_counter_sched_out(ctx, cpuctx);
1193         cpuctx->task_ctx = NULL;
1194 }
1195
1196 /*
1197  * Called with IRQs disabled
1198  */
1199 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1200 {
1201         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1202 }
1203
1204 static void
1205 __perf_counter_sched_in(struct perf_counter_context *ctx,
1206                         struct perf_cpu_context *cpuctx, int cpu)
1207 {
1208         struct perf_counter *counter;
1209         int can_add_hw = 1;
1210
1211         spin_lock(&ctx->lock);
1212         ctx->is_active = 1;
1213         if (likely(!ctx->nr_counters))
1214                 goto out;
1215
1216         ctx->timestamp = perf_clock();
1217
1218         perf_disable();
1219
1220         /*
1221          * First go through the list and put on any pinned groups
1222          * in order to give them the best chance of going on.
1223          */
1224         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1225                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1226                     !counter->attr.pinned)
1227                         continue;
1228                 if (counter->cpu != -1 && counter->cpu != cpu)
1229                         continue;
1230
1231                 if (counter != counter->group_leader)
1232                         counter_sched_in(counter, cpuctx, ctx, cpu);
1233                 else {
1234                         if (group_can_go_on(counter, cpuctx, 1))
1235                                 group_sched_in(counter, cpuctx, ctx, cpu);
1236                 }
1237
1238                 /*
1239                  * If this pinned group hasn't been scheduled,
1240                  * put it in error state.
1241                  */
1242                 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1243                         update_group_times(counter);
1244                         counter->state = PERF_COUNTER_STATE_ERROR;
1245                 }
1246         }
1247
1248         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1249                 /*
1250                  * Ignore counters in OFF or ERROR state, and
1251                  * ignore pinned counters since we did them already.
1252                  */
1253                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1254                     counter->attr.pinned)
1255                         continue;
1256
1257                 /*
1258                  * Listen to the 'cpu' scheduling filter constraint
1259                  * of counters:
1260                  */
1261                 if (counter->cpu != -1 && counter->cpu != cpu)
1262                         continue;
1263
1264                 if (counter != counter->group_leader) {
1265                         if (counter_sched_in(counter, cpuctx, ctx, cpu))
1266                                 can_add_hw = 0;
1267                 } else {
1268                         if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1269                                 if (group_sched_in(counter, cpuctx, ctx, cpu))
1270                                         can_add_hw = 0;
1271                         }
1272                 }
1273         }
1274         perf_enable();
1275  out:
1276         spin_unlock(&ctx->lock);
1277 }
1278
1279 /*
1280  * Called from scheduler to add the counters of the current task
1281  * with interrupts disabled.
1282  *
1283  * We restore the counter value and then enable it.
1284  *
1285  * This does not protect us against NMI, but enable()
1286  * sets the enabled bit in the control field of counter _before_
1287  * accessing the counter control register. If a NMI hits, then it will
1288  * keep the counter running.
1289  */
1290 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1291 {
1292         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1293         struct perf_counter_context *ctx = task->perf_counter_ctxp;
1294
1295         if (likely(!ctx))
1296                 return;
1297         if (cpuctx->task_ctx == ctx)
1298                 return;
1299         __perf_counter_sched_in(ctx, cpuctx, cpu);
1300         cpuctx->task_ctx = ctx;
1301 }
1302
1303 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1304 {
1305         struct perf_counter_context *ctx = &cpuctx->ctx;
1306
1307         __perf_counter_sched_in(ctx, cpuctx, cpu);
1308 }
1309
1310 #define MAX_INTERRUPTS (~0ULL)
1311
1312 static void perf_log_throttle(struct perf_counter *counter, int enable);
1313
1314 static void perf_adjust_period(struct perf_counter *counter, u64 events)
1315 {
1316         struct hw_perf_counter *hwc = &counter->hw;
1317         u64 period, sample_period;
1318         s64 delta;
1319
1320         events *= hwc->sample_period;
1321         period = div64_u64(events, counter->attr.sample_freq);
1322
1323         delta = (s64)(period - hwc->sample_period);
1324         delta = (delta + 7) / 8; /* low pass filter */
1325
1326         sample_period = hwc->sample_period + delta;
1327
1328         if (!sample_period)
1329                 sample_period = 1;
1330
1331         hwc->sample_period = sample_period;
1332 }
1333
1334 static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1335 {
1336         struct perf_counter *counter;
1337         struct hw_perf_counter *hwc;
1338         u64 interrupts, freq;
1339
1340         spin_lock(&ctx->lock);
1341         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1342                 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1343                         continue;
1344
1345                 hwc = &counter->hw;
1346
1347                 interrupts = hwc->interrupts;
1348                 hwc->interrupts = 0;
1349
1350                 /*
1351                  * unthrottle counters on the tick
1352                  */
1353                 if (interrupts == MAX_INTERRUPTS) {
1354                         perf_log_throttle(counter, 1);
1355                         counter->pmu->unthrottle(counter);
1356                         interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1357                 }
1358
1359                 if (!counter->attr.freq || !counter->attr.sample_freq)
1360                         continue;
1361
1362                 /*
1363                  * if the specified freq < HZ then we need to skip ticks
1364                  */
1365                 if (counter->attr.sample_freq < HZ) {
1366                         freq = counter->attr.sample_freq;
1367
1368                         hwc->freq_count += freq;
1369                         hwc->freq_interrupts += interrupts;
1370
1371                         if (hwc->freq_count < HZ)
1372                                 continue;
1373
1374                         interrupts = hwc->freq_interrupts;
1375                         hwc->freq_interrupts = 0;
1376                         hwc->freq_count -= HZ;
1377                 } else
1378                         freq = HZ;
1379
1380                 perf_adjust_period(counter, freq * interrupts);
1381
1382                 /*
1383                  * In order to avoid being stalled by an (accidental) huge
1384                  * sample period, force reset the sample period if we didn't
1385                  * get any events in this freq period.
1386                  */
1387                 if (!interrupts) {
1388                         perf_disable();
1389                         counter->pmu->disable(counter);
1390                         atomic64_set(&hwc->period_left, 0);
1391                         counter->pmu->enable(counter);
1392                         perf_enable();
1393                 }
1394         }
1395         spin_unlock(&ctx->lock);
1396 }
1397
1398 /*
1399  * Round-robin a context's counters:
1400  */
1401 static void rotate_ctx(struct perf_counter_context *ctx)
1402 {
1403         struct perf_counter *counter;
1404
1405         if (!ctx->nr_counters)
1406                 return;
1407
1408         spin_lock(&ctx->lock);
1409         /*
1410          * Rotate the first entry last (works just fine for group counters too):
1411          */
1412         perf_disable();
1413         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1414                 list_move_tail(&counter->list_entry, &ctx->counter_list);
1415                 break;
1416         }
1417         perf_enable();
1418
1419         spin_unlock(&ctx->lock);
1420 }
1421
1422 void perf_counter_task_tick(struct task_struct *curr, int cpu)
1423 {
1424         struct perf_cpu_context *cpuctx;
1425         struct perf_counter_context *ctx;
1426
1427         if (!atomic_read(&nr_counters))
1428                 return;
1429
1430         cpuctx = &per_cpu(perf_cpu_context, cpu);
1431         ctx = curr->perf_counter_ctxp;
1432
1433         perf_ctx_adjust_freq(&cpuctx->ctx);
1434         if (ctx)
1435                 perf_ctx_adjust_freq(ctx);
1436
1437         perf_counter_cpu_sched_out(cpuctx);
1438         if (ctx)
1439                 __perf_counter_task_sched_out(ctx);
1440
1441         rotate_ctx(&cpuctx->ctx);
1442         if (ctx)
1443                 rotate_ctx(ctx);
1444
1445         perf_counter_cpu_sched_in(cpuctx, cpu);
1446         if (ctx)
1447                 perf_counter_task_sched_in(curr, cpu);
1448 }
1449
1450 /*
1451  * Enable all of a task's counters that have been marked enable-on-exec.
1452  * This expects task == current.
1453  */
1454 static void perf_counter_enable_on_exec(struct task_struct *task)
1455 {
1456         struct perf_counter_context *ctx;
1457         struct perf_counter *counter;
1458         unsigned long flags;
1459         int enabled = 0;
1460
1461         local_irq_save(flags);
1462         ctx = task->perf_counter_ctxp;
1463         if (!ctx || !ctx->nr_counters)
1464                 goto out;
1465
1466         __perf_counter_task_sched_out(ctx);
1467
1468         spin_lock(&ctx->lock);
1469
1470         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1471                 if (!counter->attr.enable_on_exec)
1472                         continue;
1473                 counter->attr.enable_on_exec = 0;
1474                 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1475                         continue;
1476                 counter->state = PERF_COUNTER_STATE_INACTIVE;
1477                 counter->tstamp_enabled =
1478                         ctx->time - counter->total_time_enabled;
1479                 enabled = 1;
1480         }
1481
1482         /*
1483          * Unclone this context if we enabled any counter.
1484          */
1485         if (enabled)
1486                 unclone_ctx(ctx);
1487
1488         spin_unlock(&ctx->lock);
1489
1490         perf_counter_task_sched_in(task, smp_processor_id());
1491  out:
1492         local_irq_restore(flags);
1493 }
1494
1495 /*
1496  * Cross CPU call to read the hardware counter
1497  */
1498 static void __perf_counter_read(void *info)
1499 {
1500         struct perf_counter *counter = info;
1501         struct perf_counter_context *ctx = counter->ctx;
1502         unsigned long flags;
1503
1504         local_irq_save(flags);
1505         if (ctx->is_active)
1506                 update_context_time(ctx);
1507         counter->pmu->read(counter);
1508         update_counter_times(counter);
1509         local_irq_restore(flags);
1510 }
1511
1512 static u64 perf_counter_read(struct perf_counter *counter)
1513 {
1514         /*
1515          * If counter is enabled and currently active on a CPU, update the
1516          * value in the counter structure:
1517          */
1518         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1519                 smp_call_function_single(counter->oncpu,
1520                                          __perf_counter_read, counter, 1);
1521         } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1522                 update_counter_times(counter);
1523         }
1524
1525         return atomic64_read(&counter->count);
1526 }
1527
1528 /*
1529  * Initialize the perf_counter context in a task_struct:
1530  */
1531 static void
1532 __perf_counter_init_context(struct perf_counter_context *ctx,
1533                             struct task_struct *task)
1534 {
1535         memset(ctx, 0, sizeof(*ctx));
1536         spin_lock_init(&ctx->lock);
1537         mutex_init(&ctx->mutex);
1538         INIT_LIST_HEAD(&ctx->counter_list);
1539         INIT_LIST_HEAD(&ctx->event_list);
1540         atomic_set(&ctx->refcount, 1);
1541         ctx->task = task;
1542 }
1543
1544 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1545 {
1546         struct perf_counter_context *ctx;
1547         struct perf_cpu_context *cpuctx;
1548         struct task_struct *task;
1549         unsigned long flags;
1550         int err;
1551
1552         /*
1553          * If cpu is not a wildcard then this is a percpu counter:
1554          */
1555         if (cpu != -1) {
1556                 /* Must be root to operate on a CPU counter: */
1557                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1558                         return ERR_PTR(-EACCES);
1559
1560                 if (cpu < 0 || cpu > num_possible_cpus())
1561                         return ERR_PTR(-EINVAL);
1562
1563                 /*
1564                  * We could be clever and allow to attach a counter to an
1565                  * offline CPU and activate it when the CPU comes up, but
1566                  * that's for later.
1567                  */
1568                 if (!cpu_isset(cpu, cpu_online_map))
1569                         return ERR_PTR(-ENODEV);
1570
1571                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1572                 ctx = &cpuctx->ctx;
1573                 get_ctx(ctx);
1574
1575                 return ctx;
1576         }
1577
1578         rcu_read_lock();
1579         if (!pid)
1580                 task = current;
1581         else
1582                 task = find_task_by_vpid(pid);
1583         if (task)
1584                 get_task_struct(task);
1585         rcu_read_unlock();
1586
1587         if (!task)
1588                 return ERR_PTR(-ESRCH);
1589
1590         /*
1591          * Can't attach counters to a dying task.
1592          */
1593         err = -ESRCH;
1594         if (task->flags & PF_EXITING)
1595                 goto errout;
1596
1597         /* Reuse ptrace permission checks for now. */
1598         err = -EACCES;
1599         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1600                 goto errout;
1601
1602  retry:
1603         ctx = perf_lock_task_context(task, &flags);
1604         if (ctx) {
1605                 unclone_ctx(ctx);
1606                 spin_unlock_irqrestore(&ctx->lock, flags);
1607         }
1608
1609         if (!ctx) {
1610                 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1611                 err = -ENOMEM;
1612                 if (!ctx)
1613                         goto errout;
1614                 __perf_counter_init_context(ctx, task);
1615                 get_ctx(ctx);
1616                 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1617                         /*
1618                          * We raced with some other task; use
1619                          * the context they set.
1620                          */
1621                         kfree(ctx);
1622                         goto retry;
1623                 }
1624                 get_task_struct(task);
1625         }
1626
1627         put_task_struct(task);
1628         return ctx;
1629
1630  errout:
1631         put_task_struct(task);
1632         return ERR_PTR(err);
1633 }
1634
1635 static void free_counter_rcu(struct rcu_head *head)
1636 {
1637         struct perf_counter *counter;
1638
1639         counter = container_of(head, struct perf_counter, rcu_head);
1640         if (counter->ns)
1641                 put_pid_ns(counter->ns);
1642         kfree(counter);
1643 }
1644
1645 static void perf_pending_sync(struct perf_counter *counter);
1646
1647 static void free_counter(struct perf_counter *counter)
1648 {
1649         perf_pending_sync(counter);
1650
1651         if (!counter->parent) {
1652                 atomic_dec(&nr_counters);
1653                 if (counter->attr.mmap)
1654                         atomic_dec(&nr_mmap_counters);
1655                 if (counter->attr.comm)
1656                         atomic_dec(&nr_comm_counters);
1657         }
1658
1659         if (counter->destroy)
1660                 counter->destroy(counter);
1661
1662         put_ctx(counter->ctx);
1663         call_rcu(&counter->rcu_head, free_counter_rcu);
1664 }
1665
1666 /*
1667  * Called when the last reference to the file is gone.
1668  */
1669 static int perf_release(struct inode *inode, struct file *file)
1670 {
1671         struct perf_counter *counter = file->private_data;
1672         struct perf_counter_context *ctx = counter->ctx;
1673
1674         file->private_data = NULL;
1675
1676         WARN_ON_ONCE(ctx->parent_ctx);
1677         mutex_lock(&ctx->mutex);
1678         perf_counter_remove_from_context(counter);
1679         mutex_unlock(&ctx->mutex);
1680
1681         mutex_lock(&counter->owner->perf_counter_mutex);
1682         list_del_init(&counter->owner_entry);
1683         mutex_unlock(&counter->owner->perf_counter_mutex);
1684         put_task_struct(counter->owner);
1685
1686         free_counter(counter);
1687
1688         return 0;
1689 }
1690
1691 static u64 perf_counter_read_tree(struct perf_counter *counter)
1692 {
1693         struct perf_counter *child;
1694         u64 total = 0;
1695
1696         total += perf_counter_read(counter);
1697         list_for_each_entry(child, &counter->child_list, child_list)
1698                 total += perf_counter_read(child);
1699
1700         return total;
1701 }
1702
1703 /*
1704  * Read the performance counter - simple non blocking version for now
1705  */
1706 static ssize_t
1707 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1708 {
1709         u64 values[4];
1710         int n;
1711
1712         /*
1713          * Return end-of-file for a read on a counter that is in
1714          * error state (i.e. because it was pinned but it couldn't be
1715          * scheduled on to the CPU at some point).
1716          */
1717         if (counter->state == PERF_COUNTER_STATE_ERROR)
1718                 return 0;
1719
1720         WARN_ON_ONCE(counter->ctx->parent_ctx);
1721         mutex_lock(&counter->child_mutex);
1722         values[0] = perf_counter_read_tree(counter);
1723         n = 1;
1724         if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1725                 values[n++] = counter->total_time_enabled +
1726                         atomic64_read(&counter->child_total_time_enabled);
1727         if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1728                 values[n++] = counter->total_time_running +
1729                         atomic64_read(&counter->child_total_time_running);
1730         if (counter->attr.read_format & PERF_FORMAT_ID)
1731                 values[n++] = primary_counter_id(counter);
1732         mutex_unlock(&counter->child_mutex);
1733
1734         if (count < n * sizeof(u64))
1735                 return -EINVAL;
1736         count = n * sizeof(u64);
1737
1738         if (copy_to_user(buf, values, count))
1739                 return -EFAULT;
1740
1741         return count;
1742 }
1743
1744 static ssize_t
1745 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1746 {
1747         struct perf_counter *counter = file->private_data;
1748
1749         return perf_read_hw(counter, buf, count);
1750 }
1751
1752 static unsigned int perf_poll(struct file *file, poll_table *wait)
1753 {
1754         struct perf_counter *counter = file->private_data;
1755         struct perf_mmap_data *data;
1756         unsigned int events = POLL_HUP;
1757
1758         rcu_read_lock();
1759         data = rcu_dereference(counter->data);
1760         if (data)
1761                 events = atomic_xchg(&data->poll, 0);
1762         rcu_read_unlock();
1763
1764         poll_wait(file, &counter->waitq, wait);
1765
1766         return events;
1767 }
1768
1769 static void perf_counter_reset(struct perf_counter *counter)
1770 {
1771         (void)perf_counter_read(counter);
1772         atomic64_set(&counter->count, 0);
1773         perf_counter_update_userpage(counter);
1774 }
1775
1776 /*
1777  * Holding the top-level counter's child_mutex means that any
1778  * descendant process that has inherited this counter will block
1779  * in sync_child_counter if it goes to exit, thus satisfying the
1780  * task existence requirements of perf_counter_enable/disable.
1781  */
1782 static void perf_counter_for_each_child(struct perf_counter *counter,
1783                                         void (*func)(struct perf_counter *))
1784 {
1785         struct perf_counter *child;
1786
1787         WARN_ON_ONCE(counter->ctx->parent_ctx);
1788         mutex_lock(&counter->child_mutex);
1789         func(counter);
1790         list_for_each_entry(child, &counter->child_list, child_list)
1791                 func(child);
1792         mutex_unlock(&counter->child_mutex);
1793 }
1794
1795 static void perf_counter_for_each(struct perf_counter *counter,
1796                                   void (*func)(struct perf_counter *))
1797 {
1798         struct perf_counter_context *ctx = counter->ctx;
1799         struct perf_counter *sibling;
1800
1801         WARN_ON_ONCE(ctx->parent_ctx);
1802         mutex_lock(&ctx->mutex);
1803         counter = counter->group_leader;
1804
1805         perf_counter_for_each_child(counter, func);
1806         func(counter);
1807         list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1808                 perf_counter_for_each_child(counter, func);
1809         mutex_unlock(&ctx->mutex);
1810 }
1811
1812 static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1813 {
1814         struct perf_counter_context *ctx = counter->ctx;
1815         unsigned long size;
1816         int ret = 0;
1817         u64 value;
1818
1819         if (!counter->attr.sample_period)
1820                 return -EINVAL;
1821
1822         size = copy_from_user(&value, arg, sizeof(value));
1823         if (size != sizeof(value))
1824                 return -EFAULT;
1825
1826         if (!value)
1827                 return -EINVAL;
1828
1829         spin_lock_irq(&ctx->lock);
1830         if (counter->attr.freq) {
1831                 if (value > sysctl_perf_counter_sample_rate) {
1832                         ret = -EINVAL;
1833                         goto unlock;
1834                 }
1835
1836                 counter->attr.sample_freq = value;
1837         } else {
1838                 counter->attr.sample_period = value;
1839                 counter->hw.sample_period = value;
1840         }
1841 unlock:
1842         spin_unlock_irq(&ctx->lock);
1843
1844         return ret;
1845 }
1846
1847 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1848 {
1849         struct perf_counter *counter = file->private_data;
1850         void (*func)(struct perf_counter *);
1851         u32 flags = arg;
1852
1853         switch (cmd) {
1854         case PERF_COUNTER_IOC_ENABLE:
1855                 func = perf_counter_enable;
1856                 break;
1857         case PERF_COUNTER_IOC_DISABLE:
1858                 func = perf_counter_disable;
1859                 break;
1860         case PERF_COUNTER_IOC_RESET:
1861                 func = perf_counter_reset;
1862                 break;
1863
1864         case PERF_COUNTER_IOC_REFRESH:
1865                 return perf_counter_refresh(counter, arg);
1866
1867         case PERF_COUNTER_IOC_PERIOD:
1868                 return perf_counter_period(counter, (u64 __user *)arg);
1869
1870         default:
1871                 return -ENOTTY;
1872         }
1873
1874         if (flags & PERF_IOC_FLAG_GROUP)
1875                 perf_counter_for_each(counter, func);
1876         else
1877                 perf_counter_for_each_child(counter, func);
1878
1879         return 0;
1880 }
1881
1882 int perf_counter_task_enable(void)
1883 {
1884         struct perf_counter *counter;
1885
1886         mutex_lock(&current->perf_counter_mutex);
1887         list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1888                 perf_counter_for_each_child(counter, perf_counter_enable);
1889         mutex_unlock(&current->perf_counter_mutex);
1890
1891         return 0;
1892 }
1893
1894 int perf_counter_task_disable(void)
1895 {
1896         struct perf_counter *counter;
1897
1898         mutex_lock(&current->perf_counter_mutex);
1899         list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1900                 perf_counter_for_each_child(counter, perf_counter_disable);
1901         mutex_unlock(&current->perf_counter_mutex);
1902
1903         return 0;
1904 }
1905
1906 static int perf_counter_index(struct perf_counter *counter)
1907 {
1908         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1909                 return 0;
1910
1911         return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
1912 }
1913
1914 /*
1915  * Callers need to ensure there can be no nesting of this function, otherwise
1916  * the seqlock logic goes bad. We can not serialize this because the arch
1917  * code calls this from NMI context.
1918  */
1919 void perf_counter_update_userpage(struct perf_counter *counter)
1920 {
1921         struct perf_counter_mmap_page *userpg;
1922         struct perf_mmap_data *data;
1923
1924         rcu_read_lock();
1925         data = rcu_dereference(counter->data);
1926         if (!data)
1927                 goto unlock;
1928
1929         userpg = data->user_page;
1930
1931         /*
1932          * Disable preemption so as to not let the corresponding user-space
1933          * spin too long if we get preempted.
1934          */
1935         preempt_disable();
1936         ++userpg->lock;
1937         barrier();
1938         userpg->index = perf_counter_index(counter);
1939         userpg->offset = atomic64_read(&counter->count);
1940         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1941                 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1942
1943         userpg->time_enabled = counter->total_time_enabled +
1944                         atomic64_read(&counter->child_total_time_enabled);
1945
1946         userpg->time_running = counter->total_time_running +
1947                         atomic64_read(&counter->child_total_time_running);
1948
1949         barrier();
1950         ++userpg->lock;
1951         preempt_enable();
1952 unlock:
1953         rcu_read_unlock();
1954 }
1955
1956 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1957 {
1958         struct perf_counter *counter = vma->vm_file->private_data;
1959         struct perf_mmap_data *data;
1960         int ret = VM_FAULT_SIGBUS;
1961
1962         if (vmf->flags & FAULT_FLAG_MKWRITE) {
1963                 if (vmf->pgoff == 0)
1964                         ret = 0;
1965                 return ret;
1966         }
1967
1968         rcu_read_lock();
1969         data = rcu_dereference(counter->data);
1970         if (!data)
1971                 goto unlock;
1972
1973         if (vmf->pgoff == 0) {
1974                 vmf->page = virt_to_page(data->user_page);
1975         } else {
1976                 int nr = vmf->pgoff - 1;
1977
1978                 if ((unsigned)nr > data->nr_pages)
1979                         goto unlock;
1980
1981                 if (vmf->flags & FAULT_FLAG_WRITE)
1982                         goto unlock;
1983
1984                 vmf->page = virt_to_page(data->data_pages[nr]);
1985         }
1986
1987         get_page(vmf->page);
1988         vmf->page->mapping = vma->vm_file->f_mapping;
1989         vmf->page->index   = vmf->pgoff;
1990
1991         ret = 0;
1992 unlock:
1993         rcu_read_unlock();
1994
1995         return ret;
1996 }
1997
1998 static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1999 {
2000         struct perf_mmap_data *data;
2001         unsigned long size;
2002         int i;
2003
2004         WARN_ON(atomic_read(&counter->mmap_count));
2005
2006         size = sizeof(struct perf_mmap_data);
2007         size += nr_pages * sizeof(void *);
2008
2009         data = kzalloc(size, GFP_KERNEL);
2010         if (!data)
2011                 goto fail;
2012
2013         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2014         if (!data->user_page)
2015                 goto fail_user_page;
2016
2017         for (i = 0; i < nr_pages; i++) {
2018                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2019                 if (!data->data_pages[i])
2020                         goto fail_data_pages;
2021         }
2022
2023         data->nr_pages = nr_pages;
2024         atomic_set(&data->lock, -1);
2025
2026         rcu_assign_pointer(counter->data, data);
2027
2028         return 0;
2029
2030 fail_data_pages:
2031         for (i--; i >= 0; i--)
2032                 free_page((unsigned long)data->data_pages[i]);
2033
2034         free_page((unsigned long)data->user_page);
2035
2036 fail_user_page:
2037         kfree(data);
2038
2039 fail:
2040         return -ENOMEM;
2041 }
2042
2043 static void perf_mmap_free_page(unsigned long addr)
2044 {
2045         struct page *page = virt_to_page((void *)addr);
2046
2047         page->mapping = NULL;
2048         __free_page(page);
2049 }
2050
2051 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2052 {
2053         struct perf_mmap_data *data;
2054         int i;
2055
2056         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2057
2058         perf_mmap_free_page((unsigned long)data->user_page);
2059         for (i = 0; i < data->nr_pages; i++)
2060                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2061
2062         kfree(data);
2063 }
2064
2065 static void perf_mmap_data_free(struct perf_counter *counter)
2066 {
2067         struct perf_mmap_data *data = counter->data;
2068
2069         WARN_ON(atomic_read(&counter->mmap_count));
2070
2071         rcu_assign_pointer(counter->data, NULL);
2072         call_rcu(&data->rcu_head, __perf_mmap_data_free);
2073 }
2074
2075 static void perf_mmap_open(struct vm_area_struct *vma)
2076 {
2077         struct perf_counter *counter = vma->vm_file->private_data;
2078
2079         atomic_inc(&counter->mmap_count);
2080 }
2081
2082 static void perf_mmap_close(struct vm_area_struct *vma)
2083 {
2084         struct perf_counter *counter = vma->vm_file->private_data;
2085
2086         WARN_ON_ONCE(counter->ctx->parent_ctx);
2087         if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2088                 struct user_struct *user = current_user();
2089
2090                 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2091                 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2092                 perf_mmap_data_free(counter);
2093                 mutex_unlock(&counter->mmap_mutex);
2094         }
2095 }
2096
2097 static struct vm_operations_struct perf_mmap_vmops = {
2098         .open           = perf_mmap_open,
2099         .close          = perf_mmap_close,
2100         .fault          = perf_mmap_fault,
2101         .page_mkwrite   = perf_mmap_fault,
2102 };
2103
2104 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2105 {
2106         struct perf_counter *counter = file->private_data;
2107         unsigned long user_locked, user_lock_limit;
2108         struct user_struct *user = current_user();
2109         unsigned long locked, lock_limit;
2110         unsigned long vma_size;
2111         unsigned long nr_pages;
2112         long user_extra, extra;
2113         int ret = 0;
2114
2115         if (!(vma->vm_flags & VM_SHARED))
2116                 return -EINVAL;
2117
2118         vma_size = vma->vm_end - vma->vm_start;
2119         nr_pages = (vma_size / PAGE_SIZE) - 1;
2120
2121         /*
2122          * If we have data pages ensure they're a power-of-two number, so we
2123          * can do bitmasks instead of modulo.
2124          */
2125         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2126                 return -EINVAL;
2127
2128         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2129                 return -EINVAL;
2130
2131         if (vma->vm_pgoff != 0)
2132                 return -EINVAL;
2133
2134         WARN_ON_ONCE(counter->ctx->parent_ctx);
2135         mutex_lock(&counter->mmap_mutex);
2136         if (atomic_inc_not_zero(&counter->mmap_count)) {
2137                 if (nr_pages != counter->data->nr_pages)
2138                         ret = -EINVAL;
2139                 goto unlock;
2140         }
2141
2142         user_extra = nr_pages + 1;
2143         user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2144
2145         /*
2146          * Increase the limit linearly with more CPUs:
2147          */
2148         user_lock_limit *= num_online_cpus();
2149
2150         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2151
2152         extra = 0;
2153         if (user_locked > user_lock_limit)
2154                 extra = user_locked - user_lock_limit;
2155
2156         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2157         lock_limit >>= PAGE_SHIFT;
2158         locked = vma->vm_mm->locked_vm + extra;
2159
2160         if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2161                 ret = -EPERM;
2162                 goto unlock;
2163         }
2164
2165         WARN_ON(counter->data);
2166         ret = perf_mmap_data_alloc(counter, nr_pages);
2167         if (ret)
2168                 goto unlock;
2169
2170         atomic_set(&counter->mmap_count, 1);
2171         atomic_long_add(user_extra, &user->locked_vm);
2172         vma->vm_mm->locked_vm += extra;
2173         counter->data->nr_locked = extra;
2174         if (vma->vm_flags & VM_WRITE)
2175                 counter->data->writable = 1;
2176
2177 unlock:
2178         mutex_unlock(&counter->mmap_mutex);
2179
2180         vma->vm_flags |= VM_RESERVED;
2181         vma->vm_ops = &perf_mmap_vmops;
2182
2183         return ret;
2184 }
2185
2186 static int perf_fasync(int fd, struct file *filp, int on)
2187 {
2188         struct inode *inode = filp->f_path.dentry->d_inode;
2189         struct perf_counter *counter = filp->private_data;
2190         int retval;
2191
2192         mutex_lock(&inode->i_mutex);
2193         retval = fasync_helper(fd, filp, on, &counter->fasync);
2194         mutex_unlock(&inode->i_mutex);
2195
2196         if (retval < 0)
2197                 return retval;
2198
2199         return 0;
2200 }
2201
2202 static const struct file_operations perf_fops = {
2203         .release                = perf_release,
2204         .read                   = perf_read,
2205         .poll                   = perf_poll,
2206         .unlocked_ioctl         = perf_ioctl,
2207         .compat_ioctl           = perf_ioctl,
2208         .mmap                   = perf_mmap,
2209         .fasync                 = perf_fasync,
2210 };
2211
2212 /*
2213  * Perf counter wakeup
2214  *
2215  * If there's data, ensure we set the poll() state and publish everything
2216  * to user-space before waking everybody up.
2217  */
2218
2219 void perf_counter_wakeup(struct perf_counter *counter)
2220 {
2221         wake_up_all(&counter->waitq);
2222
2223         if (counter->pending_kill) {
2224                 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2225                 counter->pending_kill = 0;
2226         }
2227 }
2228
2229 /*
2230  * Pending wakeups
2231  *
2232  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2233  *
2234  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2235  * single linked list and use cmpxchg() to add entries lockless.
2236  */
2237
2238 static void perf_pending_counter(struct perf_pending_entry *entry)
2239 {
2240         struct perf_counter *counter = container_of(entry,
2241                         struct perf_counter, pending);
2242
2243         if (counter->pending_disable) {
2244                 counter->pending_disable = 0;
2245                 perf_counter_disable(counter);
2246         }
2247
2248         if (counter->pending_wakeup) {
2249                 counter->pending_wakeup = 0;
2250                 perf_counter_wakeup(counter);
2251         }
2252 }
2253
2254 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2255
2256 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2257         PENDING_TAIL,
2258 };
2259
2260 static void perf_pending_queue(struct perf_pending_entry *entry,
2261                                void (*func)(struct perf_pending_entry *))
2262 {
2263         struct perf_pending_entry **head;
2264
2265         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2266                 return;
2267
2268         entry->func = func;
2269
2270         head = &get_cpu_var(perf_pending_head);
2271
2272         do {
2273                 entry->next = *head;
2274         } while (cmpxchg(head, entry->next, entry) != entry->next);
2275
2276         set_perf_counter_pending();
2277
2278         put_cpu_var(perf_pending_head);
2279 }
2280
2281 static int __perf_pending_run(void)
2282 {
2283         struct perf_pending_entry *list;
2284         int nr = 0;
2285
2286         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2287         while (list != PENDING_TAIL) {
2288                 void (*func)(struct perf_pending_entry *);
2289                 struct perf_pending_entry *entry = list;
2290
2291                 list = list->next;
2292
2293                 func = entry->func;
2294                 entry->next = NULL;
2295                 /*
2296                  * Ensure we observe the unqueue before we issue the wakeup,
2297                  * so that we won't be waiting forever.
2298                  * -- see perf_not_pending().
2299                  */
2300                 smp_wmb();
2301
2302                 func(entry);
2303                 nr++;
2304         }
2305
2306         return nr;
2307 }
2308
2309 static inline int perf_not_pending(struct perf_counter *counter)
2310 {
2311         /*
2312          * If we flush on whatever cpu we run, there is a chance we don't
2313          * need to wait.
2314          */
2315         get_cpu();
2316         __perf_pending_run();
2317         put_cpu();
2318
2319         /*
2320          * Ensure we see the proper queue state before going to sleep
2321          * so that we do not miss the wakeup. -- see perf_pending_handle()
2322          */
2323         smp_rmb();
2324         return counter->pending.next == NULL;
2325 }
2326
2327 static void perf_pending_sync(struct perf_counter *counter)
2328 {
2329         wait_event(counter->waitq, perf_not_pending(counter));
2330 }
2331
2332 void perf_counter_do_pending(void)
2333 {
2334         __perf_pending_run();
2335 }
2336
2337 /*
2338  * Callchain support -- arch specific
2339  */
2340
2341 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2342 {
2343         return NULL;
2344 }
2345
2346 /*
2347  * Output
2348  */
2349
2350 struct perf_output_handle {
2351         struct perf_counter     *counter;
2352         struct perf_mmap_data   *data;
2353         unsigned long           head;
2354         unsigned long           offset;
2355         int                     nmi;
2356         int                     sample;
2357         int                     locked;
2358         unsigned long           flags;
2359 };
2360
2361 static bool perf_output_space(struct perf_mmap_data *data,
2362                               unsigned int offset, unsigned int head)
2363 {
2364         unsigned long tail;
2365         unsigned long mask;
2366
2367         if (!data->writable)
2368                 return true;
2369
2370         mask = (data->nr_pages << PAGE_SHIFT) - 1;
2371         /*
2372          * Userspace could choose to issue a mb() before updating the tail
2373          * pointer. So that all reads will be completed before the write is
2374          * issued.
2375          */
2376         tail = ACCESS_ONCE(data->user_page->data_tail);
2377         smp_rmb();
2378
2379         offset = (offset - tail) & mask;
2380         head   = (head   - tail) & mask;
2381
2382         if ((int)(head - offset) < 0)
2383                 return false;
2384
2385         return true;
2386 }
2387
2388 static void perf_output_wakeup(struct perf_output_handle *handle)
2389 {
2390         atomic_set(&handle->data->poll, POLL_IN);
2391
2392         if (handle->nmi) {
2393                 handle->counter->pending_wakeup = 1;
2394                 perf_pending_queue(&handle->counter->pending,
2395                                    perf_pending_counter);
2396         } else
2397                 perf_counter_wakeup(handle->counter);
2398 }
2399
2400 /*
2401  * Curious locking construct.
2402  *
2403  * We need to ensure a later event doesn't publish a head when a former
2404  * event isn't done writing. However since we need to deal with NMIs we
2405  * cannot fully serialize things.
2406  *
2407  * What we do is serialize between CPUs so we only have to deal with NMI
2408  * nesting on a single CPU.
2409  *
2410  * We only publish the head (and generate a wakeup) when the outer-most
2411  * event completes.
2412  */
2413 static void perf_output_lock(struct perf_output_handle *handle)
2414 {
2415         struct perf_mmap_data *data = handle->data;
2416         int cpu;
2417
2418         handle->locked = 0;
2419
2420         local_irq_save(handle->flags);
2421         cpu = smp_processor_id();
2422
2423         if (in_nmi() && atomic_read(&data->lock) == cpu)
2424                 return;
2425
2426         while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2427                 cpu_relax();
2428
2429         handle->locked = 1;
2430 }
2431
2432 static void perf_output_unlock(struct perf_output_handle *handle)
2433 {
2434         struct perf_mmap_data *data = handle->data;
2435         unsigned long head;
2436         int cpu;
2437
2438         data->done_head = data->head;
2439
2440         if (!handle->locked)
2441                 goto out;
2442
2443 again:
2444         /*
2445          * The xchg implies a full barrier that ensures all writes are done
2446          * before we publish the new head, matched by a rmb() in userspace when
2447          * reading this position.
2448          */
2449         while ((head = atomic_long_xchg(&data->done_head, 0)))
2450                 data->user_page->data_head = head;
2451
2452         /*
2453          * NMI can happen here, which means we can miss a done_head update.
2454          */
2455
2456         cpu = atomic_xchg(&data->lock, -1);
2457         WARN_ON_ONCE(cpu != smp_processor_id());
2458
2459         /*
2460          * Therefore we have to validate we did not indeed do so.
2461          */
2462         if (unlikely(atomic_long_read(&data->done_head))) {
2463                 /*
2464                  * Since we had it locked, we can lock it again.
2465                  */
2466                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2467                         cpu_relax();
2468
2469                 goto again;
2470         }
2471
2472         if (atomic_xchg(&data->wakeup, 0))
2473                 perf_output_wakeup(handle);
2474 out:
2475         local_irq_restore(handle->flags);
2476 }
2477
2478 static void perf_output_copy(struct perf_output_handle *handle,
2479                              const void *buf, unsigned int len)
2480 {
2481         unsigned int pages_mask;
2482         unsigned int offset;
2483         unsigned int size;
2484         void **pages;
2485
2486         offset          = handle->offset;
2487         pages_mask      = handle->data->nr_pages - 1;
2488         pages           = handle->data->data_pages;
2489
2490         do {
2491                 unsigned int page_offset;
2492                 int nr;
2493
2494                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2495                 page_offset = offset & (PAGE_SIZE - 1);
2496                 size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2497
2498                 memcpy(pages[nr] + page_offset, buf, size);
2499
2500                 len         -= size;
2501                 buf         += size;
2502                 offset      += size;
2503         } while (len);
2504
2505         handle->offset = offset;
2506
2507         /*
2508          * Check we didn't copy past our reservation window, taking the
2509          * possible unsigned int wrap into account.
2510          */
2511         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2512 }
2513
2514 #define perf_output_put(handle, x) \
2515         perf_output_copy((handle), &(x), sizeof(x))
2516
2517 static int perf_output_begin(struct perf_output_handle *handle,
2518                              struct perf_counter *counter, unsigned int size,
2519                              int nmi, int sample)
2520 {
2521         struct perf_mmap_data *data;
2522         unsigned int offset, head;
2523         int have_lost;
2524         struct {
2525                 struct perf_event_header header;
2526                 u64                      id;
2527                 u64                      lost;
2528         } lost_event;
2529
2530         /*
2531          * For inherited counters we send all the output towards the parent.
2532          */
2533         if (counter->parent)
2534                 counter = counter->parent;
2535
2536         rcu_read_lock();
2537         data = rcu_dereference(counter->data);
2538         if (!data)
2539                 goto out;
2540
2541         handle->data    = data;
2542         handle->counter = counter;
2543         handle->nmi     = nmi;
2544         handle->sample  = sample;
2545
2546         if (!data->nr_pages)
2547                 goto fail;
2548
2549         have_lost = atomic_read(&data->lost);
2550         if (have_lost)
2551                 size += sizeof(lost_event);
2552
2553         perf_output_lock(handle);
2554
2555         do {
2556                 offset = head = atomic_long_read(&data->head);
2557                 head += size;
2558                 if (unlikely(!perf_output_space(data, offset, head)))
2559                         goto fail;
2560         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2561
2562         handle->offset  = offset;
2563         handle->head    = head;
2564
2565         if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2566                 atomic_set(&data->wakeup, 1);
2567
2568         if (have_lost) {
2569                 lost_event.header.type = PERF_EVENT_LOST;
2570                 lost_event.header.misc = 0;
2571                 lost_event.header.size = sizeof(lost_event);
2572                 lost_event.id          = counter->id;
2573                 lost_event.lost        = atomic_xchg(&data->lost, 0);
2574
2575                 perf_output_put(handle, lost_event);
2576         }
2577
2578         return 0;
2579
2580 fail:
2581         atomic_inc(&data->lost);
2582         perf_output_unlock(handle);
2583 out:
2584         rcu_read_unlock();
2585
2586         return -ENOSPC;
2587 }
2588
2589 static void perf_output_end(struct perf_output_handle *handle)
2590 {
2591         struct perf_counter *counter = handle->counter;
2592         struct perf_mmap_data *data = handle->data;
2593
2594         int wakeup_events = counter->attr.wakeup_events;
2595
2596         if (handle->sample && wakeup_events) {
2597                 int events = atomic_inc_return(&data->events);
2598                 if (events >= wakeup_events) {
2599                         atomic_sub(wakeup_events, &data->events);
2600                         atomic_set(&data->wakeup, 1);
2601                 }
2602         }
2603
2604         perf_output_unlock(handle);
2605         rcu_read_unlock();
2606 }
2607
2608 static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2609 {
2610         /*
2611          * only top level counters have the pid namespace they were created in
2612          */
2613         if (counter->parent)
2614                 counter = counter->parent;
2615
2616         return task_tgid_nr_ns(p, counter->ns);
2617 }
2618
2619 static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2620 {
2621         /*
2622          * only top level counters have the pid namespace they were created in
2623          */
2624         if (counter->parent)
2625                 counter = counter->parent;
2626
2627         return task_pid_nr_ns(p, counter->ns);
2628 }
2629
2630 static void perf_counter_output(struct perf_counter *counter, int nmi,
2631                                 struct perf_sample_data *data)
2632 {
2633         int ret;
2634         u64 sample_type = counter->attr.sample_type;
2635         struct perf_output_handle handle;
2636         struct perf_event_header header;
2637         u64 ip;
2638         struct {
2639                 u32 pid, tid;
2640         } tid_entry;
2641         struct {
2642                 u64 id;
2643                 u64 counter;
2644         } group_entry;
2645         struct perf_callchain_entry *callchain = NULL;
2646         int callchain_size = 0;
2647         u64 time;
2648         struct {
2649                 u32 cpu, reserved;
2650         } cpu_entry;
2651
2652         header.type = PERF_EVENT_SAMPLE;
2653         header.size = sizeof(header);
2654
2655         header.misc = 0;
2656         header.misc |= perf_misc_flags(data->regs);
2657
2658         if (sample_type & PERF_SAMPLE_IP) {
2659                 ip = perf_instruction_pointer(data->regs);
2660                 header.size += sizeof(ip);
2661         }
2662
2663         if (sample_type & PERF_SAMPLE_TID) {
2664                 /* namespace issues */
2665                 tid_entry.pid = perf_counter_pid(counter, current);
2666                 tid_entry.tid = perf_counter_tid(counter, current);
2667
2668                 header.size += sizeof(tid_entry);
2669         }
2670
2671         if (sample_type & PERF_SAMPLE_TIME) {
2672                 /*
2673                  * Maybe do better on x86 and provide cpu_clock_nmi()
2674                  */
2675                 time = sched_clock();
2676
2677                 header.size += sizeof(u64);
2678         }
2679
2680         if (sample_type & PERF_SAMPLE_ADDR)
2681                 header.size += sizeof(u64);
2682
2683         if (sample_type & PERF_SAMPLE_ID)
2684                 header.size += sizeof(u64);
2685
2686         if (sample_type & PERF_SAMPLE_STREAM_ID)
2687                 header.size += sizeof(u64);
2688
2689         if (sample_type & PERF_SAMPLE_CPU) {
2690                 header.size += sizeof(cpu_entry);
2691
2692                 cpu_entry.cpu = raw_smp_processor_id();
2693                 cpu_entry.reserved = 0;
2694         }
2695
2696         if (sample_type & PERF_SAMPLE_PERIOD)
2697                 header.size += sizeof(u64);
2698
2699         if (sample_type & PERF_SAMPLE_GROUP) {
2700                 header.size += sizeof(u64) +
2701                         counter->nr_siblings * sizeof(group_entry);
2702         }
2703
2704         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2705                 callchain = perf_callchain(data->regs);
2706
2707                 if (callchain) {
2708                         callchain_size = (1 + callchain->nr) * sizeof(u64);
2709                         header.size += callchain_size;
2710                 } else
2711                         header.size += sizeof(u64);
2712         }
2713
2714         ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2715         if (ret)
2716                 return;
2717
2718         perf_output_put(&handle, header);
2719
2720         if (sample_type & PERF_SAMPLE_IP)
2721                 perf_output_put(&handle, ip);
2722
2723         if (sample_type & PERF_SAMPLE_TID)
2724                 perf_output_put(&handle, tid_entry);
2725
2726         if (sample_type & PERF_SAMPLE_TIME)
2727                 perf_output_put(&handle, time);
2728
2729         if (sample_type & PERF_SAMPLE_ADDR)
2730                 perf_output_put(&handle, data->addr);
2731
2732         if (sample_type & PERF_SAMPLE_ID) {
2733                 u64 id = primary_counter_id(counter);
2734
2735                 perf_output_put(&handle, id);
2736         }
2737
2738         if (sample_type & PERF_SAMPLE_STREAM_ID)
2739                 perf_output_put(&handle, counter->id);
2740
2741         if (sample_type & PERF_SAMPLE_CPU)
2742                 perf_output_put(&handle, cpu_entry);
2743
2744         if (sample_type & PERF_SAMPLE_PERIOD)
2745                 perf_output_put(&handle, data->period);
2746
2747         /*
2748          * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
2749          */
2750         if (sample_type & PERF_SAMPLE_GROUP) {
2751                 struct perf_counter *leader, *sub;
2752                 u64 nr = counter->nr_siblings;
2753
2754                 perf_output_put(&handle, nr);
2755
2756                 leader = counter->group_leader;
2757                 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2758                         if (sub != counter)
2759                                 sub->pmu->read(sub);
2760
2761                         group_entry.id = primary_counter_id(sub);
2762                         group_entry.counter = atomic64_read(&sub->count);
2763
2764                         perf_output_put(&handle, group_entry);
2765                 }
2766         }
2767
2768         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2769                 if (callchain)
2770                         perf_output_copy(&handle, callchain, callchain_size);
2771                 else {
2772                         u64 nr = 0;
2773                         perf_output_put(&handle, nr);
2774                 }
2775         }
2776
2777         perf_output_end(&handle);
2778 }
2779
2780 /*
2781  * read event
2782  */
2783
2784 struct perf_read_event {
2785         struct perf_event_header        header;
2786
2787         u32                             pid;
2788         u32                             tid;
2789         u64                             value;
2790         u64                             format[3];
2791 };
2792
2793 static void
2794 perf_counter_read_event(struct perf_counter *counter,
2795                         struct task_struct *task)
2796 {
2797         struct perf_output_handle handle;
2798         struct perf_read_event event = {
2799                 .header = {
2800                         .type = PERF_EVENT_READ,
2801                         .misc = 0,
2802                         .size = sizeof(event) - sizeof(event.format),
2803                 },
2804                 .pid = perf_counter_pid(counter, task),
2805                 .tid = perf_counter_tid(counter, task),
2806                 .value = atomic64_read(&counter->count),
2807         };
2808         int ret, i = 0;
2809
2810         if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2811                 event.header.size += sizeof(u64);
2812                 event.format[i++] = counter->total_time_enabled;
2813         }
2814
2815         if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2816                 event.header.size += sizeof(u64);
2817                 event.format[i++] = counter->total_time_running;
2818         }
2819
2820         if (counter->attr.read_format & PERF_FORMAT_ID) {
2821                 event.header.size += sizeof(u64);
2822                 event.format[i++] = primary_counter_id(counter);
2823         }
2824
2825         ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
2826         if (ret)
2827                 return;
2828
2829         perf_output_copy(&handle, &event, event.header.size);
2830         perf_output_end(&handle);
2831 }
2832
2833 /*
2834  * fork tracking
2835  */
2836
2837 struct perf_fork_event {
2838         struct task_struct      *task;
2839
2840         struct {
2841                 struct perf_event_header        header;
2842
2843                 u32                             pid;
2844                 u32                             ppid;
2845         } event;
2846 };
2847
2848 static void perf_counter_fork_output(struct perf_counter *counter,
2849                                      struct perf_fork_event *fork_event)
2850 {
2851         struct perf_output_handle handle;
2852         int size = fork_event->event.header.size;
2853         struct task_struct *task = fork_event->task;
2854         int ret = perf_output_begin(&handle, counter, size, 0, 0);
2855
2856         if (ret)
2857                 return;
2858
2859         fork_event->event.pid = perf_counter_pid(counter, task);
2860         fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2861
2862         perf_output_put(&handle, fork_event->event);
2863         perf_output_end(&handle);
2864 }
2865
2866 static int perf_counter_fork_match(struct perf_counter *counter)
2867 {
2868         if (counter->attr.comm || counter->attr.mmap)
2869                 return 1;
2870
2871         return 0;
2872 }
2873
2874 static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2875                                   struct perf_fork_event *fork_event)
2876 {
2877         struct perf_counter *counter;
2878
2879         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2880                 return;
2881
2882         rcu_read_lock();
2883         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2884                 if (perf_counter_fork_match(counter))
2885                         perf_counter_fork_output(counter, fork_event);
2886         }
2887         rcu_read_unlock();
2888 }
2889
2890 static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2891 {
2892         struct perf_cpu_context *cpuctx;
2893         struct perf_counter_context *ctx;
2894
2895         cpuctx = &get_cpu_var(perf_cpu_context);
2896         perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
2897         put_cpu_var(perf_cpu_context);
2898
2899         rcu_read_lock();
2900         /*
2901          * doesn't really matter which of the child contexts the
2902          * events ends up in.
2903          */
2904         ctx = rcu_dereference(current->perf_counter_ctxp);
2905         if (ctx)
2906                 perf_counter_fork_ctx(ctx, fork_event);
2907         rcu_read_unlock();
2908 }
2909
2910 void perf_counter_fork(struct task_struct *task)
2911 {
2912         struct perf_fork_event fork_event;
2913
2914         if (!atomic_read(&nr_comm_counters) &&
2915             !atomic_read(&nr_mmap_counters))
2916                 return;
2917
2918         fork_event = (struct perf_fork_event){
2919                 .task   = task,
2920                 .event  = {
2921                         .header = {
2922                                 .type = PERF_EVENT_FORK,
2923                                 .misc = 0,
2924                                 .size = sizeof(fork_event.event),
2925                         },
2926                         /* .pid  */
2927                         /* .ppid */
2928                 },
2929         };
2930
2931         perf_counter_fork_event(&fork_event);
2932 }
2933
2934 /*
2935  * comm tracking
2936  */
2937
2938 struct perf_comm_event {
2939         struct task_struct      *task;
2940         char                    *comm;
2941         int                     comm_size;
2942
2943         struct {
2944                 struct perf_event_header        header;
2945
2946                 u32                             pid;
2947                 u32                             tid;
2948         } event;
2949 };
2950
2951 static void perf_counter_comm_output(struct perf_counter *counter,
2952                                      struct perf_comm_event *comm_event)
2953 {
2954         struct perf_output_handle handle;
2955         int size = comm_event->event.header.size;
2956         int ret = perf_output_begin(&handle, counter, size, 0, 0);
2957
2958         if (ret)
2959                 return;
2960
2961         comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
2962         comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
2963
2964         perf_output_put(&handle, comm_event->event);
2965         perf_output_copy(&handle, comm_event->comm,
2966                                    comm_event->comm_size);
2967         perf_output_end(&handle);
2968 }
2969
2970 static int perf_counter_comm_match(struct perf_counter *counter)
2971 {
2972         if (counter->attr.comm)
2973                 return 1;
2974
2975         return 0;
2976 }
2977
2978 static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2979                                   struct perf_comm_event *comm_event)
2980 {
2981         struct perf_counter *counter;
2982
2983         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2984                 return;
2985
2986         rcu_read_lock();
2987         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2988                 if (perf_counter_comm_match(counter))
2989                         perf_counter_comm_output(counter, comm_event);
2990         }
2991         rcu_read_unlock();
2992 }
2993
2994 static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2995 {
2996         struct perf_cpu_context *cpuctx;
2997         struct perf_counter_context *ctx;
2998         unsigned int size;
2999         char comm[TASK_COMM_LEN];
3000
3001         memset(comm, 0, sizeof(comm));
3002         strncpy(comm, comm_event->task->comm, sizeof(comm));
3003         size = ALIGN(strlen(comm)+1, sizeof(u64));
3004
3005         comm_event->comm = comm;
3006         comm_event->comm_size = size;
3007
3008         comm_event->event.header.size = sizeof(comm_event->event) + size;
3009
3010         cpuctx = &get_cpu_var(perf_cpu_context);
3011         perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3012         put_cpu_var(perf_cpu_context);
3013
3014         rcu_read_lock();
3015         /*
3016          * doesn't really matter which of the child contexts the
3017          * events ends up in.
3018          */
3019         ctx = rcu_dereference(current->perf_counter_ctxp);
3020         if (ctx)
3021                 perf_counter_comm_ctx(ctx, comm_event);
3022         rcu_read_unlock();
3023 }
3024
3025 void perf_counter_comm(struct task_struct *task)
3026 {
3027         struct perf_comm_event comm_event;
3028
3029         if (task->perf_counter_ctxp)
3030                 perf_counter_enable_on_exec(task);
3031
3032         if (!atomic_read(&nr_comm_counters))
3033                 return;
3034
3035         comm_event = (struct perf_comm_event){
3036                 .task   = task,
3037                 /* .comm      */
3038                 /* .comm_size */
3039                 .event  = {
3040                         .header = {
3041                                 .type = PERF_EVENT_COMM,
3042                                 .misc = 0,
3043                                 /* .size */
3044                         },
3045                         /* .pid */
3046                         /* .tid */
3047                 },
3048         };
3049
3050         perf_counter_comm_event(&comm_event);
3051 }
3052
3053 /*
3054  * mmap tracking
3055  */
3056
3057 struct perf_mmap_event {
3058         struct vm_area_struct   *vma;
3059
3060         const char              *file_name;
3061         int                     file_size;
3062
3063         struct {
3064                 struct perf_event_header        header;
3065
3066                 u32                             pid;
3067                 u32                             tid;
3068                 u64                             start;
3069                 u64                             len;
3070                 u64                             pgoff;
3071         } event;
3072 };
3073
3074 static void perf_counter_mmap_output(struct perf_counter *counter,
3075                                      struct perf_mmap_event *mmap_event)
3076 {
3077         struct perf_output_handle handle;
3078         int size = mmap_event->event.header.size;
3079         int ret = perf_output_begin(&handle, counter, size, 0, 0);
3080
3081         if (ret)
3082                 return;
3083
3084         mmap_event->event.pid = perf_counter_pid(counter, current);
3085         mmap_event->event.tid = perf_counter_tid(counter, current);
3086
3087         perf_output_put(&handle, mmap_event->event);
3088         perf_output_copy(&handle, mmap_event->file_name,
3089                                    mmap_event->file_size);
3090         perf_output_end(&handle);
3091 }
3092
3093 static int perf_counter_mmap_match(struct perf_counter *counter,
3094                                    struct perf_mmap_event *mmap_event)
3095 {
3096         if (counter->attr.mmap)
3097                 return 1;
3098
3099         return 0;
3100 }
3101
3102 static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3103                                   struct perf_mmap_event *mmap_event)
3104 {
3105         struct perf_counter *counter;
3106
3107         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3108                 return;
3109
3110         rcu_read_lock();
3111         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3112                 if (perf_counter_mmap_match(counter, mmap_event))
3113                         perf_counter_mmap_output(counter, mmap_event);
3114         }
3115         rcu_read_unlock();
3116 }
3117
3118 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3119 {
3120         struct perf_cpu_context *cpuctx;
3121         struct perf_counter_context *ctx;
3122         struct vm_area_struct *vma = mmap_event->vma;
3123         struct file *file = vma->vm_file;
3124         unsigned int size;
3125         char tmp[16];
3126         char *buf = NULL;
3127         const char *name;
3128
3129         memset(tmp, 0, sizeof(tmp));
3130
3131         if (file) {
3132                 /*
3133                  * d_path works from the end of the buffer backwards, so we
3134                  * need to add enough zero bytes after the string to handle
3135                  * the 64bit alignment we do later.
3136                  */
3137                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3138                 if (!buf) {
3139                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3140                         goto got_name;
3141                 }
3142                 name = d_path(&file->f_path, buf, PATH_MAX);
3143                 if (IS_ERR(name)) {
3144                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3145                         goto got_name;
3146                 }
3147         } else {
3148                 if (arch_vma_name(mmap_event->vma)) {
3149                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3150                                        sizeof(tmp));
3151                         goto got_name;
3152                 }
3153
3154                 if (!vma->vm_mm) {
3155                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3156                         goto got_name;
3157                 }
3158
3159                 name = strncpy(tmp, "//anon", sizeof(tmp));
3160                 goto got_name;
3161         }
3162
3163 got_name:
3164         size = ALIGN(strlen(name)+1, sizeof(u64));
3165
3166         mmap_event->file_name = name;
3167         mmap_event->file_size = size;
3168
3169         mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3170
3171         cpuctx = &get_cpu_var(perf_cpu_context);
3172         perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3173         put_cpu_var(perf_cpu_context);
3174
3175         rcu_read_lock();
3176         /*
3177          * doesn't really matter which of the child contexts the
3178          * events ends up in.
3179          */
3180         ctx = rcu_dereference(current->perf_counter_ctxp);
3181         if (ctx)
3182                 perf_counter_mmap_ctx(ctx, mmap_event);
3183         rcu_read_unlock();
3184
3185         kfree(buf);
3186 }
3187
3188 void __perf_counter_mmap(struct vm_area_struct *vma)
3189 {
3190         struct perf_mmap_event mmap_event;
3191
3192         if (!atomic_read(&nr_mmap_counters))
3193                 return;
3194
3195         mmap_event = (struct perf_mmap_event){
3196                 .vma    = vma,
3197                 /* .file_name */
3198                 /* .file_size */
3199                 .event  = {
3200                         .header = {
3201                                 .type = PERF_EVENT_MMAP,
3202                                 .misc = 0,
3203                                 /* .size */
3204                         },
3205                         /* .pid */
3206                         /* .tid */
3207                         .start  = vma->vm_start,
3208                         .len    = vma->vm_end - vma->vm_start,
3209                         .pgoff  = vma->vm_pgoff,
3210                 },
3211         };
3212
3213         perf_counter_mmap_event(&mmap_event);
3214 }
3215
3216 /*
3217  * IRQ throttle logging
3218  */
3219
3220 static void perf_log_throttle(struct perf_counter *counter, int enable)
3221 {
3222         struct perf_output_handle handle;
3223         int ret;
3224
3225         struct {
3226                 struct perf_event_header        header;
3227                 u64                             time;
3228                 u64                             id;
3229                 u64                             stream_id;
3230         } throttle_event = {
3231                 .header = {
3232                         .type = PERF_EVENT_THROTTLE,
3233                         .misc = 0,
3234                         .size = sizeof(throttle_event),
3235                 },
3236                 .time           = sched_clock(),
3237                 .id             = primary_counter_id(counter),
3238                 .stream_id      = counter->id,
3239         };
3240
3241         if (enable)
3242                 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3243
3244         ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3245         if (ret)
3246                 return;
3247
3248         perf_output_put(&handle, throttle_event);
3249         perf_output_end(&handle);
3250 }
3251
3252 /*
3253  * Generic counter overflow handling, sampling.
3254  */
3255
3256 int perf_counter_overflow(struct perf_counter *counter, int nmi,
3257                           struct perf_sample_data *data)
3258 {
3259         int events = atomic_read(&counter->event_limit);
3260         int throttle = counter->pmu->unthrottle != NULL;
3261         struct hw_perf_counter *hwc = &counter->hw;
3262         int ret = 0;
3263
3264         if (!throttle) {
3265                 hwc->interrupts++;
3266         } else {
3267                 if (hwc->interrupts != MAX_INTERRUPTS) {
3268                         hwc->interrupts++;
3269                         if (HZ * hwc->interrupts >
3270                                         (u64)sysctl_perf_counter_sample_rate) {
3271                                 hwc->interrupts = MAX_INTERRUPTS;
3272                                 perf_log_throttle(counter, 0);
3273                                 ret = 1;
3274                         }
3275                 } else {
3276                         /*
3277                          * Keep re-disabling counters even though on the previous
3278                          * pass we disabled it - just in case we raced with a
3279                          * sched-in and the counter got enabled again:
3280                          */
3281                         ret = 1;
3282                 }
3283         }
3284
3285         if (counter->attr.freq) {
3286                 u64 now = sched_clock();
3287                 s64 delta = now - hwc->freq_stamp;
3288
3289                 hwc->freq_stamp = now;
3290
3291                 if (delta > 0 && delta < TICK_NSEC)
3292                         perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3293         }
3294
3295         /*
3296          * XXX event_limit might not quite work as expected on inherited
3297          * counters
3298          */
3299
3300         counter->pending_kill = POLL_IN;
3301         if (events && atomic_dec_and_test(&counter->event_limit)) {
3302                 ret = 1;
3303                 counter->pending_kill = POLL_HUP;
3304                 if (nmi) {
3305                         counter->pending_disable = 1;
3306                         perf_pending_queue(&counter->pending,
3307                                            perf_pending_counter);
3308                 } else
3309                         perf_counter_disable(counter);
3310         }
3311
3312         perf_counter_output(counter, nmi, data);
3313         return ret;
3314 }
3315
3316 /*
3317  * Generic software counter infrastructure
3318  */
3319
3320 static void perf_swcounter_update(struct perf_counter *counter)
3321 {
3322         struct hw_perf_counter *hwc = &counter->hw;
3323         u64 prev, now;
3324         s64 delta;
3325
3326 again:
3327         prev = atomic64_read(&hwc->prev_count);
3328         now = atomic64_read(&hwc->count);
3329         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
3330                 goto again;
3331
3332         delta = now - prev;
3333
3334         atomic64_add(delta, &counter->count);
3335         atomic64_sub(delta, &hwc->period_left);
3336 }
3337
3338 static void perf_swcounter_set_period(struct perf_counter *counter)
3339 {
3340         struct hw_perf_counter *hwc = &counter->hw;
3341         s64 left = atomic64_read(&hwc->period_left);
3342         s64 period = hwc->sample_period;
3343
3344         if (unlikely(left <= -period)) {
3345                 left = period;
3346                 atomic64_set(&hwc->period_left, left);
3347                 hwc->last_period = period;
3348         }
3349
3350         if (unlikely(left <= 0)) {
3351                 left += period;
3352                 atomic64_add(period, &hwc->period_left);
3353                 hwc->last_period = period;
3354         }
3355
3356         atomic64_set(&hwc->prev_count, -left);
3357         atomic64_set(&hwc->count, -left);
3358 }
3359
3360 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3361 {
3362         enum hrtimer_restart ret = HRTIMER_RESTART;
3363         struct perf_sample_data data;
3364         struct perf_counter *counter;
3365         u64 period;
3366
3367         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3368         counter->pmu->read(counter);
3369
3370         data.addr = 0;
3371         data.regs = get_irq_regs();
3372         /*
3373          * In case we exclude kernel IPs or are somehow not in interrupt
3374          * context, provide the next best thing, the user IP.
3375          */
3376         if ((counter->attr.exclude_kernel || !data.regs) &&
3377                         !counter->attr.exclude_user)
3378                 data.regs = task_pt_regs(current);
3379
3380         if (data.regs) {
3381                 if (perf_counter_overflow(counter, 0, &data))
3382                         ret = HRTIMER_NORESTART;
3383         }
3384
3385         period = max_t(u64, 10000, counter->hw.sample_period);
3386         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3387
3388         return ret;
3389 }
3390
3391 static void perf_swcounter_overflow(struct perf_counter *counter,
3392                                     int nmi, struct perf_sample_data *data)
3393 {
3394         data->period = counter->hw.last_period;
3395
3396         perf_swcounter_update(counter);
3397         perf_swcounter_set_period(counter);
3398         if (perf_counter_overflow(counter, nmi, data))
3399                 /* soft-disable the counter */
3400                 ;
3401 }
3402
3403 static int perf_swcounter_is_counting(struct perf_counter *counter)
3404 {
3405         struct perf_counter_context *ctx;
3406         unsigned long flags;
3407         int count;
3408
3409         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3410                 return 1;
3411
3412         if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3413                 return 0;
3414
3415         /*
3416          * If the counter is inactive, it could be just because
3417          * its task is scheduled out, or because it's in a group
3418          * which could not go on the PMU.  We want to count in
3419          * the first case but not the second.  If the context is
3420          * currently active then an inactive software counter must
3421          * be the second case.  If it's not currently active then
3422          * we need to know whether the counter was active when the
3423          * context was last active, which we can determine by
3424          * comparing counter->tstamp_stopped with ctx->time.
3425          *
3426          * We are within an RCU read-side critical section,
3427          * which protects the existence of *ctx.
3428          */
3429         ctx = counter->ctx;
3430         spin_lock_irqsave(&ctx->lock, flags);
3431         count = 1;
3432         /* Re-check state now we have the lock */
3433         if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
3434             counter->ctx->is_active ||
3435             counter->tstamp_stopped < ctx->time)
3436                 count = 0;
3437         spin_unlock_irqrestore(&ctx->lock, flags);
3438         return count;
3439 }
3440
3441 static int perf_swcounter_match(struct perf_counter *counter,
3442                                 enum perf_type_id type,
3443                                 u32 event, struct pt_regs *regs)
3444 {
3445         if (!perf_swcounter_is_counting(counter))
3446                 return 0;
3447
3448         if (counter->attr.type != type)
3449                 return 0;
3450         if (counter->attr.config != event)
3451                 return 0;
3452
3453         if (regs) {
3454                 if (counter->attr.exclude_user && user_mode(regs))
3455                         return 0;
3456
3457                 if (counter->attr.exclude_kernel && !user_mode(regs))
3458                         return 0;
3459         }
3460
3461         return 1;
3462 }
3463
3464 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3465                                int nmi, struct perf_sample_data *data)
3466 {
3467         int neg = atomic64_add_negative(nr, &counter->hw.count);
3468
3469         if (counter->hw.sample_period && !neg && data->regs)
3470                 perf_swcounter_overflow(counter, nmi, data);
3471 }
3472
3473 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3474                                      enum perf_type_id type,
3475                                      u32 event, u64 nr, int nmi,
3476                                      struct perf_sample_data *data)
3477 {
3478         struct perf_counter *counter;
3479
3480         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3481                 return;
3482
3483         rcu_read_lock();
3484         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3485                 if (perf_swcounter_match(counter, type, event, data->regs))
3486                         perf_swcounter_add(counter, nr, nmi, data);
3487         }
3488         rcu_read_unlock();
3489 }
3490
3491 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3492 {
3493         if (in_nmi())
3494                 return &cpuctx->recursion[3];
3495
3496         if (in_irq())
3497                 return &cpuctx->recursion[2];
3498
3499         if (in_softirq())
3500                 return &cpuctx->recursion[1];
3501
3502         return &cpuctx->recursion[0];
3503 }
3504
3505 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3506                                     u64 nr, int nmi,
3507                                     struct perf_sample_data *data)
3508 {
3509         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3510         int *recursion = perf_swcounter_recursion_context(cpuctx);
3511         struct perf_counter_context *ctx;
3512
3513         if (*recursion)
3514                 goto out;
3515
3516         (*recursion)++;
3517         barrier();
3518
3519         perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3520                                  nr, nmi, data);
3521         rcu_read_lock();
3522         /*
3523          * doesn't really matter which of the child contexts the
3524          * events ends up in.
3525          */
3526         ctx = rcu_dereference(current->perf_counter_ctxp);
3527         if (ctx)
3528                 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3529         rcu_read_unlock();
3530
3531         barrier();
3532         (*recursion)--;
3533
3534 out:
3535         put_cpu_var(perf_cpu_context);
3536 }
3537
3538 void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3539                             struct pt_regs *regs, u64 addr)
3540 {
3541         struct perf_sample_data data = {
3542                 .regs = regs,
3543                 .addr = addr,
3544         };
3545
3546         do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3547 }
3548
3549 static void perf_swcounter_read(struct perf_counter *counter)
3550 {
3551         perf_swcounter_update(counter);
3552 }
3553
3554 static int perf_swcounter_enable(struct perf_counter *counter)
3555 {
3556         perf_swcounter_set_period(counter);
3557         return 0;
3558 }
3559
3560 static void perf_swcounter_disable(struct perf_counter *counter)
3561 {
3562         perf_swcounter_update(counter);
3563 }
3564
3565 static const struct pmu perf_ops_generic = {
3566         .enable         = perf_swcounter_enable,
3567         .disable        = perf_swcounter_disable,
3568         .read           = perf_swcounter_read,
3569 };
3570
3571 /*
3572  * Software counter: cpu wall time clock
3573  */
3574
3575 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3576 {
3577         int cpu = raw_smp_processor_id();
3578         s64 prev;
3579         u64 now;
3580
3581         now = cpu_clock(cpu);
3582         prev = atomic64_read(&counter->hw.prev_count);
3583         atomic64_set(&counter->hw.prev_count, now);
3584         atomic64_add(now - prev, &counter->count);
3585 }
3586
3587 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3588 {
3589         struct hw_perf_counter *hwc = &counter->hw;
3590         int cpu = raw_smp_processor_id();
3591
3592         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3593         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3594         hwc->hrtimer.function = perf_swcounter_hrtimer;
3595         if (hwc->sample_period) {
3596                 u64 period = max_t(u64, 10000, hwc->sample_period);
3597                 __hrtimer_start_range_ns(&hwc->hrtimer,
3598                                 ns_to_ktime(period), 0,
3599                                 HRTIMER_MODE_REL, 0);
3600         }
3601
3602         return 0;
3603 }
3604
3605 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3606 {
3607         if (counter->hw.sample_period)
3608                 hrtimer_cancel(&counter->hw.hrtimer);
3609         cpu_clock_perf_counter_update(counter);
3610 }
3611
3612 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3613 {
3614         cpu_clock_perf_counter_update(counter);
3615 }
3616
3617 static const struct pmu perf_ops_cpu_clock = {
3618         .enable         = cpu_clock_perf_counter_enable,
3619         .disable        = cpu_clock_perf_counter_disable,
3620         .read           = cpu_clock_perf_counter_read,
3621 };
3622
3623 /*
3624  * Software counter: task time clock
3625  */
3626
3627 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3628 {
3629         u64 prev;
3630         s64 delta;
3631
3632         prev = atomic64_xchg(&counter->hw.prev_count, now);
3633         delta = now - prev;
3634         atomic64_add(delta, &counter->count);
3635 }
3636
3637 static int task_clock_perf_counter_enable(struct perf_counter *counter)
3638 {
3639         struct hw_perf_counter *hwc = &counter->hw;
3640         u64 now;
3641
3642         now = counter->ctx->time;
3643
3644         atomic64_set(&hwc->prev_count, now);
3645         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3646         hwc->hrtimer.function = perf_swcounter_hrtimer;
3647         if (hwc->sample_period) {
3648                 u64 period = max_t(u64, 10000, hwc->sample_period);
3649                 __hrtimer_start_range_ns(&hwc->hrtimer,
3650                                 ns_to_ktime(period), 0,
3651                                 HRTIMER_MODE_REL, 0);
3652         }
3653
3654         return 0;
3655 }
3656
3657 static void task_clock_perf_counter_disable(struct perf_counter *counter)
3658 {
3659         if (counter->hw.sample_period)
3660                 hrtimer_cancel(&counter->hw.hrtimer);
3661         task_clock_perf_counter_update(counter, counter->ctx->time);
3662
3663 }
3664
3665 static void task_clock_perf_counter_read(struct perf_counter *counter)
3666 {
3667         u64 time;
3668
3669         if (!in_nmi()) {
3670                 update_context_time(counter->ctx);
3671                 time = counter->ctx->time;
3672         } else {
3673                 u64 now = perf_clock();
3674                 u64 delta = now - counter->ctx->timestamp;
3675                 time = counter->ctx->time + delta;
3676         }
3677
3678         task_clock_perf_counter_update(counter, time);
3679 }
3680
3681 static const struct pmu perf_ops_task_clock = {
3682         .enable         = task_clock_perf_counter_enable,
3683         .disable        = task_clock_perf_counter_disable,
3684         .read           = task_clock_perf_counter_read,
3685 };
3686
3687 #ifdef CONFIG_EVENT_PROFILE
3688 void perf_tpcounter_event(int event_id)
3689 {
3690         struct perf_sample_data data = {
3691                 .regs = get_irq_regs(),
3692                 .addr = 0,
3693         };
3694
3695         if (!data.regs)
3696                 data.regs = task_pt_regs(current);
3697
3698         do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
3699 }
3700 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3701
3702 extern int ftrace_profile_enable(int);
3703 extern void ftrace_profile_disable(int);
3704
3705 static void tp_perf_counter_destroy(struct perf_counter *counter)
3706 {
3707         ftrace_profile_disable(counter->attr.config);
3708 }
3709
3710 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3711 {
3712         if (ftrace_profile_enable(counter->attr.config))
3713                 return NULL;
3714
3715         counter->destroy = tp_perf_counter_destroy;
3716
3717         return &perf_ops_generic;
3718 }
3719 #else
3720 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3721 {
3722         return NULL;
3723 }
3724 #endif
3725
3726 atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3727
3728 static void sw_perf_counter_destroy(struct perf_counter *counter)
3729 {
3730         u64 event = counter->attr.config;
3731
3732         WARN_ON(counter->parent);
3733
3734         atomic_dec(&perf_swcounter_enabled[event]);
3735 }
3736
3737 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3738 {
3739         const struct pmu *pmu = NULL;
3740         u64 event = counter->attr.config;
3741
3742         /*
3743          * Software counters (currently) can't in general distinguish
3744          * between user, kernel and hypervisor events.
3745          * However, context switches and cpu migrations are considered
3746          * to be kernel events, and page faults are never hypervisor
3747          * events.
3748          */
3749         switch (event) {
3750         case PERF_COUNT_SW_CPU_CLOCK:
3751                 pmu = &perf_ops_cpu_clock;
3752
3753                 break;
3754         case PERF_COUNT_SW_TASK_CLOCK:
3755                 /*
3756                  * If the user instantiates this as a per-cpu counter,
3757                  * use the cpu_clock counter instead.
3758                  */
3759                 if (counter->ctx->task)
3760                         pmu = &perf_ops_task_clock;
3761                 else
3762                         pmu = &perf_ops_cpu_clock;
3763
3764                 break;
3765         case PERF_COUNT_SW_PAGE_FAULTS:
3766         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3767         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3768         case PERF_COUNT_SW_CONTEXT_SWITCHES:
3769         case PERF_COUNT_SW_CPU_MIGRATIONS:
3770                 if (!counter->parent) {
3771                         atomic_inc(&perf_swcounter_enabled[event]);
3772                         counter->destroy = sw_perf_counter_destroy;
3773                 }
3774                 pmu = &perf_ops_generic;
3775                 break;
3776         }
3777
3778         return pmu;
3779 }
3780
3781 /*
3782  * Allocate and initialize a counter structure
3783  */
3784 static struct perf_counter *
3785 perf_counter_alloc(struct perf_counter_attr *attr,
3786                    int cpu,
3787                    struct perf_counter_context *ctx,
3788                    struct perf_counter *group_leader,
3789                    struct perf_counter *parent_counter,
3790                    gfp_t gfpflags)
3791 {
3792         const struct pmu *pmu;
3793         struct perf_counter *counter;
3794         struct hw_perf_counter *hwc;
3795         long err;
3796
3797         counter = kzalloc(sizeof(*counter), gfpflags);
3798         if (!counter)
3799                 return ERR_PTR(-ENOMEM);
3800
3801         /*
3802          * Single counters are their own group leaders, with an
3803          * empty sibling list:
3804          */
3805         if (!group_leader)
3806                 group_leader = counter;
3807
3808         mutex_init(&counter->child_mutex);
3809         INIT_LIST_HEAD(&counter->child_list);
3810
3811         INIT_LIST_HEAD(&counter->list_entry);
3812         INIT_LIST_HEAD(&counter->event_entry);
3813         INIT_LIST_HEAD(&counter->sibling_list);
3814         init_waitqueue_head(&counter->waitq);
3815
3816         mutex_init(&counter->mmap_mutex);
3817
3818         counter->cpu            = cpu;
3819         counter->attr           = *attr;
3820         counter->group_leader   = group_leader;
3821         counter->pmu            = NULL;
3822         counter->ctx            = ctx;
3823         counter->oncpu          = -1;
3824
3825         counter->parent         = parent_counter;
3826
3827         counter->ns             = get_pid_ns(current->nsproxy->pid_ns);
3828         counter->id             = atomic64_inc_return(&perf_counter_id);
3829
3830         counter->state          = PERF_COUNTER_STATE_INACTIVE;
3831
3832         if (attr->disabled)
3833                 counter->state = PERF_COUNTER_STATE_OFF;
3834
3835         pmu = NULL;
3836
3837         hwc = &counter->hw;
3838         hwc->sample_period = attr->sample_period;
3839         if (attr->freq && attr->sample_freq)
3840                 hwc->sample_period = 1;
3841
3842         atomic64_set(&hwc->period_left, hwc->sample_period);
3843
3844         /*
3845          * we currently do not support PERF_SAMPLE_GROUP on inherited counters
3846          */
3847         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
3848                 goto done;
3849
3850         switch (attr->type) {
3851         case PERF_TYPE_RAW:
3852         case PERF_TYPE_HARDWARE:
3853         case PERF_TYPE_HW_CACHE:
3854                 pmu = hw_perf_counter_init(counter);
3855                 break;
3856
3857         case PERF_TYPE_SOFTWARE:
3858                 pmu = sw_perf_counter_init(counter);
3859                 break;
3860
3861         case PERF_TYPE_TRACEPOINT:
3862                 pmu = tp_perf_counter_init(counter);
3863                 break;
3864
3865         default:
3866                 break;
3867         }
3868 done:
3869         err = 0;
3870         if (!pmu)
3871                 err = -EINVAL;
3872         else if (IS_ERR(pmu))
3873                 err = PTR_ERR(pmu);
3874
3875         if (err) {
3876                 if (counter->ns)
3877                         put_pid_ns(counter->ns);
3878                 kfree(counter);
3879                 return ERR_PTR(err);
3880         }
3881
3882         counter->pmu = pmu;
3883
3884         if (!counter->parent) {
3885                 atomic_inc(&nr_counters);
3886                 if (counter->attr.mmap)
3887                         atomic_inc(&nr_mmap_counters);
3888                 if (counter->attr.comm)
3889                         atomic_inc(&nr_comm_counters);
3890         }
3891
3892         return counter;
3893 }
3894
3895 static int perf_copy_attr(struct perf_counter_attr __user *uattr,
3896                           struct perf_counter_attr *attr)
3897 {
3898         int ret;
3899         u32 size;
3900
3901         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
3902                 return -EFAULT;
3903
3904         /*
3905          * zero the full structure, so that a short copy will be nice.
3906          */
3907         memset(attr, 0, sizeof(*attr));
3908
3909         ret = get_user(size, &uattr->size);
3910         if (ret)
3911                 return ret;
3912
3913         if (size > PAGE_SIZE)   /* silly large */
3914                 goto err_size;
3915
3916         if (!size)              /* abi compat */
3917                 size = PERF_ATTR_SIZE_VER0;
3918
3919         if (size < PERF_ATTR_SIZE_VER0)
3920                 goto err_size;
3921
3922         /*
3923          * If we're handed a bigger struct than we know of,
3924          * ensure all the unknown bits are 0.
3925          */
3926         if (size > sizeof(*attr)) {
3927                 unsigned long val;
3928                 unsigned long __user *addr;
3929                 unsigned long __user *end;
3930
3931                 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
3932                                 sizeof(unsigned long));
3933                 end  = PTR_ALIGN((void __user *)uattr + size,
3934                                 sizeof(unsigned long));
3935
3936                 for (; addr < end; addr += sizeof(unsigned long)) {
3937                         ret = get_user(val, addr);
3938                         if (ret)
3939                                 return ret;
3940                         if (val)
3941                                 goto err_size;
3942                 }
3943         }
3944
3945         ret = copy_from_user(attr, uattr, size);
3946         if (ret)
3947                 return -EFAULT;
3948
3949         /*
3950          * If the type exists, the corresponding creation will verify
3951          * the attr->config.
3952          */
3953         if (attr->type >= PERF_TYPE_MAX)
3954                 return -EINVAL;
3955
3956         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
3957                 return -EINVAL;
3958
3959         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
3960                 return -EINVAL;
3961
3962         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
3963                 return -EINVAL;
3964
3965 out:
3966         return ret;
3967
3968 err_size:
3969         put_user(sizeof(*attr), &uattr->size);
3970         ret = -E2BIG;
3971         goto out;
3972 }
3973
3974 /**
3975  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3976  *
3977  * @attr_uptr:  event type attributes for monitoring/sampling
3978  * @pid:                target pid
3979  * @cpu:                target cpu
3980  * @group_fd:           group leader counter fd
3981  */
3982 SYSCALL_DEFINE5(perf_counter_open,
3983                 struct perf_counter_attr __user *, attr_uptr,
3984                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3985 {
3986         struct perf_counter *counter, *group_leader;
3987         struct perf_counter_attr attr;
3988         struct perf_counter_context *ctx;
3989         struct file *counter_file = NULL;
3990         struct file *group_file = NULL;
3991         int fput_needed = 0;
3992         int fput_needed2 = 0;
3993         int ret;
3994
3995         /* for future expandability... */
3996         if (flags)
3997                 return -EINVAL;
3998
3999         ret = perf_copy_attr(attr_uptr, &attr);
4000         if (ret)
4001                 return ret;
4002
4003         if (!attr.exclude_kernel) {
4004                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4005                         return -EACCES;
4006         }
4007
4008         if (attr.freq) {
4009                 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4010                         return -EINVAL;
4011         }
4012
4013         /*
4014          * Get the target context (task or percpu):
4015          */
4016         ctx = find_get_context(pid, cpu);
4017         if (IS_ERR(ctx))
4018                 return PTR_ERR(ctx);
4019
4020         /*
4021          * Look up the group leader (we will attach this counter to it):
4022          */
4023         group_leader = NULL;
4024         if (group_fd != -1) {
4025                 ret = -EINVAL;
4026                 group_file = fget_light(group_fd, &fput_needed);
4027                 if (!group_file)
4028                         goto err_put_context;
4029                 if (group_file->f_op != &perf_fops)
4030                         goto err_put_context;
4031
4032                 group_leader = group_file->private_data;
4033                 /*
4034                  * Do not allow a recursive hierarchy (this new sibling
4035                  * becoming part of another group-sibling):
4036                  */
4037                 if (group_leader->group_leader != group_leader)
4038                         goto err_put_context;
4039                 /*
4040                  * Do not allow to attach to a group in a different
4041                  * task or CPU context:
4042                  */
4043                 if (group_leader->ctx != ctx)
4044                         goto err_put_context;
4045                 /*
4046                  * Only a group leader can be exclusive or pinned
4047                  */
4048                 if (attr.exclusive || attr.pinned)
4049                         goto err_put_context;
4050         }
4051
4052         counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4053                                      NULL, GFP_KERNEL);
4054         ret = PTR_ERR(counter);
4055         if (IS_ERR(counter))
4056                 goto err_put_context;
4057
4058         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4059         if (ret < 0)
4060                 goto err_free_put_context;
4061
4062         counter_file = fget_light(ret, &fput_needed2);
4063         if (!counter_file)
4064                 goto err_free_put_context;
4065
4066         counter->filp = counter_file;
4067         WARN_ON_ONCE(ctx->parent_ctx);
4068         mutex_lock(&ctx->mutex);
4069         perf_install_in_context(ctx, counter, cpu);
4070         ++ctx->generation;
4071         mutex_unlock(&ctx->mutex);
4072
4073         counter->owner = current;
4074         get_task_struct(current);
4075         mutex_lock(&current->perf_counter_mutex);
4076         list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4077         mutex_unlock(&current->perf_counter_mutex);
4078
4079         fput_light(counter_file, fput_needed2);
4080
4081 out_fput:
4082         fput_light(group_file, fput_needed);
4083
4084         return ret;
4085
4086 err_free_put_context:
4087         kfree(counter);
4088
4089 err_put_context:
4090         put_ctx(ctx);
4091
4092         goto out_fput;
4093 }
4094
4095 /*
4096  * inherit a counter from parent task to child task:
4097  */
4098 static struct perf_counter *
4099 inherit_counter(struct perf_counter *parent_counter,
4100               struct task_struct *parent,
4101               struct perf_counter_context *parent_ctx,
4102               struct task_struct *child,
4103               struct perf_counter *group_leader,
4104               struct perf_counter_context *child_ctx)
4105 {
4106         struct perf_counter *child_counter;
4107
4108         /*
4109          * Instead of creating recursive hierarchies of counters,
4110          * we link inherited counters back to the original parent,
4111          * which has a filp for sure, which we use as the reference
4112          * count:
4113          */
4114         if (parent_counter->parent)
4115                 parent_counter = parent_counter->parent;
4116
4117         child_counter = perf_counter_alloc(&parent_counter->attr,
4118                                            parent_counter->cpu, child_ctx,
4119                                            group_leader, parent_counter,
4120                                            GFP_KERNEL);
4121         if (IS_ERR(child_counter))
4122                 return child_counter;
4123         get_ctx(child_ctx);
4124
4125         /*
4126          * Make the child state follow the state of the parent counter,
4127          * not its attr.disabled bit.  We hold the parent's mutex,
4128          * so we won't race with perf_counter_{en, dis}able_family.
4129          */
4130         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4131                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4132         else
4133                 child_counter->state = PERF_COUNTER_STATE_OFF;
4134
4135         if (parent_counter->attr.freq)
4136                 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4137
4138         /*
4139          * Link it up in the child's context:
4140          */
4141         add_counter_to_ctx(child_counter, child_ctx);
4142
4143         /*
4144          * Get a reference to the parent filp - we will fput it
4145          * when the child counter exits. This is safe to do because
4146          * we are in the parent and we know that the filp still
4147          * exists and has a nonzero count:
4148          */
4149         atomic_long_inc(&parent_counter->filp->f_count);
4150
4151         /*
4152          * Link this into the parent counter's child list
4153          */
4154         WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4155         mutex_lock(&parent_counter->child_mutex);
4156         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4157         mutex_unlock(&parent_counter->child_mutex);
4158
4159         return child_counter;
4160 }
4161
4162 static int inherit_group(struct perf_counter *parent_counter,
4163               struct task_struct *parent,
4164               struct perf_counter_context *parent_ctx,
4165               struct task_struct *child,
4166               struct perf_counter_context *child_ctx)
4167 {
4168         struct perf_counter *leader;
4169         struct perf_counter *sub;
4170         struct perf_counter *child_ctr;
4171
4172         leader = inherit_counter(parent_counter, parent, parent_ctx,
4173                                  child, NULL, child_ctx);
4174         if (IS_ERR(leader))
4175                 return PTR_ERR(leader);
4176         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4177                 child_ctr = inherit_counter(sub, parent, parent_ctx,
4178                                             child, leader, child_ctx);
4179                 if (IS_ERR(child_ctr))
4180                         return PTR_ERR(child_ctr);
4181         }
4182         return 0;
4183 }
4184
4185 static void sync_child_counter(struct perf_counter *child_counter,
4186                                struct task_struct *child)
4187 {
4188         struct perf_counter *parent_counter = child_counter->parent;
4189         u64 child_val;
4190
4191         if (child_counter->attr.inherit_stat)
4192                 perf_counter_read_event(child_counter, child);
4193
4194         child_val = atomic64_read(&child_counter->count);
4195
4196         /*
4197          * Add back the child's count to the parent's count:
4198          */
4199         atomic64_add(child_val, &parent_counter->count);
4200         atomic64_add(child_counter->total_time_enabled,
4201                      &parent_counter->child_total_time_enabled);
4202         atomic64_add(child_counter->total_time_running,
4203                      &parent_counter->child_total_time_running);
4204
4205         /*
4206          * Remove this counter from the parent's list
4207          */
4208         WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4209         mutex_lock(&parent_counter->child_mutex);
4210         list_del_init(&child_counter->child_list);
4211         mutex_unlock(&parent_counter->child_mutex);
4212
4213         /*
4214          * Release the parent counter, if this was the last
4215          * reference to it.
4216          */
4217         fput(parent_counter->filp);
4218 }
4219
4220 static void
4221 __perf_counter_exit_task(struct perf_counter *child_counter,
4222                          struct perf_counter_context *child_ctx,
4223                          struct task_struct *child)
4224 {
4225         struct perf_counter *parent_counter;
4226
4227         update_counter_times(child_counter);
4228         perf_counter_remove_from_context(child_counter);
4229
4230         parent_counter = child_counter->parent;
4231         /*
4232          * It can happen that parent exits first, and has counters
4233          * that are still around due to the child reference. These
4234          * counters need to be zapped - but otherwise linger.
4235          */
4236         if (parent_counter) {
4237                 sync_child_counter(child_counter, child);
4238                 free_counter(child_counter);
4239         }
4240 }
4241
4242 /*
4243  * When a child task exits, feed back counter values to parent counters.
4244  */
4245 void perf_counter_exit_task(struct task_struct *child)
4246 {
4247         struct perf_counter *child_counter, *tmp;
4248         struct perf_counter_context *child_ctx;
4249         unsigned long flags;
4250
4251         if (likely(!child->perf_counter_ctxp))
4252                 return;
4253
4254         local_irq_save(flags);
4255         /*
4256          * We can't reschedule here because interrupts are disabled,
4257          * and either child is current or it is a task that can't be
4258          * scheduled, so we are now safe from rescheduling changing
4259          * our context.
4260          */
4261         child_ctx = child->perf_counter_ctxp;
4262         __perf_counter_task_sched_out(child_ctx);
4263
4264         /*
4265          * Take the context lock here so that if find_get_context is
4266          * reading child->perf_counter_ctxp, we wait until it has
4267          * incremented the context's refcount before we do put_ctx below.
4268          */
4269         spin_lock(&child_ctx->lock);
4270         child->perf_counter_ctxp = NULL;
4271         /*
4272          * If this context is a clone; unclone it so it can't get
4273          * swapped to another process while we're removing all
4274          * the counters from it.
4275          */
4276         unclone_ctx(child_ctx);
4277         spin_unlock(&child_ctx->lock);
4278         local_irq_restore(flags);
4279
4280         /*
4281          * We can recurse on the same lock type through:
4282          *
4283          *   __perf_counter_exit_task()
4284          *     sync_child_counter()
4285          *       fput(parent_counter->filp)
4286          *         perf_release()
4287          *           mutex_lock(&ctx->mutex)
4288          *
4289          * But since its the parent context it won't be the same instance.
4290          */
4291         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4292
4293 again:
4294         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4295                                  list_entry)
4296                 __perf_counter_exit_task(child_counter, child_ctx, child);
4297
4298         /*
4299          * If the last counter was a group counter, it will have appended all
4300          * its siblings to the list, but we obtained 'tmp' before that which
4301          * will still point to the list head terminating the iteration.
4302          */
4303         if (!list_empty(&child_ctx->counter_list))
4304                 goto again;
4305
4306         mutex_unlock(&child_ctx->mutex);
4307
4308         put_ctx(child_ctx);
4309 }
4310
4311 /*
4312  * free an unexposed, unused context as created by inheritance by
4313  * init_task below, used by fork() in case of fail.
4314  */
4315 void perf_counter_free_task(struct task_struct *task)
4316 {
4317         struct perf_counter_context *ctx = task->perf_counter_ctxp;
4318         struct perf_counter *counter, *tmp;
4319
4320         if (!ctx)
4321                 return;
4322
4323         mutex_lock(&ctx->mutex);
4324 again:
4325         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4326                 struct perf_counter *parent = counter->parent;
4327
4328                 if (WARN_ON_ONCE(!parent))
4329                         continue;
4330
4331                 mutex_lock(&parent->child_mutex);
4332                 list_del_init(&counter->child_list);
4333                 mutex_unlock(&parent->child_mutex);
4334
4335                 fput(parent->filp);
4336
4337                 list_del_counter(counter, ctx);
4338                 free_counter(counter);
4339         }
4340
4341         if (!list_empty(&ctx->counter_list))
4342                 goto again;
4343
4344         mutex_unlock(&ctx->mutex);
4345
4346         put_ctx(ctx);
4347 }
4348
4349 /*
4350  * Initialize the perf_counter context in task_struct
4351  */
4352 int perf_counter_init_task(struct task_struct *child)
4353 {
4354         struct perf_counter_context *child_ctx, *parent_ctx;
4355         struct perf_counter_context *cloned_ctx;
4356         struct perf_counter *counter;
4357         struct task_struct *parent = current;
4358         int inherited_all = 1;
4359         int ret = 0;
4360
4361         child->perf_counter_ctxp = NULL;
4362
4363         mutex_init(&child->perf_counter_mutex);
4364         INIT_LIST_HEAD(&child->perf_counter_list);
4365
4366         if (likely(!parent->perf_counter_ctxp))
4367                 return 0;
4368
4369         /*
4370          * This is executed from the parent task context, so inherit
4371          * counters that have been marked for cloning.
4372          * First allocate and initialize a context for the child.
4373          */
4374
4375         child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4376         if (!child_ctx)
4377                 return -ENOMEM;
4378
4379         __perf_counter_init_context(child_ctx, child);
4380         child->perf_counter_ctxp = child_ctx;
4381         get_task_struct(child);
4382
4383         /*
4384          * If the parent's context is a clone, pin it so it won't get
4385          * swapped under us.
4386          */
4387         parent_ctx = perf_pin_task_context(parent);
4388
4389         /*
4390          * No need to check if parent_ctx != NULL here; since we saw
4391          * it non-NULL earlier, the only reason for it to become NULL
4392          * is if we exit, and since we're currently in the middle of
4393          * a fork we can't be exiting at the same time.
4394          */
4395
4396         /*
4397          * Lock the parent list. No need to lock the child - not PID
4398          * hashed yet and not running, so nobody can access it.
4399          */
4400         mutex_lock(&parent_ctx->mutex);
4401
4402         /*
4403          * We dont have to disable NMIs - we are only looking at
4404          * the list, not manipulating it:
4405          */
4406         list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4407                 if (counter != counter->group_leader)
4408                         continue;
4409
4410                 if (!counter->attr.inherit) {
4411                         inherited_all = 0;
4412                         continue;
4413                 }
4414
4415                 ret = inherit_group(counter, parent, parent_ctx,
4416                                              child, child_ctx);
4417                 if (ret) {
4418                         inherited_all = 0;
4419                         break;
4420                 }
4421         }
4422
4423         if (inherited_all) {
4424                 /*
4425                  * Mark the child context as a clone of the parent
4426                  * context, or of whatever the parent is a clone of.
4427                  * Note that if the parent is a clone, it could get
4428                  * uncloned at any point, but that doesn't matter
4429                  * because the list of counters and the generation
4430                  * count can't have changed since we took the mutex.
4431                  */
4432                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4433                 if (cloned_ctx) {
4434                         child_ctx->parent_ctx = cloned_ctx;
4435                         child_ctx->parent_gen = parent_ctx->parent_gen;
4436                 } else {
4437                         child_ctx->parent_ctx = parent_ctx;
4438                         child_ctx->parent_gen = parent_ctx->generation;
4439                 }
4440                 get_ctx(child_ctx->parent_ctx);
4441         }
4442
4443         mutex_unlock(&parent_ctx->mutex);
4444
4445         perf_unpin_context(parent_ctx);
4446
4447         return ret;
4448 }
4449
4450 static void __cpuinit perf_counter_init_cpu(int cpu)
4451 {
4452         struct perf_cpu_context *cpuctx;
4453
4454         cpuctx = &per_cpu(perf_cpu_context, cpu);
4455         __perf_counter_init_context(&cpuctx->ctx, NULL);
4456
4457         spin_lock(&perf_resource_lock);
4458         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4459         spin_unlock(&perf_resource_lock);
4460
4461         hw_perf_counter_setup(cpu);
4462 }
4463
4464 #ifdef CONFIG_HOTPLUG_CPU
4465 static void __perf_counter_exit_cpu(void *info)
4466 {
4467         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4468         struct perf_counter_context *ctx = &cpuctx->ctx;
4469         struct perf_counter *counter, *tmp;
4470
4471         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4472                 __perf_counter_remove_from_context(counter);
4473 }
4474 static void perf_counter_exit_cpu(int cpu)
4475 {
4476         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4477         struct perf_counter_context *ctx = &cpuctx->ctx;
4478
4479         mutex_lock(&ctx->mutex);
4480         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4481         mutex_unlock(&ctx->mutex);
4482 }
4483 #else
4484 static inline void perf_counter_exit_cpu(int cpu) { }
4485 #endif
4486
4487 static int __cpuinit
4488 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4489 {
4490         unsigned int cpu = (long)hcpu;
4491
4492         switch (action) {
4493
4494         case CPU_UP_PREPARE:
4495         case CPU_UP_PREPARE_FROZEN:
4496                 perf_counter_init_cpu(cpu);
4497                 break;
4498
4499         case CPU_DOWN_PREPARE:
4500         case CPU_DOWN_PREPARE_FROZEN:
4501                 perf_counter_exit_cpu(cpu);
4502                 break;
4503
4504         default:
4505                 break;
4506         }
4507
4508         return NOTIFY_OK;
4509 }
4510
4511 /*
4512  * This has to have a higher priority than migration_notifier in sched.c.
4513  */
4514 static struct notifier_block __cpuinitdata perf_cpu_nb = {
4515         .notifier_call          = perf_cpu_notify,
4516         .priority               = 20,
4517 };
4518
4519 void __init perf_counter_init(void)
4520 {
4521         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4522                         (void *)(long)smp_processor_id());
4523         register_cpu_notifier(&perf_cpu_nb);
4524 }
4525
4526 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4527 {
4528         return sprintf(buf, "%d\n", perf_reserved_percpu);
4529 }
4530
4531 static ssize_t
4532 perf_set_reserve_percpu(struct sysdev_class *class,
4533                         const char *buf,
4534                         size_t count)
4535 {
4536         struct perf_cpu_context *cpuctx;
4537         unsigned long val;
4538         int err, cpu, mpt;
4539
4540         err = strict_strtoul(buf, 10, &val);
4541         if (err)
4542                 return err;
4543         if (val > perf_max_counters)
4544                 return -EINVAL;
4545
4546         spin_lock(&perf_resource_lock);
4547         perf_reserved_percpu = val;
4548         for_each_online_cpu(cpu) {
4549                 cpuctx = &per_cpu(perf_cpu_context, cpu);
4550                 spin_lock_irq(&cpuctx->ctx.lock);
4551                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4552                           perf_max_counters - perf_reserved_percpu);
4553                 cpuctx->max_pertask = mpt;
4554                 spin_unlock_irq(&cpuctx->ctx.lock);
4555         }
4556         spin_unlock(&perf_resource_lock);
4557
4558         return count;
4559 }
4560
4561 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4562 {
4563         return sprintf(buf, "%d\n", perf_overcommit);
4564 }
4565
4566 static ssize_t
4567 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4568 {
4569         unsigned long val;
4570         int err;
4571
4572         err = strict_strtoul(buf, 10, &val);
4573         if (err)
4574                 return err;
4575         if (val > 1)
4576                 return -EINVAL;
4577
4578         spin_lock(&perf_resource_lock);
4579         perf_overcommit = val;
4580         spin_unlock(&perf_resource_lock);
4581
4582         return count;
4583 }
4584
4585 static SYSDEV_CLASS_ATTR(
4586                                 reserve_percpu,
4587                                 0644,
4588                                 perf_show_reserve_percpu,
4589                                 perf_set_reserve_percpu
4590                         );
4591
4592 static SYSDEV_CLASS_ATTR(
4593                                 overcommit,
4594                                 0644,
4595                                 perf_show_overcommit,
4596                                 perf_set_overcommit
4597                         );
4598
4599 static struct attribute *perfclass_attrs[] = {
4600         &attr_reserve_percpu.attr,
4601         &attr_overcommit.attr,
4602         NULL
4603 };
4604
4605 static struct attribute_group perfclass_attr_group = {
4606         .attrs                  = perfclass_attrs,
4607         .name                   = "perf_counters",
4608 };
4609
4610 static int __init perf_counter_sysfs_init(void)
4611 {
4612         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4613                                   &perfclass_attr_group);
4614 }
4615 device_initcall(perf_counter_sysfs_init);