Merge branch 'core/signal' into perfcounters/core
[linux-2.6] / Documentation / perf_counter / builtin-stat.c
1 /*
2  * kerneltop.c: show top kernel functions - performance counters showcase
3
4    Build with:
5
6      cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8    Sample output:
9
10 ------------------------------------------------------------------------------
11  KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14              weight         RIP          kernel function
15              ______   ________________   _______________
16
17               35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18               33.00 - ffffffff804cb740 : sock_alloc_send_skb
19               31.26 - ffffffff804ce808 : skb_push
20               22.43 - ffffffff80510004 : tcp_established_options
21               19.00 - ffffffff8027d250 : find_get_page
22               15.76 - ffffffff804e4fc9 : eth_type_trans
23               15.20 - ffffffff804d8baa : dst_release
24               14.86 - ffffffff804cf5d8 : skb_release_head_state
25               14.00 - ffffffff802217d5 : read_hpet
26               12.00 - ffffffff804ffb7f : __ip_local_out
27               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28                8.54 - ffffffff805001a3 : ip_queue_xmit
29  */
30
31 /*
32  * perfstat:  /usr/bin/time -alike performance counter statistics utility
33
34           It summarizes the counter events of all tasks (and child tasks),
35           covering all CPUs that the command (or workload) executes on.
36           It only counts the per-task events of the workload started,
37           independent of how many other tasks run on those CPUs.
38
39    Sample output:
40
41    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43    Performance counter stats for 'ls':
44
45            163516953 instructions
46                 2295 cache-misses
47              2855182 branch-misses
48  */
49
50  /*
51   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52   *
53   * Improvements and fixes by:
54   *
55   *   Arjan van de Ven <arjan@linux.intel.com>
56   *   Yanmin Zhang <yanmin.zhang@intel.com>
57   *   Wu Fengguang <fengguang.wu@intel.com>
58   *   Mike Galbraith <efault@gmx.de>
59   *   Paul Mackerras <paulus@samba.org>
60   *
61   * Released under the GPL v2. (and only v2, not any later version)
62   */
63
64 #include "util/util.h"
65
66 #include <getopt.h>
67 #include <assert.h>
68 #include <fcntl.h>
69 #include <stdio.h>
70 #include <errno.h>
71 #include <time.h>
72 #include <sched.h>
73 #include <pthread.h>
74
75 #include <sys/syscall.h>
76 #include <sys/ioctl.h>
77 #include <sys/poll.h>
78 #include <sys/prctl.h>
79 #include <sys/wait.h>
80 #include <sys/uio.h>
81 #include <sys/mman.h>
82
83 #include <linux/unistd.h>
84 #include <linux/types.h>
85
86 #include "../../include/linux/perf_counter.h"
87
88
89 /*
90  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
91  * counters in the current task.
92  */
93 #define PR_TASK_PERF_COUNTERS_DISABLE   31
94 #define PR_TASK_PERF_COUNTERS_ENABLE    32
95
96 #define rdclock()                                       \
97 ({                                                      \
98         struct timespec ts;                             \
99                                                         \
100         clock_gettime(CLOCK_MONOTONIC, &ts);            \
101         ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
102 })
103
104 /*
105  * Pick up some kernel type conventions:
106  */
107 #define __user
108 #define asmlinkage
109
110 #ifdef __x86_64__
111 #define __NR_perf_counter_open 295
112 #define rmb()           asm volatile("lfence" ::: "memory")
113 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
114 #endif
115
116 #ifdef __i386__
117 #define __NR_perf_counter_open 333
118 #define rmb()           asm volatile("lfence" ::: "memory")
119 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
120 #endif
121
122 #ifdef __powerpc__
123 #define __NR_perf_counter_open 319
124 #define rmb()           asm volatile ("sync" ::: "memory")
125 #define cpu_relax()     asm volatile ("" ::: "memory");
126 #endif
127
128 #define unlikely(x)     __builtin_expect(!!(x), 0)
129 #define min(x, y) ({                            \
130         typeof(x) _min1 = (x);                  \
131         typeof(y) _min2 = (y);                  \
132         (void) (&_min1 == &_min2);              \
133         _min1 < _min2 ? _min1 : _min2; })
134
135 extern asmlinkage int sys_perf_counter_open(
136         struct perf_counter_hw_event    *hw_event_uptr          __user,
137         pid_t                           pid,
138         int                             cpu,
139         int                             group_fd,
140         unsigned long                   flags);
141
142 #define MAX_COUNTERS                    64
143 #define MAX_NR_CPUS                     256
144
145 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
146
147 static int                      system_wide                     =  0;
148
149 static int                      nr_counters                     =  0;
150 static __u64                    event_id[MAX_COUNTERS]          = {
151         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
152         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
153         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
154         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
155
156         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
157         EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
158         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
159         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
160 };
161 static int                      default_interval = 100000;
162 static int                      event_count[MAX_COUNTERS];
163 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
164
165 static int                      tid                             = -1;
166 static int                      profile_cpu                     = -1;
167 static int                      nr_cpus                         =  0;
168 static int                      nmi                             =  1;
169 static int                      group                           =  0;
170 static unsigned int             page_size;
171
172 static int                      zero;
173
174 static int                      scale                           =  1;
175
176 static const unsigned int default_count[] = {
177         1000000,
178         1000000,
179           10000,
180           10000,
181         1000000,
182           10000,
183 };
184
185 static char *hw_event_names[] = {
186         "CPU cycles",
187         "instructions",
188         "cache references",
189         "cache misses",
190         "branches",
191         "branch misses",
192         "bus cycles",
193 };
194
195 static char *sw_event_names[] = {
196         "cpu clock ticks",
197         "task clock ticks",
198         "pagefaults",
199         "context switches",
200         "CPU migrations",
201         "minor faults",
202         "major faults",
203 };
204
205 struct event_symbol {
206         __u64 event;
207         char *symbol;
208 };
209
210 static struct event_symbol event_symbols[] = {
211         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cpu-cycles",           },
212         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cycles",               },
213         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),              "instructions",         },
214         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),          "cache-references",     },
215         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),              "cache-misses",         },
216         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branch-instructions",  },
217         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branches",             },
218         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),             "branch-misses",        },
219         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),                "bus-cycles",           },
220
221         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),                 "cpu-clock",            },
222         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),                "task-clock",           },
223         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "page-faults",          },
224         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "faults",               },
225         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),           "minor-faults",         },
226         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),           "major-faults",         },
227         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "context-switches",     },
228         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "cs",                   },
229         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "cpu-migrations",       },
230         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "migrations",           },
231 };
232
233 #define __PERF_COUNTER_FIELD(config, name) \
234         ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
235
236 #define PERF_COUNTER_RAW(config)        __PERF_COUNTER_FIELD(config, RAW)
237 #define PERF_COUNTER_CONFIG(config)     __PERF_COUNTER_FIELD(config, CONFIG)
238 #define PERF_COUNTER_TYPE(config)       __PERF_COUNTER_FIELD(config, TYPE)
239 #define PERF_COUNTER_ID(config)         __PERF_COUNTER_FIELD(config, EVENT)
240
241 static void display_events_help(void)
242 {
243         unsigned int i;
244         __u64 e;
245
246         printf(
247         " -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
248
249         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
250                 int type, id;
251
252                 e = event_symbols[i].event;
253                 type = PERF_COUNTER_TYPE(e);
254                 id = PERF_COUNTER_ID(e);
255
256                 printf("\n                             %d:%d: %-20s",
257                                 type, id, event_symbols[i].symbol);
258         }
259
260         printf("\n"
261         "                           rNNN: raw PMU events (eventsel+umask)\n\n");
262 }
263
264 static void display_help(void)
265 {
266         printf(
267         "Usage: perfstat [<events...>] <cmd...>\n\n"
268         "PerfStat Options (up to %d event types can be specified):\n\n",
269                  MAX_COUNTERS);
270
271         display_events_help();
272
273         printf(
274         " -l                           # scale counter values\n"
275         " -a                           # system-wide collection\n");
276         exit(0);
277 }
278
279 static char *event_name(int ctr)
280 {
281         __u64 config = event_id[ctr];
282         int type = PERF_COUNTER_TYPE(config);
283         int id = PERF_COUNTER_ID(config);
284         static char buf[32];
285
286         if (PERF_COUNTER_RAW(config)) {
287                 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
288                 return buf;
289         }
290
291         switch (type) {
292         case PERF_TYPE_HARDWARE:
293                 if (id < PERF_HW_EVENTS_MAX)
294                         return hw_event_names[id];
295                 return "unknown-hardware";
296
297         case PERF_TYPE_SOFTWARE:
298                 if (id < PERF_SW_EVENTS_MAX)
299                         return sw_event_names[id];
300                 return "unknown-software";
301
302         default:
303                 break;
304         }
305
306         return "unknown";
307 }
308
309 /*
310  * Each event can have multiple symbolic names.
311  * Symbolic names are (almost) exactly matched.
312  */
313 static __u64 match_event_symbols(char *str)
314 {
315         __u64 config, id;
316         int type;
317         unsigned int i;
318
319         if (sscanf(str, "r%llx", &config) == 1)
320                 return config | PERF_COUNTER_RAW_MASK;
321
322         if (sscanf(str, "%d:%llu", &type, &id) == 2)
323                 return EID(type, id);
324
325         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
326                 if (!strncmp(str, event_symbols[i].symbol,
327                              strlen(event_symbols[i].symbol)))
328                         return event_symbols[i].event;
329         }
330
331         return ~0ULL;
332 }
333
334 static int parse_events(char *str)
335 {
336         __u64 config;
337
338 again:
339         if (nr_counters == MAX_COUNTERS)
340                 return -1;
341
342         config = match_event_symbols(str);
343         if (config == ~0ULL)
344                 return -1;
345
346         event_id[nr_counters] = config;
347         nr_counters++;
348
349         str = strstr(str, ",");
350         if (str) {
351                 str++;
352                 goto again;
353         }
354
355         return 0;
356 }
357
358
359 /*
360  * perfstat
361  */
362
363 char fault_here[1000000];
364
365 static void create_perfstat_counter(int counter)
366 {
367         struct perf_counter_hw_event hw_event;
368
369         memset(&hw_event, 0, sizeof(hw_event));
370         hw_event.config         = event_id[counter];
371         hw_event.record_type    = 0;
372         hw_event.nmi            = 0;
373         if (scale)
374                 hw_event.read_format    = PERF_FORMAT_TOTAL_TIME_ENABLED |
375                                           PERF_FORMAT_TOTAL_TIME_RUNNING;
376
377         if (system_wide) {
378                 int cpu;
379                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
380                         fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
381                         if (fd[cpu][counter] < 0) {
382                                 printf("perfstat error: syscall returned with %d (%s)\n",
383                                                 fd[cpu][counter], strerror(errno));
384                                 exit(-1);
385                         }
386                 }
387         } else {
388                 hw_event.inherit        = 1;
389                 hw_event.disabled       = 1;
390
391                 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
392                 if (fd[0][counter] < 0) {
393                         printf("perfstat error: syscall returned with %d (%s)\n",
394                                         fd[0][counter], strerror(errno));
395                         exit(-1);
396                 }
397         }
398 }
399
400 int do_perfstat(int argc, char *argv[])
401 {
402         unsigned long long t0, t1;
403         int counter;
404         ssize_t res;
405         int status;
406         int pid;
407
408         if (!system_wide)
409                 nr_cpus = 1;
410
411         for (counter = 0; counter < nr_counters; counter++)
412                 create_perfstat_counter(counter);
413
414         argc -= optind;
415         argv += optind;
416
417         if (!argc)
418                 display_help();
419
420         /*
421          * Enable counters and exec the command:
422          */
423         t0 = rdclock();
424         prctl(PR_TASK_PERF_COUNTERS_ENABLE);
425
426         if ((pid = fork()) < 0)
427                 perror("failed to fork");
428         if (!pid) {
429                 if (execvp(argv[0], argv)) {
430                         perror(argv[0]);
431                         exit(-1);
432                 }
433         }
434         while (wait(&status) >= 0)
435                 ;
436         prctl(PR_TASK_PERF_COUNTERS_DISABLE);
437         t1 = rdclock();
438
439         fflush(stdout);
440
441         fprintf(stderr, "\n");
442         fprintf(stderr, " Performance counter stats for \'%s\':\n",
443                 argv[0]);
444         fprintf(stderr, "\n");
445
446         for (counter = 0; counter < nr_counters; counter++) {
447                 int cpu, nv;
448                 __u64 count[3], single_count[3];
449                 int scaled;
450
451                 count[0] = count[1] = count[2] = 0;
452                 nv = scale ? 3 : 1;
453                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454                         res = read(fd[cpu][counter],
455                                    single_count, nv * sizeof(__u64));
456                         assert(res == nv * sizeof(__u64));
457
458                         count[0] += single_count[0];
459                         if (scale) {
460                                 count[1] += single_count[1];
461                                 count[2] += single_count[2];
462                         }
463                 }
464
465                 scaled = 0;
466                 if (scale) {
467                         if (count[2] == 0) {
468                                 fprintf(stderr, " %14s  %-20s\n",
469                                         "<not counted>", event_name(counter));
470                                 continue;
471                         }
472                         if (count[2] < count[1]) {
473                                 scaled = 1;
474                                 count[0] = (unsigned long long)
475                                         ((double)count[0] * count[1] / count[2] + 0.5);
476                         }
477                 }
478
479                 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
480                     event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
481
482                         double msecs = (double)count[0] / 1000000;
483
484                         fprintf(stderr, " %14.6f  %-20s (msecs)",
485                                 msecs, event_name(counter));
486                 } else {
487                         fprintf(stderr, " %14Ld  %-20s (events)",
488                                 count[0], event_name(counter));
489                 }
490                 if (scaled)
491                         fprintf(stderr, "  (scaled from %.2f%%)",
492                                 (double) count[2] / count[1] * 100);
493                 fprintf(stderr, "\n");
494         }
495         fprintf(stderr, "\n");
496         fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
497                         (double)(t1-t0)/1e6);
498         fprintf(stderr, "\n");
499
500         return 0;
501 }
502
503 static void process_options(int argc, char **argv)
504 {
505         int error = 0, counter;
506
507         for (;;) {
508                 int option_index = 0;
509                 /** Options for getopt */
510                 static struct option long_options[] = {
511                         {"count",       required_argument,      NULL, 'c'},
512                         {"cpu",         required_argument,      NULL, 'C'},
513                         {"delay",       required_argument,      NULL, 'd'},
514                         {"dump_symtab", no_argument,            NULL, 'D'},
515                         {"event",       required_argument,      NULL, 'e'},
516                         {"filter",      required_argument,      NULL, 'f'},
517                         {"group",       required_argument,      NULL, 'g'},
518                         {"help",        no_argument,            NULL, 'h'},
519                         {"nmi",         required_argument,      NULL, 'n'},
520                         {"munmap_info", no_argument,            NULL, 'U'},
521                         {"pid",         required_argument,      NULL, 'p'},
522                         {"realtime",    required_argument,      NULL, 'r'},
523                         {"scale",       no_argument,            NULL, 'l'},
524                         {"symbol",      required_argument,      NULL, 's'},
525                         {"stat",        no_argument,            NULL, 'S'},
526                         {"vmlinux",     required_argument,      NULL, 'x'},
527                         {"zero",        no_argument,            NULL, 'z'},
528                         {NULL,          0,                      NULL,  0 }
529                 };
530                 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
531                                     long_options, &option_index);
532                 if (c == -1)
533                         break;
534
535                 switch (c) {
536                 case 'a': system_wide                   =              1; break;
537                 case 'c': default_interval              =   atoi(optarg); break;
538                 case 'C':
539                         /* CPU and PID are mutually exclusive */
540                         if (tid != -1) {
541                                 printf("WARNING: CPU switch overriding PID\n");
542                                 sleep(1);
543                                 tid = -1;
544                         }
545                         profile_cpu                     =   atoi(optarg); break;
546
547                 case 'e': error                         = parse_events(optarg); break;
548
549                 case 'g': group                         =   atoi(optarg); break;
550                 case 'h':                                 display_help(); break;
551                 case 'l': scale                         =              1; break;
552                 case 'n': nmi                           =   atoi(optarg); break;
553                 case 'p':
554                         /* CPU and PID are mutually exclusive */
555                         if (profile_cpu != -1) {
556                                 printf("WARNING: PID switch overriding CPU\n");
557                                 sleep(1);
558                                 profile_cpu = -1;
559                         }
560                         tid                             =   atoi(optarg); break;
561                 case 'z': zero                          =              1; break;
562                 default: error = 1; break;
563                 }
564         }
565         if (error)
566                 display_help();
567
568         if (!nr_counters) {
569                 nr_counters = 8;
570         }
571
572         for (counter = 0; counter < nr_counters; counter++) {
573                 if (event_count[counter])
574                         continue;
575
576                 event_count[counter] = default_interval;
577         }
578 }
579
580 int cmd_stat(int argc, char **argv, const char *prefix)
581 {
582         page_size = sysconf(_SC_PAGE_SIZE);
583
584         process_options(argc, argv);
585
586         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
587         assert(nr_cpus <= MAX_NR_CPUS);
588         assert(nr_cpus >= 0);
589
590         return do_perfstat(argc, argv);
591 }