2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
65 #include "util/util.h"
76 #include <sys/syscall.h>
77 #include <sys/ioctl.h>
79 #include <sys/prctl.h>
84 #include <linux/unistd.h>
85 #include <linux/types.h>
87 #define EVENT_MASK_KERNEL 1
88 #define EVENT_MASK_USER 2
90 static int system_wide = 0;
92 static int nr_counters = 0;
93 static __u64 event_id[MAX_COUNTERS] = {
94 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
95 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
96 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
97 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
99 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
100 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
101 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
102 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
104 static int default_interval = 100000;
105 static int event_count[MAX_COUNTERS];
106 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
107 static int event_mask[MAX_COUNTERS];
110 static int profile_cpu = -1;
111 static int nr_cpus = 0;
113 static int group = 0;
114 static unsigned int page_size;
118 static int scale = 1;
120 static const unsigned int default_count[] = {
129 static char *hw_event_names[] = {
139 static char *sw_event_names[] = {
149 struct event_symbol {
154 static struct event_symbol event_symbols[] = {
155 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
156 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
157 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
158 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
159 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
165 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
166 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
167 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
168 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
169 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
177 #define __PERF_COUNTER_FIELD(config, name) \
178 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
180 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
181 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
182 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
183 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
185 static void display_events_help(void)
191 " -e EVENT --event=EVENT # symbolic-name abbreviations");
193 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
196 e = event_symbols[i].event;
197 type = PERF_COUNTER_TYPE(e);
198 id = PERF_COUNTER_ID(e);
200 printf("\n %d:%d: %-20s",
201 type, id, event_symbols[i].symbol);
205 " rNNN: raw PMU events (eventsel+umask)\n\n");
208 static void display_help(void)
211 "Usage: perfstat [<events...>] <cmd...>\n\n"
212 "PerfStat Options (up to %d event types can be specified):\n\n",
215 display_events_help();
218 " -l # scale counter values\n"
219 " -a # system-wide collection\n");
223 static char *event_name(int ctr)
225 __u64 config = event_id[ctr];
226 int type = PERF_COUNTER_TYPE(config);
227 int id = PERF_COUNTER_ID(config);
230 if (PERF_COUNTER_RAW(config)) {
231 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
236 case PERF_TYPE_HARDWARE:
237 if (id < PERF_HW_EVENTS_MAX)
238 return hw_event_names[id];
239 return "unknown-hardware";
241 case PERF_TYPE_SOFTWARE:
242 if (id < PERF_SW_EVENTS_MAX)
243 return sw_event_names[id];
244 return "unknown-software";
254 * Each event can have multiple symbolic names.
255 * Symbolic names are (almost) exactly matched.
257 static __u64 match_event_symbols(char *str)
264 if (sscanf(str, "r%llx", &config) == 1)
265 return config | PERF_COUNTER_RAW_MASK;
267 switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
269 if (strchr(mask_str, 'u'))
270 event_mask[nr_counters] |= EVENT_MASK_USER;
271 if (strchr(mask_str, 'k'))
272 event_mask[nr_counters] |= EVENT_MASK_KERNEL;
274 return EID(type, id);
280 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
281 if (!strncmp(str, event_symbols[i].symbol,
282 strlen(event_symbols[i].symbol)))
283 return event_symbols[i].event;
289 static int parse_events(char *str)
294 if (nr_counters == MAX_COUNTERS)
297 config = match_event_symbols(str);
301 event_id[nr_counters] = config;
304 str = strstr(str, ",");
318 char fault_here[1000000];
320 static void create_perfstat_counter(int counter)
322 struct perf_counter_hw_event hw_event;
324 memset(&hw_event, 0, sizeof(hw_event));
325 hw_event.config = event_id[counter];
326 hw_event.record_type = 0;
328 hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
329 hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER;
332 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
333 PERF_FORMAT_TOTAL_TIME_RUNNING;
337 for (cpu = 0; cpu < nr_cpus; cpu ++) {
338 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
339 if (fd[cpu][counter] < 0) {
340 printf("perfstat error: syscall returned with %d (%s)\n",
341 fd[cpu][counter], strerror(errno));
346 hw_event.inherit = 1;
347 hw_event.disabled = 1;
349 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
350 if (fd[0][counter] < 0) {
351 printf("perfstat error: syscall returned with %d (%s)\n",
352 fd[0][counter], strerror(errno));
358 int do_perfstat(int argc, char *argv[])
360 unsigned long long t0, t1;
369 for (counter = 0; counter < nr_counters; counter++)
370 create_perfstat_counter(counter);
379 * Enable counters and exec the command:
382 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
384 if ((pid = fork()) < 0)
385 perror("failed to fork");
387 if (execvp(argv[0], argv)) {
392 while (wait(&status) >= 0)
394 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
399 fprintf(stderr, "\n");
400 fprintf(stderr, " Performance counter stats for \'%s\':\n",
402 fprintf(stderr, "\n");
404 for (counter = 0; counter < nr_counters; counter++) {
406 __u64 count[3], single_count[3];
409 count[0] = count[1] = count[2] = 0;
411 for (cpu = 0; cpu < nr_cpus; cpu ++) {
412 res = read(fd[cpu][counter],
413 single_count, nv * sizeof(__u64));
414 assert(res == nv * sizeof(__u64));
416 count[0] += single_count[0];
418 count[1] += single_count[1];
419 count[2] += single_count[2];
426 fprintf(stderr, " %14s %-20s\n",
427 "<not counted>", event_name(counter));
430 if (count[2] < count[1]) {
432 count[0] = (unsigned long long)
433 ((double)count[0] * count[1] / count[2] + 0.5);
437 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
438 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
440 double msecs = (double)count[0] / 1000000;
442 fprintf(stderr, " %14.6f %-20s (msecs)",
443 msecs, event_name(counter));
445 fprintf(stderr, " %14Ld %-20s (events)",
446 count[0], event_name(counter));
449 fprintf(stderr, " (scaled from %.2f%%)",
450 (double) count[2] / count[1] * 100);
451 fprintf(stderr, "\n");
453 fprintf(stderr, "\n");
454 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
455 (double)(t1-t0)/1e6);
456 fprintf(stderr, "\n");
461 static void process_options(int argc, char **argv)
463 int error = 0, counter;
466 int option_index = 0;
467 /** Options for getopt */
468 static struct option long_options[] = {
469 {"count", required_argument, NULL, 'c'},
470 {"cpu", required_argument, NULL, 'C'},
471 {"delay", required_argument, NULL, 'd'},
472 {"dump_symtab", no_argument, NULL, 'D'},
473 {"event", required_argument, NULL, 'e'},
474 {"filter", required_argument, NULL, 'f'},
475 {"group", required_argument, NULL, 'g'},
476 {"help", no_argument, NULL, 'h'},
477 {"nmi", required_argument, NULL, 'n'},
478 {"munmap_info", no_argument, NULL, 'U'},
479 {"pid", required_argument, NULL, 'p'},
480 {"realtime", required_argument, NULL, 'r'},
481 {"scale", no_argument, NULL, 'l'},
482 {"symbol", required_argument, NULL, 's'},
483 {"stat", no_argument, NULL, 'S'},
484 {"vmlinux", required_argument, NULL, 'x'},
485 {"zero", no_argument, NULL, 'z'},
488 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
489 long_options, &option_index);
494 case 'a': system_wide = 1; break;
495 case 'c': default_interval = atoi(optarg); break;
497 /* CPU and PID are mutually exclusive */
499 printf("WARNING: CPU switch overriding PID\n");
503 profile_cpu = atoi(optarg); break;
505 case 'e': error = parse_events(optarg); break;
507 case 'g': group = atoi(optarg); break;
508 case 'h': display_help(); break;
509 case 'l': scale = 1; break;
510 case 'n': nmi = atoi(optarg); break;
512 /* CPU and PID are mutually exclusive */
513 if (profile_cpu != -1) {
514 printf("WARNING: PID switch overriding CPU\n");
518 tid = atoi(optarg); break;
519 case 'z': zero = 1; break;
520 default: error = 1; break;
530 for (counter = 0; counter < nr_counters; counter++) {
531 if (event_count[counter])
534 event_count[counter] = default_interval;
538 static void skip_signal(int signo)
542 int cmd_stat(int argc, char **argv, const char *prefix)
546 page_size = sysconf(_SC_PAGE_SIZE);
548 process_options(argc, argv);
550 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
551 assert(nr_cpus <= MAX_NR_CPUS);
552 assert(nr_cpus >= 0);
555 * We dont want to block the signals - that would cause
556 * child tasks to inherit that and Ctrl-C would not work.
557 * What we want is for Ctrl-C to work in the exec()-ed
558 * task, but being ignored by perf stat itself:
560 signal(SIGINT, skip_signal);
561 signal(SIGALRM, skip_signal);
562 signal(SIGABRT, skip_signal);
564 return do_perfstat(argc, argv);