2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
64 #include "util/util.h"
75 #include <sys/syscall.h>
76 #include <sys/ioctl.h>
78 #include <sys/prctl.h>
83 #include <linux/unistd.h>
84 #include <linux/types.h>
86 #include "../../include/linux/perf_counter.h"
90 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
91 * counters in the current task.
93 #define PR_TASK_PERF_COUNTERS_DISABLE 31
94 #define PR_TASK_PERF_COUNTERS_ENABLE 32
100 clock_gettime(CLOCK_MONOTONIC, &ts); \
101 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
105 * Pick up some kernel type conventions:
111 #define __NR_perf_counter_open 295
112 #define rmb() asm volatile("lfence" ::: "memory")
113 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
117 #define __NR_perf_counter_open 333
118 #define rmb() asm volatile("lfence" ::: "memory")
119 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
123 #define __NR_perf_counter_open 319
124 #define rmb() asm volatile ("sync" ::: "memory")
125 #define cpu_relax() asm volatile ("" ::: "memory");
128 #define unlikely(x) __builtin_expect(!!(x), 0)
129 #define min(x, y) ({ \
130 typeof(x) _min1 = (x); \
131 typeof(y) _min2 = (y); \
132 (void) (&_min1 == &_min2); \
133 _min1 < _min2 ? _min1 : _min2; })
135 extern asmlinkage int sys_perf_counter_open(
136 struct perf_counter_hw_event *hw_event_uptr __user,
140 unsigned long flags);
142 #define MAX_COUNTERS 64
143 #define MAX_NR_CPUS 256
145 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
147 static int system_wide = 0;
149 static int nr_counters = 0;
150 static __u64 event_id[MAX_COUNTERS] = {
151 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
152 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
153 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
154 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
156 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
157 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
158 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
159 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
161 static int default_interval = 100000;
162 static int event_count[MAX_COUNTERS];
163 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
166 static int profile_cpu = -1;
167 static int nr_cpus = 0;
169 static int group = 0;
170 static unsigned int page_size;
174 static int scale = 1;
176 static const unsigned int default_count[] = {
185 static char *hw_event_names[] = {
195 static char *sw_event_names[] = {
205 struct event_symbol {
210 static struct event_symbol event_symbols[] = {
211 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
212 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
213 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
214 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
215 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
216 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
217 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
218 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
219 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
221 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
222 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
223 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
224 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
225 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
226 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
227 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
228 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
229 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
230 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
233 #define __PERF_COUNTER_FIELD(config, name) \
234 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
236 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
237 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
238 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
239 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
241 static void display_events_help(void)
247 " -e EVENT --event=EVENT # symbolic-name abbreviations");
249 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
252 e = event_symbols[i].event;
253 type = PERF_COUNTER_TYPE(e);
254 id = PERF_COUNTER_ID(e);
256 printf("\n %d:%d: %-20s",
257 type, id, event_symbols[i].symbol);
261 " rNNN: raw PMU events (eventsel+umask)\n\n");
264 static void display_help(void)
267 "Usage: perfstat [<events...>] <cmd...>\n\n"
268 "PerfStat Options (up to %d event types can be specified):\n\n",
271 display_events_help();
274 " -l # scale counter values\n"
275 " -a # system-wide collection\n");
279 static char *event_name(int ctr)
281 __u64 config = event_id[ctr];
282 int type = PERF_COUNTER_TYPE(config);
283 int id = PERF_COUNTER_ID(config);
286 if (PERF_COUNTER_RAW(config)) {
287 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
292 case PERF_TYPE_HARDWARE:
293 if (id < PERF_HW_EVENTS_MAX)
294 return hw_event_names[id];
295 return "unknown-hardware";
297 case PERF_TYPE_SOFTWARE:
298 if (id < PERF_SW_EVENTS_MAX)
299 return sw_event_names[id];
300 return "unknown-software";
310 * Each event can have multiple symbolic names.
311 * Symbolic names are (almost) exactly matched.
313 static __u64 match_event_symbols(char *str)
319 if (sscanf(str, "r%llx", &config) == 1)
320 return config | PERF_COUNTER_RAW_MASK;
322 if (sscanf(str, "%d:%llu", &type, &id) == 2)
323 return EID(type, id);
325 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
326 if (!strncmp(str, event_symbols[i].symbol,
327 strlen(event_symbols[i].symbol)))
328 return event_symbols[i].event;
334 static int parse_events(char *str)
339 if (nr_counters == MAX_COUNTERS)
342 config = match_event_symbols(str);
346 event_id[nr_counters] = config;
349 str = strstr(str, ",");
363 char fault_here[1000000];
365 static void create_perfstat_counter(int counter)
367 struct perf_counter_hw_event hw_event;
369 memset(&hw_event, 0, sizeof(hw_event));
370 hw_event.config = event_id[counter];
371 hw_event.record_type = 0;
374 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
375 PERF_FORMAT_TOTAL_TIME_RUNNING;
379 for (cpu = 0; cpu < nr_cpus; cpu ++) {
380 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
381 if (fd[cpu][counter] < 0) {
382 printf("perfstat error: syscall returned with %d (%s)\n",
383 fd[cpu][counter], strerror(errno));
388 hw_event.inherit = 1;
389 hw_event.disabled = 1;
391 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
392 if (fd[0][counter] < 0) {
393 printf("perfstat error: syscall returned with %d (%s)\n",
394 fd[0][counter], strerror(errno));
400 int do_perfstat(int argc, char *argv[])
402 unsigned long long t0, t1;
411 for (counter = 0; counter < nr_counters; counter++)
412 create_perfstat_counter(counter);
421 * Enable counters and exec the command:
424 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
426 if ((pid = fork()) < 0)
427 perror("failed to fork");
429 if (execvp(argv[0], argv)) {
434 while (wait(&status) >= 0)
436 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
441 fprintf(stderr, "\n");
442 fprintf(stderr, " Performance counter stats for \'%s\':\n",
444 fprintf(stderr, "\n");
446 for (counter = 0; counter < nr_counters; counter++) {
448 __u64 count[3], single_count[3];
451 count[0] = count[1] = count[2] = 0;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 res = read(fd[cpu][counter],
455 single_count, nv * sizeof(__u64));
456 assert(res == nv * sizeof(__u64));
458 count[0] += single_count[0];
460 count[1] += single_count[1];
461 count[2] += single_count[2];
468 fprintf(stderr, " %14s %-20s\n",
469 "<not counted>", event_name(counter));
472 if (count[2] < count[1]) {
474 count[0] = (unsigned long long)
475 ((double)count[0] * count[1] / count[2] + 0.5);
479 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
480 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
482 double msecs = (double)count[0] / 1000000;
484 fprintf(stderr, " %14.6f %-20s (msecs)",
485 msecs, event_name(counter));
487 fprintf(stderr, " %14Ld %-20s (events)",
488 count[0], event_name(counter));
491 fprintf(stderr, " (scaled from %.2f%%)",
492 (double) count[2] / count[1] * 100);
493 fprintf(stderr, "\n");
495 fprintf(stderr, "\n");
496 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
497 (double)(t1-t0)/1e6);
498 fprintf(stderr, "\n");
503 static void process_options(int argc, char **argv)
505 int error = 0, counter;
508 int option_index = 0;
509 /** Options for getopt */
510 static struct option long_options[] = {
511 {"count", required_argument, NULL, 'c'},
512 {"cpu", required_argument, NULL, 'C'},
513 {"delay", required_argument, NULL, 'd'},
514 {"dump_symtab", no_argument, NULL, 'D'},
515 {"event", required_argument, NULL, 'e'},
516 {"filter", required_argument, NULL, 'f'},
517 {"group", required_argument, NULL, 'g'},
518 {"help", no_argument, NULL, 'h'},
519 {"nmi", required_argument, NULL, 'n'},
520 {"munmap_info", no_argument, NULL, 'U'},
521 {"pid", required_argument, NULL, 'p'},
522 {"realtime", required_argument, NULL, 'r'},
523 {"scale", no_argument, NULL, 'l'},
524 {"symbol", required_argument, NULL, 's'},
525 {"stat", no_argument, NULL, 'S'},
526 {"vmlinux", required_argument, NULL, 'x'},
527 {"zero", no_argument, NULL, 'z'},
530 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
531 long_options, &option_index);
536 case 'a': system_wide = 1; break;
537 case 'c': default_interval = atoi(optarg); break;
539 /* CPU and PID are mutually exclusive */
541 printf("WARNING: CPU switch overriding PID\n");
545 profile_cpu = atoi(optarg); break;
547 case 'e': error = parse_events(optarg); break;
549 case 'g': group = atoi(optarg); break;
550 case 'h': display_help(); break;
551 case 'l': scale = 1; break;
552 case 'n': nmi = atoi(optarg); break;
554 /* CPU and PID are mutually exclusive */
555 if (profile_cpu != -1) {
556 printf("WARNING: PID switch overriding CPU\n");
560 tid = atoi(optarg); break;
561 case 'z': zero = 1; break;
562 default: error = 1; break;
572 for (counter = 0; counter < nr_counters; counter++) {
573 if (event_count[counter])
576 event_count[counter] = default_interval;
580 int cmd_stat(int argc, char **argv, const char *prefix)
582 page_size = sysconf(_SC_PAGE_SIZE);
584 process_options(argc, argv);
586 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
587 assert(nr_cpus <= MAX_NR_CPUS);
588 assert(nr_cpus >= 0);
590 return do_perfstat(argc, argv);