2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
65 #include <sys/types.h>
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
84 #include <sys/prctl.h>
89 #include <linux/unistd.h>
90 #include <linux/types.h>
92 #include "../../include/linux/perf_counter.h"
96 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
97 * counters in the current task.
99 #define PR_TASK_PERF_COUNTERS_DISABLE 31
100 #define PR_TASK_PERF_COUNTERS_ENABLE 32
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
106 struct timespec ts; \
108 clock_gettime(CLOCK_MONOTONIC, &ts); \
109 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
113 * Pick up some kernel type conventions:
119 #define __NR_perf_counter_open 295
120 #define rmb() asm volatile("lfence" ::: "memory")
121 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
125 #define __NR_perf_counter_open 333
126 #define rmb() asm volatile("lfence" ::: "memory")
127 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
131 #define __NR_perf_counter_open 319
132 #define rmb() asm volatile ("sync" ::: "memory")
133 #define cpu_relax() asm volatile ("" ::: "memory");
136 #define unlikely(x) __builtin_expect(!!(x), 0)
138 asmlinkage int sys_perf_counter_open(
139 struct perf_counter_hw_event *hw_event_uptr __user,
146 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
149 #define MAX_COUNTERS 64
150 #define MAX_NR_CPUS 256
152 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
154 static int run_perfstat = 0;
155 static int system_wide = 0;
157 static int nr_counters = 0;
158 static __u64 event_id[MAX_COUNTERS] = {
159 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
160 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
161 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
162 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
164 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
165 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
166 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
167 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
169 static int default_interval = 100000;
170 static int event_count[MAX_COUNTERS];
171 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
173 static __u64 count_filter = 100;
176 static int profile_cpu = -1;
177 static int nr_cpus = 0;
179 static int group = 0;
180 static unsigned int page_size;
182 static char *vmlinux;
184 static char *sym_filter;
185 static unsigned long filter_start;
186 static unsigned long filter_end;
188 static int delay_secs = 2;
190 static int dump_symtab;
196 struct source_line *next;
199 static struct source_line *lines;
200 static struct source_line **lines_tail;
202 const unsigned int default_count[] = {
211 static char *hw_event_names[] = {
221 static char *sw_event_names[] = {
231 struct event_symbol {
236 static struct event_symbol event_symbols[] = {
237 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
238 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
239 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
240 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
241 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
242 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
243 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
244 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
245 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
247 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
248 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
249 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
250 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
251 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
252 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
253 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
254 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
255 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
256 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
259 #define __PERF_COUNTER_FIELD(config, name) \
260 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
262 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
263 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
264 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
265 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
267 static void display_events_help(void)
273 " -e EVENT --event=EVENT # symbolic-name abbreviations");
275 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
278 e = event_symbols[i].event;
279 type = PERF_COUNTER_TYPE(e);
280 id = PERF_COUNTER_ID(e);
282 printf("\n %d:%d: %-20s",
283 type, id, event_symbols[i].symbol);
287 " rNNN: raw PMU events (eventsel+umask)\n\n");
290 static void display_perfstat_help(void)
293 "Usage: perfstat [<events...>] <cmd...>\n\n"
294 "PerfStat Options (up to %d event types can be specified):\n\n",
297 display_events_help();
300 " -a # system-wide collection\n");
304 static void display_help(void)
307 return display_perfstat_help();
310 "Usage: kerneltop [<options>]\n"
311 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
312 "KernelTop Options (up to %d event types can be specified at once):\n\n",
315 display_events_help();
318 " -S --stat # perfstat COMMAND\n"
319 " -a # system-wide collection (for perfstat)\n\n"
320 " -c CNT --count=CNT # event period to sample\n\n"
321 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
322 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
323 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
324 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
325 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
326 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
327 " -z --zero # zero counts after display\n"
328 " -D --dump_symtab # dump symbol table to stderr on startup\n"
334 static char *event_name(int ctr)
336 __u64 config = event_id[ctr];
337 int type = PERF_COUNTER_TYPE(config);
338 int id = PERF_COUNTER_ID(config);
341 if (PERF_COUNTER_RAW(config)) {
342 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
347 case PERF_TYPE_HARDWARE:
348 if (id < PERF_HW_EVENTS_MAX)
349 return hw_event_names[id];
350 return "unknown-hardware";
352 case PERF_TYPE_SOFTWARE:
353 if (id < PERF_SW_EVENTS_MAX)
354 return sw_event_names[id];
355 return "unknown-software";
365 * Each event can have multiple symbolic names.
366 * Symbolic names are (almost) exactly matched.
368 static __u64 match_event_symbols(char *str)
374 if (sscanf(str, "r%llx", &config) == 1)
375 return config | PERF_COUNTER_RAW_MASK;
377 if (sscanf(str, "%d:%llu", &type, &id) == 2)
378 return EID(type, id);
380 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
381 if (!strncmp(str, event_symbols[i].symbol,
382 strlen(event_symbols[i].symbol)))
383 return event_symbols[i].event;
389 static int parse_events(char *str)
394 if (nr_counters == MAX_COUNTERS)
397 config = match_event_symbols(str);
401 event_id[nr_counters] = config;
404 str = strstr(str, ",");
418 char fault_here[1000000];
420 static void create_perfstat_counter(int counter)
422 struct perf_counter_hw_event hw_event;
424 memset(&hw_event, 0, sizeof(hw_event));
425 hw_event.config = event_id[counter];
426 hw_event.record_type = PERF_RECORD_SIMPLE;
431 for (cpu = 0; cpu < nr_cpus; cpu ++) {
432 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
433 if (fd[cpu][counter] < 0) {
434 printf("perfstat error: syscall returned with %d (%s)\n",
435 fd[cpu][counter], strerror(errno));
440 hw_event.inherit = 1;
441 hw_event.disabled = 1;
443 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
444 if (fd[0][counter] < 0) {
445 printf("perfstat error: syscall returned with %d (%s)\n",
446 fd[0][counter], strerror(errno));
452 int do_perfstat(int argc, char *argv[])
454 unsigned long long t0, t1;
463 for (counter = 0; counter < nr_counters; counter++)
464 create_perfstat_counter(counter);
473 * Enable counters and exec the command:
476 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
478 if ((pid = fork()) < 0)
479 perror("failed to fork");
481 if (execvp(argv[0], argv)) {
486 while (wait(&status) >= 0)
488 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
493 fprintf(stderr, "\n");
494 fprintf(stderr, " Performance counter stats for \'%s\':\n",
496 fprintf(stderr, "\n");
498 for (counter = 0; counter < nr_counters; counter++) {
500 __u64 count, single_count;
503 for (cpu = 0; cpu < nr_cpus; cpu ++) {
504 res = read(fd[cpu][counter],
505 (char *) &single_count, sizeof(single_count));
506 assert(res == sizeof(single_count));
507 count += single_count;
510 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
511 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
513 double msecs = (double)count / 1000000;
515 fprintf(stderr, " %14.6f %-20s (msecs)\n",
516 msecs, event_name(counter));
518 fprintf(stderr, " %14Ld %-20s (events)\n",
519 count, event_name(counter));
522 fprintf(stderr, "\n");
523 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
524 (double)(t1-t0)/1e6);
525 fprintf(stderr, "\n");
534 static uint64_t min_ip;
535 static uint64_t max_ip = -1ll;
538 unsigned long long addr;
540 unsigned long count[MAX_COUNTERS];
542 struct source_line *source;
545 #define MAX_SYMS 100000
547 static int sym_table_count;
549 struct sym_entry *sym_filter_entry;
551 static struct sym_entry sym_table[MAX_SYMS];
553 static void show_details(struct sym_entry *sym);
556 * Ordering weight: count-1 * count-2 * ... / count-n
558 static double sym_weight(const struct sym_entry *sym)
563 weight = sym->count[0];
565 for (counter = 1; counter < nr_counters-1; counter++)
566 weight *= sym->count[counter];
568 weight /= (sym->count[counter] + 1);
573 static int compare(const void *__sym1, const void *__sym2)
575 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
577 return sym_weight(sym1) < sym_weight(sym2);
580 static time_t last_refresh;
582 static long userspace_events;
583 static const char CONSOLE_CLEAR[] = "
\e[H
\e[2J";
585 static struct sym_entry tmp[MAX_SYMS];
587 static void print_sym_table(void)
591 float events_per_sec = events/delay_secs;
592 float kevents_per_sec = (events-userspace_events)/delay_secs;
594 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
595 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
597 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
600 "------------------------------------------------------------------------------\n");
601 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
603 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
604 nmi ? "NMI" : "IRQ");
606 if (nr_counters == 1)
607 printf("%d ", event_count[0]);
609 for (counter = 0; counter < nr_counters; counter++) {
613 printf("%s", event_name(counter));
619 printf(" (tid: %d", tid);
623 if (profile_cpu != -1)
624 printf(", cpu: %d)\n", profile_cpu);
629 printf(", %d CPUs)\n", nr_cpus);
632 printf("------------------------------------------------------------------------------\n\n");
634 if (nr_counters == 1)
637 printf(" weight events");
639 printf(" RIP kernel function\n"
640 " ______ ______ ________________ _______________\n\n"
644 for (i = 0; i < sym_table_count; i++) {
647 if (nr_counters == 1) {
649 tmp[i].count[0] >= count_filter) {
650 printf("%19.2f - %016llx : %s\n",
651 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
656 tmp[i].count[0] >= count_filter) {
657 printf("%8.1f %10ld - %016llx : %s\n",
660 tmp[i].addr, tmp[i].sym);
665 * Add decay to the counts:
667 for (count = 0; count < nr_counters; count++)
668 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
671 if (sym_filter_entry)
672 show_details(sym_filter_entry);
674 last_refresh = time(NULL);
677 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
679 if (poll(&stdin_poll, 1, 0) == 1) {
680 printf("key pressed - exiting.\n");
686 static int read_symbol(FILE *in, struct sym_entry *s)
688 static int filter_match = 0;
693 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
699 /* skip until end of line: */
703 if (rc == '\n' || rc == EOF || pos >= 499)
712 /* Filter out known duplicates and non-text symbols. */
713 if (!strcmp(sym, "_text"))
715 if (!min_ip && !strcmp(sym, "_stext"))
717 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
719 if (stype != 'T' && stype != 't')
721 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
723 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
726 s->sym = malloc(strlen(str));
729 strcpy((char *)s->sym, str);
732 /* Tag events to be skipped. */
733 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
735 if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
738 if (filter_match == 1) {
739 filter_end = s->addr;
741 if (filter_end - filter_start > 10000) {
742 printf("hm, too large filter symbol <%s> - skipping.\n",
744 printf("symbol filter start: %016lx\n", filter_start);
745 printf(" end: %016lx\n", filter_end);
746 filter_end = filter_start = 0;
751 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
753 filter_start = s->addr;
759 int compare_addr(const void *__sym1, const void *__sym2)
761 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
763 return sym1->addr > sym2->addr;
766 static void sort_symbol_table(void)
771 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
772 for (i = 0, dups = 0; i < sym_table_count; i++) {
773 if (sym_table[i].addr == sym_table[i+1].addr) {
774 sym_table[i+1].addr = -1ll;
778 sym_table_count -= dups;
782 static void parse_symbols(void)
784 struct sym_entry *last;
786 FILE *kallsyms = fopen("/proc/kallsyms", "r");
789 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
793 while (!feof(kallsyms)) {
794 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
796 assert(sym_table_count <= MAX_SYMS);
801 min_ip = sym_table[0].addr;
802 max_ip = sym_table[sym_table_count-1].addr;
803 last = sym_table + sym_table_count++;
810 for (count=0; count < sym_table_count; count ++) {
811 if (!strcmp(sym_table[count].sym, sym_filter)) {
812 sym_filter_entry = &sym_table[count];
820 for (i = 0; i < sym_table_count; i++)
821 fprintf(stderr, "%llx %s\n",
822 sym_table[i].addr, sym_table[i].sym);
830 static void parse_vmlinux(char *filename)
833 char command[PATH_MAX*2];
837 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
839 file = popen(command, "r");
844 while (!feof(file)) {
845 struct source_line *src;
849 src = malloc(sizeof(struct source_line));
851 memset(src, 0, sizeof(struct source_line));
853 if (getline(&src->line, &dummy, file) < 0)
858 c = strchr(src->line, '\n');
864 lines_tail = &src->next;
866 if (strlen(src->line)>8 && src->line[8] == ':')
867 src->EIP = strtoull(src->line, NULL, 16);
868 if (strlen(src->line)>8 && src->line[16] == ':')
869 src->EIP = strtoull(src->line, NULL, 16);
874 static void record_precise_ip(uint64_t ip)
876 struct source_line *line;
878 for (line = lines; line; line = line->next) {
886 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
888 struct source_line *line;
889 char pattern[PATH_MAX];
890 sprintf(pattern, "<%s>:", sym->sym);
892 for (line = lines; line; line = line->next) {
893 if (strstr(line->line, pattern)) {
900 static void show_lines(struct source_line *line_queue, int line_queue_count)
903 struct source_line *line;
906 for (i = 0; i < line_queue_count; i++) {
907 printf("%8li\t%s\n", line->count, line->line);
912 #define TRACE_COUNT 3
914 static void show_details(struct sym_entry *sym)
916 struct source_line *line;
917 struct source_line *line_queue = NULL;
919 int line_queue_count = 0;
922 lookup_sym_in_vmlinux(sym);
926 printf("Showing details for %s\n", sym->sym);
930 if (displayed && strstr(line->line, ">:"))
933 if (!line_queue_count)
937 if (line->count >= count_filter) {
938 show_lines(line_queue, line_queue_count);
939 line_queue_count = 0;
941 } else if (line_queue_count > TRACE_COUNT) {
942 line_queue = line_queue->next;
955 * Binary search in the histogram table and record the hit:
957 static void record_ip(uint64_t ip, int counter)
959 int left_idx, middle_idx, right_idx, idx;
960 unsigned long left, middle, right;
962 record_precise_ip(ip);
965 right_idx = sym_table_count-1;
966 assert(ip <= max_ip && ip >= min_ip);
968 while (left_idx + 1 < right_idx) {
969 middle_idx = (left_idx + right_idx) / 2;
971 left = sym_table[ left_idx].addr;
972 middle = sym_table[middle_idx].addr;
973 right = sym_table[ right_idx].addr;
975 if (!(left <= middle && middle <= right)) {
976 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
977 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
979 assert(left <= middle && middle <= right);
980 if (!(left <= ip && ip <= right)) {
981 printf(" left: %016lx\n", left);
982 printf(" ip: %016lx\n", (unsigned long)ip);
983 printf("right: %016lx\n", right);
985 assert(left <= ip && ip <= right);
987 * [ left .... target .... middle .... right ]
991 right_idx = middle_idx;
995 * [ left .... middle ... target ... right ]
998 left_idx = middle_idx;
1003 if (!sym_table[idx].skip)
1004 sym_table[idx].count[counter]++;
1008 static void process_event(uint64_t ip, int counter)
1012 if (ip < min_ip || ip > max_ip) {
1017 record_ip(ip, counter);
1020 static void process_options(int argc, char *argv[])
1022 int error = 0, counter;
1024 if (strstr(argv[0], "perfstat"))
1028 int option_index = 0;
1029 /** Options for getopt */
1030 static struct option long_options[] = {
1031 {"count", required_argument, NULL, 'c'},
1032 {"cpu", required_argument, NULL, 'C'},
1033 {"delay", required_argument, NULL, 'd'},
1034 {"dump_symtab", no_argument, NULL, 'D'},
1035 {"event", required_argument, NULL, 'e'},
1036 {"filter", required_argument, NULL, 'f'},
1037 {"group", required_argument, NULL, 'g'},
1038 {"help", no_argument, NULL, 'h'},
1039 {"nmi", required_argument, NULL, 'n'},
1040 {"pid", required_argument, NULL, 'p'},
1041 {"vmlinux", required_argument, NULL, 'x'},
1042 {"symbol", required_argument, NULL, 's'},
1043 {"stat", no_argument, NULL, 'S'},
1044 {"zero", no_argument, NULL, 'z'},
1047 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
1048 long_options, &option_index);
1053 case 'a': system_wide = 1; break;
1054 case 'c': default_interval = atoi(optarg); break;
1056 /* CPU and PID are mutually exclusive */
1058 printf("WARNING: CPU switch overriding PID\n");
1062 profile_cpu = atoi(optarg); break;
1063 case 'd': delay_secs = atoi(optarg); break;
1064 case 'D': dump_symtab = 1; break;
1066 case 'e': error = parse_events(optarg); break;
1068 case 'f': count_filter = atoi(optarg); break;
1069 case 'g': group = atoi(optarg); break;
1070 case 'h': display_help(); break;
1071 case 'n': nmi = atoi(optarg); break;
1073 /* CPU and PID are mutually exclusive */
1074 if (profile_cpu != -1) {
1075 printf("WARNING: PID switch overriding CPU\n");
1079 tid = atoi(optarg); break;
1080 case 's': sym_filter = strdup(optarg); break;
1081 case 'S': run_perfstat = 1; break;
1082 case 'x': vmlinux = strdup(optarg); break;
1083 case 'z': zero = 1; break;
1084 default: error = 1; break;
1099 for (counter = 0; counter < nr_counters; counter++) {
1100 if (event_count[counter])
1103 event_count[counter] = default_interval;
1114 static unsigned int mmap_read_head(struct mmap_data *md)
1116 struct perf_counter_mmap_page *pc = md->base;
1117 unsigned int seq, head;
1123 if (unlikely(seq & 1)) {
1128 head = pc->data_head;
1131 if (pc->lock != seq)
1137 static void mmap_read(struct mmap_data *md)
1139 unsigned int head = mmap_read_head(md);
1140 unsigned int old = md->prev;
1141 unsigned char *data = md->base + page_size;
1143 if (head - old > md->mask) {
1144 printf("ERROR: failed to keep up with mmap data\n");
1148 for (; old != head;) {
1149 __u64 *ptr = (__u64 *)&data[old & md->mask];
1150 old += sizeof(__u64);
1152 process_event(*ptr, md->counter);
1158 int main(int argc, char *argv[])
1160 struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1161 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1162 struct perf_counter_hw_event hw_event;
1163 int i, counter, group_fd;
1167 page_size = sysconf(_SC_PAGE_SIZE);
1169 process_options(argc, argv);
1171 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1172 assert(nr_cpus <= MAX_NR_CPUS);
1173 assert(nr_cpus >= 0);
1176 return do_perfstat(argc, argv);
1178 if (tid != -1 || profile_cpu != -1)
1182 if (vmlinux && sym_filter_entry)
1183 parse_vmlinux(vmlinux);
1185 for (i = 0; i < nr_cpus; i++) {
1187 for (counter = 0; counter < nr_counters; counter++) {
1190 if (tid == -1 && profile_cpu == -1)
1193 memset(&hw_event, 0, sizeof(hw_event));
1194 hw_event.config = event_id[counter];
1195 hw_event.irq_period = event_count[counter];
1196 hw_event.record_type = PERF_RECORD_IRQ;
1199 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1200 if (fd[i][counter] < 0) {
1202 printf("kerneltop error: syscall returned with %d (%s)\n",
1203 fd[i][counter], strerror(err));
1205 printf("Are you root?\n");
1208 assert(fd[i][counter] >= 0);
1209 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1212 * First counter acts as the group leader:
1214 if (group && group_fd == -1)
1215 group_fd = fd[i][counter];
1217 event_array[i][counter].fd = fd[i][counter];
1218 event_array[i][counter].events = POLLIN;
1220 mmap_array[i][counter].counter = counter;
1221 mmap_array[i][counter].prev = 0;
1222 mmap_array[i][counter].mask = 2*page_size - 1;
1223 mmap_array[i][counter].base = mmap(NULL, 3*page_size,
1224 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1225 if (mmap_array[i][counter].base == MAP_FAILED) {
1226 printf("kerneltop error: failed to mmap with %d (%s)\n",
1227 errno, strerror(errno));
1233 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1234 last_refresh = time(NULL);
1239 for (i = 0; i < nr_cpus; i++) {
1240 for (counter = 0; counter < nr_counters; counter++)
1241 mmap_read(&mmap_array[i][counter]);
1244 if (time(NULL) >= last_refresh + delay_secs) {
1246 events = userspace_events = 0;
1250 ret = poll(event_array[0], nr_cpus, 1000);