2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
65 #include <sys/types.h>
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
84 #include <sys/prctl.h>
89 #include <linux/unistd.h>
90 #include <linux/types.h>
92 #include "../../include/linux/perf_counter.h"
96 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
97 * counters in the current task.
99 #define PR_TASK_PERF_COUNTERS_DISABLE 31
100 #define PR_TASK_PERF_COUNTERS_ENABLE 32
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
106 struct timespec ts; \
108 clock_gettime(CLOCK_MONOTONIC, &ts); \
109 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
113 * Pick up some kernel type conventions:
119 #define __NR_perf_counter_open 295
120 #define rmb() asm volatile("lfence" ::: "memory")
121 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
125 #define __NR_perf_counter_open 333
126 #define rmb() asm volatile("lfence" ::: "memory")
127 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
131 #define __NR_perf_counter_open 319
132 #define rmb() asm volatile ("sync" ::: "memory")
133 #define cpu_relax() asm volatile ("" ::: "memory");
136 #define unlikely(x) __builtin_expect(!!(x), 0)
137 #define min(x, y) ({ \
138 typeof(x) _min1 = (x); \
139 typeof(y) _min2 = (y); \
140 (void) (&_min1 == &_min2); \
141 _min1 < _min2 ? _min1 : _min2; })
143 asmlinkage int sys_perf_counter_open(
144 struct perf_counter_hw_event *hw_event_uptr __user,
151 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
154 #define MAX_COUNTERS 64
155 #define MAX_NR_CPUS 256
157 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
159 static int run_perfstat = 0;
160 static int system_wide = 0;
162 static int nr_counters = 0;
163 static __u64 event_id[MAX_COUNTERS] = {
164 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
165 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
167 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
169 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
170 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
172 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
174 static int default_interval = 100000;
175 static int event_count[MAX_COUNTERS];
176 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
178 static __u64 count_filter = 100;
181 static int profile_cpu = -1;
182 static int nr_cpus = 0;
184 static int group = 0;
185 static unsigned int page_size;
186 static unsigned int mmap_pages = 16;
188 static char *vmlinux;
190 static char *sym_filter;
191 static unsigned long filter_start;
192 static unsigned long filter_end;
194 static int delay_secs = 2;
196 static int dump_symtab;
202 struct source_line *next;
205 static struct source_line *lines;
206 static struct source_line **lines_tail;
208 const unsigned int default_count[] = {
217 static char *hw_event_names[] = {
227 static char *sw_event_names[] = {
237 struct event_symbol {
242 static struct event_symbol event_symbols[] = {
243 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
244 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
245 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
246 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
247 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
248 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
249 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
253 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
254 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
255 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
256 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
257 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
258 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
259 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
265 #define __PERF_COUNTER_FIELD(config, name) \
266 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
268 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
269 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
270 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
271 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
273 static void display_events_help(void)
279 " -e EVENT --event=EVENT # symbolic-name abbreviations");
281 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
284 e = event_symbols[i].event;
285 type = PERF_COUNTER_TYPE(e);
286 id = PERF_COUNTER_ID(e);
288 printf("\n %d:%d: %-20s",
289 type, id, event_symbols[i].symbol);
293 " rNNN: raw PMU events (eventsel+umask)\n\n");
296 static void display_perfstat_help(void)
299 "Usage: perfstat [<events...>] <cmd...>\n\n"
300 "PerfStat Options (up to %d event types can be specified):\n\n",
303 display_events_help();
306 " -a # system-wide collection\n");
310 static void display_help(void)
313 return display_perfstat_help();
316 "Usage: kerneltop [<options>]\n"
317 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
318 "KernelTop Options (up to %d event types can be specified at once):\n\n",
321 display_events_help();
324 " -S --stat # perfstat COMMAND\n"
325 " -a # system-wide collection (for perfstat)\n\n"
326 " -c CNT --count=CNT # event period to sample\n\n"
327 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
328 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
329 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
330 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
331 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
332 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
333 " -z --zero # zero counts after display\n"
334 " -D --dump_symtab # dump symbol table to stderr on startup\n"
335 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
341 static char *event_name(int ctr)
343 __u64 config = event_id[ctr];
344 int type = PERF_COUNTER_TYPE(config);
345 int id = PERF_COUNTER_ID(config);
348 if (PERF_COUNTER_RAW(config)) {
349 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
354 case PERF_TYPE_HARDWARE:
355 if (id < PERF_HW_EVENTS_MAX)
356 return hw_event_names[id];
357 return "unknown-hardware";
359 case PERF_TYPE_SOFTWARE:
360 if (id < PERF_SW_EVENTS_MAX)
361 return sw_event_names[id];
362 return "unknown-software";
372 * Each event can have multiple symbolic names.
373 * Symbolic names are (almost) exactly matched.
375 static __u64 match_event_symbols(char *str)
381 if (sscanf(str, "r%llx", &config) == 1)
382 return config | PERF_COUNTER_RAW_MASK;
384 if (sscanf(str, "%d:%llu", &type, &id) == 2)
385 return EID(type, id);
387 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
388 if (!strncmp(str, event_symbols[i].symbol,
389 strlen(event_symbols[i].symbol)))
390 return event_symbols[i].event;
396 static int parse_events(char *str)
401 if (nr_counters == MAX_COUNTERS)
404 config = match_event_symbols(str);
408 event_id[nr_counters] = config;
411 str = strstr(str, ",");
425 char fault_here[1000000];
427 static void create_perfstat_counter(int counter)
429 struct perf_counter_hw_event hw_event;
431 memset(&hw_event, 0, sizeof(hw_event));
432 hw_event.config = event_id[counter];
433 hw_event.record_type = PERF_RECORD_SIMPLE;
438 for (cpu = 0; cpu < nr_cpus; cpu ++) {
439 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
440 if (fd[cpu][counter] < 0) {
441 printf("perfstat error: syscall returned with %d (%s)\n",
442 fd[cpu][counter], strerror(errno));
447 hw_event.inherit = 1;
448 hw_event.disabled = 1;
450 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
451 if (fd[0][counter] < 0) {
452 printf("perfstat error: syscall returned with %d (%s)\n",
453 fd[0][counter], strerror(errno));
459 int do_perfstat(int argc, char *argv[])
461 unsigned long long t0, t1;
470 for (counter = 0; counter < nr_counters; counter++)
471 create_perfstat_counter(counter);
480 * Enable counters and exec the command:
483 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
485 if ((pid = fork()) < 0)
486 perror("failed to fork");
488 if (execvp(argv[0], argv)) {
493 while (wait(&status) >= 0)
495 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
500 fprintf(stderr, "\n");
501 fprintf(stderr, " Performance counter stats for \'%s\':\n",
503 fprintf(stderr, "\n");
505 for (counter = 0; counter < nr_counters; counter++) {
507 __u64 count, single_count;
510 for (cpu = 0; cpu < nr_cpus; cpu ++) {
511 res = read(fd[cpu][counter],
512 (char *) &single_count, sizeof(single_count));
513 assert(res == sizeof(single_count));
514 count += single_count;
517 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
518 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
520 double msecs = (double)count / 1000000;
522 fprintf(stderr, " %14.6f %-20s (msecs)\n",
523 msecs, event_name(counter));
525 fprintf(stderr, " %14Ld %-20s (events)\n",
526 count, event_name(counter));
529 fprintf(stderr, "\n");
530 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
531 (double)(t1-t0)/1e6);
532 fprintf(stderr, "\n");
541 static uint64_t min_ip;
542 static uint64_t max_ip = -1ll;
545 unsigned long long addr;
547 unsigned long count[MAX_COUNTERS];
549 struct source_line *source;
552 #define MAX_SYMS 100000
554 static int sym_table_count;
556 struct sym_entry *sym_filter_entry;
558 static struct sym_entry sym_table[MAX_SYMS];
560 static void show_details(struct sym_entry *sym);
563 * Ordering weight: count-1 * count-2 * ... / count-n
565 static double sym_weight(const struct sym_entry *sym)
570 weight = sym->count[0];
572 for (counter = 1; counter < nr_counters-1; counter++)
573 weight *= sym->count[counter];
575 weight /= (sym->count[counter] + 1);
580 static int compare(const void *__sym1, const void *__sym2)
582 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
584 return sym_weight(sym1) < sym_weight(sym2);
587 static time_t last_refresh;
589 static long userspace_events;
590 static const char CONSOLE_CLEAR[] = "
\e[H
\e[2J";
592 static struct sym_entry tmp[MAX_SYMS];
594 static void print_sym_table(void)
598 float events_per_sec = events/delay_secs;
599 float kevents_per_sec = (events-userspace_events)/delay_secs;
601 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
602 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
604 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
607 "------------------------------------------------------------------------------\n");
608 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
610 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
611 nmi ? "NMI" : "IRQ");
613 if (nr_counters == 1)
614 printf("%d ", event_count[0]);
616 for (counter = 0; counter < nr_counters; counter++) {
620 printf("%s", event_name(counter));
626 printf(" (tid: %d", tid);
630 if (profile_cpu != -1)
631 printf(", cpu: %d)\n", profile_cpu);
636 printf(", %d CPUs)\n", nr_cpus);
639 printf("------------------------------------------------------------------------------\n\n");
641 if (nr_counters == 1)
644 printf(" weight events");
646 printf(" RIP kernel function\n"
647 " ______ ______ ________________ _______________\n\n"
651 for (i = 0; i < sym_table_count; i++) {
654 if (nr_counters == 1) {
656 tmp[i].count[0] >= count_filter) {
657 printf("%19.2f - %016llx : %s\n",
658 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
663 tmp[i].count[0] >= count_filter) {
664 printf("%8.1f %10ld - %016llx : %s\n",
667 tmp[i].addr, tmp[i].sym);
672 * Add decay to the counts:
674 for (count = 0; count < nr_counters; count++)
675 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
678 if (sym_filter_entry)
679 show_details(sym_filter_entry);
681 last_refresh = time(NULL);
684 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
686 if (poll(&stdin_poll, 1, 0) == 1) {
687 printf("key pressed - exiting.\n");
693 static int read_symbol(FILE *in, struct sym_entry *s)
695 static int filter_match = 0;
700 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
706 /* skip until end of line: */
710 if (rc == '\n' || rc == EOF || pos >= 499)
719 /* Filter out known duplicates and non-text symbols. */
720 if (!strcmp(sym, "_text"))
722 if (!min_ip && !strcmp(sym, "_stext"))
724 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
726 if (stype != 'T' && stype != 't')
728 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
730 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
733 s->sym = malloc(strlen(str));
736 strcpy((char *)s->sym, str);
739 /* Tag events to be skipped. */
740 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
742 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
744 else if (!strcmp("mwait_idle", s->sym))
747 if (filter_match == 1) {
748 filter_end = s->addr;
750 if (filter_end - filter_start > 10000) {
751 printf("hm, too large filter symbol <%s> - skipping.\n",
753 printf("symbol filter start: %016lx\n", filter_start);
754 printf(" end: %016lx\n", filter_end);
755 filter_end = filter_start = 0;
760 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
762 filter_start = s->addr;
768 int compare_addr(const void *__sym1, const void *__sym2)
770 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
772 return sym1->addr > sym2->addr;
775 static void sort_symbol_table(void)
780 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
781 for (i = 0, dups = 0; i < sym_table_count; i++) {
782 if (sym_table[i].addr == sym_table[i+1].addr) {
783 sym_table[i+1].addr = -1ll;
787 sym_table_count -= dups;
791 static void parse_symbols(void)
793 struct sym_entry *last;
795 FILE *kallsyms = fopen("/proc/kallsyms", "r");
798 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
802 while (!feof(kallsyms)) {
803 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
805 assert(sym_table_count <= MAX_SYMS);
810 min_ip = sym_table[0].addr;
811 max_ip = sym_table[sym_table_count-1].addr;
812 last = sym_table + sym_table_count++;
819 for (count=0; count < sym_table_count; count ++) {
820 if (!strcmp(sym_table[count].sym, sym_filter)) {
821 sym_filter_entry = &sym_table[count];
829 for (i = 0; i < sym_table_count; i++)
830 fprintf(stderr, "%llx %s\n",
831 sym_table[i].addr, sym_table[i].sym);
839 static void parse_vmlinux(char *filename)
842 char command[PATH_MAX*2];
846 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
848 file = popen(command, "r");
853 while (!feof(file)) {
854 struct source_line *src;
858 src = malloc(sizeof(struct source_line));
860 memset(src, 0, sizeof(struct source_line));
862 if (getline(&src->line, &dummy, file) < 0)
867 c = strchr(src->line, '\n');
873 lines_tail = &src->next;
875 if (strlen(src->line)>8 && src->line[8] == ':')
876 src->EIP = strtoull(src->line, NULL, 16);
877 if (strlen(src->line)>8 && src->line[16] == ':')
878 src->EIP = strtoull(src->line, NULL, 16);
883 static void record_precise_ip(uint64_t ip)
885 struct source_line *line;
887 for (line = lines; line; line = line->next) {
895 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
897 struct source_line *line;
898 char pattern[PATH_MAX];
899 sprintf(pattern, "<%s>:", sym->sym);
901 for (line = lines; line; line = line->next) {
902 if (strstr(line->line, pattern)) {
909 static void show_lines(struct source_line *line_queue, int line_queue_count)
912 struct source_line *line;
915 for (i = 0; i < line_queue_count; i++) {
916 printf("%8li\t%s\n", line->count, line->line);
921 #define TRACE_COUNT 3
923 static void show_details(struct sym_entry *sym)
925 struct source_line *line;
926 struct source_line *line_queue = NULL;
928 int line_queue_count = 0;
931 lookup_sym_in_vmlinux(sym);
935 printf("Showing details for %s\n", sym->sym);
939 if (displayed && strstr(line->line, ">:"))
942 if (!line_queue_count)
946 if (line->count >= count_filter) {
947 show_lines(line_queue, line_queue_count);
948 line_queue_count = 0;
950 } else if (line_queue_count > TRACE_COUNT) {
951 line_queue = line_queue->next;
964 * Binary search in the histogram table and record the hit:
966 static void record_ip(uint64_t ip, int counter)
968 int left_idx, middle_idx, right_idx, idx;
969 unsigned long left, middle, right;
971 record_precise_ip(ip);
974 right_idx = sym_table_count-1;
975 assert(ip <= max_ip && ip >= min_ip);
977 while (left_idx + 1 < right_idx) {
978 middle_idx = (left_idx + right_idx) / 2;
980 left = sym_table[ left_idx].addr;
981 middle = sym_table[middle_idx].addr;
982 right = sym_table[ right_idx].addr;
984 if (!(left <= middle && middle <= right)) {
985 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
986 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
988 assert(left <= middle && middle <= right);
989 if (!(left <= ip && ip <= right)) {
990 printf(" left: %016lx\n", left);
991 printf(" ip: %016lx\n", (unsigned long)ip);
992 printf("right: %016lx\n", right);
994 assert(left <= ip && ip <= right);
996 * [ left .... target .... middle .... right ]
1000 right_idx = middle_idx;
1004 * [ left .... middle ... target ... right ]
1007 left_idx = middle_idx;
1012 if (!sym_table[idx].skip)
1013 sym_table[idx].count[counter]++;
1017 static void process_event(uint64_t ip, int counter)
1021 if (ip < min_ip || ip > max_ip) {
1026 record_ip(ip, counter);
1029 static void process_options(int argc, char *argv[])
1031 int error = 0, counter;
1033 if (strstr(argv[0], "perfstat"))
1037 int option_index = 0;
1038 /** Options for getopt */
1039 static struct option long_options[] = {
1040 {"count", required_argument, NULL, 'c'},
1041 {"cpu", required_argument, NULL, 'C'},
1042 {"delay", required_argument, NULL, 'd'},
1043 {"dump_symtab", no_argument, NULL, 'D'},
1044 {"event", required_argument, NULL, 'e'},
1045 {"filter", required_argument, NULL, 'f'},
1046 {"group", required_argument, NULL, 'g'},
1047 {"help", no_argument, NULL, 'h'},
1048 {"nmi", required_argument, NULL, 'n'},
1049 {"pid", required_argument, NULL, 'p'},
1050 {"vmlinux", required_argument, NULL, 'x'},
1051 {"symbol", required_argument, NULL, 's'},
1052 {"stat", no_argument, NULL, 'S'},
1053 {"zero", no_argument, NULL, 'z'},
1054 {"mmap_pages", required_argument, NULL, 'm'},
1057 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
1058 long_options, &option_index);
1063 case 'a': system_wide = 1; break;
1064 case 'c': default_interval = atoi(optarg); break;
1066 /* CPU and PID are mutually exclusive */
1068 printf("WARNING: CPU switch overriding PID\n");
1072 profile_cpu = atoi(optarg); break;
1073 case 'd': delay_secs = atoi(optarg); break;
1074 case 'D': dump_symtab = 1; break;
1076 case 'e': error = parse_events(optarg); break;
1078 case 'f': count_filter = atoi(optarg); break;
1079 case 'g': group = atoi(optarg); break;
1080 case 'h': display_help(); break;
1081 case 'n': nmi = atoi(optarg); break;
1083 /* CPU and PID are mutually exclusive */
1084 if (profile_cpu != -1) {
1085 printf("WARNING: PID switch overriding CPU\n");
1089 tid = atoi(optarg); break;
1090 case 's': sym_filter = strdup(optarg); break;
1091 case 'S': run_perfstat = 1; break;
1092 case 'x': vmlinux = strdup(optarg); break;
1093 case 'z': zero = 1; break;
1094 case 'm': mmap_pages = atoi(optarg); break;
1095 default: error = 1; break;
1110 for (counter = 0; counter < nr_counters; counter++) {
1111 if (event_count[counter])
1114 event_count[counter] = default_interval;
1125 static unsigned int mmap_read_head(struct mmap_data *md)
1127 struct perf_counter_mmap_page *pc = md->base;
1128 unsigned int seq, head;
1134 if (unlikely(seq & 1)) {
1139 head = pc->data_head;
1142 if (pc->lock != seq)
1148 struct timeval last_read, this_read;
1150 static void mmap_read(struct mmap_data *md)
1152 unsigned int head = mmap_read_head(md);
1153 unsigned int old = md->prev;
1154 unsigned char *data = md->base + page_size;
1157 gettimeofday(&this_read, NULL);
1160 * If we're further behind than half the buffer, there's a chance
1161 * the writer will bite our tail and screw up the events under us.
1163 * If we somehow ended up ahead of the head, we got messed up.
1165 * In either case, truncate and restart at head.
1168 if (diff > md->mask / 2 || diff < 0) {
1170 unsigned long msecs;
1172 timersub(&this_read, &last_read, &iv);
1173 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1175 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1176 " Last read %lu msecs ago.\n", msecs);
1179 * head points to a known good entry, start there.
1184 last_read = this_read;
1186 for (; old != head;) {
1187 struct event_struct {
1188 struct perf_event_header header;
1191 } *event = (struct event_struct *)&data[old & md->mask];
1192 struct event_struct event_copy;
1194 unsigned int size = event->header.size;
1197 * Event straddles the mmap boundary -- header should always
1198 * be inside due to u64 alignment of output.
1200 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1201 unsigned int offset = old;
1202 unsigned int len = sizeof(*event), cpy;
1203 void *dst = &event_copy;
1206 cpy = min(md->mask + 1 - (offset & md->mask), len);
1207 memcpy(dst, &data[offset & md->mask], cpy);
1213 event = &event_copy;
1218 switch (event->header.type) {
1220 case PERF_EVENT_IP | __PERF_EVENT_TID:
1221 process_event(event->ip, md->counter);
1229 int main(int argc, char *argv[])
1231 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1232 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1233 struct perf_counter_hw_event hw_event;
1234 int i, counter, group_fd, nr_poll = 0;
1238 page_size = sysconf(_SC_PAGE_SIZE);
1240 process_options(argc, argv);
1242 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1243 assert(nr_cpus <= MAX_NR_CPUS);
1244 assert(nr_cpus >= 0);
1247 return do_perfstat(argc, argv);
1249 if (tid != -1 || profile_cpu != -1)
1253 if (vmlinux && sym_filter_entry)
1254 parse_vmlinux(vmlinux);
1256 for (i = 0; i < nr_cpus; i++) {
1258 for (counter = 0; counter < nr_counters; counter++) {
1261 if (tid == -1 && profile_cpu == -1)
1264 memset(&hw_event, 0, sizeof(hw_event));
1265 hw_event.config = event_id[counter];
1266 hw_event.irq_period = event_count[counter];
1267 hw_event.record_type = PERF_RECORD_IRQ;
1269 hw_event.include_tid = 1;
1271 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1272 if (fd[i][counter] < 0) {
1274 printf("kerneltop error: syscall returned with %d (%s)\n",
1275 fd[i][counter], strerror(err));
1277 printf("Are you root?\n");
1280 assert(fd[i][counter] >= 0);
1281 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1284 * First counter acts as the group leader:
1286 if (group && group_fd == -1)
1287 group_fd = fd[i][counter];
1289 event_array[nr_poll].fd = fd[i][counter];
1290 event_array[nr_poll].events = POLLIN;
1293 mmap_array[i][counter].counter = counter;
1294 mmap_array[i][counter].prev = 0;
1295 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1296 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1297 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1298 if (mmap_array[i][counter].base == MAP_FAILED) {
1299 printf("kerneltop error: failed to mmap with %d (%s)\n",
1300 errno, strerror(errno));
1306 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1307 last_refresh = time(NULL);
1312 for (i = 0; i < nr_cpus; i++) {
1313 for (counter = 0; counter < nr_counters; counter++)
1314 mmap_read(&mmap_array[i][counter]);
1317 if (time(NULL) >= last_refresh + delay_secs) {
1319 events = userspace_events = 0;
1323 ret = poll(event_array, nr_poll, 1000);