perf_counter: kerneltop: output event support
[linux-2.6] / Documentation / perf_counter / kerneltop.c
1 /*
2  * kerneltop.c: show top kernel functions - performance counters showcase
3
4    Build with:
5
6      cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8    Sample output:
9
10 ------------------------------------------------------------------------------
11  KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14              weight         RIP          kernel function
15              ______   ________________   _______________
16
17               35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18               33.00 - ffffffff804cb740 : sock_alloc_send_skb
19               31.26 - ffffffff804ce808 : skb_push
20               22.43 - ffffffff80510004 : tcp_established_options
21               19.00 - ffffffff8027d250 : find_get_page
22               15.76 - ffffffff804e4fc9 : eth_type_trans
23               15.20 - ffffffff804d8baa : dst_release
24               14.86 - ffffffff804cf5d8 : skb_release_head_state
25               14.00 - ffffffff802217d5 : read_hpet
26               12.00 - ffffffff804ffb7f : __ip_local_out
27               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28                8.54 - ffffffff805001a3 : ip_queue_xmit
29  */
30
31 /*
32  * perfstat:  /usr/bin/time -alike performance counter statistics utility
33
34           It summarizes the counter events of all tasks (and child tasks),
35           covering all CPUs that the command (or workload) executes on.
36           It only counts the per-task events of the workload started,
37           independent of how many other tasks run on those CPUs.
38
39    Sample output:
40
41    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43    Performance counter stats for 'ls':
44
45            163516953 instructions
46                 2295 cache-misses
47              2855182 branch-misses
48  */
49
50  /*
51   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52   *
53   * Improvements and fixes by:
54   *
55   *   Arjan van de Ven <arjan@linux.intel.com>
56   *   Yanmin Zhang <yanmin.zhang@intel.com>
57   *   Wu Fengguang <fengguang.wu@intel.com>
58   *   Mike Galbraith <efault@gmx.de>
59   *   Paul Mackerras <paulus@samba.org>
60   *
61   * Released under the GPL v2. (and only v2, not any later version)
62   */
63
64 #define _GNU_SOURCE
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #include <sys/time.h>
68 #include <unistd.h>
69 #include <stdint.h>
70 #include <stdlib.h>
71 #include <string.h>
72 #include <limits.h>
73 #include <getopt.h>
74 #include <assert.h>
75 #include <fcntl.h>
76 #include <stdio.h>
77 #include <errno.h>
78 #include <ctype.h>
79 #include <time.h>
80
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
83 #include <sys/poll.h>
84 #include <sys/prctl.h>
85 #include <sys/wait.h>
86 #include <sys/uio.h>
87 #include <sys/mman.h>
88
89 #include <linux/unistd.h>
90 #include <linux/types.h>
91
92 #include "../../include/linux/perf_counter.h"
93
94
95 /*
96  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
97  * counters in the current task.
98  */
99 #define PR_TASK_PERF_COUNTERS_DISABLE   31
100 #define PR_TASK_PERF_COUNTERS_ENABLE    32
101
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
103
104 #define rdclock()                                       \
105 ({                                                      \
106         struct timespec ts;                             \
107                                                         \
108         clock_gettime(CLOCK_MONOTONIC, &ts);            \
109         ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
110 })
111
112 /*
113  * Pick up some kernel type conventions:
114  */
115 #define __user
116 #define asmlinkage
117
118 #ifdef __x86_64__
119 #define __NR_perf_counter_open 295
120 #define rmb()           asm volatile("lfence" ::: "memory")
121 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
122 #endif
123
124 #ifdef __i386__
125 #define __NR_perf_counter_open 333
126 #define rmb()           asm volatile("lfence" ::: "memory")
127 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
128 #endif
129
130 #ifdef __powerpc__
131 #define __NR_perf_counter_open 319
132 #define rmb()           asm volatile ("sync" ::: "memory")
133 #define cpu_relax()     asm volatile ("" ::: "memory");
134 #endif
135
136 #define unlikely(x)     __builtin_expect(!!(x), 0)
137 #define min(x, y) ({                            \
138         typeof(x) _min1 = (x);                  \
139         typeof(y) _min2 = (y);                  \
140         (void) (&_min1 == &_min2);              \
141         _min1 < _min2 ? _min1 : _min2; })
142
143 asmlinkage int sys_perf_counter_open(
144         struct perf_counter_hw_event    *hw_event_uptr          __user,
145         pid_t                           pid,
146         int                             cpu,
147         int                             group_fd,
148         unsigned long                   flags)
149 {
150         return syscall(
151                 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
152 }
153
154 #define MAX_COUNTERS                    64
155 #define MAX_NR_CPUS                     256
156
157 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
158
159 static int                      run_perfstat                    =  0;
160 static int                      system_wide                     =  0;
161
162 static int                      nr_counters                     =  0;
163 static __u64                    event_id[MAX_COUNTERS]          = {
164         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
165         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
166         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
167         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
168
169         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
170         EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
171         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
172         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
173 };
174 static int                      default_interval = 100000;
175 static int                      event_count[MAX_COUNTERS];
176 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
177
178 static __u64                    count_filter                   = 100;
179
180 static int                      tid                             = -1;
181 static int                      profile_cpu                     = -1;
182 static int                      nr_cpus                         =  0;
183 static int                      nmi                             =  1;
184 static int                      group                           =  0;
185 static unsigned int             page_size;
186 static unsigned int             mmap_pages                      =  16;
187
188 static char                     *vmlinux;
189
190 static char                     *sym_filter;
191 static unsigned long            filter_start;
192 static unsigned long            filter_end;
193
194 static int                      delay_secs                      =  2;
195 static int                      zero;
196 static int                      dump_symtab;
197
198 struct source_line {
199         uint64_t                EIP;
200         unsigned long           count;
201         char                    *line;
202         struct source_line      *next;
203 };
204
205 static struct source_line       *lines;
206 static struct source_line       **lines_tail;
207
208 const unsigned int default_count[] = {
209         1000000,
210         1000000,
211           10000,
212           10000,
213         1000000,
214           10000,
215 };
216
217 static char *hw_event_names[] = {
218         "CPU cycles",
219         "instructions",
220         "cache references",
221         "cache misses",
222         "branches",
223         "branch misses",
224         "bus cycles",
225 };
226
227 static char *sw_event_names[] = {
228         "cpu clock ticks",
229         "task clock ticks",
230         "pagefaults",
231         "context switches",
232         "CPU migrations",
233         "minor faults",
234         "major faults",
235 };
236
237 struct event_symbol {
238         __u64 event;
239         char *symbol;
240 };
241
242 static struct event_symbol event_symbols[] = {
243         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cpu-cycles",           },
244         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cycles",               },
245         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),              "instructions",         },
246         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),          "cache-references",     },
247         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),              "cache-misses",         },
248         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branch-instructions",  },
249         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branches",             },
250         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),             "branch-misses",        },
251         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),                "bus-cycles",           },
252
253         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),                 "cpu-clock",            },
254         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),                "task-clock",           },
255         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "page-faults",          },
256         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "faults",               },
257         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),           "minor-faults",         },
258         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),           "major-faults",         },
259         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "context-switches",     },
260         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "cs",                   },
261         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "cpu-migrations",       },
262         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "migrations",           },
263 };
264
265 #define __PERF_COUNTER_FIELD(config, name) \
266         ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
267
268 #define PERF_COUNTER_RAW(config)        __PERF_COUNTER_FIELD(config, RAW)
269 #define PERF_COUNTER_CONFIG(config)     __PERF_COUNTER_FIELD(config, CONFIG)
270 #define PERF_COUNTER_TYPE(config)       __PERF_COUNTER_FIELD(config, TYPE)
271 #define PERF_COUNTER_ID(config)         __PERF_COUNTER_FIELD(config, EVENT)
272
273 static void display_events_help(void)
274 {
275         unsigned int i;
276         __u64 e;
277
278         printf(
279         " -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
280
281         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
282                 int type, id;
283
284                 e = event_symbols[i].event;
285                 type = PERF_COUNTER_TYPE(e);
286                 id = PERF_COUNTER_ID(e);
287
288                 printf("\n                             %d:%d: %-20s",
289                                 type, id, event_symbols[i].symbol);
290         }
291
292         printf("\n"
293         "                           rNNN: raw PMU events (eventsel+umask)\n\n");
294 }
295
296 static void display_perfstat_help(void)
297 {
298         printf(
299         "Usage: perfstat [<events...>] <cmd...>\n\n"
300         "PerfStat Options (up to %d event types can be specified):\n\n",
301                  MAX_COUNTERS);
302
303         display_events_help();
304
305         printf(
306         " -a                           # system-wide collection\n");
307         exit(0);
308 }
309
310 static void display_help(void)
311 {
312         if (run_perfstat)
313                 return display_perfstat_help();
314
315         printf(
316         "Usage: kerneltop [<options>]\n"
317         "   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
318         "KernelTop Options (up to %d event types can be specified at once):\n\n",
319                  MAX_COUNTERS);
320
321         display_events_help();
322
323         printf(
324         " -S        --stat             # perfstat COMMAND\n"
325         " -a                           # system-wide collection (for perfstat)\n\n"
326         " -c CNT    --count=CNT        # event period to sample\n\n"
327         " -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
328         " -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
329         " -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
330         " -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
331         " -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
332         " -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
333         " -z        --zero             # zero counts after display\n"
334         " -D        --dump_symtab      # dump symbol table to stderr on startup\n"
335         " -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
336         );
337
338         exit(0);
339 }
340
341 static char *event_name(int ctr)
342 {
343         __u64 config = event_id[ctr];
344         int type = PERF_COUNTER_TYPE(config);
345         int id = PERF_COUNTER_ID(config);
346         static char buf[32];
347
348         if (PERF_COUNTER_RAW(config)) {
349                 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
350                 return buf;
351         }
352
353         switch (type) {
354         case PERF_TYPE_HARDWARE:
355                 if (id < PERF_HW_EVENTS_MAX)
356                         return hw_event_names[id];
357                 return "unknown-hardware";
358
359         case PERF_TYPE_SOFTWARE:
360                 if (id < PERF_SW_EVENTS_MAX)
361                         return sw_event_names[id];
362                 return "unknown-software";
363
364         default:
365                 break;
366         }
367
368         return "unknown";
369 }
370
371 /*
372  * Each event can have multiple symbolic names.
373  * Symbolic names are (almost) exactly matched.
374  */
375 static __u64 match_event_symbols(char *str)
376 {
377         __u64 config, id;
378         int type;
379         unsigned int i;
380
381         if (sscanf(str, "r%llx", &config) == 1)
382                 return config | PERF_COUNTER_RAW_MASK;
383
384         if (sscanf(str, "%d:%llu", &type, &id) == 2)
385                 return EID(type, id);
386
387         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
388                 if (!strncmp(str, event_symbols[i].symbol,
389                              strlen(event_symbols[i].symbol)))
390                         return event_symbols[i].event;
391         }
392
393         return ~0ULL;
394 }
395
396 static int parse_events(char *str)
397 {
398         __u64 config;
399
400 again:
401         if (nr_counters == MAX_COUNTERS)
402                 return -1;
403
404         config = match_event_symbols(str);
405         if (config == ~0ULL)
406                 return -1;
407
408         event_id[nr_counters] = config;
409         nr_counters++;
410
411         str = strstr(str, ",");
412         if (str) {
413                 str++;
414                 goto again;
415         }
416
417         return 0;
418 }
419
420
421 /*
422  * perfstat
423  */
424
425 char fault_here[1000000];
426
427 static void create_perfstat_counter(int counter)
428 {
429         struct perf_counter_hw_event hw_event;
430
431         memset(&hw_event, 0, sizeof(hw_event));
432         hw_event.config         = event_id[counter];
433         hw_event.record_type    = PERF_RECORD_SIMPLE;
434         hw_event.nmi            = 0;
435
436         if (system_wide) {
437                 int cpu;
438                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
439                         fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
440                         if (fd[cpu][counter] < 0) {
441                                 printf("perfstat error: syscall returned with %d (%s)\n",
442                                                 fd[cpu][counter], strerror(errno));
443                                 exit(-1);
444                         }
445                 }
446         } else {
447                 hw_event.inherit        = 1;
448                 hw_event.disabled       = 1;
449
450                 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
451                 if (fd[0][counter] < 0) {
452                         printf("perfstat error: syscall returned with %d (%s)\n",
453                                         fd[0][counter], strerror(errno));
454                         exit(-1);
455                 }
456         }
457 }
458
459 int do_perfstat(int argc, char *argv[])
460 {
461         unsigned long long t0, t1;
462         int counter;
463         ssize_t res;
464         int status;
465         int pid;
466
467         if (!system_wide)
468                 nr_cpus = 1;
469
470         for (counter = 0; counter < nr_counters; counter++)
471                 create_perfstat_counter(counter);
472
473         argc -= optind;
474         argv += optind;
475
476         if (!argc)
477                 display_help();
478
479         /*
480          * Enable counters and exec the command:
481          */
482         t0 = rdclock();
483         prctl(PR_TASK_PERF_COUNTERS_ENABLE);
484
485         if ((pid = fork()) < 0)
486                 perror("failed to fork");
487         if (!pid) {
488                 if (execvp(argv[0], argv)) {
489                         perror(argv[0]);
490                         exit(-1);
491                 }
492         }
493         while (wait(&status) >= 0)
494                 ;
495         prctl(PR_TASK_PERF_COUNTERS_DISABLE);
496         t1 = rdclock();
497
498         fflush(stdout);
499
500         fprintf(stderr, "\n");
501         fprintf(stderr, " Performance counter stats for \'%s\':\n",
502                 argv[0]);
503         fprintf(stderr, "\n");
504
505         for (counter = 0; counter < nr_counters; counter++) {
506                 int cpu;
507                 __u64 count, single_count;
508
509                 count = 0;
510                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
511                         res = read(fd[cpu][counter],
512                                         (char *) &single_count, sizeof(single_count));
513                         assert(res == sizeof(single_count));
514                         count += single_count;
515                 }
516
517                 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
518                     event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
519
520                         double msecs = (double)count / 1000000;
521
522                         fprintf(stderr, " %14.6f  %-20s (msecs)\n",
523                                 msecs, event_name(counter));
524                 } else {
525                         fprintf(stderr, " %14Ld  %-20s (events)\n",
526                                 count, event_name(counter));
527                 }
528         }
529         fprintf(stderr, "\n");
530         fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
531                         (double)(t1-t0)/1e6);
532         fprintf(stderr, "\n");
533
534         return 0;
535 }
536
537 /*
538  * Symbols
539  */
540
541 static uint64_t                 min_ip;
542 static uint64_t                 max_ip = -1ll;
543
544 struct sym_entry {
545         unsigned long long      addr;
546         char                    *sym;
547         unsigned long           count[MAX_COUNTERS];
548         int                     skip;
549         struct source_line      *source;
550 };
551
552 #define MAX_SYMS                100000
553
554 static int sym_table_count;
555
556 struct sym_entry                *sym_filter_entry;
557
558 static struct sym_entry         sym_table[MAX_SYMS];
559
560 static void show_details(struct sym_entry *sym);
561
562 /*
563  * Ordering weight: count-1 * count-2 * ... / count-n
564  */
565 static double sym_weight(const struct sym_entry *sym)
566 {
567         double weight;
568         int counter;
569
570         weight = sym->count[0];
571
572         for (counter = 1; counter < nr_counters-1; counter++)
573                 weight *= sym->count[counter];
574
575         weight /= (sym->count[counter] + 1);
576
577         return weight;
578 }
579
580 static int compare(const void *__sym1, const void *__sym2)
581 {
582         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
583
584         return sym_weight(sym1) < sym_weight(sym2);
585 }
586
587 static time_t                   last_refresh;
588 static long                     events;
589 static long                     userspace_events;
590 static const char               CONSOLE_CLEAR[] = "\e[H\e[2J";
591
592 static struct sym_entry         tmp[MAX_SYMS];
593
594 static void print_sym_table(void)
595 {
596         int i, printed;
597         int counter;
598         float events_per_sec = events/delay_secs;
599         float kevents_per_sec = (events-userspace_events)/delay_secs;
600
601         memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
602         qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
603
604         write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
605
606         printf(
607 "------------------------------------------------------------------------------\n");
608         printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
609                 events_per_sec,
610                 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
611                 nmi ? "NMI" : "IRQ");
612
613         if (nr_counters == 1)
614                 printf("%d ", event_count[0]);
615
616         for (counter = 0; counter < nr_counters; counter++) {
617                 if (counter)
618                         printf("/");
619
620                 printf("%s", event_name(counter));
621         }
622
623         printf( "], ");
624
625         if (tid != -1)
626                 printf(" (tid: %d", tid);
627         else
628                 printf(" (all");
629
630         if (profile_cpu != -1)
631                 printf(", cpu: %d)\n", profile_cpu);
632         else {
633                 if (tid != -1)
634                         printf(")\n");
635                 else
636                         printf(", %d CPUs)\n", nr_cpus);
637         }
638
639         printf("------------------------------------------------------------------------------\n\n");
640
641         if (nr_counters == 1)
642                 printf("             events");
643         else
644                 printf("  weight     events");
645
646         printf("         RIP          kernel function\n"
647                        "  ______     ______   ________________   _______________\n\n"
648         );
649
650         printed = 0;
651         for (i = 0; i < sym_table_count; i++) {
652                 int count;
653
654                 if (nr_counters == 1) {
655                         if (printed <= 18 &&
656                                         tmp[i].count[0] >= count_filter) {
657                                 printf("%19.2f - %016llx : %s\n",
658                                   sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
659                                 printed++;
660                         }
661                 } else {
662                         if (printed <= 18 &&
663                                         tmp[i].count[0] >= count_filter) {
664                                 printf("%8.1f %10ld - %016llx : %s\n",
665                                   sym_weight(tmp + i),
666                                   tmp[i].count[0],
667                                   tmp[i].addr, tmp[i].sym);
668                                 printed++;
669                         }
670                 }
671                 /*
672                  * Add decay to the counts:
673                  */
674                 for (count = 0; count < nr_counters; count++)
675                         sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
676         }
677
678         if (sym_filter_entry)
679                 show_details(sym_filter_entry);
680
681         last_refresh = time(NULL);
682
683         {
684                 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
685
686                 if (poll(&stdin_poll, 1, 0) == 1) {
687                         printf("key pressed - exiting.\n");
688                         exit(0);
689                 }
690         }
691 }
692
693 static int read_symbol(FILE *in, struct sym_entry *s)
694 {
695         static int filter_match = 0;
696         char *sym, stype;
697         char str[500];
698         int rc, pos;
699
700         rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
701         if (rc == EOF)
702                 return -1;
703
704         assert(rc == 3);
705
706         /* skip until end of line: */
707         pos = strlen(str);
708         do {
709                 rc = fgetc(in);
710                 if (rc == '\n' || rc == EOF || pos >= 499)
711                         break;
712                 str[pos] = rc;
713                 pos++;
714         } while (1);
715         str[pos] = 0;
716
717         sym = str;
718
719         /* Filter out known duplicates and non-text symbols. */
720         if (!strcmp(sym, "_text"))
721                 return 1;
722         if (!min_ip && !strcmp(sym, "_stext"))
723                 return 1;
724         if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
725                 return 1;
726         if (stype != 'T' && stype != 't')
727                 return 1;
728         if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
729                 return 1;
730         if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
731                 return 1;
732
733         s->sym = malloc(strlen(str));
734         assert(s->sym);
735
736         strcpy((char *)s->sym, str);
737         s->skip = 0;
738
739         /* Tag events to be skipped. */
740         if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
741                 s->skip = 1;
742         else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
743                 s->skip = 1;
744         else if (!strcmp("mwait_idle", s->sym))
745                 s->skip = 1;
746
747         if (filter_match == 1) {
748                 filter_end = s->addr;
749                 filter_match = -1;
750                 if (filter_end - filter_start > 10000) {
751                         printf("hm, too large filter symbol <%s> - skipping.\n",
752                                 sym_filter);
753                         printf("symbol filter start: %016lx\n", filter_start);
754                         printf("                end: %016lx\n", filter_end);
755                         filter_end = filter_start = 0;
756                         sym_filter = NULL;
757                         sleep(1);
758                 }
759         }
760         if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
761                 filter_match = 1;
762                 filter_start = s->addr;
763         }
764
765         return 0;
766 }
767
768 int compare_addr(const void *__sym1, const void *__sym2)
769 {
770         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
771
772         return sym1->addr > sym2->addr;
773 }
774
775 static void sort_symbol_table(void)
776 {
777         int i, dups;
778
779         do {
780                 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
781                 for (i = 0, dups = 0; i < sym_table_count; i++) {
782                         if (sym_table[i].addr == sym_table[i+1].addr) {
783                                 sym_table[i+1].addr = -1ll;
784                                 dups++;
785                         }
786                 }
787                 sym_table_count -= dups;
788         } while(dups);
789 }
790
791 static void parse_symbols(void)
792 {
793         struct sym_entry *last;
794
795         FILE *kallsyms = fopen("/proc/kallsyms", "r");
796
797         if (!kallsyms) {
798                 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
799                 exit(-1);
800         }
801
802         while (!feof(kallsyms)) {
803                 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
804                         sym_table_count++;
805                         assert(sym_table_count <= MAX_SYMS);
806                 }
807         }
808
809         sort_symbol_table();
810         min_ip = sym_table[0].addr;
811         max_ip = sym_table[sym_table_count-1].addr;
812         last = sym_table + sym_table_count++;
813
814         last->addr = -1ll;
815         last->sym = "<end>";
816
817         if (filter_end) {
818                 int count;
819                 for (count=0; count < sym_table_count; count ++) {
820                         if (!strcmp(sym_table[count].sym, sym_filter)) {
821                                 sym_filter_entry = &sym_table[count];
822                                 break;
823                         }
824                 }
825         }
826         if (dump_symtab) {
827                 int i;
828
829                 for (i = 0; i < sym_table_count; i++)
830                         fprintf(stderr, "%llx %s\n",
831                                 sym_table[i].addr, sym_table[i].sym);
832         }
833 }
834
835 /*
836  * Source lines
837  */
838
839 static void parse_vmlinux(char *filename)
840 {
841         FILE *file;
842         char command[PATH_MAX*2];
843         if (!filename)
844                 return;
845
846         sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
847
848         file = popen(command, "r");
849         if (!file)
850                 return;
851
852         lines_tail = &lines;
853         while (!feof(file)) {
854                 struct source_line *src;
855                 size_t dummy = 0;
856                 char *c;
857
858                 src = malloc(sizeof(struct source_line));
859                 assert(src != NULL);
860                 memset(src, 0, sizeof(struct source_line));
861
862                 if (getline(&src->line, &dummy, file) < 0)
863                         break;
864                 if (!src->line)
865                         break;
866
867                 c = strchr(src->line, '\n');
868                 if (c)
869                         *c = 0;
870
871                 src->next = NULL;
872                 *lines_tail = src;
873                 lines_tail = &src->next;
874
875                 if (strlen(src->line)>8 && src->line[8] == ':')
876                         src->EIP = strtoull(src->line, NULL, 16);
877                 if (strlen(src->line)>8 && src->line[16] == ':')
878                         src->EIP = strtoull(src->line, NULL, 16);
879         }
880         pclose(file);
881 }
882
883 static void record_precise_ip(uint64_t ip)
884 {
885         struct source_line *line;
886
887         for (line = lines; line; line = line->next) {
888                 if (line->EIP == ip)
889                         line->count++;
890                 if (line->EIP > ip)
891                         break;
892         }
893 }
894
895 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
896 {
897         struct source_line *line;
898         char pattern[PATH_MAX];
899         sprintf(pattern, "<%s>:", sym->sym);
900
901         for (line = lines; line; line = line->next) {
902                 if (strstr(line->line, pattern)) {
903                         sym->source = line;
904                         break;
905                 }
906         }
907 }
908
909 static void show_lines(struct source_line *line_queue, int line_queue_count)
910 {
911         int i;
912         struct source_line *line;
913
914         line = line_queue;
915         for (i = 0; i < line_queue_count; i++) {
916                 printf("%8li\t%s\n", line->count, line->line);
917                 line = line->next;
918         }
919 }
920
921 #define TRACE_COUNT     3
922
923 static void show_details(struct sym_entry *sym)
924 {
925         struct source_line *line;
926         struct source_line *line_queue = NULL;
927         int displayed = 0;
928         int line_queue_count = 0;
929
930         if (!sym->source)
931                 lookup_sym_in_vmlinux(sym);
932         if (!sym->source)
933                 return;
934
935         printf("Showing details for %s\n", sym->sym);
936
937         line = sym->source;
938         while (line) {
939                 if (displayed && strstr(line->line, ">:"))
940                         break;
941
942                 if (!line_queue_count)
943                         line_queue = line;
944                 line_queue_count ++;
945
946                 if (line->count >= count_filter) {
947                         show_lines(line_queue, line_queue_count);
948                         line_queue_count = 0;
949                         line_queue = NULL;
950                 } else if (line_queue_count > TRACE_COUNT) {
951                         line_queue = line_queue->next;
952                         line_queue_count --;
953                 }
954
955                 line->count = 0;
956                 displayed++;
957                 if (displayed > 300)
958                         break;
959                 line = line->next;
960         }
961 }
962
963 /*
964  * Binary search in the histogram table and record the hit:
965  */
966 static void record_ip(uint64_t ip, int counter)
967 {
968         int left_idx, middle_idx, right_idx, idx;
969         unsigned long left, middle, right;
970
971         record_precise_ip(ip);
972
973         left_idx = 0;
974         right_idx = sym_table_count-1;
975         assert(ip <= max_ip && ip >= min_ip);
976
977         while (left_idx + 1 < right_idx) {
978                 middle_idx = (left_idx + right_idx) / 2;
979
980                 left   = sym_table[  left_idx].addr;
981                 middle = sym_table[middle_idx].addr;
982                 right  = sym_table[ right_idx].addr;
983
984                 if (!(left <= middle && middle <= right)) {
985                         printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
986                         printf("%d %d %d\n", left_idx, middle_idx, right_idx);
987                 }
988                 assert(left <= middle && middle <= right);
989                 if (!(left <= ip && ip <= right)) {
990                         printf(" left: %016lx\n", left);
991                         printf("   ip: %016lx\n", (unsigned long)ip);
992                         printf("right: %016lx\n", right);
993                 }
994                 assert(left <= ip && ip <= right);
995                 /*
996                  * [ left .... target .... middle .... right ]
997                  *   => right := middle
998                  */
999                 if (ip < middle) {
1000                         right_idx = middle_idx;
1001                         continue;
1002                 }
1003                 /*
1004                  * [ left .... middle ... target ... right ]
1005                  *   => left := middle
1006                  */
1007                 left_idx = middle_idx;
1008         }
1009
1010         idx = left_idx;
1011
1012         if (!sym_table[idx].skip)
1013                 sym_table[idx].count[counter]++;
1014         else events--;
1015 }
1016
1017 static void process_event(uint64_t ip, int counter)
1018 {
1019         events++;
1020
1021         if (ip < min_ip || ip > max_ip) {
1022                 userspace_events++;
1023                 return;
1024         }
1025
1026         record_ip(ip, counter);
1027 }
1028
1029 static void process_options(int argc, char *argv[])
1030 {
1031         int error = 0, counter;
1032
1033         if (strstr(argv[0], "perfstat"))
1034                 run_perfstat = 1;
1035
1036         for (;;) {
1037                 int option_index = 0;
1038                 /** Options for getopt */
1039                 static struct option long_options[] = {
1040                         {"count",       required_argument,      NULL, 'c'},
1041                         {"cpu",         required_argument,      NULL, 'C'},
1042                         {"delay",       required_argument,      NULL, 'd'},
1043                         {"dump_symtab", no_argument,            NULL, 'D'},
1044                         {"event",       required_argument,      NULL, 'e'},
1045                         {"filter",      required_argument,      NULL, 'f'},
1046                         {"group",       required_argument,      NULL, 'g'},
1047                         {"help",        no_argument,            NULL, 'h'},
1048                         {"nmi",         required_argument,      NULL, 'n'},
1049                         {"pid",         required_argument,      NULL, 'p'},
1050                         {"vmlinux",     required_argument,      NULL, 'x'},
1051                         {"symbol",      required_argument,      NULL, 's'},
1052                         {"stat",        no_argument,            NULL, 'S'},
1053                         {"zero",        no_argument,            NULL, 'z'},
1054                         {"mmap_pages",  required_argument,      NULL, 'm'},
1055                         {NULL,          0,                      NULL,  0 }
1056                 };
1057                 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
1058                                     long_options, &option_index);
1059                 if (c == -1)
1060                         break;
1061
1062                 switch (c) {
1063                 case 'a': system_wide                   =              1; break;
1064                 case 'c': default_interval              =   atoi(optarg); break;
1065                 case 'C':
1066                         /* CPU and PID are mutually exclusive */
1067                         if (tid != -1) {
1068                                 printf("WARNING: CPU switch overriding PID\n");
1069                                 sleep(1);
1070                                 tid = -1;
1071                         }
1072                         profile_cpu                     =   atoi(optarg); break;
1073                 case 'd': delay_secs                    =   atoi(optarg); break;
1074                 case 'D': dump_symtab                   =              1; break;
1075
1076                 case 'e': error                         = parse_events(optarg); break;
1077
1078                 case 'f': count_filter                  =   atoi(optarg); break;
1079                 case 'g': group                         =   atoi(optarg); break;
1080                 case 'h':                                 display_help(); break;
1081                 case 'n': nmi                           =   atoi(optarg); break;
1082                 case 'p':
1083                         /* CPU and PID are mutually exclusive */
1084                         if (profile_cpu != -1) {
1085                                 printf("WARNING: PID switch overriding CPU\n");
1086                                 sleep(1);
1087                                 profile_cpu = -1;
1088                         }
1089                         tid                             =   atoi(optarg); break;
1090                 case 's': sym_filter                    = strdup(optarg); break;
1091                 case 'S': run_perfstat                  =              1; break;
1092                 case 'x': vmlinux                       = strdup(optarg); break;
1093                 case 'z': zero                          =              1; break;
1094                 case 'm': mmap_pages                    =   atoi(optarg); break;
1095                 default: error = 1; break;
1096                 }
1097         }
1098         if (error)
1099                 display_help();
1100
1101         if (!nr_counters) {
1102                 if (run_perfstat)
1103                         nr_counters = 8;
1104                 else {
1105                         nr_counters = 1;
1106                         event_id[0] = 0;
1107                 }
1108         }
1109
1110         for (counter = 0; counter < nr_counters; counter++) {
1111                 if (event_count[counter])
1112                         continue;
1113
1114                 event_count[counter] = default_interval;
1115         }
1116 }
1117
1118 struct mmap_data {
1119         int counter;
1120         void *base;
1121         unsigned int mask;
1122         unsigned int prev;
1123 };
1124
1125 static unsigned int mmap_read_head(struct mmap_data *md)
1126 {
1127         struct perf_counter_mmap_page *pc = md->base;
1128         unsigned int seq, head;
1129
1130 repeat:
1131         rmb();
1132         seq = pc->lock;
1133
1134         if (unlikely(seq & 1)) {
1135                 cpu_relax();
1136                 goto repeat;
1137         }
1138
1139         head = pc->data_head;
1140
1141         rmb();
1142         if (pc->lock != seq)
1143                 goto repeat;
1144
1145         return head;
1146 }
1147
1148 struct timeval last_read, this_read;
1149
1150 static void mmap_read(struct mmap_data *md)
1151 {
1152         unsigned int head = mmap_read_head(md);
1153         unsigned int old = md->prev;
1154         unsigned char *data = md->base + page_size;
1155         int diff;
1156
1157         gettimeofday(&this_read, NULL);
1158
1159         /*
1160          * If we're further behind than half the buffer, there's a chance
1161          * the writer will bite our tail and screw up the events under us.
1162          *
1163          * If we somehow ended up ahead of the head, we got messed up.
1164          *
1165          * In either case, truncate and restart at head.
1166          */
1167         diff = head - old;
1168         if (diff > md->mask / 2 || diff < 0) {
1169                 struct timeval iv;
1170                 unsigned long msecs;
1171
1172                 timersub(&this_read, &last_read, &iv);
1173                 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1174
1175                 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1176                                 "  Last read %lu msecs ago.\n", msecs);
1177
1178                 /*
1179                  * head points to a known good entry, start there.
1180                  */
1181                 old = head;
1182         }
1183
1184         last_read = this_read;
1185
1186         for (; old != head;) {
1187                 struct event_struct {
1188                         struct perf_event_header header;
1189                         __u64 ip;
1190                         __u32 pid, tid;
1191                 } *event = (struct event_struct *)&data[old & md->mask];
1192                 struct event_struct event_copy;
1193
1194                 unsigned int size = event->header.size;
1195
1196                 /*
1197                  * Event straddles the mmap boundary -- header should always
1198                  * be inside due to u64 alignment of output.
1199                  */
1200                 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1201                         unsigned int offset = old;
1202                         unsigned int len = sizeof(*event), cpy;
1203                         void *dst = &event_copy;
1204
1205                         do {
1206                                 cpy = min(md->mask + 1 - (offset & md->mask), len);
1207                                 memcpy(dst, &data[offset & md->mask], cpy);
1208                                 offset += cpy;
1209                                 dst += cpy;
1210                                 len -= cpy;
1211                         } while (len);
1212
1213                         event = &event_copy;
1214                 }
1215
1216                 old += size;
1217
1218                 switch (event->header.type) {
1219                 case PERF_EVENT_IP:
1220                 case PERF_EVENT_IP | __PERF_EVENT_TID:
1221                         process_event(event->ip, md->counter);
1222                         break;
1223                 }
1224         }
1225
1226         md->prev = old;
1227 }
1228
1229 int main(int argc, char *argv[])
1230 {
1231         struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1232         struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1233         struct perf_counter_hw_event hw_event;
1234         int i, counter, group_fd, nr_poll = 0;
1235         unsigned int cpu;
1236         int ret;
1237
1238         page_size = sysconf(_SC_PAGE_SIZE);
1239
1240         process_options(argc, argv);
1241
1242         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1243         assert(nr_cpus <= MAX_NR_CPUS);
1244         assert(nr_cpus >= 0);
1245
1246         if (run_perfstat)
1247                 return do_perfstat(argc, argv);
1248
1249         if (tid != -1 || profile_cpu != -1)
1250                 nr_cpus = 1;
1251
1252         parse_symbols();
1253         if (vmlinux && sym_filter_entry)
1254                 parse_vmlinux(vmlinux);
1255
1256         for (i = 0; i < nr_cpus; i++) {
1257                 group_fd = -1;
1258                 for (counter = 0; counter < nr_counters; counter++) {
1259
1260                         cpu     = profile_cpu;
1261                         if (tid == -1 && profile_cpu == -1)
1262                                 cpu = i;
1263
1264                         memset(&hw_event, 0, sizeof(hw_event));
1265                         hw_event.config         = event_id[counter];
1266                         hw_event.irq_period     = event_count[counter];
1267                         hw_event.record_type    = PERF_RECORD_IRQ;
1268                         hw_event.nmi            = nmi;
1269                         hw_event.include_tid    = 1;
1270
1271                         fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1272                         if (fd[i][counter] < 0) {
1273                                 int err = errno;
1274                                 printf("kerneltop error: syscall returned with %d (%s)\n",
1275                                         fd[i][counter], strerror(err));
1276                                 if (err == EPERM)
1277                                         printf("Are you root?\n");
1278                                 exit(-1);
1279                         }
1280                         assert(fd[i][counter] >= 0);
1281                         fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1282
1283                         /*
1284                          * First counter acts as the group leader:
1285                          */
1286                         if (group && group_fd == -1)
1287                                 group_fd = fd[i][counter];
1288
1289                         event_array[nr_poll].fd = fd[i][counter];
1290                         event_array[nr_poll].events = POLLIN;
1291                         nr_poll++;
1292
1293                         mmap_array[i][counter].counter = counter;
1294                         mmap_array[i][counter].prev = 0;
1295                         mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1296                         mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1297                                         PROT_READ, MAP_SHARED, fd[i][counter], 0);
1298                         if (mmap_array[i][counter].base == MAP_FAILED) {
1299                                 printf("kerneltop error: failed to mmap with %d (%s)\n",
1300                                                 errno, strerror(errno));
1301                                 exit(-1);
1302                         }
1303                 }
1304         }
1305
1306         printf("KernelTop refresh period: %d seconds\n", delay_secs);
1307         last_refresh = time(NULL);
1308
1309         while (1) {
1310                 int hits = events;
1311
1312                 for (i = 0; i < nr_cpus; i++) {
1313                         for (counter = 0; counter < nr_counters; counter++)
1314                                 mmap_read(&mmap_array[i][counter]);
1315                 }
1316
1317                 if (time(NULL) >= last_refresh + delay_secs) {
1318                         print_sym_table();
1319                         events = userspace_events = 0;
1320                 }
1321
1322                 if (hits == events)
1323                         ret = poll(event_array, nr_poll, 1000);
1324                 hits = events;
1325         }
1326
1327         return 0;
1328 }