perf_counter tools: remove glib dependency and fix bugs in kerneltop.c
[linux-2.6] / Documentation / perf_counter / kerneltop.c
1 /*
2  * kerneltop.c: show top kernel functions - performance counters showcase
3
4    Build with:
5
6      cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8    Sample output:
9
10 ------------------------------------------------------------------------------
11  KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14              weight         RIP          kernel function
15              ______   ________________   _______________
16
17               35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18               33.00 - ffffffff804cb740 : sock_alloc_send_skb
19               31.26 - ffffffff804ce808 : skb_push
20               22.43 - ffffffff80510004 : tcp_established_options
21               19.00 - ffffffff8027d250 : find_get_page
22               15.76 - ffffffff804e4fc9 : eth_type_trans
23               15.20 - ffffffff804d8baa : dst_release
24               14.86 - ffffffff804cf5d8 : skb_release_head_state
25               14.00 - ffffffff802217d5 : read_hpet
26               12.00 - ffffffff804ffb7f : __ip_local_out
27               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28                8.54 - ffffffff805001a3 : ip_queue_xmit
29  */
30
31 /*
32  * perfstat:  /usr/bin/time -alike performance counter statistics utility
33
34           It summarizes the counter events of all tasks (and child tasks),
35           covering all CPUs that the command (or workload) executes on.
36           It only counts the per-task events of the workload started,
37           independent of how many other tasks run on those CPUs.
38
39    Sample output:
40
41    $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43    Performance counter stats for 'ls':
44
45            163516953 instructions
46                 2295 cache-misses
47              2855182 branch-misses
48  */
49
50  /*
51   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52   *
53   * Improvements and fixes by:
54   *
55   *   Arjan van de Ven <arjan@linux.intel.com>
56   *   Yanmin Zhang <yanmin.zhang@intel.com>
57   *   Wu Fengguang <fengguang.wu@intel.com>
58   *   Mike Galbraith <efault@gmx.de>
59   *   Paul Mackerras <paulus@samba.org>
60   *
61   * Released under the GPL v2. (and only v2, not any later version)
62   */
63
64 #define _GNU_SOURCE
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #include <sys/time.h>
68 #include <unistd.h>
69 #include <stdint.h>
70 #include <stdlib.h>
71 #include <string.h>
72 #include <limits.h>
73 #include <getopt.h>
74 #include <assert.h>
75 #include <fcntl.h>
76 #include <stdio.h>
77 #include <errno.h>
78 #include <ctype.h>
79 #include <time.h>
80
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
83 #include <sys/poll.h>
84 #include <sys/prctl.h>
85 #include <sys/wait.h>
86 #include <sys/uio.h>
87 #include <sys/mman.h>
88
89 #include <linux/unistd.h>
90 #include <linux/types.h>
91
92 #include "../../include/linux/perf_counter.h"
93
94
95 /*
96  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
97  * counters in the current task.
98  */
99 #define PR_TASK_PERF_COUNTERS_DISABLE   31
100 #define PR_TASK_PERF_COUNTERS_ENABLE    32
101
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
103
104 #define rdclock()                                       \
105 ({                                                      \
106         struct timespec ts;                             \
107                                                         \
108         clock_gettime(CLOCK_MONOTONIC, &ts);            \
109         ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
110 })
111
112 /*
113  * Pick up some kernel type conventions:
114  */
115 #define __user
116 #define asmlinkage
117
118 #ifdef __x86_64__
119 #define __NR_perf_counter_open 295
120 #define rmb()           asm volatile("lfence" ::: "memory")
121 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
122 #endif
123
124 #ifdef __i386__
125 #define __NR_perf_counter_open 333
126 #define rmb()           asm volatile("lfence" ::: "memory")
127 #define cpu_relax()     asm volatile("rep; nop" ::: "memory");
128 #endif
129
130 #ifdef __powerpc__
131 #define __NR_perf_counter_open 319
132 #define rmb()           asm volatile ("sync" ::: "memory")
133 #define cpu_relax()     asm volatile ("" ::: "memory");
134 #endif
135
136 #define unlikely(x)     __builtin_expect(!!(x), 0)
137
138 asmlinkage int sys_perf_counter_open(
139         struct perf_counter_hw_event    *hw_event_uptr          __user,
140         pid_t                           pid,
141         int                             cpu,
142         int                             group_fd,
143         unsigned long                   flags)
144 {
145         return syscall(
146                 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
147 }
148
149 #define MAX_COUNTERS                    64
150 #define MAX_NR_CPUS                     256
151
152 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
153
154 static int                      run_perfstat                    =  0;
155 static int                      system_wide                     =  0;
156
157 static int                      nr_counters                     =  0;
158 static __u64                    event_id[MAX_COUNTERS]          = {
159         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
160         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
161         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
162         EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
163
164         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
165         EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
166         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
167         EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
168 };
169 static int                      default_interval = 100000;
170 static int                      event_count[MAX_COUNTERS];
171 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
172
173 static __u64                    count_filter                   = 100;
174
175 static int                      tid                             = -1;
176 static int                      profile_cpu                     = -1;
177 static int                      nr_cpus                         =  0;
178 static int                      nmi                             =  1;
179 static int                      group                           =  0;
180 static unsigned int             page_size;
181
182 static char                     *vmlinux;
183
184 static char                     *sym_filter;
185 static unsigned long            filter_start;
186 static unsigned long            filter_end;
187
188 static int                      delay_secs                      =  2;
189 static int                      zero;
190 static int                      dump_symtab;
191
192 struct source_line {
193         uint64_t                EIP;
194         unsigned long           count;
195         char                    *line;
196         struct source_line      *next;
197 };
198
199 static struct source_line       *lines;
200 static struct source_line       **lines_tail;
201
202 const unsigned int default_count[] = {
203         1000000,
204         1000000,
205           10000,
206           10000,
207         1000000,
208           10000,
209 };
210
211 static char *hw_event_names[] = {
212         "CPU cycles",
213         "instructions",
214         "cache references",
215         "cache misses",
216         "branches",
217         "branch misses",
218         "bus cycles",
219 };
220
221 static char *sw_event_names[] = {
222         "cpu clock ticks",
223         "task clock ticks",
224         "pagefaults",
225         "context switches",
226         "CPU migrations",
227         "minor faults",
228         "major faults",
229 };
230
231 struct event_symbol {
232         __u64 event;
233         char *symbol;
234 };
235
236 static struct event_symbol event_symbols[] = {
237         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cpu-cycles",           },
238         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),                "cycles",               },
239         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),              "instructions",         },
240         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),          "cache-references",     },
241         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),              "cache-misses",         },
242         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branch-instructions",  },
243         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),       "branches",             },
244         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),             "branch-misses",        },
245         {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),                "bus-cycles",           },
246
247         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),                 "cpu-clock",            },
248         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),                "task-clock",           },
249         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "page-faults",          },
250         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),               "faults",               },
251         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),           "minor-faults",         },
252         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),           "major-faults",         },
253         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "context-switches",     },
254         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),          "cs",                   },
255         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "cpu-migrations",       },
256         {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),            "migrations",           },
257 };
258
259 #define __PERF_COUNTER_FIELD(config, name) \
260         ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
261
262 #define PERF_COUNTER_RAW(config)        __PERF_COUNTER_FIELD(config, RAW)
263 #define PERF_COUNTER_CONFIG(config)     __PERF_COUNTER_FIELD(config, CONFIG)
264 #define PERF_COUNTER_TYPE(config)       __PERF_COUNTER_FIELD(config, TYPE)
265 #define PERF_COUNTER_ID(config)         __PERF_COUNTER_FIELD(config, EVENT)
266
267 static void display_events_help(void)
268 {
269         unsigned int i;
270         __u64 e;
271
272         printf(
273         " -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
274
275         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
276                 int type, id;
277
278                 e = event_symbols[i].event;
279                 type = PERF_COUNTER_TYPE(e);
280                 id = PERF_COUNTER_ID(e);
281
282                 printf("\n                             %d:%d: %-20s",
283                                 type, id, event_symbols[i].symbol);
284         }
285
286         printf("\n"
287         "                           rNNN: raw PMU events (eventsel+umask)\n\n");
288 }
289
290 static void display_perfstat_help(void)
291 {
292         printf(
293         "Usage: perfstat [<events...>] <cmd...>\n\n"
294         "PerfStat Options (up to %d event types can be specified):\n\n",
295                  MAX_COUNTERS);
296
297         display_events_help();
298
299         printf(
300         " -a                           # system-wide collection\n");
301         exit(0);
302 }
303
304 static void display_help(void)
305 {
306         if (run_perfstat)
307                 return display_perfstat_help();
308
309         printf(
310         "Usage: kerneltop [<options>]\n"
311         "   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
312         "KernelTop Options (up to %d event types can be specified at once):\n\n",
313                  MAX_COUNTERS);
314
315         display_events_help();
316
317         printf(
318         " -S        --stat             # perfstat COMMAND\n"
319         " -a                           # system-wide collection (for perfstat)\n\n"
320         " -c CNT    --count=CNT        # event period to sample\n\n"
321         " -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
322         " -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
323         " -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
324         " -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
325         " -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
326         " -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
327         " -z        --zero             # zero counts after display\n"
328         " -D        --dump_symtab      # dump symbol table to stderr on startup\n"
329         );
330
331         exit(0);
332 }
333
334 static char *event_name(int ctr)
335 {
336         __u64 config = event_id[ctr];
337         int type = PERF_COUNTER_TYPE(config);
338         int id = PERF_COUNTER_ID(config);
339         static char buf[32];
340
341         if (PERF_COUNTER_RAW(config)) {
342                 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
343                 return buf;
344         }
345
346         switch (type) {
347         case PERF_TYPE_HARDWARE:
348                 if (id < PERF_HW_EVENTS_MAX)
349                         return hw_event_names[id];
350                 return "unknown-hardware";
351
352         case PERF_TYPE_SOFTWARE:
353                 if (id < PERF_SW_EVENTS_MAX)
354                         return sw_event_names[id];
355                 return "unknown-software";
356
357         default:
358                 break;
359         }
360
361         return "unknown";
362 }
363
364 /*
365  * Each event can have multiple symbolic names.
366  * Symbolic names are (almost) exactly matched.
367  */
368 static __u64 match_event_symbols(char *str)
369 {
370         __u64 config, id;
371         int type;
372         unsigned int i;
373
374         if (sscanf(str, "r%llx", &config) == 1)
375                 return config | PERF_COUNTER_RAW_MASK;
376
377         if (sscanf(str, "%d:%llu", &type, &id) == 2)
378                 return EID(type, id);
379
380         for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
381                 if (!strncmp(str, event_symbols[i].symbol,
382                              strlen(event_symbols[i].symbol)))
383                         return event_symbols[i].event;
384         }
385
386         return ~0ULL;
387 }
388
389 static int parse_events(char *str)
390 {
391         __u64 config;
392
393 again:
394         if (nr_counters == MAX_COUNTERS)
395                 return -1;
396
397         config = match_event_symbols(str);
398         if (config == ~0ULL)
399                 return -1;
400
401         event_id[nr_counters] = config;
402         nr_counters++;
403
404         str = strstr(str, ",");
405         if (str) {
406                 str++;
407                 goto again;
408         }
409
410         return 0;
411 }
412
413
414 /*
415  * perfstat
416  */
417
418 char fault_here[1000000];
419
420 static void create_perfstat_counter(int counter)
421 {
422         struct perf_counter_hw_event hw_event;
423
424         memset(&hw_event, 0, sizeof(hw_event));
425         hw_event.config         = event_id[counter];
426         hw_event.record_type    = PERF_RECORD_SIMPLE;
427         hw_event.nmi            = 0;
428
429         if (system_wide) {
430                 int cpu;
431                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
432                         fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
433                         if (fd[cpu][counter] < 0) {
434                                 printf("perfstat error: syscall returned with %d (%s)\n",
435                                                 fd[cpu][counter], strerror(errno));
436                                 exit(-1);
437                         }
438                 }
439         } else {
440                 hw_event.inherit        = 1;
441                 hw_event.disabled       = 1;
442
443                 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
444                 if (fd[0][counter] < 0) {
445                         printf("perfstat error: syscall returned with %d (%s)\n",
446                                         fd[0][counter], strerror(errno));
447                         exit(-1);
448                 }
449         }
450 }
451
452 int do_perfstat(int argc, char *argv[])
453 {
454         unsigned long long t0, t1;
455         int counter;
456         ssize_t res;
457         int status;
458         int pid;
459
460         if (!system_wide)
461                 nr_cpus = 1;
462
463         for (counter = 0; counter < nr_counters; counter++)
464                 create_perfstat_counter(counter);
465
466         argc -= optind;
467         argv += optind;
468
469         if (!argc)
470                 display_help();
471
472         /*
473          * Enable counters and exec the command:
474          */
475         t0 = rdclock();
476         prctl(PR_TASK_PERF_COUNTERS_ENABLE);
477
478         if ((pid = fork()) < 0)
479                 perror("failed to fork");
480         if (!pid) {
481                 if (execvp(argv[0], argv)) {
482                         perror(argv[0]);
483                         exit(-1);
484                 }
485         }
486         while (wait(&status) >= 0)
487                 ;
488         prctl(PR_TASK_PERF_COUNTERS_DISABLE);
489         t1 = rdclock();
490
491         fflush(stdout);
492
493         fprintf(stderr, "\n");
494         fprintf(stderr, " Performance counter stats for \'%s\':\n",
495                 argv[0]);
496         fprintf(stderr, "\n");
497
498         for (counter = 0; counter < nr_counters; counter++) {
499                 int cpu;
500                 __u64 count, single_count;
501
502                 count = 0;
503                 for (cpu = 0; cpu < nr_cpus; cpu ++) {
504                         res = read(fd[cpu][counter],
505                                         (char *) &single_count, sizeof(single_count));
506                         assert(res == sizeof(single_count));
507                         count += single_count;
508                 }
509
510                 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
511                     event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
512
513                         double msecs = (double)count / 1000000;
514
515                         fprintf(stderr, " %14.6f  %-20s (msecs)\n",
516                                 msecs, event_name(counter));
517                 } else {
518                         fprintf(stderr, " %14Ld  %-20s (events)\n",
519                                 count, event_name(counter));
520                 }
521         }
522         fprintf(stderr, "\n");
523         fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
524                         (double)(t1-t0)/1e6);
525         fprintf(stderr, "\n");
526
527         return 0;
528 }
529
530 /*
531  * Symbols
532  */
533
534 static uint64_t                 min_ip;
535 static uint64_t                 max_ip = -1ll;
536
537 struct sym_entry {
538         unsigned long long      addr;
539         char                    *sym;
540         unsigned long           count[MAX_COUNTERS];
541         int                     skip;
542         struct source_line      *source;
543 };
544
545 #define MAX_SYMS                100000
546
547 static int sym_table_count;
548
549 struct sym_entry                *sym_filter_entry;
550
551 static struct sym_entry         sym_table[MAX_SYMS];
552
553 static void show_details(struct sym_entry *sym);
554
555 /*
556  * Ordering weight: count-1 * count-2 * ... / count-n
557  */
558 static double sym_weight(const struct sym_entry *sym)
559 {
560         double weight;
561         int counter;
562
563         weight = sym->count[0];
564
565         for (counter = 1; counter < nr_counters-1; counter++)
566                 weight *= sym->count[counter];
567
568         weight /= (sym->count[counter] + 1);
569
570         return weight;
571 }
572
573 static int compare(const void *__sym1, const void *__sym2)
574 {
575         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
576
577         return sym_weight(sym1) < sym_weight(sym2);
578 }
579
580 static time_t                   last_refresh;
581 static long                     events;
582 static long                     userspace_events;
583 static const char               CONSOLE_CLEAR[] = "\e[H\e[2J";
584
585 static struct sym_entry         tmp[MAX_SYMS];
586
587 static void print_sym_table(void)
588 {
589         int i, printed;
590         int counter;
591         float events_per_sec = events/delay_secs;
592         float kevents_per_sec = (events-userspace_events)/delay_secs;
593
594         memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
595         qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
596
597         write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
598
599         printf(
600 "------------------------------------------------------------------------------\n");
601         printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
602                 events_per_sec,
603                 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
604                 nmi ? "NMI" : "IRQ");
605
606         if (nr_counters == 1)
607                 printf("%d ", event_count[0]);
608
609         for (counter = 0; counter < nr_counters; counter++) {
610                 if (counter)
611                         printf("/");
612
613                 printf("%s", event_name(counter));
614         }
615
616         printf( "], ");
617
618         if (tid != -1)
619                 printf(" (tid: %d", tid);
620         else
621                 printf(" (all");
622
623         if (profile_cpu != -1)
624                 printf(", cpu: %d)\n", profile_cpu);
625         else {
626                 if (tid != -1)
627                         printf(")\n");
628                 else
629                         printf(", %d CPUs)\n", nr_cpus);
630         }
631
632         printf("------------------------------------------------------------------------------\n\n");
633
634         if (nr_counters == 1)
635                 printf("             events");
636         else
637                 printf("  weight     events");
638
639         printf("         RIP          kernel function\n"
640                        "  ______     ______   ________________   _______________\n\n"
641         );
642
643         printed = 0;
644         for (i = 0; i < sym_table_count; i++) {
645                 int count;
646
647                 if (nr_counters == 1) {
648                         if (printed <= 18 &&
649                                         tmp[i].count[0] >= count_filter) {
650                                 printf("%19.2f - %016llx : %s\n",
651                                   sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
652                                 printed++;
653                         }
654                 } else {
655                         if (printed <= 18 &&
656                                         tmp[i].count[0] >= count_filter) {
657                                 printf("%8.1f %10ld - %016llx : %s\n",
658                                   sym_weight(tmp + i),
659                                   tmp[i].count[0],
660                                   tmp[i].addr, tmp[i].sym);
661                                 printed++;
662                         }
663                 }
664                 /*
665                  * Add decay to the counts:
666                  */
667                 for (count = 0; count < nr_counters; count++)
668                         sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
669         }
670
671         if (sym_filter_entry)
672                 show_details(sym_filter_entry);
673
674         last_refresh = time(NULL);
675
676         {
677                 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
678
679                 if (poll(&stdin_poll, 1, 0) == 1) {
680                         printf("key pressed - exiting.\n");
681                         exit(0);
682                 }
683         }
684 }
685
686 static int read_symbol(FILE *in, struct sym_entry *s)
687 {
688         static int filter_match = 0;
689         char *sym, stype;
690         char str[500];
691         int rc, pos;
692
693         rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
694         if (rc == EOF)
695                 return -1;
696
697         assert(rc == 3);
698
699         /* skip until end of line: */
700         pos = strlen(str);
701         do {
702                 rc = fgetc(in);
703                 if (rc == '\n' || rc == EOF || pos >= 499)
704                         break;
705                 str[pos] = rc;
706                 pos++;
707         } while (1);
708         str[pos] = 0;
709
710         sym = str;
711
712         /* Filter out known duplicates and non-text symbols. */
713         if (!strcmp(sym, "_text"))
714                 return 1;
715         if (!min_ip && !strcmp(sym, "_stext"))
716                 return 1;
717         if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
718                 return 1;
719         if (stype != 'T' && stype != 't')
720                 return 1;
721         if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
722                 return 1;
723         if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
724                 return 1;
725
726         s->sym = malloc(strlen(str));
727         assert(s->sym);
728
729         strcpy((char *)s->sym, str);
730         s->skip = 0;
731
732         /* Tag events to be skipped. */
733         if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
734                 s->skip = 1;
735         if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
736                 s->skip = 1;
737
738         if (filter_match == 1) {
739                 filter_end = s->addr;
740                 filter_match = -1;
741                 if (filter_end - filter_start > 10000) {
742                         printf("hm, too large filter symbol <%s> - skipping.\n",
743                                 sym_filter);
744                         printf("symbol filter start: %016lx\n", filter_start);
745                         printf("                end: %016lx\n", filter_end);
746                         filter_end = filter_start = 0;
747                         sym_filter = NULL;
748                         sleep(1);
749                 }
750         }
751         if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
752                 filter_match = 1;
753                 filter_start = s->addr;
754         }
755
756         return 0;
757 }
758
759 int compare_addr(const void *__sym1, const void *__sym2)
760 {
761         const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
762
763         return sym1->addr > sym2->addr;
764 }
765
766 static void sort_symbol_table(void)
767 {
768         int i, dups;
769
770         do {
771                 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
772                 for (i = 0, dups = 0; i < sym_table_count; i++) {
773                         if (sym_table[i].addr == sym_table[i+1].addr) {
774                                 sym_table[i+1].addr = -1ll;
775                                 dups++;
776                         }
777                 }
778                 sym_table_count -= dups;
779         } while(dups);
780 }
781
782 static void parse_symbols(void)
783 {
784         struct sym_entry *last;
785
786         FILE *kallsyms = fopen("/proc/kallsyms", "r");
787
788         if (!kallsyms) {
789                 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
790                 exit(-1);
791         }
792
793         while (!feof(kallsyms)) {
794                 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
795                         sym_table_count++;
796                         assert(sym_table_count <= MAX_SYMS);
797                 }
798         }
799
800         sort_symbol_table();
801         min_ip = sym_table[0].addr;
802         max_ip = sym_table[sym_table_count-1].addr;
803         last = sym_table + sym_table_count++;
804
805         last->addr = -1ll;
806         last->sym = "<end>";
807
808         if (filter_end) {
809                 int count;
810                 for (count=0; count < sym_table_count; count ++) {
811                         if (!strcmp(sym_table[count].sym, sym_filter)) {
812                                 sym_filter_entry = &sym_table[count];
813                                 break;
814                         }
815                 }
816         }
817         if (dump_symtab) {
818                 int i;
819
820                 for (i = 0; i < sym_table_count; i++)
821                         fprintf(stderr, "%llx %s\n",
822                                 sym_table[i].addr, sym_table[i].sym);
823         }
824 }
825
826 /*
827  * Source lines
828  */
829
830 static void parse_vmlinux(char *filename)
831 {
832         FILE *file;
833         char command[PATH_MAX*2];
834         if (!filename)
835                 return;
836
837         sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
838
839         file = popen(command, "r");
840         if (!file)
841                 return;
842
843         lines_tail = &lines;
844         while (!feof(file)) {
845                 struct source_line *src;
846                 size_t dummy = 0;
847                 char *c;
848
849                 src = malloc(sizeof(struct source_line));
850                 assert(src != NULL);
851                 memset(src, 0, sizeof(struct source_line));
852
853                 if (getline(&src->line, &dummy, file) < 0)
854                         break;
855                 if (!src->line)
856                         break;
857
858                 c = strchr(src->line, '\n');
859                 if (c)
860                         *c = 0;
861
862                 src->next = NULL;
863                 *lines_tail = src;
864                 lines_tail = &src->next;
865
866                 if (strlen(src->line)>8 && src->line[8] == ':')
867                         src->EIP = strtoull(src->line, NULL, 16);
868                 if (strlen(src->line)>8 && src->line[16] == ':')
869                         src->EIP = strtoull(src->line, NULL, 16);
870         }
871         pclose(file);
872 }
873
874 static void record_precise_ip(uint64_t ip)
875 {
876         struct source_line *line;
877
878         for (line = lines; line; line = line->next) {
879                 if (line->EIP == ip)
880                         line->count++;
881                 if (line->EIP > ip)
882                         break;
883         }
884 }
885
886 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
887 {
888         struct source_line *line;
889         char pattern[PATH_MAX];
890         sprintf(pattern, "<%s>:", sym->sym);
891
892         for (line = lines; line; line = line->next) {
893                 if (strstr(line->line, pattern)) {
894                         sym->source = line;
895                         break;
896                 }
897         }
898 }
899
900 static void show_lines(struct source_line *line_queue, int line_queue_count)
901 {
902         int i;
903         struct source_line *line;
904
905         line = line_queue;
906         for (i = 0; i < line_queue_count; i++) {
907                 printf("%8li\t%s\n", line->count, line->line);
908                 line = line->next;
909         }
910 }
911
912 #define TRACE_COUNT     3
913
914 static void show_details(struct sym_entry *sym)
915 {
916         struct source_line *line;
917         struct source_line *line_queue = NULL;
918         int displayed = 0;
919         int line_queue_count = 0;
920
921         if (!sym->source)
922                 lookup_sym_in_vmlinux(sym);
923         if (!sym->source)
924                 return;
925
926         printf("Showing details for %s\n", sym->sym);
927
928         line = sym->source;
929         while (line) {
930                 if (displayed && strstr(line->line, ">:"))
931                         break;
932
933                 if (!line_queue_count)
934                         line_queue = line;
935                 line_queue_count ++;
936
937                 if (line->count >= count_filter) {
938                         show_lines(line_queue, line_queue_count);
939                         line_queue_count = 0;
940                         line_queue = NULL;
941                 } else if (line_queue_count > TRACE_COUNT) {
942                         line_queue = line_queue->next;
943                         line_queue_count --;
944                 }
945
946                 line->count = 0;
947                 displayed++;
948                 if (displayed > 300)
949                         break;
950                 line = line->next;
951         }
952 }
953
954 /*
955  * Binary search in the histogram table and record the hit:
956  */
957 static void record_ip(uint64_t ip, int counter)
958 {
959         int left_idx, middle_idx, right_idx, idx;
960         unsigned long left, middle, right;
961
962         record_precise_ip(ip);
963
964         left_idx = 0;
965         right_idx = sym_table_count-1;
966         assert(ip <= max_ip && ip >= min_ip);
967
968         while (left_idx + 1 < right_idx) {
969                 middle_idx = (left_idx + right_idx) / 2;
970
971                 left   = sym_table[  left_idx].addr;
972                 middle = sym_table[middle_idx].addr;
973                 right  = sym_table[ right_idx].addr;
974
975                 if (!(left <= middle && middle <= right)) {
976                         printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
977                         printf("%d %d %d\n", left_idx, middle_idx, right_idx);
978                 }
979                 assert(left <= middle && middle <= right);
980                 if (!(left <= ip && ip <= right)) {
981                         printf(" left: %016lx\n", left);
982                         printf("   ip: %016lx\n", (unsigned long)ip);
983                         printf("right: %016lx\n", right);
984                 }
985                 assert(left <= ip && ip <= right);
986                 /*
987                  * [ left .... target .... middle .... right ]
988                  *   => right := middle
989                  */
990                 if (ip < middle) {
991                         right_idx = middle_idx;
992                         continue;
993                 }
994                 /*
995                  * [ left .... middle ... target ... right ]
996                  *   => left := middle
997                  */
998                 left_idx = middle_idx;
999         }
1000
1001         idx = left_idx;
1002
1003         if (!sym_table[idx].skip)
1004                 sym_table[idx].count[counter]++;
1005         else events--;
1006 }
1007
1008 static void process_event(uint64_t ip, int counter)
1009 {
1010         events++;
1011
1012         if (ip < min_ip || ip > max_ip) {
1013                 userspace_events++;
1014                 return;
1015         }
1016
1017         record_ip(ip, counter);
1018 }
1019
1020 static void process_options(int argc, char *argv[])
1021 {
1022         int error = 0, counter;
1023
1024         if (strstr(argv[0], "perfstat"))
1025                 run_perfstat = 1;
1026
1027         for (;;) {
1028                 int option_index = 0;
1029                 /** Options for getopt */
1030                 static struct option long_options[] = {
1031                         {"count",       required_argument,      NULL, 'c'},
1032                         {"cpu",         required_argument,      NULL, 'C'},
1033                         {"delay",       required_argument,      NULL, 'd'},
1034                         {"dump_symtab", no_argument,            NULL, 'D'},
1035                         {"event",       required_argument,      NULL, 'e'},
1036                         {"filter",      required_argument,      NULL, 'f'},
1037                         {"group",       required_argument,      NULL, 'g'},
1038                         {"help",        no_argument,            NULL, 'h'},
1039                         {"nmi",         required_argument,      NULL, 'n'},
1040                         {"pid",         required_argument,      NULL, 'p'},
1041                         {"vmlinux",     required_argument,      NULL, 'x'},
1042                         {"symbol",      required_argument,      NULL, 's'},
1043                         {"stat",        no_argument,            NULL, 'S'},
1044                         {"zero",        no_argument,            NULL, 'z'},
1045                         {NULL,          0,                      NULL,  0 }
1046                 };
1047                 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
1048                                     long_options, &option_index);
1049                 if (c == -1)
1050                         break;
1051
1052                 switch (c) {
1053                 case 'a': system_wide                   =              1; break;
1054                 case 'c': default_interval              =   atoi(optarg); break;
1055                 case 'C':
1056                         /* CPU and PID are mutually exclusive */
1057                         if (tid != -1) {
1058                                 printf("WARNING: CPU switch overriding PID\n");
1059                                 sleep(1);
1060                                 tid = -1;
1061                         }
1062                         profile_cpu                     =   atoi(optarg); break;
1063                 case 'd': delay_secs                    =   atoi(optarg); break;
1064                 case 'D': dump_symtab                   =              1; break;
1065
1066                 case 'e': error                         = parse_events(optarg); break;
1067
1068                 case 'f': count_filter                  =   atoi(optarg); break;
1069                 case 'g': group                         =   atoi(optarg); break;
1070                 case 'h':                                 display_help(); break;
1071                 case 'n': nmi                           =   atoi(optarg); break;
1072                 case 'p':
1073                         /* CPU and PID are mutually exclusive */
1074                         if (profile_cpu != -1) {
1075                                 printf("WARNING: PID switch overriding CPU\n");
1076                                 sleep(1);
1077                                 profile_cpu = -1;
1078                         }
1079                         tid                             =   atoi(optarg); break;
1080                 case 's': sym_filter                    = strdup(optarg); break;
1081                 case 'S': run_perfstat                  =              1; break;
1082                 case 'x': vmlinux                       = strdup(optarg); break;
1083                 case 'z': zero                          =              1; break;
1084                 default: error = 1; break;
1085                 }
1086         }
1087         if (error)
1088                 display_help();
1089
1090         if (!nr_counters) {
1091                 if (run_perfstat)
1092                         nr_counters = 8;
1093                 else {
1094                         nr_counters = 1;
1095                         event_id[0] = 0;
1096                 }
1097         }
1098
1099         for (counter = 0; counter < nr_counters; counter++) {
1100                 if (event_count[counter])
1101                         continue;
1102
1103                 event_count[counter] = default_interval;
1104         }
1105 }
1106
1107 struct mmap_data {
1108         int counter;
1109         void *base;
1110         unsigned int mask;
1111         unsigned int prev;
1112 };
1113
1114 static unsigned int mmap_read_head(struct mmap_data *md)
1115 {
1116         struct perf_counter_mmap_page *pc = md->base;
1117         unsigned int seq, head;
1118
1119 repeat:
1120         rmb();
1121         seq = pc->lock;
1122
1123         if (unlikely(seq & 1)) {
1124                 cpu_relax();
1125                 goto repeat;
1126         }
1127
1128         head = pc->data_head;
1129
1130         rmb();
1131         if (pc->lock != seq)
1132                 goto repeat;
1133
1134         return head;
1135 }
1136
1137 static void mmap_read(struct mmap_data *md)
1138 {
1139         unsigned int head = mmap_read_head(md);
1140         unsigned int old = md->prev;
1141         unsigned char *data = md->base + page_size;
1142
1143         if (head - old > md->mask) {
1144                 printf("ERROR: failed to keep up with mmap data\n");
1145                 exit(-1);
1146         }
1147
1148         for (; old != head;) {
1149                 __u64 *ptr = (__u64 *)&data[old & md->mask];
1150                 old += sizeof(__u64);
1151
1152                 process_event(*ptr, md->counter);
1153         }
1154
1155         md->prev = old;
1156 }
1157
1158 int main(int argc, char *argv[])
1159 {
1160         struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1161         struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1162         struct perf_counter_hw_event hw_event;
1163         int i, counter, group_fd;
1164         unsigned int cpu;
1165         int ret;
1166
1167         page_size = sysconf(_SC_PAGE_SIZE);
1168
1169         process_options(argc, argv);
1170
1171         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1172         assert(nr_cpus <= MAX_NR_CPUS);
1173         assert(nr_cpus >= 0);
1174
1175         if (run_perfstat)
1176                 return do_perfstat(argc, argv);
1177
1178         if (tid != -1 || profile_cpu != -1)
1179                 nr_cpus = 1;
1180
1181         parse_symbols();
1182         if (vmlinux && sym_filter_entry)
1183                 parse_vmlinux(vmlinux);
1184
1185         for (i = 0; i < nr_cpus; i++) {
1186                 group_fd = -1;
1187                 for (counter = 0; counter < nr_counters; counter++) {
1188
1189                         cpu     = profile_cpu;
1190                         if (tid == -1 && profile_cpu == -1)
1191                                 cpu = i;
1192
1193                         memset(&hw_event, 0, sizeof(hw_event));
1194                         hw_event.config         = event_id[counter];
1195                         hw_event.irq_period     = event_count[counter];
1196                         hw_event.record_type    = PERF_RECORD_IRQ;
1197                         hw_event.nmi            = nmi;
1198
1199                         fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1200                         if (fd[i][counter] < 0) {
1201                                 int err = errno;
1202                                 printf("kerneltop error: syscall returned with %d (%s)\n",
1203                                         fd[i][counter], strerror(err));
1204                                 if (err == EPERM)
1205                                         printf("Are you root?\n");
1206                                 exit(-1);
1207                         }
1208                         assert(fd[i][counter] >= 0);
1209                         fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1210
1211                         /*
1212                          * First counter acts as the group leader:
1213                          */
1214                         if (group && group_fd == -1)
1215                                 group_fd = fd[i][counter];
1216
1217                         event_array[i][counter].fd = fd[i][counter];
1218                         event_array[i][counter].events = POLLIN;
1219
1220                         mmap_array[i][counter].counter = counter;
1221                         mmap_array[i][counter].prev = 0;
1222                         mmap_array[i][counter].mask = 2*page_size - 1;
1223                         mmap_array[i][counter].base = mmap(NULL, 3*page_size,
1224                                         PROT_READ, MAP_SHARED, fd[i][counter], 0);
1225                         if (mmap_array[i][counter].base == MAP_FAILED) {
1226                                 printf("kerneltop error: failed to mmap with %d (%s)\n",
1227                                                 errno, strerror(errno));
1228                                 exit(-1);
1229                         }
1230                 }
1231         }
1232
1233         printf("KernelTop refresh period: %d seconds\n", delay_secs);
1234         last_refresh = time(NULL);
1235
1236         while (1) {
1237                 int hits = events;
1238
1239                 for (i = 0; i < nr_cpus; i++) {
1240                         for (counter = 0; counter < nr_counters; counter++)
1241                                 mmap_read(&mmap_array[i][counter]);
1242                 }
1243
1244                 if (time(NULL) >= last_refresh + delay_secs) {
1245                         print_sym_table();
1246                         events = userspace_events = 0;
1247                 }
1248
1249                 if (hits == events)
1250                         ret = poll(event_array[0], nr_cpus, 1000);
1251                 hits = events;
1252         }
1253
1254         return 0;
1255 }