perf record/report: Fix PID/COMM handling
[linux-2.6] / Documentation / perf_counter / builtin-record.c
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9
10 #include "perf.h"
11
12 #include "util/util.h"
13 #include "util/parse-options.h"
14 #include "util/parse-events.h"
15 #include "util/string.h"
16
17 #include <unistd.h>
18 #include <sched.h>
19
20 #define ALIGN(x, a)             __ALIGN_MASK(x, (typeof(x))(a)-1)
21 #define __ALIGN_MASK(x, mask)   (((x)+(mask))&~(mask))
22
23 static long                     default_interval = 100000;
24 static long                     event_count[MAX_COUNTERS];
25
26 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
27 static int                      nr_cpus                         = 0;
28 static unsigned int             page_size;
29 static unsigned int             mmap_pages                      = 128;
30 static int                      output;
31 static const char               *output_name                    = "perf.data";
32 static int                      group                           = 0;
33 static unsigned int             realtime_prio                   = 0;
34 static int                      system_wide                     = 0;
35 static pid_t                    target_pid                      = -1;
36 static int                      inherit                         = 1;
37 static int                      force                           = 0;
38 static int                      append_file                     = 0;
39
40 const unsigned int default_count[] = {
41         1000000,
42         1000000,
43           10000,
44           10000,
45         1000000,
46           10000,
47 };
48
49 struct mmap_data {
50         int counter;
51         void *base;
52         unsigned int mask;
53         unsigned int prev;
54 };
55
56 static unsigned int mmap_read_head(struct mmap_data *md)
57 {
58         struct perf_counter_mmap_page *pc = md->base;
59         int head;
60
61         head = pc->data_head;
62         rmb();
63
64         return head;
65 }
66
67 static long events;
68 static struct timeval last_read, this_read;
69
70 static __u64 bytes_written;
71
72 static void mmap_read(struct mmap_data *md)
73 {
74         unsigned int head = mmap_read_head(md);
75         unsigned int old = md->prev;
76         unsigned char *data = md->base + page_size;
77         unsigned long size;
78         void *buf;
79         int diff;
80
81         gettimeofday(&this_read, NULL);
82
83         /*
84          * If we're further behind than half the buffer, there's a chance
85          * the writer will bite our tail and screw up the events under us.
86          *
87          * If we somehow ended up ahead of the head, we got messed up.
88          *
89          * In either case, truncate and restart at head.
90          */
91         diff = head - old;
92         if (diff > md->mask / 2 || diff < 0) {
93                 struct timeval iv;
94                 unsigned long msecs;
95
96                 timersub(&this_read, &last_read, &iv);
97                 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
98
99                 fprintf(stderr, "WARNING: failed to keep up with mmap data."
100                                 "  Last read %lu msecs ago.\n", msecs);
101
102                 /*
103                  * head points to a known good entry, start there.
104                  */
105                 old = head;
106         }
107
108         last_read = this_read;
109
110         if (old != head)
111                 events++;
112
113         size = head - old;
114
115         if ((old & md->mask) + size != (head & md->mask)) {
116                 buf = &data[old & md->mask];
117                 size = md->mask + 1 - (old & md->mask);
118                 old += size;
119
120                 while (size) {
121                         int ret = write(output, buf, size);
122
123                         if (ret < 0)
124                                 die("failed to write");
125
126                         size -= ret;
127                         buf += ret;
128
129                         bytes_written += ret;
130                 }
131         }
132
133         buf = &data[old & md->mask];
134         size = head - old;
135         old += size;
136
137         while (size) {
138                 int ret = write(output, buf, size);
139
140                 if (ret < 0)
141                         die("failed to write");
142
143                 size -= ret;
144                 buf += ret;
145
146                 bytes_written += ret;
147         }
148
149         md->prev = old;
150 }
151
152 static volatile int done = 0;
153
154 static void sig_handler(int sig)
155 {
156         done = 1;
157 }
158
159 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
160 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
161
162 static int nr_poll;
163 static int nr_cpu;
164
165 struct mmap_event {
166         struct perf_event_header        header;
167         __u32                           pid;
168         __u32                           tid;
169         __u64                           start;
170         __u64                           len;
171         __u64                           pgoff;
172         char                            filename[PATH_MAX];
173 };
174
175 struct comm_event {
176         struct perf_event_header        header;
177         __u32                           pid;
178         __u32                           tid;
179         char                            comm[16];
180 };
181
182 static void pid_synthesize_comm_event(pid_t pid, int full)
183 {
184         struct comm_event comm_ev;
185         char filename[PATH_MAX];
186         char bf[BUFSIZ];
187         int fd, ret;
188         size_t size;
189         char *field, *sep;
190         DIR *tasks;
191         struct dirent dirent, *next;
192
193         snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
194
195         fd = open(filename, O_RDONLY);
196         if (fd < 0) {
197                 fprintf(stderr, "couldn't open %s\n", filename);
198                 exit(EXIT_FAILURE);
199         }
200         if (read(fd, bf, sizeof(bf)) < 0) {
201                 fprintf(stderr, "couldn't read %s\n", filename);
202                 exit(EXIT_FAILURE);
203         }
204         close(fd);
205
206         /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
207         memset(&comm_ev, 0, sizeof(comm_ev));
208         field = strchr(bf, '(');
209         if (field == NULL)
210                 goto out_failure;
211         sep = strchr(++field, ')');
212         if (sep == NULL)
213                 goto out_failure;
214         size = sep - field;
215         memcpy(comm_ev.comm, field, size++);
216
217         comm_ev.pid = pid;
218         comm_ev.header.type = PERF_EVENT_COMM;
219         size = ALIGN(size, sizeof(uint64_t));
220         comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
221
222         if (!full) {
223                 comm_ev.tid = pid;
224
225                 ret = write(output, &comm_ev, comm_ev.header.size);
226                 if (ret < 0) {
227                         perror("failed to write");
228                         exit(-1);
229                 }
230                 return;
231         }
232
233         snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
234
235         tasks = opendir(filename);
236         while (!readdir_r(tasks, &dirent, &next) && next) {
237                 char *end;
238                 pid = strtol(dirent.d_name, &end, 10);
239                 if (*end)
240                         continue;
241
242                 comm_ev.tid = pid;
243
244                 ret = write(output, &comm_ev, comm_ev.header.size);
245                 if (ret < 0) {
246                         perror("failed to write");
247                         exit(-1);
248                 }
249         }
250         closedir(tasks);
251         return;
252
253 out_failure:
254         fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
255                 filename);
256         exit(EXIT_FAILURE);
257 }
258
259 static void pid_synthesize_mmap_events(pid_t pid)
260 {
261         char filename[PATH_MAX];
262         FILE *fp;
263
264         snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
265
266         fp = fopen(filename, "r");
267         if (fp == NULL) {
268                 fprintf(stderr, "couldn't open %s\n", filename);
269                 exit(EXIT_FAILURE);
270         }
271         while (1) {
272                 char bf[BUFSIZ], *pbf = bf;
273                 struct mmap_event mmap_ev = {
274                         .header.type = PERF_EVENT_MMAP,
275                 };
276                 int n;
277                 size_t size;
278                 if (fgets(bf, sizeof(bf), fp) == NULL)
279                         break;
280
281                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
282                 n = hex2u64(pbf, &mmap_ev.start);
283                 if (n < 0)
284                         continue;
285                 pbf += n + 1;
286                 n = hex2u64(pbf, &mmap_ev.len);
287                 if (n < 0)
288                         continue;
289                 pbf += n + 3;
290                 if (*pbf == 'x') { /* vm_exec */
291                         char *execname = strrchr(bf, ' ');
292
293                         if (execname == NULL || execname[1] != '/')
294                                 continue;
295
296                         execname += 1;
297                         size = strlen(execname);
298                         execname[size - 1] = '\0'; /* Remove \n */
299                         memcpy(mmap_ev.filename, execname, size);
300                         size = ALIGN(size, sizeof(uint64_t));
301                         mmap_ev.len -= mmap_ev.start;
302                         mmap_ev.header.size = (sizeof(mmap_ev) -
303                                                (sizeof(mmap_ev.filename) - size));
304                         mmap_ev.pid = pid;
305                         mmap_ev.tid = pid;
306
307                         if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
308                                 perror("failed to write");
309                                 exit(-1);
310                         }
311                 }
312         }
313
314         fclose(fp);
315 }
316
317 static void synthesize_events(void)
318 {
319         DIR *proc;
320         struct dirent dirent, *next;
321
322         proc = opendir("/proc");
323
324         while (!readdir_r(proc, &dirent, &next) && next) {
325                 char *end;
326                 pid_t pid;
327
328                 pid = strtol(dirent.d_name, &end, 10);
329                 if (*end) /* only interested in proper numerical dirents */
330                         continue;
331
332                 pid_synthesize_comm_event(pid, 1);
333                 pid_synthesize_mmap_events(pid);
334         }
335
336         closedir(proc);
337 }
338
339 static void open_counters(int cpu, pid_t pid)
340 {
341         struct perf_counter_attr attr;
342         int counter, group_fd;
343         int track = 1;
344
345         if (pid > 0) {
346                 pid_synthesize_comm_event(pid, 0);
347                 pid_synthesize_mmap_events(pid);
348         }
349
350         group_fd = -1;
351         for (counter = 0; counter < nr_counters; counter++) {
352
353                 memset(&attr, 0, sizeof(attr));
354                 attr.config             = event_id[counter];
355                 attr.sample_period      = event_count[counter];
356                 attr.sample_type        = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
357                 attr.mmap               = track;
358                 attr.comm               = track;
359                 attr.inherit    = (cpu < 0) && inherit;
360
361                 track = 0; // only the first counter needs these
362
363                 fd[nr_cpu][counter] =
364                         sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
365
366                 if (fd[nr_cpu][counter] < 0) {
367                         int err = errno;
368
369                         error("syscall returned with %d (%s)\n",
370                                         fd[nr_cpu][counter], strerror(err));
371                         if (err == EPERM)
372                                 printf("Are you root?\n");
373                         exit(-1);
374                 }
375                 assert(fd[nr_cpu][counter] >= 0);
376                 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
377
378                 /*
379                  * First counter acts as the group leader:
380                  */
381                 if (group && group_fd == -1)
382                         group_fd = fd[nr_cpu][counter];
383
384                 event_array[nr_poll].fd = fd[nr_cpu][counter];
385                 event_array[nr_poll].events = POLLIN;
386                 nr_poll++;
387
388                 mmap_array[nr_cpu][counter].counter = counter;
389                 mmap_array[nr_cpu][counter].prev = 0;
390                 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
391                 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
392                                 PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
393                 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
394                         error("failed to mmap with %d (%s)\n", errno, strerror(errno));
395                         exit(-1);
396                 }
397         }
398         nr_cpu++;
399 }
400
401 static int __cmd_record(int argc, const char **argv)
402 {
403         int i, counter;
404         struct stat st;
405         pid_t pid;
406         int flags;
407         int ret;
408
409         page_size = sysconf(_SC_PAGE_SIZE);
410         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
411         assert(nr_cpus <= MAX_NR_CPUS);
412         assert(nr_cpus >= 0);
413
414         if (!stat(output_name, &st) && !force && !append_file) {
415                 fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
416                                 output_name);
417                 exit(-1);
418         }
419
420         flags = O_CREAT|O_RDWR;
421         if (append_file)
422                 flags |= O_APPEND;
423         else
424                 flags |= O_TRUNC;
425
426         output = open(output_name, flags, S_IRUSR|S_IWUSR);
427         if (output < 0) {
428                 perror("failed to create output file");
429                 exit(-1);
430         }
431
432         if (!system_wide) {
433                 open_counters(-1, target_pid != -1 ? target_pid : getpid());
434         } else for (i = 0; i < nr_cpus; i++)
435                 open_counters(i, target_pid);
436
437         signal(SIGCHLD, sig_handler);
438         signal(SIGINT, sig_handler);
439
440         if (target_pid == -1 && argc) {
441                 pid = fork();
442                 if (pid < 0)
443                         perror("failed to fork");
444
445                 if (!pid) {
446                         if (execvp(argv[0], (char **)argv)) {
447                                 perror(argv[0]);
448                                 exit(-1);
449                         }
450                 }
451         }
452
453         if (realtime_prio) {
454                 struct sched_param param;
455
456                 param.sched_priority = realtime_prio;
457                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
458                         printf("Could not set realtime priority.\n");
459                         exit(-1);
460                 }
461         }
462
463         if (system_wide)
464                 synthesize_events();
465
466         while (!done) {
467                 int hits = events;
468
469                 for (i = 0; i < nr_cpu; i++) {
470                         for (counter = 0; counter < nr_counters; counter++)
471                                 mmap_read(&mmap_array[i][counter]);
472                 }
473
474                 if (hits == events)
475                         ret = poll(event_array, nr_poll, 100);
476         }
477
478         /*
479          * Approximate RIP event size: 24 bytes.
480          */
481         fprintf(stderr,
482                 "[ perf record: Captured and wrote %.3f MB %s (~%lld events) ]\n",
483                 (double)bytes_written / 1024.0 / 1024.0,
484                 output_name,
485                 bytes_written / 24);
486
487         return 0;
488 }
489
490 static const char * const record_usage[] = {
491         "perf record [<options>] [<command>]",
492         "perf record [<options>] -- <command> [<options>]",
493         NULL
494 };
495
496 static char events_help_msg[EVENTS_HELP_MAX];
497
498 static const struct option options[] = {
499         OPT_CALLBACK('e', "event", NULL, "event",
500                      events_help_msg, parse_events),
501         OPT_INTEGER('p', "pid", &target_pid,
502                     "record events on existing pid"),
503         OPT_INTEGER('r', "realtime", &realtime_prio,
504                     "collect data with this RT SCHED_FIFO priority"),
505         OPT_BOOLEAN('a', "all-cpus", &system_wide,
506                             "system-wide collection from all CPUs"),
507         OPT_BOOLEAN('A', "append", &append_file,
508                             "append to the output file to do incremental profiling"),
509         OPT_BOOLEAN('f', "force", &force,
510                         "overwrite existing data file"),
511         OPT_LONG('c', "count", &default_interval,
512                     "event period to sample"),
513         OPT_STRING('o', "output", &output_name, "file",
514                     "output file name"),
515         OPT_BOOLEAN('i', "inherit", &inherit,
516                     "child tasks inherit counters"),
517         OPT_INTEGER('m', "mmap-pages", &mmap_pages,
518                     "number of mmap data pages"),
519         OPT_END()
520 };
521
522 int cmd_record(int argc, const char **argv, const char *prefix)
523 {
524         int counter;
525
526         create_events_help(events_help_msg);
527
528         argc = parse_options(argc, argv, options, record_usage, 0);
529         if (!argc && target_pid == -1 && !system_wide)
530                 usage_with_options(record_usage, options);
531
532         if (!nr_counters) {
533                 nr_counters = 1;
534                 event_id[0] = 0;
535         }
536
537         for (counter = 0; counter < nr_counters; counter++) {
538                 if (event_count[counter])
539                         continue;
540
541                 event_count[counter] = default_interval;
542         }
543
544         return __cmd_record(argc, argv);
545 }