Pull pvops into release branch
[linux-2.6] / kernel / trace / trace_sched_wakeup.c
1 /*
2  * trace task wakeup timings
3  *
4  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6  *
7  * Based on code from the latency_tracer, that is:
8  *
9  *  Copyright (C) 2004-2006 Ingo Molnar
10  *  Copyright (C) 2004 William Lee Irwin III
11  */
12 #include <linux/module.h>
13 #include <linux/fs.h>
14 #include <linux/debugfs.h>
15 #include <linux/kallsyms.h>
16 #include <linux/uaccess.h>
17 #include <linux/ftrace.h>
18 #include <linux/marker.h>
19
20 #include "trace.h"
21
22 static struct trace_array       *wakeup_trace;
23 static int __read_mostly        tracer_enabled;
24
25 static struct task_struct       *wakeup_task;
26 static int                      wakeup_cpu;
27 static unsigned                 wakeup_prio = -1;
28
29 static DEFINE_SPINLOCK(wakeup_lock);
30
31 static void __wakeup_reset(struct trace_array *tr);
32
33 #ifdef CONFIG_FTRACE
34 /*
35  * irqsoff uses its own tracer function to keep the overhead down:
36  */
37 static void
38 wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
39 {
40         struct trace_array *tr = wakeup_trace;
41         struct trace_array_cpu *data;
42         unsigned long flags;
43         long disabled;
44         int resched;
45         int cpu;
46
47         if (likely(!wakeup_task))
48                 return;
49
50         resched = need_resched();
51         preempt_disable_notrace();
52
53         cpu = raw_smp_processor_id();
54         data = tr->data[cpu];
55         disabled = atomic_inc_return(&data->disabled);
56         if (unlikely(disabled != 1))
57                 goto out;
58
59         spin_lock_irqsave(&wakeup_lock, flags);
60
61         if (unlikely(!wakeup_task))
62                 goto unlock;
63
64         /*
65          * The task can't disappear because it needs to
66          * wake up first, and we have the wakeup_lock.
67          */
68         if (task_cpu(wakeup_task) != cpu)
69                 goto unlock;
70
71         trace_function(tr, data, ip, parent_ip, flags);
72
73  unlock:
74         spin_unlock_irqrestore(&wakeup_lock, flags);
75
76  out:
77         atomic_dec(&data->disabled);
78
79         /*
80          * To prevent recursion from the scheduler, if the
81          * resched flag was set before we entered, then
82          * don't reschedule.
83          */
84         if (resched)
85                 preempt_enable_no_resched_notrace();
86         else
87                 preempt_enable_notrace();
88 }
89
90 static struct ftrace_ops trace_ops __read_mostly =
91 {
92         .func = wakeup_tracer_call,
93 };
94 #endif /* CONFIG_FTRACE */
95
96 /*
97  * Should this new latency be reported/recorded?
98  */
99 static int report_latency(cycle_t delta)
100 {
101         if (tracing_thresh) {
102                 if (delta < tracing_thresh)
103                         return 0;
104         } else {
105                 if (delta <= tracing_max_latency)
106                         return 0;
107         }
108         return 1;
109 }
110
111 static void notrace
112 wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
113         struct task_struct *next)
114 {
115         unsigned long latency = 0, t0 = 0, t1 = 0;
116         struct trace_array **ptr = private;
117         struct trace_array *tr = *ptr;
118         struct trace_array_cpu *data;
119         cycle_t T0, T1, delta;
120         unsigned long flags;
121         long disabled;
122         int cpu;
123
124         if (unlikely(!tracer_enabled))
125                 return;
126
127         /*
128          * When we start a new trace, we set wakeup_task to NULL
129          * and then set tracer_enabled = 1. We want to make sure
130          * that another CPU does not see the tracer_enabled = 1
131          * and the wakeup_task with an older task, that might
132          * actually be the same as next.
133          */
134         smp_rmb();
135
136         if (next != wakeup_task)
137                 return;
138
139         /* The task we are waiting for is waking up */
140         data = tr->data[wakeup_cpu];
141
142         /* disable local data, not wakeup_cpu data */
143         cpu = raw_smp_processor_id();
144         disabled = atomic_inc_return(&tr->data[cpu]->disabled);
145         if (likely(disabled != 1))
146                 goto out;
147
148         spin_lock_irqsave(&wakeup_lock, flags);
149
150         /* We could race with grabbing wakeup_lock */
151         if (unlikely(!tracer_enabled || next != wakeup_task))
152                 goto out_unlock;
153
154         trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
155
156         /*
157          * usecs conversion is slow so we try to delay the conversion
158          * as long as possible:
159          */
160         T0 = data->preempt_timestamp;
161         T1 = ftrace_now(cpu);
162         delta = T1-T0;
163
164         if (!report_latency(delta))
165                 goto out_unlock;
166
167         latency = nsecs_to_usecs(delta);
168
169         tracing_max_latency = delta;
170         t0 = nsecs_to_usecs(T0);
171         t1 = nsecs_to_usecs(T1);
172
173         update_max_tr(tr, wakeup_task, wakeup_cpu);
174
175 out_unlock:
176         __wakeup_reset(tr);
177         spin_unlock_irqrestore(&wakeup_lock, flags);
178 out:
179         atomic_dec(&tr->data[cpu]->disabled);
180 }
181
182 static notrace void
183 sched_switch_callback(void *probe_data, void *call_data,
184                       const char *format, va_list *args)
185 {
186         struct task_struct *prev;
187         struct task_struct *next;
188         struct rq *__rq;
189
190         /* skip prev_pid %d next_pid %d prev_state %ld */
191         (void)va_arg(*args, int);
192         (void)va_arg(*args, int);
193         (void)va_arg(*args, long);
194         __rq = va_arg(*args, typeof(__rq));
195         prev = va_arg(*args, typeof(prev));
196         next = va_arg(*args, typeof(next));
197
198         tracing_record_cmdline(prev);
199
200         /*
201          * If tracer_switch_func only points to the local
202          * switch func, it still needs the ptr passed to it.
203          */
204         wakeup_sched_switch(probe_data, __rq, prev, next);
205 }
206
207 static void __wakeup_reset(struct trace_array *tr)
208 {
209         struct trace_array_cpu *data;
210         int cpu;
211
212         assert_spin_locked(&wakeup_lock);
213
214         for_each_possible_cpu(cpu) {
215                 data = tr->data[cpu];
216                 tracing_reset(data);
217         }
218
219         wakeup_cpu = -1;
220         wakeup_prio = -1;
221
222         if (wakeup_task)
223                 put_task_struct(wakeup_task);
224
225         wakeup_task = NULL;
226 }
227
228 static void wakeup_reset(struct trace_array *tr)
229 {
230         unsigned long flags;
231
232         spin_lock_irqsave(&wakeup_lock, flags);
233         __wakeup_reset(tr);
234         spin_unlock_irqrestore(&wakeup_lock, flags);
235 }
236
237 static void
238 wakeup_check_start(struct trace_array *tr, struct task_struct *p,
239                    struct task_struct *curr)
240 {
241         int cpu = smp_processor_id();
242         unsigned long flags;
243         long disabled;
244
245         if (likely(!rt_task(p)) ||
246                         p->prio >= wakeup_prio ||
247                         p->prio >= curr->prio)
248                 return;
249
250         disabled = atomic_inc_return(&tr->data[cpu]->disabled);
251         if (unlikely(disabled != 1))
252                 goto out;
253
254         /* interrupts should be off from try_to_wake_up */
255         spin_lock(&wakeup_lock);
256
257         /* check for races. */
258         if (!tracer_enabled || p->prio >= wakeup_prio)
259                 goto out_locked;
260
261         /* reset the trace */
262         __wakeup_reset(tr);
263
264         wakeup_cpu = task_cpu(p);
265         wakeup_prio = p->prio;
266
267         wakeup_task = p;
268         get_task_struct(wakeup_task);
269
270         local_save_flags(flags);
271
272         tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
273         trace_function(tr, tr->data[wakeup_cpu],
274                        CALLER_ADDR1, CALLER_ADDR2, flags);
275
276 out_locked:
277         spin_unlock(&wakeup_lock);
278 out:
279         atomic_dec(&tr->data[cpu]->disabled);
280 }
281
282 static notrace void
283 wake_up_callback(void *probe_data, void *call_data,
284                  const char *format, va_list *args)
285 {
286         struct trace_array **ptr = probe_data;
287         struct trace_array *tr = *ptr;
288         struct task_struct *curr;
289         struct task_struct *task;
290         struct rq *__rq;
291
292         if (likely(!tracer_enabled))
293                 return;
294
295         /* Skip pid %d state %ld */
296         (void)va_arg(*args, int);
297         (void)va_arg(*args, long);
298         /* now get the meat: "rq %p task %p rq->curr %p" */
299         __rq = va_arg(*args, typeof(__rq));
300         task = va_arg(*args, typeof(task));
301         curr = va_arg(*args, typeof(curr));
302
303         tracing_record_cmdline(task);
304         tracing_record_cmdline(curr);
305
306         wakeup_check_start(tr, task, curr);
307 }
308
309 static void start_wakeup_tracer(struct trace_array *tr)
310 {
311         int ret;
312
313         ret = marker_probe_register("kernel_sched_wakeup",
314                         "pid %d state %ld ## rq %p task %p rq->curr %p",
315                         wake_up_callback,
316                         &wakeup_trace);
317         if (ret) {
318                 pr_info("wakeup trace: Couldn't add marker"
319                         " probe to kernel_sched_wakeup\n");
320                 return;
321         }
322
323         ret = marker_probe_register("kernel_sched_wakeup_new",
324                         "pid %d state %ld ## rq %p task %p rq->curr %p",
325                         wake_up_callback,
326                         &wakeup_trace);
327         if (ret) {
328                 pr_info("wakeup trace: Couldn't add marker"
329                         " probe to kernel_sched_wakeup_new\n");
330                 goto fail_deprobe;
331         }
332
333         ret = marker_probe_register("kernel_sched_schedule",
334                 "prev_pid %d next_pid %d prev_state %ld "
335                 "## rq %p prev %p next %p",
336                 sched_switch_callback,
337                 &wakeup_trace);
338         if (ret) {
339                 pr_info("sched trace: Couldn't add marker"
340                         " probe to kernel_sched_schedule\n");
341                 goto fail_deprobe_wake_new;
342         }
343
344         wakeup_reset(tr);
345
346         /*
347          * Don't let the tracer_enabled = 1 show up before
348          * the wakeup_task is reset. This may be overkill since
349          * wakeup_reset does a spin_unlock after setting the
350          * wakeup_task to NULL, but I want to be safe.
351          * This is a slow path anyway.
352          */
353         smp_wmb();
354
355         register_ftrace_function(&trace_ops);
356
357         tracer_enabled = 1;
358
359         return;
360 fail_deprobe_wake_new:
361         marker_probe_unregister("kernel_sched_wakeup_new",
362                                 wake_up_callback,
363                                 &wakeup_trace);
364 fail_deprobe:
365         marker_probe_unregister("kernel_sched_wakeup",
366                                 wake_up_callback,
367                                 &wakeup_trace);
368 }
369
370 static void stop_wakeup_tracer(struct trace_array *tr)
371 {
372         tracer_enabled = 0;
373         unregister_ftrace_function(&trace_ops);
374         marker_probe_unregister("kernel_sched_schedule",
375                                 sched_switch_callback,
376                                 &wakeup_trace);
377         marker_probe_unregister("kernel_sched_wakeup_new",
378                                 wake_up_callback,
379                                 &wakeup_trace);
380         marker_probe_unregister("kernel_sched_wakeup",
381                                 wake_up_callback,
382                                 &wakeup_trace);
383 }
384
385 static void wakeup_tracer_init(struct trace_array *tr)
386 {
387         wakeup_trace = tr;
388
389         if (tr->ctrl)
390                 start_wakeup_tracer(tr);
391 }
392
393 static void wakeup_tracer_reset(struct trace_array *tr)
394 {
395         if (tr->ctrl) {
396                 stop_wakeup_tracer(tr);
397                 /* make sure we put back any tasks we are tracing */
398                 wakeup_reset(tr);
399         }
400 }
401
402 static void wakeup_tracer_ctrl_update(struct trace_array *tr)
403 {
404         if (tr->ctrl)
405                 start_wakeup_tracer(tr);
406         else
407                 stop_wakeup_tracer(tr);
408 }
409
410 static void wakeup_tracer_open(struct trace_iterator *iter)
411 {
412         /* stop the trace while dumping */
413         if (iter->tr->ctrl)
414                 stop_wakeup_tracer(iter->tr);
415 }
416
417 static void wakeup_tracer_close(struct trace_iterator *iter)
418 {
419         /* forget about any processes we were recording */
420         if (iter->tr->ctrl)
421                 start_wakeup_tracer(iter->tr);
422 }
423
424 static struct tracer wakeup_tracer __read_mostly =
425 {
426         .name           = "wakeup",
427         .init           = wakeup_tracer_init,
428         .reset          = wakeup_tracer_reset,
429         .open           = wakeup_tracer_open,
430         .close          = wakeup_tracer_close,
431         .ctrl_update    = wakeup_tracer_ctrl_update,
432         .print_max      = 1,
433 #ifdef CONFIG_FTRACE_SELFTEST
434         .selftest    = trace_selftest_startup_wakeup,
435 #endif
436 };
437
438 __init static int init_wakeup_tracer(void)
439 {
440         int ret;
441
442         ret = register_tracer(&wakeup_tracer);
443         if (ret)
444                 return ret;
445
446         return 0;
447 }
448 device_initcall(init_wakeup_tracer);