Merge branches 'irq/sparseirq', 'x86/quirks' and 'x86/reboot' into cpus4096
[linux-2.6] / arch / x86 / kernel / process.c
1 #include <linux/errno.h>
2 #include <linux/kernel.h>
3 #include <linux/mm.h>
4 #include <linux/smp.h>
5 #include <linux/slab.h>
6 #include <linux/sched.h>
7 #include <linux/module.h>
8 #include <linux/pm.h>
9 #include <linux/clockchips.h>
10 #include <linux/ftrace.h>
11 #include <asm/system.h>
12 #include <asm/apic.h>
13
14 unsigned long idle_halt;
15 EXPORT_SYMBOL(idle_halt);
16 unsigned long idle_nomwait;
17 EXPORT_SYMBOL(idle_nomwait);
18
19 struct kmem_cache *task_xstate_cachep;
20
21 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
22 {
23         *dst = *src;
24         if (src->thread.xstate) {
25                 dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
26                                                       GFP_KERNEL);
27                 if (!dst->thread.xstate)
28                         return -ENOMEM;
29                 WARN_ON((unsigned long)dst->thread.xstate & 15);
30                 memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
31         }
32         return 0;
33 }
34
35 void free_thread_xstate(struct task_struct *tsk)
36 {
37         if (tsk->thread.xstate) {
38                 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
39                 tsk->thread.xstate = NULL;
40         }
41 }
42
43 void free_thread_info(struct thread_info *ti)
44 {
45         free_thread_xstate(ti->task);
46         free_pages((unsigned long)ti, get_order(THREAD_SIZE));
47 }
48
49 void arch_task_cache_init(void)
50 {
51         task_xstate_cachep =
52                 kmem_cache_create("task_xstate", xstate_size,
53                                   __alignof__(union thread_xstate),
54                                   SLAB_PANIC, NULL);
55 }
56
57 /*
58  * Idle related variables and functions
59  */
60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override);
62
63 /*
64  * Powermanagement idle function, if any..
65  */
66 void (*pm_idle)(void);
67 EXPORT_SYMBOL(pm_idle);
68
69 #ifdef CONFIG_X86_32
70 /*
71  * This halt magic was a workaround for ancient floppy DMA
72  * wreckage. It should be safe to remove.
73  */
74 static int hlt_counter;
75 void disable_hlt(void)
76 {
77         hlt_counter++;
78 }
79 EXPORT_SYMBOL(disable_hlt);
80
81 void enable_hlt(void)
82 {
83         hlt_counter--;
84 }
85 EXPORT_SYMBOL(enable_hlt);
86
87 static inline int hlt_use_halt(void)
88 {
89         return (!hlt_counter && boot_cpu_data.hlt_works_ok);
90 }
91 #else
92 static inline int hlt_use_halt(void)
93 {
94         return 1;
95 }
96 #endif
97
98 /*
99  * We use this if we don't have any better
100  * idle routine..
101  */
102 void default_idle(void)
103 {
104         if (hlt_use_halt()) {
105                 struct power_trace it;
106
107                 trace_power_start(&it, POWER_CSTATE, 1);
108                 current_thread_info()->status &= ~TS_POLLING;
109                 /*
110                  * TS_POLLING-cleared state must be visible before we
111                  * test NEED_RESCHED:
112                  */
113                 smp_mb();
114
115                 if (!need_resched())
116                         safe_halt();    /* enables interrupts racelessly */
117                 else
118                         local_irq_enable();
119                 current_thread_info()->status |= TS_POLLING;
120                 trace_power_end(&it);
121         } else {
122                 local_irq_enable();
123                 /* loop is done by the caller */
124                 cpu_relax();
125         }
126 }
127 #ifdef CONFIG_APM_MODULE
128 EXPORT_SYMBOL(default_idle);
129 #endif
130
131 void stop_this_cpu(void *dummy)
132 {
133         local_irq_disable();
134         /*
135          * Remove this CPU:
136          */
137         cpu_clear(smp_processor_id(), cpu_online_map);
138         disable_local_APIC();
139
140         for (;;) {
141                 if (hlt_works(smp_processor_id()))
142                         halt();
143         }
144 }
145
146 static void do_nothing(void *unused)
147 {
148 }
149
150 /*
151  * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
152  * pm_idle and update to new pm_idle value. Required while changing pm_idle
153  * handler on SMP systems.
154  *
155  * Caller must have changed pm_idle to the new value before the call. Old
156  * pm_idle value will not be used by any CPU after the return of this function.
157  */
158 void cpu_idle_wait(void)
159 {
160         smp_mb();
161         /* kick all the CPUs so that they exit out of pm_idle */
162         smp_call_function(do_nothing, NULL, 1);
163 }
164 EXPORT_SYMBOL_GPL(cpu_idle_wait);
165
166 /*
167  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
168  * which can obviate IPI to trigger checking of need_resched.
169  * We execute MONITOR against need_resched and enter optimized wait state
170  * through MWAIT. Whenever someone changes need_resched, we would be woken
171  * up from MWAIT (without an IPI).
172  *
173  * New with Core Duo processors, MWAIT can take some hints based on CPU
174  * capability.
175  */
176 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
177 {
178         struct power_trace it;
179
180         trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
181         if (!need_resched()) {
182                 __monitor((void *)&current_thread_info()->flags, 0, 0);
183                 smp_mb();
184                 if (!need_resched())
185                         __mwait(ax, cx);
186         }
187         trace_power_end(&it);
188 }
189
190 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
191 static void mwait_idle(void)
192 {
193         struct power_trace it;
194         if (!need_resched()) {
195                 trace_power_start(&it, POWER_CSTATE, 1);
196                 __monitor((void *)&current_thread_info()->flags, 0, 0);
197                 smp_mb();
198                 if (!need_resched())
199                         __sti_mwait(0, 0);
200                 else
201                         local_irq_enable();
202                 trace_power_end(&it);
203         } else
204                 local_irq_enable();
205 }
206
207 /*
208  * On SMP it's slightly faster (but much more power-consuming!)
209  * to poll the ->work.need_resched flag instead of waiting for the
210  * cross-CPU IPI to arrive. Use this option with caution.
211  */
212 static void poll_idle(void)
213 {
214         struct power_trace it;
215
216         trace_power_start(&it, POWER_CSTATE, 0);
217         local_irq_enable();
218         while (!need_resched())
219                 cpu_relax();
220         trace_power_end(&it);
221 }
222
223 /*
224  * mwait selection logic:
225  *
226  * It depends on the CPU. For AMD CPUs that support MWAIT this is
227  * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
228  * then depend on a clock divisor and current Pstate of the core. If
229  * all cores of a processor are in halt state (C1) the processor can
230  * enter the C1E (C1 enhanced) state. If mwait is used this will never
231  * happen.
232  *
233  * idle=mwait overrides this decision and forces the usage of mwait.
234  */
235 static int __cpuinitdata force_mwait;
236
237 #define MWAIT_INFO                      0x05
238 #define MWAIT_ECX_EXTENDED_INFO         0x01
239 #define MWAIT_EDX_C1                    0xf0
240
241 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
242 {
243         u32 eax, ebx, ecx, edx;
244
245         if (force_mwait)
246                 return 1;
247
248         if (c->cpuid_level < MWAIT_INFO)
249                 return 0;
250
251         cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
252         /* Check, whether EDX has extended info about MWAIT */
253         if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
254                 return 1;
255
256         /*
257          * edx enumeratios MONITOR/MWAIT extensions. Check, whether
258          * C1  supports MWAIT
259          */
260         return (edx & MWAIT_EDX_C1);
261 }
262
263 /*
264  * Check for AMD CPUs, which have potentially C1E support
265  */
266 static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
267 {
268         if (c->x86_vendor != X86_VENDOR_AMD)
269                 return 0;
270
271         if (c->x86 < 0x0F)
272                 return 0;
273
274         /* Family 0x0f models < rev F do not have C1E */
275         if (c->x86 == 0x0f && c->x86_model < 0x40)
276                 return 0;
277
278         return 1;
279 }
280
281 static cpumask_t c1e_mask = CPU_MASK_NONE;
282 static int c1e_detected;
283
284 void c1e_remove_cpu(int cpu)
285 {
286         cpu_clear(cpu, c1e_mask);
287 }
288
289 /*
290  * C1E aware idle routine. We check for C1E active in the interrupt
291  * pending message MSR. If we detect C1E, then we handle it the same
292  * way as C3 power states (local apic timer and TSC stop)
293  */
294 static void c1e_idle(void)
295 {
296         if (need_resched())
297                 return;
298
299         if (!c1e_detected) {
300                 u32 lo, hi;
301
302                 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
303                 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
304                         c1e_detected = 1;
305                         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
306                                 mark_tsc_unstable("TSC halt in AMD C1E");
307                         printk(KERN_INFO "System has AMD C1E enabled\n");
308                         set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
309                 }
310         }
311
312         if (c1e_detected) {
313                 int cpu = smp_processor_id();
314
315                 if (!cpu_isset(cpu, c1e_mask)) {
316                         cpu_set(cpu, c1e_mask);
317                         /*
318                          * Force broadcast so ACPI can not interfere. Needs
319                          * to run with interrupts enabled as it uses
320                          * smp_function_call.
321                          */
322                         local_irq_enable();
323                         clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
324                                            &cpu);
325                         printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
326                                cpu);
327                         local_irq_disable();
328                 }
329                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
330
331                 default_idle();
332
333                 /*
334                  * The switch back from broadcast mode needs to be
335                  * called with interrupts disabled.
336                  */
337                  local_irq_disable();
338                  clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
339                  local_irq_enable();
340         } else
341                 default_idle();
342 }
343
344 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
345 {
346 #ifdef CONFIG_X86_SMP
347         if (pm_idle == poll_idle && smp_num_siblings > 1) {
348                 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
349                         " performance may degrade.\n");
350         }
351 #endif
352         if (pm_idle)
353                 return;
354
355         if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
356                 /*
357                  * One CPU supports mwait => All CPUs supports mwait
358                  */
359                 printk(KERN_INFO "using mwait in idle threads.\n");
360                 pm_idle = mwait_idle;
361         } else if (check_c1e_idle(c)) {
362                 printk(KERN_INFO "using C1E aware idle routine\n");
363                 pm_idle = c1e_idle;
364         } else
365                 pm_idle = default_idle;
366 }
367
368 static int __init idle_setup(char *str)
369 {
370         if (!str)
371                 return -EINVAL;
372
373         if (!strcmp(str, "poll")) {
374                 printk("using polling idle threads.\n");
375                 pm_idle = poll_idle;
376         } else if (!strcmp(str, "mwait"))
377                 force_mwait = 1;
378         else if (!strcmp(str, "halt")) {
379                 /*
380                  * When the boot option of idle=halt is added, halt is
381                  * forced to be used for CPU idle. In such case CPU C2/C3
382                  * won't be used again.
383                  * To continue to load the CPU idle driver, don't touch
384                  * the boot_option_idle_override.
385                  */
386                 pm_idle = default_idle;
387                 idle_halt = 1;
388                 return 0;
389         } else if (!strcmp(str, "nomwait")) {
390                 /*
391                  * If the boot option of "idle=nomwait" is added,
392                  * it means that mwait will be disabled for CPU C2/C3
393                  * states. In such case it won't touch the variable
394                  * of boot_option_idle_override.
395                  */
396                 idle_nomwait = 1;
397                 return 0;
398         } else
399                 return -1;
400
401         boot_option_idle_override = 1;
402         return 0;
403 }
404 early_param("idle", idle_setup);
405