cpuidle: fix HP nx6125 regression
[linux-2.6] / drivers / acpi / processor_idle.c
1 /*
2  * processor_idle - idle state submodule to the ACPI processor driver
3  *
4  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6  *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
7  *  Copyright (C) 2004  Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8  *                      - Added processor hotplug support
9  *  Copyright (C) 2005  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
10  *                      - Added support for C3 on SMP
11  *
12  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13  *
14  *  This program is free software; you can redistribute it and/or modify
15  *  it under the terms of the GNU General Public License as published by
16  *  the Free Software Foundation; either version 2 of the License, or (at
17  *  your option) any later version.
18  *
19  *  This program is distributed in the hope that it will be useful, but
20  *  WITHOUT ANY WARRANTY; without even the implied warranty of
21  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  *  General Public License for more details.
23  *
24  *  You should have received a copy of the GNU General Public License along
25  *  with this program; if not, write to the Free Software Foundation, Inc.,
26  *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27  *
28  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29  */
30
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/init.h>
34 #include <linux/cpufreq.h>
35 #include <linux/proc_fs.h>
36 #include <linux/seq_file.h>
37 #include <linux/acpi.h>
38 #include <linux/dmi.h>
39 #include <linux/moduleparam.h>
40 #include <linux/sched.h>        /* need_resched() */
41 #include <linux/latency.h>
42 #include <linux/clockchips.h>
43 #include <linux/cpuidle.h>
44
45 /*
46  * Include the apic definitions for x86 to have the APIC timer related defines
47  * available also for UP (on SMP it gets magically included via linux/smp.h).
48  * asm/acpi.h is not an option, as it would require more include magic. Also
49  * creating an empty asm-ia64/apic.h would just trade pest vs. cholera.
50  */
51 #ifdef CONFIG_X86
52 #include <asm/apic.h>
53 #endif
54
55 #include <asm/io.h>
56 #include <asm/uaccess.h>
57
58 #include <acpi/acpi_bus.h>
59 #include <acpi/processor.h>
60
61 #define ACPI_PROCESSOR_COMPONENT        0x01000000
62 #define ACPI_PROCESSOR_CLASS            "processor"
63 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
64 ACPI_MODULE_NAME("processor_idle");
65 #define ACPI_PROCESSOR_FILE_POWER       "power"
66 #define US_TO_PM_TIMER_TICKS(t)         ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
67 #define PM_TIMER_TICK_NS                (1000000000ULL/PM_TIMER_FREQUENCY)
68 #ifndef CONFIG_CPU_IDLE
69 #define C2_OVERHEAD                     4       /* 1us (3.579 ticks per us) */
70 #define C3_OVERHEAD                     4       /* 1us (3.579 ticks per us) */
71 static void (*pm_idle_save) (void) __read_mostly;
72 #else
73 #define C2_OVERHEAD                     1       /* 1us */
74 #define C3_OVERHEAD                     1       /* 1us */
75 #endif
76 #define PM_TIMER_TICKS_TO_US(p)         (((p) * 1000)/(PM_TIMER_FREQUENCY/1000))
77
78 static unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER;
79 module_param(max_cstate, uint, 0000);
80 static unsigned int nocst __read_mostly;
81 module_param(nocst, uint, 0000);
82
83 #ifndef CONFIG_CPU_IDLE
84 /*
85  * bm_history -- bit-mask with a bit per jiffy of bus-master activity
86  * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
87  * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
88  * 100 HZ: 0x0000000F: 4 jiffies = 40ms
89  * reduce history for more aggressive entry into C3
90  */
91 static unsigned int bm_history __read_mostly =
92     (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
93 module_param(bm_history, uint, 0644);
94
95 static int acpi_processor_set_power_policy(struct acpi_processor *pr);
96
97 #endif
98
99 /*
100  * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
101  * For now disable this. Probably a bug somewhere else.
102  *
103  * To skip this limit, boot/load with a large max_cstate limit.
104  */
105 static int set_max_cstate(struct dmi_system_id *id)
106 {
107         if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
108                 return 0;
109
110         printk(KERN_NOTICE PREFIX "%s detected - limiting to C%ld max_cstate."
111                " Override with \"processor.max_cstate=%d\"\n", id->ident,
112                (long)id->driver_data, ACPI_PROCESSOR_MAX_POWER + 1);
113
114         max_cstate = (long)id->driver_data;
115
116         return 0;
117 }
118
119 /* Actually this shouldn't be __cpuinitdata, would be better to fix the
120    callers to only run once -AK */
121 static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
122         { set_max_cstate, "IBM ThinkPad R40e", {
123           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
124           DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
125         { set_max_cstate, "IBM ThinkPad R40e", {
126           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
127           DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
128         { set_max_cstate, "IBM ThinkPad R40e", {
129           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
130           DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
131         { set_max_cstate, "IBM ThinkPad R40e", {
132           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
133           DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
134         { set_max_cstate, "IBM ThinkPad R40e", {
135           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
136           DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
137         { set_max_cstate, "IBM ThinkPad R40e", {
138           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
139           DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
140         { set_max_cstate, "IBM ThinkPad R40e", {
141           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
142           DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
143         { set_max_cstate, "IBM ThinkPad R40e", {
144           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
145           DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
146         { set_max_cstate, "IBM ThinkPad R40e", {
147           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
148           DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
149         { set_max_cstate, "IBM ThinkPad R40e", {
150           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
151           DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
152         { set_max_cstate, "IBM ThinkPad R40e", {
153           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
154           DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
155         { set_max_cstate, "IBM ThinkPad R40e", {
156           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
157           DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
158         { set_max_cstate, "IBM ThinkPad R40e", {
159           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
160           DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
161         { set_max_cstate, "IBM ThinkPad R40e", {
162           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
163           DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
164         { set_max_cstate, "IBM ThinkPad R40e", {
165           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
166           DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
167         { set_max_cstate, "IBM ThinkPad R40e", {
168           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
169           DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
170         { set_max_cstate, "Medion 41700", {
171           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
172           DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
173         { set_max_cstate, "Clevo 5600D", {
174           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
175           DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
176          (void *)2},
177         {},
178 };
179
180 static inline u32 ticks_elapsed(u32 t1, u32 t2)
181 {
182         if (t2 >= t1)
183                 return (t2 - t1);
184         else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
185                 return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
186         else
187                 return ((0xFFFFFFFF - t1) + t2);
188 }
189
190 static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
191 {
192         if (t2 >= t1)
193                 return PM_TIMER_TICKS_TO_US(t2 - t1);
194         else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
195                 return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
196         else
197                 return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
198 }
199
200 static void acpi_safe_halt(void)
201 {
202         current_thread_info()->status &= ~TS_POLLING;
203         /*
204          * TS_POLLING-cleared state must be visible before we
205          * test NEED_RESCHED:
206          */
207         smp_mb();
208         if (!need_resched())
209                 safe_halt();
210         current_thread_info()->status |= TS_POLLING;
211 }
212
213 #ifndef CONFIG_CPU_IDLE
214
215 static void
216 acpi_processor_power_activate(struct acpi_processor *pr,
217                               struct acpi_processor_cx *new)
218 {
219         struct acpi_processor_cx *old;
220
221         if (!pr || !new)
222                 return;
223
224         old = pr->power.state;
225
226         if (old)
227                 old->promotion.count = 0;
228         new->demotion.count = 0;
229
230         /* Cleanup from old state. */
231         if (old) {
232                 switch (old->type) {
233                 case ACPI_STATE_C3:
234                         /* Disable bus master reload */
235                         if (new->type != ACPI_STATE_C3 && pr->flags.bm_check)
236                                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
237                         break;
238                 }
239         }
240
241         /* Prepare to use new state. */
242         switch (new->type) {
243         case ACPI_STATE_C3:
244                 /* Enable bus master reload */
245                 if (old->type != ACPI_STATE_C3 && pr->flags.bm_check)
246                         acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
247                 break;
248         }
249
250         pr->power.state = new;
251
252         return;
253 }
254
255 static atomic_t c3_cpu_count;
256
257 /* Common C-state entry for C2, C3, .. */
258 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
259 {
260         if (cstate->space_id == ACPI_CSTATE_FFH) {
261                 /* Call into architectural FFH based C-state */
262                 acpi_processor_ffh_cstate_enter(cstate);
263         } else {
264                 int unused;
265                 /* IO port based C-state */
266                 inb(cstate->address);
267                 /* Dummy wait op - must do something useless after P_LVL2 read
268                    because chipsets cannot guarantee that STPCLK# signal
269                    gets asserted in time to freeze execution properly. */
270                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
271         }
272 }
273 #endif /* !CONFIG_CPU_IDLE */
274
275 #ifdef ARCH_APICTIMER_STOPS_ON_C3
276
277 /*
278  * Some BIOS implementations switch to C3 in the published C2 state.
279  * This seems to be a common problem on AMD boxen, but other vendors
280  * are affected too. We pick the most conservative approach: we assume
281  * that the local APIC stops in both C2 and C3.
282  */
283 static void acpi_timer_check_state(int state, struct acpi_processor *pr,
284                                    struct acpi_processor_cx *cx)
285 {
286         struct acpi_processor_power *pwr = &pr->power;
287         u8 type = local_apic_timer_c2_ok ? ACPI_STATE_C3 : ACPI_STATE_C2;
288
289         /*
290          * Check, if one of the previous states already marked the lapic
291          * unstable
292          */
293         if (pwr->timer_broadcast_on_state < state)
294                 return;
295
296         if (cx->type >= type)
297                 pr->power.timer_broadcast_on_state = state;
298 }
299
300 static void acpi_propagate_timer_broadcast(struct acpi_processor *pr)
301 {
302 #ifdef CONFIG_GENERIC_CLOCKEVENTS
303         unsigned long reason;
304
305         reason = pr->power.timer_broadcast_on_state < INT_MAX ?
306                 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
307
308         clockevents_notify(reason, &pr->id);
309 #else
310         cpumask_t mask = cpumask_of_cpu(pr->id);
311
312         if (pr->power.timer_broadcast_on_state < INT_MAX)
313                 on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1);
314         else
315                 on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1);
316 #endif
317 }
318
319 /* Power(C) State timer broadcast control */
320 static void acpi_state_timer_broadcast(struct acpi_processor *pr,
321                                        struct acpi_processor_cx *cx,
322                                        int broadcast)
323 {
324 #ifdef CONFIG_GENERIC_CLOCKEVENTS
325
326         int state = cx - pr->power.states;
327
328         if (state >= pr->power.timer_broadcast_on_state) {
329                 unsigned long reason;
330
331                 reason = broadcast ?  CLOCK_EVT_NOTIFY_BROADCAST_ENTER :
332                         CLOCK_EVT_NOTIFY_BROADCAST_EXIT;
333                 clockevents_notify(reason, &pr->id);
334         }
335 #endif
336 }
337
338 #else
339
340 static void acpi_timer_check_state(int state, struct acpi_processor *pr,
341                                    struct acpi_processor_cx *cstate) { }
342 static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { }
343 static void acpi_state_timer_broadcast(struct acpi_processor *pr,
344                                        struct acpi_processor_cx *cx,
345                                        int broadcast)
346 {
347 }
348
349 #endif
350
351 /*
352  * Suspend / resume control
353  */
354 static int acpi_idle_suspend;
355
356 int acpi_processor_suspend(struct acpi_device * device, pm_message_t state)
357 {
358         acpi_idle_suspend = 1;
359         return 0;
360 }
361
362 int acpi_processor_resume(struct acpi_device * device)
363 {
364         acpi_idle_suspend = 0;
365         return 0;
366 }
367
368 #ifndef CONFIG_CPU_IDLE
369 static void acpi_processor_idle(void)
370 {
371         struct acpi_processor *pr = NULL;
372         struct acpi_processor_cx *cx = NULL;
373         struct acpi_processor_cx *next_state = NULL;
374         int sleep_ticks = 0;
375         u32 t1, t2 = 0;
376
377         /*
378          * Interrupts must be disabled during bus mastering calculations and
379          * for C2/C3 transitions.
380          */
381         local_irq_disable();
382
383         pr = processors[smp_processor_id()];
384         if (!pr) {
385                 local_irq_enable();
386                 return;
387         }
388
389         /*
390          * Check whether we truly need to go idle, or should
391          * reschedule:
392          */
393         if (unlikely(need_resched())) {
394                 local_irq_enable();
395                 return;
396         }
397
398         cx = pr->power.state;
399         if (!cx || acpi_idle_suspend) {
400                 if (pm_idle_save)
401                         pm_idle_save();
402                 else
403                         acpi_safe_halt();
404                 return;
405         }
406
407         /*
408          * Check BM Activity
409          * -----------------
410          * Check for bus mastering activity (if required), record, and check
411          * for demotion.
412          */
413         if (pr->flags.bm_check) {
414                 u32 bm_status = 0;
415                 unsigned long diff = jiffies - pr->power.bm_check_timestamp;
416
417                 if (diff > 31)
418                         diff = 31;
419
420                 pr->power.bm_activity <<= diff;
421
422                 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
423                 if (bm_status) {
424                         pr->power.bm_activity |= 0x1;
425                         acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
426                 }
427                 /*
428                  * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
429                  * the true state of bus mastering activity; forcing us to
430                  * manually check the BMIDEA bit of each IDE channel.
431                  */
432                 else if (errata.piix4.bmisx) {
433                         if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
434                             || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
435                                 pr->power.bm_activity |= 0x1;
436                 }
437
438                 pr->power.bm_check_timestamp = jiffies;
439
440                 /*
441                  * If bus mastering is or was active this jiffy, demote
442                  * to avoid a faulty transition.  Note that the processor
443                  * won't enter a low-power state during this call (to this
444                  * function) but should upon the next.
445                  *
446                  * TBD: A better policy might be to fallback to the demotion
447                  *      state (use it for this quantum only) istead of
448                  *      demoting -- and rely on duration as our sole demotion
449                  *      qualification.  This may, however, introduce DMA
450                  *      issues (e.g. floppy DMA transfer overrun/underrun).
451                  */
452                 if ((pr->power.bm_activity & 0x1) &&
453                     cx->demotion.threshold.bm) {
454                         local_irq_enable();
455                         next_state = cx->demotion.state;
456                         goto end;
457                 }
458         }
459
460 #ifdef CONFIG_HOTPLUG_CPU
461         /*
462          * Check for P_LVL2_UP flag before entering C2 and above on
463          * an SMP system. We do it here instead of doing it at _CST/P_LVL
464          * detection phase, to work cleanly with logical CPU hotplug.
465          */
466         if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
467             !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
468                 cx = &pr->power.states[ACPI_STATE_C1];
469 #endif
470
471         /*
472          * Sleep:
473          * ------
474          * Invoke the current Cx state to put the processor to sleep.
475          */
476         if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) {
477                 current_thread_info()->status &= ~TS_POLLING;
478                 /*
479                  * TS_POLLING-cleared state must be visible before we
480                  * test NEED_RESCHED:
481                  */
482                 smp_mb();
483                 if (need_resched()) {
484                         current_thread_info()->status |= TS_POLLING;
485                         local_irq_enable();
486                         return;
487                 }
488         }
489
490         switch (cx->type) {
491
492         case ACPI_STATE_C1:
493                 /*
494                  * Invoke C1.
495                  * Use the appropriate idle routine, the one that would
496                  * be used without acpi C-states.
497                  */
498                 if (pm_idle_save)
499                         pm_idle_save();
500                 else
501                         acpi_safe_halt();
502
503                 /*
504                  * TBD: Can't get time duration while in C1, as resumes
505                  *      go to an ISR rather than here.  Need to instrument
506                  *      base interrupt handler.
507                  *
508                  * Note: the TSC better not stop in C1, sched_clock() will
509                  *       skew otherwise.
510                  */
511                 sleep_ticks = 0xFFFFFFFF;
512                 break;
513
514         case ACPI_STATE_C2:
515                 /* Get start time (ticks) */
516                 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
517                 /* Tell the scheduler that we are going deep-idle: */
518                 sched_clock_idle_sleep_event();
519                 /* Invoke C2 */
520                 acpi_state_timer_broadcast(pr, cx, 1);
521                 acpi_cstate_enter(cx);
522                 /* Get end time (ticks) */
523                 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
524
525 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
526                 /* TSC halts in C2, so notify users */
527                 mark_tsc_unstable("possible TSC halt in C2");
528 #endif
529                 /* Compute time (ticks) that we were actually asleep */
530                 sleep_ticks = ticks_elapsed(t1, t2);
531
532                 /* Tell the scheduler how much we idled: */
533                 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
534
535                 /* Re-enable interrupts */
536                 local_irq_enable();
537                 /* Do not account our idle-switching overhead: */
538                 sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
539
540                 current_thread_info()->status |= TS_POLLING;
541                 acpi_state_timer_broadcast(pr, cx, 0);
542                 break;
543
544         case ACPI_STATE_C3:
545                 /*
546                  * disable bus master
547                  * bm_check implies we need ARB_DIS
548                  * !bm_check implies we need cache flush
549                  * bm_control implies whether we can do ARB_DIS
550                  *
551                  * That leaves a case where bm_check is set and bm_control is
552                  * not set. In that case we cannot do much, we enter C3
553                  * without doing anything.
554                  */
555                 if (pr->flags.bm_check && pr->flags.bm_control) {
556                         if (atomic_inc_return(&c3_cpu_count) ==
557                             num_online_cpus()) {
558                                 /*
559                                  * All CPUs are trying to go to C3
560                                  * Disable bus master arbitration
561                                  */
562                                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
563                         }
564                 } else if (!pr->flags.bm_check) {
565                         /* SMP with no shared cache... Invalidate cache  */
566                         ACPI_FLUSH_CPU_CACHE();
567                 }
568
569                 /* Get start time (ticks) */
570                 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
571                 /* Invoke C3 */
572                 acpi_state_timer_broadcast(pr, cx, 1);
573                 /* Tell the scheduler that we are going deep-idle: */
574                 sched_clock_idle_sleep_event();
575                 acpi_cstate_enter(cx);
576                 /* Get end time (ticks) */
577                 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
578                 if (pr->flags.bm_check && pr->flags.bm_control) {
579                         /* Enable bus master arbitration */
580                         atomic_dec(&c3_cpu_count);
581                         acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
582                 }
583
584 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
585                 /* TSC halts in C3, so notify users */
586                 mark_tsc_unstable("TSC halts in C3");
587 #endif
588                 /* Compute time (ticks) that we were actually asleep */
589                 sleep_ticks = ticks_elapsed(t1, t2);
590                 /* Tell the scheduler how much we idled: */
591                 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
592
593                 /* Re-enable interrupts */
594                 local_irq_enable();
595                 /* Do not account our idle-switching overhead: */
596                 sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
597
598                 current_thread_info()->status |= TS_POLLING;
599                 acpi_state_timer_broadcast(pr, cx, 0);
600                 break;
601
602         default:
603                 local_irq_enable();
604                 return;
605         }
606         cx->usage++;
607         if ((cx->type != ACPI_STATE_C1) && (sleep_ticks > 0))
608                 cx->time += sleep_ticks;
609
610         next_state = pr->power.state;
611
612 #ifdef CONFIG_HOTPLUG_CPU
613         /* Don't do promotion/demotion */
614         if ((cx->type == ACPI_STATE_C1) && (num_online_cpus() > 1) &&
615             !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) {
616                 next_state = cx;
617                 goto end;
618         }
619 #endif
620
621         /*
622          * Promotion?
623          * ----------
624          * Track the number of longs (time asleep is greater than threshold)
625          * and promote when the count threshold is reached.  Note that bus
626          * mastering activity may prevent promotions.
627          * Do not promote above max_cstate.
628          */
629         if (cx->promotion.state &&
630             ((cx->promotion.state - pr->power.states) <= max_cstate)) {
631                 if (sleep_ticks > cx->promotion.threshold.ticks &&
632                   cx->promotion.state->latency <= system_latency_constraint()) {
633                         cx->promotion.count++;
634                         cx->demotion.count = 0;
635                         if (cx->promotion.count >=
636                             cx->promotion.threshold.count) {
637                                 if (pr->flags.bm_check) {
638                                         if (!
639                                             (pr->power.bm_activity & cx->
640                                              promotion.threshold.bm)) {
641                                                 next_state =
642                                                     cx->promotion.state;
643                                                 goto end;
644                                         }
645                                 } else {
646                                         next_state = cx->promotion.state;
647                                         goto end;
648                                 }
649                         }
650                 }
651         }
652
653         /*
654          * Demotion?
655          * ---------
656          * Track the number of shorts (time asleep is less than time threshold)
657          * and demote when the usage threshold is reached.
658          */
659         if (cx->demotion.state) {
660                 if (sleep_ticks < cx->demotion.threshold.ticks) {
661                         cx->demotion.count++;
662                         cx->promotion.count = 0;
663                         if (cx->demotion.count >= cx->demotion.threshold.count) {
664                                 next_state = cx->demotion.state;
665                                 goto end;
666                         }
667                 }
668         }
669
670       end:
671         /*
672          * Demote if current state exceeds max_cstate
673          * or if the latency of the current state is unacceptable
674          */
675         if ((pr->power.state - pr->power.states) > max_cstate ||
676                 pr->power.state->latency > system_latency_constraint()) {
677                 if (cx->demotion.state)
678                         next_state = cx->demotion.state;
679         }
680
681         /*
682          * New Cx State?
683          * -------------
684          * If we're going to start using a new Cx state we must clean up
685          * from the previous and prepare to use the new.
686          */
687         if (next_state != pr->power.state)
688                 acpi_processor_power_activate(pr, next_state);
689 }
690
691 static int acpi_processor_set_power_policy(struct acpi_processor *pr)
692 {
693         unsigned int i;
694         unsigned int state_is_set = 0;
695         struct acpi_processor_cx *lower = NULL;
696         struct acpi_processor_cx *higher = NULL;
697         struct acpi_processor_cx *cx;
698
699
700         if (!pr)
701                 return -EINVAL;
702
703         /*
704          * This function sets the default Cx state policy (OS idle handler).
705          * Our scheme is to promote quickly to C2 but more conservatively
706          * to C3.  We're favoring C2  for its characteristics of low latency
707          * (quick response), good power savings, and ability to allow bus
708          * mastering activity.  Note that the Cx state policy is completely
709          * customizable and can be altered dynamically.
710          */
711
712         /* startup state */
713         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
714                 cx = &pr->power.states[i];
715                 if (!cx->valid)
716                         continue;
717
718                 if (!state_is_set)
719                         pr->power.state = cx;
720                 state_is_set++;
721                 break;
722         }
723
724         if (!state_is_set)
725                 return -ENODEV;
726
727         /* demotion */
728         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
729                 cx = &pr->power.states[i];
730                 if (!cx->valid)
731                         continue;
732
733                 if (lower) {
734                         cx->demotion.state = lower;
735                         cx->demotion.threshold.ticks = cx->latency_ticks;
736                         cx->demotion.threshold.count = 1;
737                         if (cx->type == ACPI_STATE_C3)
738                                 cx->demotion.threshold.bm = bm_history;
739                 }
740
741                 lower = cx;
742         }
743
744         /* promotion */
745         for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
746                 cx = &pr->power.states[i];
747                 if (!cx->valid)
748                         continue;
749
750                 if (higher) {
751                         cx->promotion.state = higher;
752                         cx->promotion.threshold.ticks = cx->latency_ticks;
753                         if (cx->type >= ACPI_STATE_C2)
754                                 cx->promotion.threshold.count = 4;
755                         else
756                                 cx->promotion.threshold.count = 10;
757                         if (higher->type == ACPI_STATE_C3)
758                                 cx->promotion.threshold.bm = bm_history;
759                 }
760
761                 higher = cx;
762         }
763
764         return 0;
765 }
766 #endif /* !CONFIG_CPU_IDLE */
767
768 static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr)
769 {
770
771         if (!pr)
772                 return -EINVAL;
773
774         if (!pr->pblk)
775                 return -ENODEV;
776
777         /* if info is obtained from pblk/fadt, type equals state */
778         pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;
779         pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;
780
781 #ifndef CONFIG_HOTPLUG_CPU
782         /*
783          * Check for P_LVL2_UP flag before entering C2 and above on
784          * an SMP system.
785          */
786         if ((num_online_cpus() > 1) &&
787             !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
788                 return -ENODEV;
789 #endif
790
791         /* determine C2 and C3 address from pblk */
792         pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;
793         pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;
794
795         /* determine latencies from FADT */
796         pr->power.states[ACPI_STATE_C2].latency = acpi_gbl_FADT.C2latency;
797         pr->power.states[ACPI_STATE_C3].latency = acpi_gbl_FADT.C3latency;
798
799         ACPI_DEBUG_PRINT((ACPI_DB_INFO,
800                           "lvl2[0x%08x] lvl3[0x%08x]\n",
801                           pr->power.states[ACPI_STATE_C2].address,
802                           pr->power.states[ACPI_STATE_C3].address));
803
804         return 0;
805 }
806
807 static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
808 {
809         if (!pr->power.states[ACPI_STATE_C1].valid) {
810                 /* set the first C-State to C1 */
811                 /* all processors need to support C1 */
812                 pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
813                 pr->power.states[ACPI_STATE_C1].valid = 1;
814         }
815         /* the C0 state only exists as a filler in our array */
816         pr->power.states[ACPI_STATE_C0].valid = 1;
817         return 0;
818 }
819
820 static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
821 {
822         acpi_status status = 0;
823         acpi_integer count;
824         int current_count;
825         int i;
826         struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
827         union acpi_object *cst;
828
829
830         if (nocst)
831                 return -ENODEV;
832
833         current_count = 0;
834
835         status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
836         if (ACPI_FAILURE(status)) {
837                 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
838                 return -ENODEV;
839         }
840
841         cst = buffer.pointer;
842
843         /* There must be at least 2 elements */
844         if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
845                 printk(KERN_ERR PREFIX "not enough elements in _CST\n");
846                 status = -EFAULT;
847                 goto end;
848         }
849
850         count = cst->package.elements[0].integer.value;
851
852         /* Validate number of power states. */
853         if (count < 1 || count != cst->package.count - 1) {
854                 printk(KERN_ERR PREFIX "count given by _CST is not valid\n");
855                 status = -EFAULT;
856                 goto end;
857         }
858
859         /* Tell driver that at least _CST is supported. */
860         pr->flags.has_cst = 1;
861
862         for (i = 1; i <= count; i++) {
863                 union acpi_object *element;
864                 union acpi_object *obj;
865                 struct acpi_power_register *reg;
866                 struct acpi_processor_cx cx;
867
868                 memset(&cx, 0, sizeof(cx));
869
870                 element = &(cst->package.elements[i]);
871                 if (element->type != ACPI_TYPE_PACKAGE)
872                         continue;
873
874                 if (element->package.count != 4)
875                         continue;
876
877                 obj = &(element->package.elements[0]);
878
879                 if (obj->type != ACPI_TYPE_BUFFER)
880                         continue;
881
882                 reg = (struct acpi_power_register *)obj->buffer.pointer;
883
884                 if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
885                     (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
886                         continue;
887
888                 /* There should be an easy way to extract an integer... */
889                 obj = &(element->package.elements[1]);
890                 if (obj->type != ACPI_TYPE_INTEGER)
891                         continue;
892
893                 cx.type = obj->integer.value;
894                 /*
895                  * Some buggy BIOSes won't list C1 in _CST -
896                  * Let acpi_processor_get_power_info_default() handle them later
897                  */
898                 if (i == 1 && cx.type != ACPI_STATE_C1)
899                         current_count++;
900
901                 cx.address = reg->address;
902                 cx.index = current_count + 1;
903
904                 cx.space_id = ACPI_CSTATE_SYSTEMIO;
905                 if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
906                         if (acpi_processor_ffh_cstate_probe
907                                         (pr->id, &cx, reg) == 0) {
908                                 cx.space_id = ACPI_CSTATE_FFH;
909                         } else if (cx.type != ACPI_STATE_C1) {
910                                 /*
911                                  * C1 is a special case where FIXED_HARDWARE
912                                  * can be handled in non-MWAIT way as well.
913                                  * In that case, save this _CST entry info.
914                                  * That is, we retain space_id of SYSTEM_IO for
915                                  * halt based C1.
916                                  * Otherwise, ignore this info and continue.
917                                  */
918                                 continue;
919                         }
920                 }
921
922                 obj = &(element->package.elements[2]);
923                 if (obj->type != ACPI_TYPE_INTEGER)
924                         continue;
925
926                 cx.latency = obj->integer.value;
927
928                 obj = &(element->package.elements[3]);
929                 if (obj->type != ACPI_TYPE_INTEGER)
930                         continue;
931
932                 cx.power = obj->integer.value;
933
934                 current_count++;
935                 memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
936
937                 /*
938                  * We support total ACPI_PROCESSOR_MAX_POWER - 1
939                  * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
940                  */
941                 if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
942                         printk(KERN_WARNING
943                                "Limiting number of power states to max (%d)\n",
944                                ACPI_PROCESSOR_MAX_POWER);
945                         printk(KERN_WARNING
946                                "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
947                         break;
948                 }
949         }
950
951         ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
952                           current_count));
953
954         /* Validate number of power states discovered */
955         if (current_count < 2)
956                 status = -EFAULT;
957
958       end:
959         kfree(buffer.pointer);
960
961         return status;
962 }
963
964 static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)
965 {
966
967         if (!cx->address)
968                 return;
969
970         /*
971          * C2 latency must be less than or equal to 100
972          * microseconds.
973          */
974         else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {
975                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
976                                   "latency too large [%d]\n", cx->latency));
977                 return;
978         }
979
980         /*
981          * Otherwise we've met all of our C2 requirements.
982          * Normalize the C2 latency to expidite policy
983          */
984         cx->valid = 1;
985
986 #ifndef CONFIG_CPU_IDLE
987         cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
988 #else
989         cx->latency_ticks = cx->latency;
990 #endif
991
992         return;
993 }
994
995 static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
996                                            struct acpi_processor_cx *cx)
997 {
998         static int bm_check_flag;
999
1000
1001         if (!cx->address)
1002                 return;
1003
1004         /*
1005          * C3 latency must be less than or equal to 1000
1006          * microseconds.
1007          */
1008         else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {
1009                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1010                                   "latency too large [%d]\n", cx->latency));
1011                 return;
1012         }
1013
1014         /*
1015          * PIIX4 Erratum #18: We don't support C3 when Type-F (fast)
1016          * DMA transfers are used by any ISA device to avoid livelock.
1017          * Note that we could disable Type-F DMA (as recommended by
1018          * the erratum), but this is known to disrupt certain ISA
1019          * devices thus we take the conservative approach.
1020          */
1021         else if (errata.piix4.fdma) {
1022                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1023                                   "C3 not supported on PIIX4 with Type-F DMA\n"));
1024                 return;
1025         }
1026
1027         /* All the logic here assumes flags.bm_check is same across all CPUs */
1028         if (!bm_check_flag) {
1029                 /* Determine whether bm_check is needed based on CPU  */
1030                 acpi_processor_power_init_bm_check(&(pr->flags), pr->id);
1031                 bm_check_flag = pr->flags.bm_check;
1032         } else {
1033                 pr->flags.bm_check = bm_check_flag;
1034         }
1035
1036         if (pr->flags.bm_check) {
1037                 if (!pr->flags.bm_control) {
1038                         if (pr->flags.has_cst != 1) {
1039                                 /* bus mastering control is necessary */
1040                                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1041                                         "C3 support requires BM control\n"));
1042                                 return;
1043                         } else {
1044                                 /* Here we enter C3 without bus mastering */
1045                                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1046                                         "C3 support without BM control\n"));
1047                         }
1048                 }
1049         } else {
1050                 /*
1051                  * WBINVD should be set in fadt, for C3 state to be
1052                  * supported on when bm_check is not required.
1053                  */
1054                 if (!(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD)) {
1055                         ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1056                                           "Cache invalidation should work properly"
1057                                           " for C3 to be enabled on SMP systems\n"));
1058                         return;
1059                 }
1060                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
1061         }
1062
1063         /*
1064          * Otherwise we've met all of our C3 requirements.
1065          * Normalize the C3 latency to expidite policy.  Enable
1066          * checking of bus mastering status (bm_check) so we can
1067          * use this in our C3 policy
1068          */
1069         cx->valid = 1;
1070
1071 #ifndef CONFIG_CPU_IDLE
1072         cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
1073 #else
1074         cx->latency_ticks = cx->latency;
1075 #endif
1076
1077         return;
1078 }
1079
1080 static int acpi_processor_power_verify(struct acpi_processor *pr)
1081 {
1082         unsigned int i;
1083         unsigned int working = 0;
1084
1085         pr->power.timer_broadcast_on_state = INT_MAX;
1086
1087         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
1088                 struct acpi_processor_cx *cx = &pr->power.states[i];
1089
1090                 switch (cx->type) {
1091                 case ACPI_STATE_C1:
1092                         cx->valid = 1;
1093                         break;
1094
1095                 case ACPI_STATE_C2:
1096                         acpi_processor_power_verify_c2(cx);
1097                         if (cx->valid)
1098                                 acpi_timer_check_state(i, pr, cx);
1099                         break;
1100
1101                 case ACPI_STATE_C3:
1102                         acpi_processor_power_verify_c3(pr, cx);
1103                         if (cx->valid)
1104                                 acpi_timer_check_state(i, pr, cx);
1105                         break;
1106                 }
1107
1108                 if (cx->valid)
1109                         working++;
1110         }
1111
1112         acpi_propagate_timer_broadcast(pr);
1113
1114         return (working);
1115 }
1116
1117 static int acpi_processor_get_power_info(struct acpi_processor *pr)
1118 {
1119         unsigned int i;
1120         int result;
1121
1122
1123         /* NOTE: the idle thread may not be running while calling
1124          * this function */
1125
1126         /* Zero initialize all the C-states info. */
1127         memset(pr->power.states, 0, sizeof(pr->power.states));
1128
1129         result = acpi_processor_get_power_info_cst(pr);
1130         if (result == -ENODEV)
1131                 result = acpi_processor_get_power_info_fadt(pr);
1132
1133         if (result)
1134                 return result;
1135
1136         acpi_processor_get_power_info_default(pr);
1137
1138         pr->power.count = acpi_processor_power_verify(pr);
1139
1140 #ifndef CONFIG_CPU_IDLE
1141         /*
1142          * Set Default Policy
1143          * ------------------
1144          * Now that we know which states are supported, set the default
1145          * policy.  Note that this policy can be changed dynamically
1146          * (e.g. encourage deeper sleeps to conserve battery life when
1147          * not on AC).
1148          */
1149         result = acpi_processor_set_power_policy(pr);
1150         if (result)
1151                 return result;
1152 #endif
1153
1154         /*
1155          * if one state of type C2 or C3 is available, mark this
1156          * CPU as being "idle manageable"
1157          */
1158         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
1159                 if (pr->power.states[i].valid) {
1160                         pr->power.count = i;
1161                         if (pr->power.states[i].type >= ACPI_STATE_C2)
1162                                 pr->flags.power = 1;
1163                 }
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
1170 {
1171         struct acpi_processor *pr = seq->private;
1172         unsigned int i;
1173
1174
1175         if (!pr)
1176                 goto end;
1177
1178         seq_printf(seq, "active state:            C%zd\n"
1179                    "max_cstate:              C%d\n"
1180                    "bus master activity:     %08x\n"
1181                    "maximum allowed latency: %d usec\n",
1182                    pr->power.state ? pr->power.state - pr->power.states : 0,
1183                    max_cstate, (unsigned)pr->power.bm_activity,
1184                    system_latency_constraint());
1185
1186         seq_puts(seq, "states:\n");
1187
1188         for (i = 1; i <= pr->power.count; i++) {
1189                 seq_printf(seq, "   %cC%d:                  ",
1190                            (&pr->power.states[i] ==
1191                             pr->power.state ? '*' : ' '), i);
1192
1193                 if (!pr->power.states[i].valid) {
1194                         seq_puts(seq, "<not supported>\n");
1195                         continue;
1196                 }
1197
1198                 switch (pr->power.states[i].type) {
1199                 case ACPI_STATE_C1:
1200                         seq_printf(seq, "type[C1] ");
1201                         break;
1202                 case ACPI_STATE_C2:
1203                         seq_printf(seq, "type[C2] ");
1204                         break;
1205                 case ACPI_STATE_C3:
1206                         seq_printf(seq, "type[C3] ");
1207                         break;
1208                 default:
1209                         seq_printf(seq, "type[--] ");
1210                         break;
1211                 }
1212
1213                 if (pr->power.states[i].promotion.state)
1214                         seq_printf(seq, "promotion[C%zd] ",
1215                                    (pr->power.states[i].promotion.state -
1216                                     pr->power.states));
1217                 else
1218                         seq_puts(seq, "promotion[--] ");
1219
1220                 if (pr->power.states[i].demotion.state)
1221                         seq_printf(seq, "demotion[C%zd] ",
1222                                    (pr->power.states[i].demotion.state -
1223                                     pr->power.states));
1224                 else
1225                         seq_puts(seq, "demotion[--] ");
1226
1227                 seq_printf(seq, "latency[%03d] usage[%08d] duration[%020llu]\n",
1228                            pr->power.states[i].latency,
1229                            pr->power.states[i].usage,
1230                            (unsigned long long)pr->power.states[i].time);
1231         }
1232
1233       end:
1234         return 0;
1235 }
1236
1237 static int acpi_processor_power_open_fs(struct inode *inode, struct file *file)
1238 {
1239         return single_open(file, acpi_processor_power_seq_show,
1240                            PDE(inode)->data);
1241 }
1242
1243 static const struct file_operations acpi_processor_power_fops = {
1244         .open = acpi_processor_power_open_fs,
1245         .read = seq_read,
1246         .llseek = seq_lseek,
1247         .release = single_release,
1248 };
1249
1250 #ifndef CONFIG_CPU_IDLE
1251
1252 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
1253 {
1254         int result = 0;
1255
1256
1257         if (!pr)
1258                 return -EINVAL;
1259
1260         if (nocst) {
1261                 return -ENODEV;
1262         }
1263
1264         if (!pr->flags.power_setup_done)
1265                 return -ENODEV;
1266
1267         /* Fall back to the default idle loop */
1268         pm_idle = pm_idle_save;
1269         synchronize_sched();    /* Relies on interrupts forcing exit from idle. */
1270
1271         pr->flags.power = 0;
1272         result = acpi_processor_get_power_info(pr);
1273         if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
1274                 pm_idle = acpi_processor_idle;
1275
1276         return result;
1277 }
1278
1279 #ifdef CONFIG_SMP
1280 static void smp_callback(void *v)
1281 {
1282         /* we already woke the CPU up, nothing more to do */
1283 }
1284
1285 /*
1286  * This function gets called when a part of the kernel has a new latency
1287  * requirement.  This means we need to get all processors out of their C-state,
1288  * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
1289  * wakes them all right up.
1290  */
1291 static int acpi_processor_latency_notify(struct notifier_block *b,
1292                 unsigned long l, void *v)
1293 {
1294         smp_call_function(smp_callback, NULL, 0, 1);
1295         return NOTIFY_OK;
1296 }
1297
1298 static struct notifier_block acpi_processor_latency_notifier = {
1299         .notifier_call = acpi_processor_latency_notify,
1300 };
1301
1302 #endif
1303
1304 #else /* CONFIG_CPU_IDLE */
1305
1306 /**
1307  * acpi_idle_bm_check - checks if bus master activity was detected
1308  */
1309 static int acpi_idle_bm_check(void)
1310 {
1311         u32 bm_status = 0;
1312
1313         acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
1314         if (bm_status)
1315                 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
1316         /*
1317          * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
1318          * the true state of bus mastering activity; forcing us to
1319          * manually check the BMIDEA bit of each IDE channel.
1320          */
1321         else if (errata.piix4.bmisx) {
1322                 if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
1323                     || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
1324                         bm_status = 1;
1325         }
1326         return bm_status;
1327 }
1328
1329 /**
1330  * acpi_idle_update_bm_rld - updates the BM_RLD bit depending on target state
1331  * @pr: the processor
1332  * @target: the new target state
1333  */
1334 static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
1335                                            struct acpi_processor_cx *target)
1336 {
1337         if (pr->flags.bm_rld_set && target->type != ACPI_STATE_C3) {
1338                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
1339                 pr->flags.bm_rld_set = 0;
1340         }
1341
1342         if (!pr->flags.bm_rld_set && target->type == ACPI_STATE_C3) {
1343                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
1344                 pr->flags.bm_rld_set = 1;
1345         }
1346 }
1347
1348 /**
1349  * acpi_idle_do_entry - a helper function that does C2 and C3 type entry
1350  * @cx: cstate data
1351  */
1352 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1353 {
1354         if (cx->space_id == ACPI_CSTATE_FFH) {
1355                 /* Call into architectural FFH based C-state */
1356                 acpi_processor_ffh_cstate_enter(cx);
1357         } else {
1358                 int unused;
1359                 /* IO port based C-state */
1360                 inb(cx->address);
1361                 /* Dummy wait op - must do something useless after P_LVL2 read
1362                    because chipsets cannot guarantee that STPCLK# signal
1363                    gets asserted in time to freeze execution properly. */
1364                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
1365         }
1366 }
1367
1368 /**
1369  * acpi_idle_enter_c1 - enters an ACPI C1 state-type
1370  * @dev: the target CPU
1371  * @state: the state data
1372  *
1373  * This is equivalent to the HALT instruction.
1374  */
1375 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
1376                               struct cpuidle_state *state)
1377 {
1378         struct acpi_processor *pr;
1379         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1380         pr = processors[smp_processor_id()];
1381
1382         if (unlikely(!pr))
1383                 return 0;
1384
1385         if (pr->flags.bm_check)
1386                 acpi_idle_update_bm_rld(pr, cx);
1387
1388         acpi_safe_halt();
1389
1390         cx->usage++;
1391
1392         return 0;
1393 }
1394
1395 /**
1396  * acpi_idle_enter_simple - enters an ACPI state without BM handling
1397  * @dev: the target CPU
1398  * @state: the state data
1399  */
1400 static int acpi_idle_enter_simple(struct cpuidle_device *dev,
1401                                   struct cpuidle_state *state)
1402 {
1403         struct acpi_processor *pr;
1404         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1405         u32 t1, t2;
1406         int sleep_ticks = 0;
1407
1408         pr = processors[smp_processor_id()];
1409
1410         if (unlikely(!pr))
1411                 return 0;
1412
1413         if (acpi_idle_suspend)
1414                 return(acpi_idle_enter_c1(dev, state));
1415
1416         if (pr->flags.bm_check)
1417                 acpi_idle_update_bm_rld(pr, cx);
1418
1419         local_irq_disable();
1420         current_thread_info()->status &= ~TS_POLLING;
1421         /*
1422          * TS_POLLING-cleared state must be visible before we test
1423          * NEED_RESCHED:
1424          */
1425         smp_mb();
1426
1427         if (unlikely(need_resched())) {
1428                 current_thread_info()->status |= TS_POLLING;
1429                 local_irq_enable();
1430                 return 0;
1431         }
1432
1433         if (cx->type == ACPI_STATE_C3)
1434                 ACPI_FLUSH_CPU_CACHE();
1435
1436         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1437         /* Tell the scheduler that we are going deep-idle: */
1438         sched_clock_idle_sleep_event();
1439         acpi_state_timer_broadcast(pr, cx, 1);
1440         acpi_idle_do_entry(cx);
1441         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1442
1443 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
1444         /* TSC could halt in idle, so notify users */
1445         mark_tsc_unstable("TSC halts in idle");;
1446 #endif
1447         sleep_ticks = ticks_elapsed(t1, t2);
1448
1449         /* Tell the scheduler how much we idled: */
1450         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
1451
1452         local_irq_enable();
1453         current_thread_info()->status |= TS_POLLING;
1454
1455         cx->usage++;
1456
1457         acpi_state_timer_broadcast(pr, cx, 0);
1458         cx->time += sleep_ticks;
1459         return ticks_elapsed_in_us(t1, t2);
1460 }
1461
1462 static int c3_cpu_count;
1463 static DEFINE_SPINLOCK(c3_lock);
1464
1465 /**
1466  * acpi_idle_enter_bm - enters C3 with proper BM handling
1467  * @dev: the target CPU
1468  * @state: the state data
1469  *
1470  * If BM is detected, the deepest non-C3 idle state is entered instead.
1471  */
1472 static int acpi_idle_enter_bm(struct cpuidle_device *dev,
1473                               struct cpuidle_state *state)
1474 {
1475         struct acpi_processor *pr;
1476         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1477         u32 t1, t2;
1478         int sleep_ticks = 0;
1479
1480         pr = processors[smp_processor_id()];
1481
1482         if (unlikely(!pr))
1483                 return 0;
1484
1485         if (acpi_idle_suspend)
1486                 return(acpi_idle_enter_c1(dev, state));
1487
1488         if (acpi_idle_bm_check()) {
1489                 if (dev->safe_state) {
1490                         return dev->safe_state->enter(dev, dev->safe_state);
1491                 } else {
1492                         acpi_safe_halt();
1493                         return 0;
1494                 }
1495         }
1496
1497         local_irq_disable();
1498         current_thread_info()->status &= ~TS_POLLING;
1499         /*
1500          * TS_POLLING-cleared state must be visible before we test
1501          * NEED_RESCHED:
1502          */
1503         smp_mb();
1504
1505         if (unlikely(need_resched())) {
1506                 current_thread_info()->status |= TS_POLLING;
1507                 local_irq_enable();
1508                 return 0;
1509         }
1510
1511         /* Tell the scheduler that we are going deep-idle: */
1512         sched_clock_idle_sleep_event();
1513         /*
1514          * Must be done before busmaster disable as we might need to
1515          * access HPET !
1516          */
1517         acpi_state_timer_broadcast(pr, cx, 1);
1518
1519         acpi_idle_update_bm_rld(pr, cx);
1520
1521         /*
1522          * disable bus master
1523          * bm_check implies we need ARB_DIS
1524          * !bm_check implies we need cache flush
1525          * bm_control implies whether we can do ARB_DIS
1526          *
1527          * That leaves a case where bm_check is set and bm_control is
1528          * not set. In that case we cannot do much, we enter C3
1529          * without doing anything.
1530          */
1531         if (pr->flags.bm_check && pr->flags.bm_control) {
1532                 spin_lock(&c3_lock);
1533                 c3_cpu_count++;
1534                 /* Disable bus master arbitration when all CPUs are in C3 */
1535                 if (c3_cpu_count == num_online_cpus())
1536                         acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
1537                 spin_unlock(&c3_lock);
1538         } else if (!pr->flags.bm_check) {
1539                 ACPI_FLUSH_CPU_CACHE();
1540         }
1541
1542         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1543         acpi_idle_do_entry(cx);
1544         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1545
1546         /* Re-enable bus master arbitration */
1547         if (pr->flags.bm_check && pr->flags.bm_control) {
1548                 spin_lock(&c3_lock);
1549                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
1550                 c3_cpu_count--;
1551                 spin_unlock(&c3_lock);
1552         }
1553
1554 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
1555         /* TSC could halt in idle, so notify users */
1556         mark_tsc_unstable("TSC halts in idle");
1557 #endif
1558         sleep_ticks = ticks_elapsed(t1, t2);
1559         /* Tell the scheduler how much we idled: */
1560         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
1561
1562         local_irq_enable();
1563         current_thread_info()->status |= TS_POLLING;
1564
1565         cx->usage++;
1566
1567         acpi_state_timer_broadcast(pr, cx, 0);
1568         cx->time += sleep_ticks;
1569         return ticks_elapsed_in_us(t1, t2);
1570 }
1571
1572 struct cpuidle_driver acpi_idle_driver = {
1573         .name =         "acpi_idle",
1574         .owner =        THIS_MODULE,
1575 };
1576
1577 /**
1578  * acpi_processor_setup_cpuidle - prepares and configures CPUIDLE
1579  * @pr: the ACPI processor
1580  */
1581 static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
1582 {
1583         int i, count = 0;
1584         struct acpi_processor_cx *cx;
1585         struct cpuidle_state *state;
1586         struct cpuidle_device *dev = &pr->power.dev;
1587
1588         if (!pr->flags.power_setup_done)
1589                 return -EINVAL;
1590
1591         if (pr->flags.power == 0) {
1592                 return -EINVAL;
1593         }
1594
1595         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
1596                 cx = &pr->power.states[i];
1597                 state = &dev->states[count];
1598
1599                 if (!cx->valid)
1600                         continue;
1601
1602 #ifdef CONFIG_HOTPLUG_CPU
1603                 if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
1604                     !pr->flags.has_cst &&
1605                     !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
1606                         continue;
1607 #endif
1608                 cpuidle_set_statedata(state, cx);
1609
1610                 snprintf(state->name, CPUIDLE_NAME_LEN, "C%d", i);
1611                 state->exit_latency = cx->latency;
1612                 state->target_residency = cx->latency * 6;
1613                 state->power_usage = cx->power;
1614
1615                 state->flags = 0;
1616                 switch (cx->type) {
1617                         case ACPI_STATE_C1:
1618                         state->flags |= CPUIDLE_FLAG_SHALLOW;
1619                         state->enter = acpi_idle_enter_c1;
1620                         dev->safe_state = state;
1621                         break;
1622
1623                         case ACPI_STATE_C2:
1624                         state->flags |= CPUIDLE_FLAG_BALANCED;
1625                         state->flags |= CPUIDLE_FLAG_TIME_VALID;
1626                         state->enter = acpi_idle_enter_simple;
1627                         dev->safe_state = state;
1628                         break;
1629
1630                         case ACPI_STATE_C3:
1631                         state->flags |= CPUIDLE_FLAG_DEEP;
1632                         state->flags |= CPUIDLE_FLAG_TIME_VALID;
1633                         state->flags |= CPUIDLE_FLAG_CHECK_BM;
1634                         state->enter = pr->flags.bm_check ?
1635                                         acpi_idle_enter_bm :
1636                                         acpi_idle_enter_simple;
1637                         break;
1638                 }
1639
1640                 count++;
1641         }
1642
1643         dev->state_count = count;
1644
1645         if (!count)
1646                 return -EINVAL;
1647
1648         return 0;
1649 }
1650
1651 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
1652 {
1653         int ret;
1654
1655         if (!pr)
1656                 return -EINVAL;
1657
1658         if (nocst) {
1659                 return -ENODEV;
1660         }
1661
1662         if (!pr->flags.power_setup_done)
1663                 return -ENODEV;
1664
1665         cpuidle_pause_and_lock();
1666         cpuidle_disable_device(&pr->power.dev);
1667         acpi_processor_get_power_info(pr);
1668         acpi_processor_setup_cpuidle(pr);
1669         ret = cpuidle_enable_device(&pr->power.dev);
1670         cpuidle_resume_and_unlock();
1671
1672         return ret;
1673 }
1674
1675 #endif /* CONFIG_CPU_IDLE */
1676
1677 int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
1678                               struct acpi_device *device)
1679 {
1680         acpi_status status = 0;
1681         static int first_run;
1682         struct proc_dir_entry *entry = NULL;
1683         unsigned int i;
1684
1685
1686         if (!first_run) {
1687                 dmi_check_system(processor_power_dmi_table);
1688                 if (max_cstate < ACPI_C_STATES_MAX)
1689                         printk(KERN_NOTICE
1690                                "ACPI: processor limited to max C-state %d\n",
1691                                max_cstate);
1692                 first_run++;
1693 #if !defined (CONFIG_CPU_IDLE) && defined (CONFIG_SMP)
1694                 register_latency_notifier(&acpi_processor_latency_notifier);
1695 #endif
1696         }
1697
1698         if (!pr)
1699                 return -EINVAL;
1700
1701         if (acpi_gbl_FADT.cst_control && !nocst) {
1702                 status =
1703                     acpi_os_write_port(acpi_gbl_FADT.smi_command, acpi_gbl_FADT.cst_control, 8);
1704                 if (ACPI_FAILURE(status)) {
1705                         ACPI_EXCEPTION((AE_INFO, status,
1706                                         "Notifying BIOS of _CST ability failed"));
1707                 }
1708         }
1709
1710         acpi_processor_get_power_info(pr);
1711         pr->flags.power_setup_done = 1;
1712
1713         /*
1714          * Install the idle handler if processor power management is supported.
1715          * Note that we use previously set idle handler will be used on
1716          * platforms that only support C1.
1717          */
1718         if ((pr->flags.power) && (!boot_option_idle_override)) {
1719 #ifdef CONFIG_CPU_IDLE
1720                 acpi_processor_setup_cpuidle(pr);
1721                 pr->power.dev.cpu = pr->id;
1722                 if (cpuidle_register_device(&pr->power.dev))
1723                         return -EIO;
1724 #endif
1725
1726                 printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
1727                 for (i = 1; i <= pr->power.count; i++)
1728                         if (pr->power.states[i].valid)
1729                                 printk(" C%d[C%d]", i,
1730                                        pr->power.states[i].type);
1731                 printk(")\n");
1732
1733 #ifndef CONFIG_CPU_IDLE
1734                 if (pr->id == 0) {
1735                         pm_idle_save = pm_idle;
1736                         pm_idle = acpi_processor_idle;
1737                 }
1738 #endif
1739         }
1740
1741         /* 'power' [R] */
1742         entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,
1743                                   S_IRUGO, acpi_device_dir(device));
1744         if (!entry)
1745                 return -EIO;
1746         else {
1747                 entry->proc_fops = &acpi_processor_power_fops;
1748                 entry->data = acpi_driver_data(device);
1749                 entry->owner = THIS_MODULE;
1750         }
1751
1752         return 0;
1753 }
1754
1755 int acpi_processor_power_exit(struct acpi_processor *pr,
1756                               struct acpi_device *device)
1757 {
1758 #ifdef CONFIG_CPU_IDLE
1759         if ((pr->flags.power) && (!boot_option_idle_override))
1760                 cpuidle_unregister_device(&pr->power.dev);
1761 #endif
1762         pr->flags.power_setup_done = 0;
1763
1764         if (acpi_device_dir(device))
1765                 remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,
1766                                   acpi_device_dir(device));
1767
1768 #ifndef CONFIG_CPU_IDLE
1769
1770         /* Unregister the idle handler when processor #0 is removed. */
1771         if (pr->id == 0) {
1772                 pm_idle = pm_idle_save;
1773
1774                 /*
1775                  * We are about to unload the current idle thread pm callback
1776                  * (pm_idle), Wait for all processors to update cached/local
1777                  * copies of pm_idle before proceeding.
1778                  */
1779                 cpu_idle_wait();
1780 #ifdef CONFIG_SMP
1781                 unregister_latency_notifier(&acpi_processor_latency_notifier);
1782 #endif
1783         }
1784 #endif
1785
1786         return 0;
1787 }