igb: update version number and copyright dates
[linux-2.6] / drivers / acpi / processor_idle.c
1 /*
2  * processor_idle - idle state submodule to the ACPI processor driver
3  *
4  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6  *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
7  *  Copyright (C) 2004  Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8  *                      - Added processor hotplug support
9  *  Copyright (C) 2005  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
10  *                      - Added support for C3 on SMP
11  *
12  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13  *
14  *  This program is free software; you can redistribute it and/or modify
15  *  it under the terms of the GNU General Public License as published by
16  *  the Free Software Foundation; either version 2 of the License, or (at
17  *  your option) any later version.
18  *
19  *  This program is distributed in the hope that it will be useful, but
20  *  WITHOUT ANY WARRANTY; without even the implied warranty of
21  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  *  General Public License for more details.
23  *
24  *  You should have received a copy of the GNU General Public License along
25  *  with this program; if not, write to the Free Software Foundation, Inc.,
26  *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27  *
28  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29  */
30
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/init.h>
34 #include <linux/cpufreq.h>
35 #include <linux/proc_fs.h>
36 #include <linux/seq_file.h>
37 #include <linux/acpi.h>
38 #include <linux/dmi.h>
39 #include <linux/moduleparam.h>
40 #include <linux/sched.h>        /* need_resched() */
41 #include <linux/pm_qos_params.h>
42 #include <linux/clockchips.h>
43 #include <linux/cpuidle.h>
44 #include <linux/irqflags.h>
45
46 /*
47  * Include the apic definitions for x86 to have the APIC timer related defines
48  * available also for UP (on SMP it gets magically included via linux/smp.h).
49  * asm/acpi.h is not an option, as it would require more include magic. Also
50  * creating an empty asm-ia64/apic.h would just trade pest vs. cholera.
51  */
52 #ifdef CONFIG_X86
53 #include <asm/apic.h>
54 #endif
55
56 #include <asm/io.h>
57 #include <asm/uaccess.h>
58
59 #include <acpi/acpi_bus.h>
60 #include <acpi/processor.h>
61 #include <asm/processor.h>
62
63 #define ACPI_PROCESSOR_CLASS            "processor"
64 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
65 ACPI_MODULE_NAME("processor_idle");
66 #define ACPI_PROCESSOR_FILE_POWER       "power"
67 #define US_TO_PM_TIMER_TICKS(t)         ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
68 #define PM_TIMER_TICK_NS                (1000000000ULL/PM_TIMER_FREQUENCY)
69 #ifndef CONFIG_CPU_IDLE
70 #define C2_OVERHEAD                     4       /* 1us (3.579 ticks per us) */
71 #define C3_OVERHEAD                     4       /* 1us (3.579 ticks per us) */
72 static void (*pm_idle_save) (void) __read_mostly;
73 #else
74 #define C2_OVERHEAD                     1       /* 1us */
75 #define C3_OVERHEAD                     1       /* 1us */
76 #endif
77 #define PM_TIMER_TICKS_TO_US(p)         (((p) * 1000)/(PM_TIMER_FREQUENCY/1000))
78
79 static unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER;
80 #ifdef CONFIG_CPU_IDLE
81 module_param(max_cstate, uint, 0000);
82 #else
83 module_param(max_cstate, uint, 0644);
84 #endif
85 static unsigned int nocst __read_mostly;
86 module_param(nocst, uint, 0000);
87
88 #ifndef CONFIG_CPU_IDLE
89 /*
90  * bm_history -- bit-mask with a bit per jiffy of bus-master activity
91  * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
92  * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
93  * 100 HZ: 0x0000000F: 4 jiffies = 40ms
94  * reduce history for more aggressive entry into C3
95  */
96 static unsigned int bm_history __read_mostly =
97     (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
98 module_param(bm_history, uint, 0644);
99
100 static int acpi_processor_set_power_policy(struct acpi_processor *pr);
101
102 #else   /* CONFIG_CPU_IDLE */
103 static unsigned int latency_factor __read_mostly = 2;
104 module_param(latency_factor, uint, 0644);
105 #endif
106
107 /*
108  * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
109  * For now disable this. Probably a bug somewhere else.
110  *
111  * To skip this limit, boot/load with a large max_cstate limit.
112  */
113 static int set_max_cstate(const struct dmi_system_id *id)
114 {
115         if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
116                 return 0;
117
118         printk(KERN_NOTICE PREFIX "%s detected - limiting to C%ld max_cstate."
119                " Override with \"processor.max_cstate=%d\"\n", id->ident,
120                (long)id->driver_data, ACPI_PROCESSOR_MAX_POWER + 1);
121
122         max_cstate = (long)id->driver_data;
123
124         return 0;
125 }
126
127 /* Actually this shouldn't be __cpuinitdata, would be better to fix the
128    callers to only run once -AK */
129 static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
130         { set_max_cstate, "IBM ThinkPad R40e", {
131           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
132           DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
133         { set_max_cstate, "IBM ThinkPad R40e", {
134           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
135           DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
136         { set_max_cstate, "IBM ThinkPad R40e", {
137           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
138           DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
139         { set_max_cstate, "IBM ThinkPad R40e", {
140           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
141           DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
142         { set_max_cstate, "IBM ThinkPad R40e", {
143           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
144           DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
145         { set_max_cstate, "IBM ThinkPad R40e", {
146           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
147           DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
148         { set_max_cstate, "IBM ThinkPad R40e", {
149           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
150           DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
151         { set_max_cstate, "IBM ThinkPad R40e", {
152           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
153           DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
154         { set_max_cstate, "IBM ThinkPad R40e", {
155           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
156           DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
157         { set_max_cstate, "IBM ThinkPad R40e", {
158           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
159           DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
160         { set_max_cstate, "IBM ThinkPad R40e", {
161           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
162           DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
163         { set_max_cstate, "IBM ThinkPad R40e", {
164           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
165           DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
166         { set_max_cstate, "IBM ThinkPad R40e", {
167           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
168           DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
169         { set_max_cstate, "IBM ThinkPad R40e", {
170           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
171           DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
172         { set_max_cstate, "IBM ThinkPad R40e", {
173           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
174           DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
175         { set_max_cstate, "IBM ThinkPad R40e", {
176           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
177           DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
178         { set_max_cstate, "Medion 41700", {
179           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
180           DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
181         { set_max_cstate, "Clevo 5600D", {
182           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
183           DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
184          (void *)2},
185         {},
186 };
187
188 static inline u32 ticks_elapsed(u32 t1, u32 t2)
189 {
190         if (t2 >= t1)
191                 return (t2 - t1);
192         else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
193                 return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
194         else
195                 return ((0xFFFFFFFF - t1) + t2);
196 }
197
198 static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
199 {
200         if (t2 >= t1)
201                 return PM_TIMER_TICKS_TO_US(t2 - t1);
202         else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
203                 return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
204         else
205                 return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
206 }
207
208 /*
209  * Callers should disable interrupts before the call and enable
210  * interrupts after return.
211  */
212 static void acpi_safe_halt(void)
213 {
214         current_thread_info()->status &= ~TS_POLLING;
215         /*
216          * TS_POLLING-cleared state must be visible before we
217          * test NEED_RESCHED:
218          */
219         smp_mb();
220         if (!need_resched()) {
221                 safe_halt();
222                 local_irq_disable();
223         }
224         current_thread_info()->status |= TS_POLLING;
225 }
226
227 #ifndef CONFIG_CPU_IDLE
228
229 static void
230 acpi_processor_power_activate(struct acpi_processor *pr,
231                               struct acpi_processor_cx *new)
232 {
233         struct acpi_processor_cx *old;
234
235         if (!pr || !new)
236                 return;
237
238         old = pr->power.state;
239
240         if (old)
241                 old->promotion.count = 0;
242         new->demotion.count = 0;
243
244         /* Cleanup from old state. */
245         if (old) {
246                 switch (old->type) {
247                 case ACPI_STATE_C3:
248                         /* Disable bus master reload */
249                         if (new->type != ACPI_STATE_C3 && pr->flags.bm_check)
250                                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
251                         break;
252                 }
253         }
254
255         /* Prepare to use new state. */
256         switch (new->type) {
257         case ACPI_STATE_C3:
258                 /* Enable bus master reload */
259                 if (old->type != ACPI_STATE_C3 && pr->flags.bm_check)
260                         acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
261                 break;
262         }
263
264         pr->power.state = new;
265
266         return;
267 }
268
269 static atomic_t c3_cpu_count;
270
271 /* Common C-state entry for C2, C3, .. */
272 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
273 {
274         /* Don't trace irqs off for idle */
275         stop_critical_timings();
276         if (cstate->entry_method == ACPI_CSTATE_FFH) {
277                 /* Call into architectural FFH based C-state */
278                 acpi_processor_ffh_cstate_enter(cstate);
279         } else {
280                 int unused;
281                 /* IO port based C-state */
282                 inb(cstate->address);
283                 /* Dummy wait op - must do something useless after P_LVL2 read
284                    because chipsets cannot guarantee that STPCLK# signal
285                    gets asserted in time to freeze execution properly. */
286                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
287         }
288         start_critical_timings();
289 }
290 #endif /* !CONFIG_CPU_IDLE */
291
292 #ifdef ARCH_APICTIMER_STOPS_ON_C3
293
294 /*
295  * Some BIOS implementations switch to C3 in the published C2 state.
296  * This seems to be a common problem on AMD boxen, but other vendors
297  * are affected too. We pick the most conservative approach: we assume
298  * that the local APIC stops in both C2 and C3.
299  */
300 static void acpi_timer_check_state(int state, struct acpi_processor *pr,
301                                    struct acpi_processor_cx *cx)
302 {
303         struct acpi_processor_power *pwr = &pr->power;
304         u8 type = local_apic_timer_c2_ok ? ACPI_STATE_C3 : ACPI_STATE_C2;
305
306         /*
307          * Check, if one of the previous states already marked the lapic
308          * unstable
309          */
310         if (pwr->timer_broadcast_on_state < state)
311                 return;
312
313         if (cx->type >= type)
314                 pr->power.timer_broadcast_on_state = state;
315 }
316
317 static void acpi_propagate_timer_broadcast(struct acpi_processor *pr)
318 {
319         unsigned long reason;
320
321         reason = pr->power.timer_broadcast_on_state < INT_MAX ?
322                 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
323
324         clockevents_notify(reason, &pr->id);
325 }
326
327 /* Power(C) State timer broadcast control */
328 static void acpi_state_timer_broadcast(struct acpi_processor *pr,
329                                        struct acpi_processor_cx *cx,
330                                        int broadcast)
331 {
332         int state = cx - pr->power.states;
333
334         if (state >= pr->power.timer_broadcast_on_state) {
335                 unsigned long reason;
336
337                 reason = broadcast ?  CLOCK_EVT_NOTIFY_BROADCAST_ENTER :
338                         CLOCK_EVT_NOTIFY_BROADCAST_EXIT;
339                 clockevents_notify(reason, &pr->id);
340         }
341 }
342
343 #else
344
345 static void acpi_timer_check_state(int state, struct acpi_processor *pr,
346                                    struct acpi_processor_cx *cstate) { }
347 static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { }
348 static void acpi_state_timer_broadcast(struct acpi_processor *pr,
349                                        struct acpi_processor_cx *cx,
350                                        int broadcast)
351 {
352 }
353
354 #endif
355
356 /*
357  * Suspend / resume control
358  */
359 static int acpi_idle_suspend;
360
361 int acpi_processor_suspend(struct acpi_device * device, pm_message_t state)
362 {
363         acpi_idle_suspend = 1;
364         return 0;
365 }
366
367 int acpi_processor_resume(struct acpi_device * device)
368 {
369         acpi_idle_suspend = 0;
370         return 0;
371 }
372
373 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
374 static int tsc_halts_in_c(int state)
375 {
376         switch (boot_cpu_data.x86_vendor) {
377         case X86_VENDOR_AMD:
378         case X86_VENDOR_INTEL:
379                 /*
380                  * AMD Fam10h TSC will tick in all
381                  * C/P/S0/S1 states when this bit is set.
382                  */
383                 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
384                         return 0;
385
386                 /*FALL THROUGH*/
387         default:
388                 return state > ACPI_STATE_C1;
389         }
390 }
391 #endif
392
393 #ifndef CONFIG_CPU_IDLE
394 static void acpi_processor_idle(void)
395 {
396         struct acpi_processor *pr = NULL;
397         struct acpi_processor_cx *cx = NULL;
398         struct acpi_processor_cx *next_state = NULL;
399         int sleep_ticks = 0;
400         u32 t1, t2 = 0;
401
402         /*
403          * Interrupts must be disabled during bus mastering calculations and
404          * for C2/C3 transitions.
405          */
406         local_irq_disable();
407
408         pr = __get_cpu_var(processors);
409         if (!pr) {
410                 local_irq_enable();
411                 return;
412         }
413
414         /*
415          * Check whether we truly need to go idle, or should
416          * reschedule:
417          */
418         if (unlikely(need_resched())) {
419                 local_irq_enable();
420                 return;
421         }
422
423         cx = pr->power.state;
424         if (!cx || acpi_idle_suspend) {
425                 if (pm_idle_save) {
426                         pm_idle_save(); /* enables IRQs */
427                 } else {
428                         acpi_safe_halt();
429                         local_irq_enable();
430                 }
431
432                 return;
433         }
434
435         /*
436          * Check BM Activity
437          * -----------------
438          * Check for bus mastering activity (if required), record, and check
439          * for demotion.
440          */
441         if (pr->flags.bm_check) {
442                 u32 bm_status = 0;
443                 unsigned long diff = jiffies - pr->power.bm_check_timestamp;
444
445                 if (diff > 31)
446                         diff = 31;
447
448                 pr->power.bm_activity <<= diff;
449
450                 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
451                 if (bm_status) {
452                         pr->power.bm_activity |= 0x1;
453                         acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
454                 }
455                 /*
456                  * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
457                  * the true state of bus mastering activity; forcing us to
458                  * manually check the BMIDEA bit of each IDE channel.
459                  */
460                 else if (errata.piix4.bmisx) {
461                         if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
462                             || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
463                                 pr->power.bm_activity |= 0x1;
464                 }
465
466                 pr->power.bm_check_timestamp = jiffies;
467
468                 /*
469                  * If bus mastering is or was active this jiffy, demote
470                  * to avoid a faulty transition.  Note that the processor
471                  * won't enter a low-power state during this call (to this
472                  * function) but should upon the next.
473                  *
474                  * TBD: A better policy might be to fallback to the demotion
475                  *      state (use it for this quantum only) istead of
476                  *      demoting -- and rely on duration as our sole demotion
477                  *      qualification.  This may, however, introduce DMA
478                  *      issues (e.g. floppy DMA transfer overrun/underrun).
479                  */
480                 if ((pr->power.bm_activity & 0x1) &&
481                     cx->demotion.threshold.bm) {
482                         local_irq_enable();
483                         next_state = cx->demotion.state;
484                         goto end;
485                 }
486         }
487
488 #ifdef CONFIG_HOTPLUG_CPU
489         /*
490          * Check for P_LVL2_UP flag before entering C2 and above on
491          * an SMP system. We do it here instead of doing it at _CST/P_LVL
492          * detection phase, to work cleanly with logical CPU hotplug.
493          */
494         if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
495             !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
496                 cx = &pr->power.states[ACPI_STATE_C1];
497 #endif
498
499         /*
500          * Sleep:
501          * ------
502          * Invoke the current Cx state to put the processor to sleep.
503          */
504         if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) {
505                 current_thread_info()->status &= ~TS_POLLING;
506                 /*
507                  * TS_POLLING-cleared state must be visible before we
508                  * test NEED_RESCHED:
509                  */
510                 smp_mb();
511                 if (need_resched()) {
512                         current_thread_info()->status |= TS_POLLING;
513                         local_irq_enable();
514                         return;
515                 }
516         }
517
518         switch (cx->type) {
519
520         case ACPI_STATE_C1:
521                 /*
522                  * Invoke C1.
523                  * Use the appropriate idle routine, the one that would
524                  * be used without acpi C-states.
525                  */
526                 if (pm_idle_save) {
527                         pm_idle_save(); /* enables IRQs */
528                 } else {
529                         acpi_safe_halt();
530                         local_irq_enable();
531                 }
532
533                 /*
534                  * TBD: Can't get time duration while in C1, as resumes
535                  *      go to an ISR rather than here.  Need to instrument
536                  *      base interrupt handler.
537                  *
538                  * Note: the TSC better not stop in C1, sched_clock() will
539                  *       skew otherwise.
540                  */
541                 sleep_ticks = 0xFFFFFFFF;
542
543                 break;
544
545         case ACPI_STATE_C2:
546                 /* Get start time (ticks) */
547                 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
548                 /* Tell the scheduler that we are going deep-idle: */
549                 sched_clock_idle_sleep_event();
550                 /* Invoke C2 */
551                 acpi_state_timer_broadcast(pr, cx, 1);
552                 acpi_cstate_enter(cx);
553                 /* Get end time (ticks) */
554                 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
555
556 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
557                 /* TSC halts in C2, so notify users */
558                 if (tsc_halts_in_c(ACPI_STATE_C2))
559                         mark_tsc_unstable("possible TSC halt in C2");
560 #endif
561                 /* Compute time (ticks) that we were actually asleep */
562                 sleep_ticks = ticks_elapsed(t1, t2);
563
564                 /* Tell the scheduler how much we idled: */
565                 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
566
567                 /* Re-enable interrupts */
568                 local_irq_enable();
569                 /* Do not account our idle-switching overhead: */
570                 sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
571
572                 current_thread_info()->status |= TS_POLLING;
573                 acpi_state_timer_broadcast(pr, cx, 0);
574                 break;
575
576         case ACPI_STATE_C3:
577                 acpi_unlazy_tlb(smp_processor_id());
578                 /*
579                  * Must be done before busmaster disable as we might
580                  * need to access HPET !
581                  */
582                 acpi_state_timer_broadcast(pr, cx, 1);
583                 /*
584                  * disable bus master
585                  * bm_check implies we need ARB_DIS
586                  * !bm_check implies we need cache flush
587                  * bm_control implies whether we can do ARB_DIS
588                  *
589                  * That leaves a case where bm_check is set and bm_control is
590                  * not set. In that case we cannot do much, we enter C3
591                  * without doing anything.
592                  */
593                 if (pr->flags.bm_check && pr->flags.bm_control) {
594                         if (atomic_inc_return(&c3_cpu_count) ==
595                             num_online_cpus()) {
596                                 /*
597                                  * All CPUs are trying to go to C3
598                                  * Disable bus master arbitration
599                                  */
600                                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
601                         }
602                 } else if (!pr->flags.bm_check) {
603                         /* SMP with no shared cache... Invalidate cache  */
604                         ACPI_FLUSH_CPU_CACHE();
605                 }
606
607                 /* Get start time (ticks) */
608                 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
609                 /* Invoke C3 */
610                 /* Tell the scheduler that we are going deep-idle: */
611                 sched_clock_idle_sleep_event();
612                 acpi_cstate_enter(cx);
613                 /* Get end time (ticks) */
614                 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
615                 if (pr->flags.bm_check && pr->flags.bm_control) {
616                         /* Enable bus master arbitration */
617                         atomic_dec(&c3_cpu_count);
618                         acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
619                 }
620
621 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
622                 /* TSC halts in C3, so notify users */
623                 if (tsc_halts_in_c(ACPI_STATE_C3))
624                         mark_tsc_unstable("TSC halts in C3");
625 #endif
626                 /* Compute time (ticks) that we were actually asleep */
627                 sleep_ticks = ticks_elapsed(t1, t2);
628                 /* Tell the scheduler how much we idled: */
629                 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
630
631                 /* Re-enable interrupts */
632                 local_irq_enable();
633                 /* Do not account our idle-switching overhead: */
634                 sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
635
636                 current_thread_info()->status |= TS_POLLING;
637                 acpi_state_timer_broadcast(pr, cx, 0);
638                 break;
639
640         default:
641                 local_irq_enable();
642                 return;
643         }
644         cx->usage++;
645         if ((cx->type != ACPI_STATE_C1) && (sleep_ticks > 0))
646                 cx->time += sleep_ticks;
647
648         next_state = pr->power.state;
649
650 #ifdef CONFIG_HOTPLUG_CPU
651         /* Don't do promotion/demotion */
652         if ((cx->type == ACPI_STATE_C1) && (num_online_cpus() > 1) &&
653             !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) {
654                 next_state = cx;
655                 goto end;
656         }
657 #endif
658
659         /*
660          * Promotion?
661          * ----------
662          * Track the number of longs (time asleep is greater than threshold)
663          * and promote when the count threshold is reached.  Note that bus
664          * mastering activity may prevent promotions.
665          * Do not promote above max_cstate.
666          */
667         if (cx->promotion.state &&
668             ((cx->promotion.state - pr->power.states) <= max_cstate)) {
669                 if (sleep_ticks > cx->promotion.threshold.ticks &&
670                   cx->promotion.state->latency <=
671                                 pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
672                         cx->promotion.count++;
673                         cx->demotion.count = 0;
674                         if (cx->promotion.count >=
675                             cx->promotion.threshold.count) {
676                                 if (pr->flags.bm_check) {
677                                         if (!
678                                             (pr->power.bm_activity & cx->
679                                              promotion.threshold.bm)) {
680                                                 next_state =
681                                                     cx->promotion.state;
682                                                 goto end;
683                                         }
684                                 } else {
685                                         next_state = cx->promotion.state;
686                                         goto end;
687                                 }
688                         }
689                 }
690         }
691
692         /*
693          * Demotion?
694          * ---------
695          * Track the number of shorts (time asleep is less than time threshold)
696          * and demote when the usage threshold is reached.
697          */
698         if (cx->demotion.state) {
699                 if (sleep_ticks < cx->demotion.threshold.ticks) {
700                         cx->demotion.count++;
701                         cx->promotion.count = 0;
702                         if (cx->demotion.count >= cx->demotion.threshold.count) {
703                                 next_state = cx->demotion.state;
704                                 goto end;
705                         }
706                 }
707         }
708
709       end:
710         /*
711          * Demote if current state exceeds max_cstate
712          * or if the latency of the current state is unacceptable
713          */
714         if ((pr->power.state - pr->power.states) > max_cstate ||
715                 pr->power.state->latency >
716                                 pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
717                 if (cx->demotion.state)
718                         next_state = cx->demotion.state;
719         }
720
721         /*
722          * New Cx State?
723          * -------------
724          * If we're going to start using a new Cx state we must clean up
725          * from the previous and prepare to use the new.
726          */
727         if (next_state != pr->power.state)
728                 acpi_processor_power_activate(pr, next_state);
729 }
730
731 static int acpi_processor_set_power_policy(struct acpi_processor *pr)
732 {
733         unsigned int i;
734         unsigned int state_is_set = 0;
735         struct acpi_processor_cx *lower = NULL;
736         struct acpi_processor_cx *higher = NULL;
737         struct acpi_processor_cx *cx;
738
739
740         if (!pr)
741                 return -EINVAL;
742
743         /*
744          * This function sets the default Cx state policy (OS idle handler).
745          * Our scheme is to promote quickly to C2 but more conservatively
746          * to C3.  We're favoring C2  for its characteristics of low latency
747          * (quick response), good power savings, and ability to allow bus
748          * mastering activity.  Note that the Cx state policy is completely
749          * customizable and can be altered dynamically.
750          */
751
752         /* startup state */
753         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
754                 cx = &pr->power.states[i];
755                 if (!cx->valid)
756                         continue;
757
758                 if (!state_is_set)
759                         pr->power.state = cx;
760                 state_is_set++;
761                 break;
762         }
763
764         if (!state_is_set)
765                 return -ENODEV;
766
767         /* demotion */
768         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
769                 cx = &pr->power.states[i];
770                 if (!cx->valid)
771                         continue;
772
773                 if (lower) {
774                         cx->demotion.state = lower;
775                         cx->demotion.threshold.ticks = cx->latency_ticks;
776                         cx->demotion.threshold.count = 1;
777                         if (cx->type == ACPI_STATE_C3)
778                                 cx->demotion.threshold.bm = bm_history;
779                 }
780
781                 lower = cx;
782         }
783
784         /* promotion */
785         for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
786                 cx = &pr->power.states[i];
787                 if (!cx->valid)
788                         continue;
789
790                 if (higher) {
791                         cx->promotion.state = higher;
792                         cx->promotion.threshold.ticks = cx->latency_ticks;
793                         if (cx->type >= ACPI_STATE_C2)
794                                 cx->promotion.threshold.count = 4;
795                         else
796                                 cx->promotion.threshold.count = 10;
797                         if (higher->type == ACPI_STATE_C3)
798                                 cx->promotion.threshold.bm = bm_history;
799                 }
800
801                 higher = cx;
802         }
803
804         return 0;
805 }
806 #endif /* !CONFIG_CPU_IDLE */
807
808 static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr)
809 {
810
811         if (!pr)
812                 return -EINVAL;
813
814         if (!pr->pblk)
815                 return -ENODEV;
816
817         /* if info is obtained from pblk/fadt, type equals state */
818         pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;
819         pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;
820
821 #ifndef CONFIG_HOTPLUG_CPU
822         /*
823          * Check for P_LVL2_UP flag before entering C2 and above on
824          * an SMP system.
825          */
826         if ((num_online_cpus() > 1) &&
827             !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
828                 return -ENODEV;
829 #endif
830
831         /* determine C2 and C3 address from pblk */
832         pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;
833         pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;
834
835         /* determine latencies from FADT */
836         pr->power.states[ACPI_STATE_C2].latency = acpi_gbl_FADT.C2latency;
837         pr->power.states[ACPI_STATE_C3].latency = acpi_gbl_FADT.C3latency;
838
839         ACPI_DEBUG_PRINT((ACPI_DB_INFO,
840                           "lvl2[0x%08x] lvl3[0x%08x]\n",
841                           pr->power.states[ACPI_STATE_C2].address,
842                           pr->power.states[ACPI_STATE_C3].address));
843
844         return 0;
845 }
846
847 static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
848 {
849         if (!pr->power.states[ACPI_STATE_C1].valid) {
850                 /* set the first C-State to C1 */
851                 /* all processors need to support C1 */
852                 pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
853                 pr->power.states[ACPI_STATE_C1].valid = 1;
854                 pr->power.states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_HALT;
855         }
856         /* the C0 state only exists as a filler in our array */
857         pr->power.states[ACPI_STATE_C0].valid = 1;
858         return 0;
859 }
860
861 static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
862 {
863         acpi_status status = 0;
864         acpi_integer count;
865         int current_count;
866         int i;
867         struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
868         union acpi_object *cst;
869
870
871         if (nocst)
872                 return -ENODEV;
873
874         current_count = 0;
875
876         status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
877         if (ACPI_FAILURE(status)) {
878                 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
879                 return -ENODEV;
880         }
881
882         cst = buffer.pointer;
883
884         /* There must be at least 2 elements */
885         if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
886                 printk(KERN_ERR PREFIX "not enough elements in _CST\n");
887                 status = -EFAULT;
888                 goto end;
889         }
890
891         count = cst->package.elements[0].integer.value;
892
893         /* Validate number of power states. */
894         if (count < 1 || count != cst->package.count - 1) {
895                 printk(KERN_ERR PREFIX "count given by _CST is not valid\n");
896                 status = -EFAULT;
897                 goto end;
898         }
899
900         /* Tell driver that at least _CST is supported. */
901         pr->flags.has_cst = 1;
902
903         for (i = 1; i <= count; i++) {
904                 union acpi_object *element;
905                 union acpi_object *obj;
906                 struct acpi_power_register *reg;
907                 struct acpi_processor_cx cx;
908
909                 memset(&cx, 0, sizeof(cx));
910
911                 element = &(cst->package.elements[i]);
912                 if (element->type != ACPI_TYPE_PACKAGE)
913                         continue;
914
915                 if (element->package.count != 4)
916                         continue;
917
918                 obj = &(element->package.elements[0]);
919
920                 if (obj->type != ACPI_TYPE_BUFFER)
921                         continue;
922
923                 reg = (struct acpi_power_register *)obj->buffer.pointer;
924
925                 if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
926                     (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
927                         continue;
928
929                 /* There should be an easy way to extract an integer... */
930                 obj = &(element->package.elements[1]);
931                 if (obj->type != ACPI_TYPE_INTEGER)
932                         continue;
933
934                 cx.type = obj->integer.value;
935                 /*
936                  * Some buggy BIOSes won't list C1 in _CST -
937                  * Let acpi_processor_get_power_info_default() handle them later
938                  */
939                 if (i == 1 && cx.type != ACPI_STATE_C1)
940                         current_count++;
941
942                 cx.address = reg->address;
943                 cx.index = current_count + 1;
944
945                 cx.entry_method = ACPI_CSTATE_SYSTEMIO;
946                 if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
947                         if (acpi_processor_ffh_cstate_probe
948                                         (pr->id, &cx, reg) == 0) {
949                                 cx.entry_method = ACPI_CSTATE_FFH;
950                         } else if (cx.type == ACPI_STATE_C1) {
951                                 /*
952                                  * C1 is a special case where FIXED_HARDWARE
953                                  * can be handled in non-MWAIT way as well.
954                                  * In that case, save this _CST entry info.
955                                  * Otherwise, ignore this info and continue.
956                                  */
957                                 cx.entry_method = ACPI_CSTATE_HALT;
958                                 snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
959                         } else {
960                                 continue;
961                         }
962                         if (cx.type == ACPI_STATE_C1 &&
963                                         (idle_halt || idle_nomwait)) {
964                                 /*
965                                  * In most cases the C1 space_id obtained from
966                                  * _CST object is FIXED_HARDWARE access mode.
967                                  * But when the option of idle=halt is added,
968                                  * the entry_method type should be changed from
969                                  * CSTATE_FFH to CSTATE_HALT.
970                                  * When the option of idle=nomwait is added,
971                                  * the C1 entry_method type should be
972                                  * CSTATE_HALT.
973                                  */
974                                 cx.entry_method = ACPI_CSTATE_HALT;
975                                 snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
976                         }
977                 } else {
978                         snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
979                                  cx.address);
980                 }
981
982                 if (cx.type == ACPI_STATE_C1) {
983                         cx.valid = 1;
984                 }
985
986                 obj = &(element->package.elements[2]);
987                 if (obj->type != ACPI_TYPE_INTEGER)
988                         continue;
989
990                 cx.latency = obj->integer.value;
991
992                 obj = &(element->package.elements[3]);
993                 if (obj->type != ACPI_TYPE_INTEGER)
994                         continue;
995
996                 cx.power = obj->integer.value;
997
998                 current_count++;
999                 memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
1000
1001                 /*
1002                  * We support total ACPI_PROCESSOR_MAX_POWER - 1
1003                  * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
1004                  */
1005                 if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
1006                         printk(KERN_WARNING
1007                                "Limiting number of power states to max (%d)\n",
1008                                ACPI_PROCESSOR_MAX_POWER);
1009                         printk(KERN_WARNING
1010                                "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
1011                         break;
1012                 }
1013         }
1014
1015         ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
1016                           current_count));
1017
1018         /* Validate number of power states discovered */
1019         if (current_count < 2)
1020                 status = -EFAULT;
1021
1022       end:
1023         kfree(buffer.pointer);
1024
1025         return status;
1026 }
1027
1028 static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)
1029 {
1030
1031         if (!cx->address)
1032                 return;
1033
1034         /*
1035          * C2 latency must be less than or equal to 100
1036          * microseconds.
1037          */
1038         else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {
1039                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1040                                   "latency too large [%d]\n", cx->latency));
1041                 return;
1042         }
1043
1044         /*
1045          * Otherwise we've met all of our C2 requirements.
1046          * Normalize the C2 latency to expidite policy
1047          */
1048         cx->valid = 1;
1049
1050 #ifndef CONFIG_CPU_IDLE
1051         cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
1052 #else
1053         cx->latency_ticks = cx->latency;
1054 #endif
1055
1056         return;
1057 }
1058
1059 static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
1060                                            struct acpi_processor_cx *cx)
1061 {
1062         static int bm_check_flag;
1063
1064
1065         if (!cx->address)
1066                 return;
1067
1068         /*
1069          * C3 latency must be less than or equal to 1000
1070          * microseconds.
1071          */
1072         else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {
1073                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1074                                   "latency too large [%d]\n", cx->latency));
1075                 return;
1076         }
1077
1078         /*
1079          * PIIX4 Erratum #18: We don't support C3 when Type-F (fast)
1080          * DMA transfers are used by any ISA device to avoid livelock.
1081          * Note that we could disable Type-F DMA (as recommended by
1082          * the erratum), but this is known to disrupt certain ISA
1083          * devices thus we take the conservative approach.
1084          */
1085         else if (errata.piix4.fdma) {
1086                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1087                                   "C3 not supported on PIIX4 with Type-F DMA\n"));
1088                 return;
1089         }
1090
1091         /* All the logic here assumes flags.bm_check is same across all CPUs */
1092         if (!bm_check_flag) {
1093                 /* Determine whether bm_check is needed based on CPU  */
1094                 acpi_processor_power_init_bm_check(&(pr->flags), pr->id);
1095                 bm_check_flag = pr->flags.bm_check;
1096         } else {
1097                 pr->flags.bm_check = bm_check_flag;
1098         }
1099
1100         if (pr->flags.bm_check) {
1101                 if (!pr->flags.bm_control) {
1102                         if (pr->flags.has_cst != 1) {
1103                                 /* bus mastering control is necessary */
1104                                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1105                                         "C3 support requires BM control\n"));
1106                                 return;
1107                         } else {
1108                                 /* Here we enter C3 without bus mastering */
1109                                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1110                                         "C3 support without BM control\n"));
1111                         }
1112                 }
1113         } else {
1114                 /*
1115                  * WBINVD should be set in fadt, for C3 state to be
1116                  * supported on when bm_check is not required.
1117                  */
1118                 if (!(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD)) {
1119                         ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1120                                           "Cache invalidation should work properly"
1121                                           " for C3 to be enabled on SMP systems\n"));
1122                         return;
1123                 }
1124                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
1125         }
1126
1127         /*
1128          * Otherwise we've met all of our C3 requirements.
1129          * Normalize the C3 latency to expidite policy.  Enable
1130          * checking of bus mastering status (bm_check) so we can
1131          * use this in our C3 policy
1132          */
1133         cx->valid = 1;
1134
1135 #ifndef CONFIG_CPU_IDLE
1136         cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
1137 #else
1138         cx->latency_ticks = cx->latency;
1139 #endif
1140
1141         return;
1142 }
1143
1144 static int acpi_processor_power_verify(struct acpi_processor *pr)
1145 {
1146         unsigned int i;
1147         unsigned int working = 0;
1148
1149         pr->power.timer_broadcast_on_state = INT_MAX;
1150
1151         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
1152                 struct acpi_processor_cx *cx = &pr->power.states[i];
1153
1154                 switch (cx->type) {
1155                 case ACPI_STATE_C1:
1156                         cx->valid = 1;
1157                         break;
1158
1159                 case ACPI_STATE_C2:
1160                         acpi_processor_power_verify_c2(cx);
1161                         if (cx->valid)
1162                                 acpi_timer_check_state(i, pr, cx);
1163                         break;
1164
1165                 case ACPI_STATE_C3:
1166                         acpi_processor_power_verify_c3(pr, cx);
1167                         if (cx->valid)
1168                                 acpi_timer_check_state(i, pr, cx);
1169                         break;
1170                 }
1171
1172                 if (cx->valid)
1173                         working++;
1174         }
1175
1176         acpi_propagate_timer_broadcast(pr);
1177
1178         return (working);
1179 }
1180
1181 static int acpi_processor_get_power_info(struct acpi_processor *pr)
1182 {
1183         unsigned int i;
1184         int result;
1185
1186
1187         /* NOTE: the idle thread may not be running while calling
1188          * this function */
1189
1190         /* Zero initialize all the C-states info. */
1191         memset(pr->power.states, 0, sizeof(pr->power.states));
1192
1193         result = acpi_processor_get_power_info_cst(pr);
1194         if (result == -ENODEV)
1195                 result = acpi_processor_get_power_info_fadt(pr);
1196
1197         if (result)
1198                 return result;
1199
1200         acpi_processor_get_power_info_default(pr);
1201
1202         pr->power.count = acpi_processor_power_verify(pr);
1203
1204 #ifndef CONFIG_CPU_IDLE
1205         /*
1206          * Set Default Policy
1207          * ------------------
1208          * Now that we know which states are supported, set the default
1209          * policy.  Note that this policy can be changed dynamically
1210          * (e.g. encourage deeper sleeps to conserve battery life when
1211          * not on AC).
1212          */
1213         result = acpi_processor_set_power_policy(pr);
1214         if (result)
1215                 return result;
1216 #endif
1217
1218         /*
1219          * if one state of type C2 or C3 is available, mark this
1220          * CPU as being "idle manageable"
1221          */
1222         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
1223                 if (pr->power.states[i].valid) {
1224                         pr->power.count = i;
1225                         if (pr->power.states[i].type >= ACPI_STATE_C2)
1226                                 pr->flags.power = 1;
1227                 }
1228         }
1229
1230         return 0;
1231 }
1232
1233 static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
1234 {
1235         struct acpi_processor *pr = seq->private;
1236         unsigned int i;
1237
1238
1239         if (!pr)
1240                 goto end;
1241
1242         seq_printf(seq, "active state:            C%zd\n"
1243                    "max_cstate:              C%d\n"
1244                    "bus master activity:     %08x\n"
1245                    "maximum allowed latency: %d usec\n",
1246                    pr->power.state ? pr->power.state - pr->power.states : 0,
1247                    max_cstate, (unsigned)pr->power.bm_activity,
1248                    pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY));
1249
1250         seq_puts(seq, "states:\n");
1251
1252         for (i = 1; i <= pr->power.count; i++) {
1253                 seq_printf(seq, "   %cC%d:                  ",
1254                            (&pr->power.states[i] ==
1255                             pr->power.state ? '*' : ' '), i);
1256
1257                 if (!pr->power.states[i].valid) {
1258                         seq_puts(seq, "<not supported>\n");
1259                         continue;
1260                 }
1261
1262                 switch (pr->power.states[i].type) {
1263                 case ACPI_STATE_C1:
1264                         seq_printf(seq, "type[C1] ");
1265                         break;
1266                 case ACPI_STATE_C2:
1267                         seq_printf(seq, "type[C2] ");
1268                         break;
1269                 case ACPI_STATE_C3:
1270                         seq_printf(seq, "type[C3] ");
1271                         break;
1272                 default:
1273                         seq_printf(seq, "type[--] ");
1274                         break;
1275                 }
1276
1277                 if (pr->power.states[i].promotion.state)
1278                         seq_printf(seq, "promotion[C%zd] ",
1279                                    (pr->power.states[i].promotion.state -
1280                                     pr->power.states));
1281                 else
1282                         seq_puts(seq, "promotion[--] ");
1283
1284                 if (pr->power.states[i].demotion.state)
1285                         seq_printf(seq, "demotion[C%zd] ",
1286                                    (pr->power.states[i].demotion.state -
1287                                     pr->power.states));
1288                 else
1289                         seq_puts(seq, "demotion[--] ");
1290
1291                 seq_printf(seq, "latency[%03d] usage[%08d] duration[%020llu]\n",
1292                            pr->power.states[i].latency,
1293                            pr->power.states[i].usage,
1294                            (unsigned long long)pr->power.states[i].time);
1295         }
1296
1297       end:
1298         return 0;
1299 }
1300
1301 static int acpi_processor_power_open_fs(struct inode *inode, struct file *file)
1302 {
1303         return single_open(file, acpi_processor_power_seq_show,
1304                            PDE(inode)->data);
1305 }
1306
1307 static const struct file_operations acpi_processor_power_fops = {
1308         .owner = THIS_MODULE,
1309         .open = acpi_processor_power_open_fs,
1310         .read = seq_read,
1311         .llseek = seq_lseek,
1312         .release = single_release,
1313 };
1314
1315 #ifndef CONFIG_CPU_IDLE
1316
1317 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
1318 {
1319         int result = 0;
1320
1321         if (boot_option_idle_override)
1322                 return 0;
1323
1324         if (!pr)
1325                 return -EINVAL;
1326
1327         if (nocst) {
1328                 return -ENODEV;
1329         }
1330
1331         if (!pr->flags.power_setup_done)
1332                 return -ENODEV;
1333
1334         /*
1335          * Fall back to the default idle loop, when pm_idle_save had
1336          * been initialized.
1337          */
1338         if (pm_idle_save) {
1339                 pm_idle = pm_idle_save;
1340                 /* Relies on interrupts forcing exit from idle. */
1341                 synchronize_sched();
1342         }
1343
1344         pr->flags.power = 0;
1345         result = acpi_processor_get_power_info(pr);
1346         if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
1347                 pm_idle = acpi_processor_idle;
1348
1349         return result;
1350 }
1351
1352 #ifdef CONFIG_SMP
1353 static void smp_callback(void *v)
1354 {
1355         /* we already woke the CPU up, nothing more to do */
1356 }
1357
1358 /*
1359  * This function gets called when a part of the kernel has a new latency
1360  * requirement.  This means we need to get all processors out of their C-state,
1361  * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
1362  * wakes them all right up.
1363  */
1364 static int acpi_processor_latency_notify(struct notifier_block *b,
1365                 unsigned long l, void *v)
1366 {
1367         smp_call_function(smp_callback, NULL, 1);
1368         return NOTIFY_OK;
1369 }
1370
1371 static struct notifier_block acpi_processor_latency_notifier = {
1372         .notifier_call = acpi_processor_latency_notify,
1373 };
1374
1375 #endif
1376
1377 #else /* CONFIG_CPU_IDLE */
1378
1379 /**
1380  * acpi_idle_bm_check - checks if bus master activity was detected
1381  */
1382 static int acpi_idle_bm_check(void)
1383 {
1384         u32 bm_status = 0;
1385
1386         acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
1387         if (bm_status)
1388                 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
1389         /*
1390          * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
1391          * the true state of bus mastering activity; forcing us to
1392          * manually check the BMIDEA bit of each IDE channel.
1393          */
1394         else if (errata.piix4.bmisx) {
1395                 if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
1396                     || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
1397                         bm_status = 1;
1398         }
1399         return bm_status;
1400 }
1401
1402 /**
1403  * acpi_idle_update_bm_rld - updates the BM_RLD bit depending on target state
1404  * @pr: the processor
1405  * @target: the new target state
1406  */
1407 static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
1408                                            struct acpi_processor_cx *target)
1409 {
1410         if (pr->flags.bm_rld_set && target->type != ACPI_STATE_C3) {
1411                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
1412                 pr->flags.bm_rld_set = 0;
1413         }
1414
1415         if (!pr->flags.bm_rld_set && target->type == ACPI_STATE_C3) {
1416                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
1417                 pr->flags.bm_rld_set = 1;
1418         }
1419 }
1420
1421 /**
1422  * acpi_idle_do_entry - a helper function that does C2 and C3 type entry
1423  * @cx: cstate data
1424  *
1425  * Caller disables interrupt before call and enables interrupt after return.
1426  */
1427 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1428 {
1429         /* Don't trace irqs off for idle */
1430         stop_critical_timings();
1431         if (cx->entry_method == ACPI_CSTATE_FFH) {
1432                 /* Call into architectural FFH based C-state */
1433                 acpi_processor_ffh_cstate_enter(cx);
1434         } else if (cx->entry_method == ACPI_CSTATE_HALT) {
1435                 acpi_safe_halt();
1436         } else {
1437                 int unused;
1438                 /* IO port based C-state */
1439                 inb(cx->address);
1440                 /* Dummy wait op - must do something useless after P_LVL2 read
1441                    because chipsets cannot guarantee that STPCLK# signal
1442                    gets asserted in time to freeze execution properly. */
1443                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
1444         }
1445         start_critical_timings();
1446 }
1447
1448 /**
1449  * acpi_idle_enter_c1 - enters an ACPI C1 state-type
1450  * @dev: the target CPU
1451  * @state: the state data
1452  *
1453  * This is equivalent to the HALT instruction.
1454  */
1455 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
1456                               struct cpuidle_state *state)
1457 {
1458         u32 t1, t2;
1459         struct acpi_processor *pr;
1460         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1461
1462         pr = __get_cpu_var(processors);
1463
1464         if (unlikely(!pr))
1465                 return 0;
1466
1467         local_irq_disable();
1468
1469         /* Do not access any ACPI IO ports in suspend path */
1470         if (acpi_idle_suspend) {
1471                 acpi_safe_halt();
1472                 local_irq_enable();
1473                 return 0;
1474         }
1475
1476         if (pr->flags.bm_check)
1477                 acpi_idle_update_bm_rld(pr, cx);
1478
1479         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1480         acpi_idle_do_entry(cx);
1481         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1482
1483         local_irq_enable();
1484         cx->usage++;
1485
1486         return ticks_elapsed_in_us(t1, t2);
1487 }
1488
1489 /**
1490  * acpi_idle_enter_simple - enters an ACPI state without BM handling
1491  * @dev: the target CPU
1492  * @state: the state data
1493  */
1494 static int acpi_idle_enter_simple(struct cpuidle_device *dev,
1495                                   struct cpuidle_state *state)
1496 {
1497         struct acpi_processor *pr;
1498         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1499         u32 t1, t2;
1500         int sleep_ticks = 0;
1501
1502         pr = __get_cpu_var(processors);
1503
1504         if (unlikely(!pr))
1505                 return 0;
1506
1507         if (acpi_idle_suspend)
1508                 return(acpi_idle_enter_c1(dev, state));
1509
1510         local_irq_disable();
1511         current_thread_info()->status &= ~TS_POLLING;
1512         /*
1513          * TS_POLLING-cleared state must be visible before we test
1514          * NEED_RESCHED:
1515          */
1516         smp_mb();
1517
1518         if (unlikely(need_resched())) {
1519                 current_thread_info()->status |= TS_POLLING;
1520                 local_irq_enable();
1521                 return 0;
1522         }
1523
1524         /*
1525          * Must be done before busmaster disable as we might need to
1526          * access HPET !
1527          */
1528         acpi_state_timer_broadcast(pr, cx, 1);
1529
1530         if (pr->flags.bm_check)
1531                 acpi_idle_update_bm_rld(pr, cx);
1532
1533         if (cx->type == ACPI_STATE_C3)
1534                 ACPI_FLUSH_CPU_CACHE();
1535
1536         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1537         /* Tell the scheduler that we are going deep-idle: */
1538         sched_clock_idle_sleep_event();
1539         acpi_idle_do_entry(cx);
1540         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1541
1542 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
1543         /* TSC could halt in idle, so notify users */
1544         if (tsc_halts_in_c(cx->type))
1545                 mark_tsc_unstable("TSC halts in idle");;
1546 #endif
1547         sleep_ticks = ticks_elapsed(t1, t2);
1548
1549         /* Tell the scheduler how much we idled: */
1550         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
1551
1552         local_irq_enable();
1553         current_thread_info()->status |= TS_POLLING;
1554
1555         cx->usage++;
1556
1557         acpi_state_timer_broadcast(pr, cx, 0);
1558         cx->time += sleep_ticks;
1559         return ticks_elapsed_in_us(t1, t2);
1560 }
1561
1562 static int c3_cpu_count;
1563 static DEFINE_SPINLOCK(c3_lock);
1564
1565 /**
1566  * acpi_idle_enter_bm - enters C3 with proper BM handling
1567  * @dev: the target CPU
1568  * @state: the state data
1569  *
1570  * If BM is detected, the deepest non-C3 idle state is entered instead.
1571  */
1572 static int acpi_idle_enter_bm(struct cpuidle_device *dev,
1573                               struct cpuidle_state *state)
1574 {
1575         struct acpi_processor *pr;
1576         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1577         u32 t1, t2;
1578         int sleep_ticks = 0;
1579
1580         pr = __get_cpu_var(processors);
1581
1582         if (unlikely(!pr))
1583                 return 0;
1584
1585         if (acpi_idle_suspend)
1586                 return(acpi_idle_enter_c1(dev, state));
1587
1588         if (acpi_idle_bm_check()) {
1589                 if (dev->safe_state) {
1590                         dev->last_state = dev->safe_state;
1591                         return dev->safe_state->enter(dev, dev->safe_state);
1592                 } else {
1593                         local_irq_disable();
1594                         acpi_safe_halt();
1595                         local_irq_enable();
1596                         return 0;
1597                 }
1598         }
1599
1600         local_irq_disable();
1601         current_thread_info()->status &= ~TS_POLLING;
1602         /*
1603          * TS_POLLING-cleared state must be visible before we test
1604          * NEED_RESCHED:
1605          */
1606         smp_mb();
1607
1608         if (unlikely(need_resched())) {
1609                 current_thread_info()->status |= TS_POLLING;
1610                 local_irq_enable();
1611                 return 0;
1612         }
1613
1614         acpi_unlazy_tlb(smp_processor_id());
1615
1616         /* Tell the scheduler that we are going deep-idle: */
1617         sched_clock_idle_sleep_event();
1618         /*
1619          * Must be done before busmaster disable as we might need to
1620          * access HPET !
1621          */
1622         acpi_state_timer_broadcast(pr, cx, 1);
1623
1624         acpi_idle_update_bm_rld(pr, cx);
1625
1626         /*
1627          * disable bus master
1628          * bm_check implies we need ARB_DIS
1629          * !bm_check implies we need cache flush
1630          * bm_control implies whether we can do ARB_DIS
1631          *
1632          * That leaves a case where bm_check is set and bm_control is
1633          * not set. In that case we cannot do much, we enter C3
1634          * without doing anything.
1635          */
1636         if (pr->flags.bm_check && pr->flags.bm_control) {
1637                 spin_lock(&c3_lock);
1638                 c3_cpu_count++;
1639                 /* Disable bus master arbitration when all CPUs are in C3 */
1640                 if (c3_cpu_count == num_online_cpus())
1641                         acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
1642                 spin_unlock(&c3_lock);
1643         } else if (!pr->flags.bm_check) {
1644                 ACPI_FLUSH_CPU_CACHE();
1645         }
1646
1647         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1648         acpi_idle_do_entry(cx);
1649         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1650
1651         /* Re-enable bus master arbitration */
1652         if (pr->flags.bm_check && pr->flags.bm_control) {
1653                 spin_lock(&c3_lock);
1654                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
1655                 c3_cpu_count--;
1656                 spin_unlock(&c3_lock);
1657         }
1658
1659 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
1660         /* TSC could halt in idle, so notify users */
1661         if (tsc_halts_in_c(ACPI_STATE_C3))
1662                 mark_tsc_unstable("TSC halts in idle");
1663 #endif
1664         sleep_ticks = ticks_elapsed(t1, t2);
1665         /* Tell the scheduler how much we idled: */
1666         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
1667
1668         local_irq_enable();
1669         current_thread_info()->status |= TS_POLLING;
1670
1671         cx->usage++;
1672
1673         acpi_state_timer_broadcast(pr, cx, 0);
1674         cx->time += sleep_ticks;
1675         return ticks_elapsed_in_us(t1, t2);
1676 }
1677
1678 struct cpuidle_driver acpi_idle_driver = {
1679         .name =         "acpi_idle",
1680         .owner =        THIS_MODULE,
1681 };
1682
1683 /**
1684  * acpi_processor_setup_cpuidle - prepares and configures CPUIDLE
1685  * @pr: the ACPI processor
1686  */
1687 static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
1688 {
1689         int i, count = CPUIDLE_DRIVER_STATE_START;
1690         struct acpi_processor_cx *cx;
1691         struct cpuidle_state *state;
1692         struct cpuidle_device *dev = &pr->power.dev;
1693
1694         if (!pr->flags.power_setup_done)
1695                 return -EINVAL;
1696
1697         if (pr->flags.power == 0) {
1698                 return -EINVAL;
1699         }
1700
1701         dev->cpu = pr->id;
1702         for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
1703                 dev->states[i].name[0] = '\0';
1704                 dev->states[i].desc[0] = '\0';
1705         }
1706
1707         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
1708                 cx = &pr->power.states[i];
1709                 state = &dev->states[count];
1710
1711                 if (!cx->valid)
1712                         continue;
1713
1714 #ifdef CONFIG_HOTPLUG_CPU
1715                 if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
1716                     !pr->flags.has_cst &&
1717                     !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
1718                         continue;
1719 #endif
1720                 cpuidle_set_statedata(state, cx);
1721
1722                 snprintf(state->name, CPUIDLE_NAME_LEN, "C%d", i);
1723                 strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
1724                 state->exit_latency = cx->latency;
1725                 state->target_residency = cx->latency * latency_factor;
1726                 state->power_usage = cx->power;
1727
1728                 state->flags = 0;
1729                 switch (cx->type) {
1730                         case ACPI_STATE_C1:
1731                         state->flags |= CPUIDLE_FLAG_SHALLOW;
1732                         if (cx->entry_method == ACPI_CSTATE_FFH)
1733                                 state->flags |= CPUIDLE_FLAG_TIME_VALID;
1734
1735                         state->enter = acpi_idle_enter_c1;
1736                         dev->safe_state = state;
1737                         break;
1738
1739                         case ACPI_STATE_C2:
1740                         state->flags |= CPUIDLE_FLAG_BALANCED;
1741                         state->flags |= CPUIDLE_FLAG_TIME_VALID;
1742                         state->enter = acpi_idle_enter_simple;
1743                         dev->safe_state = state;
1744                         break;
1745
1746                         case ACPI_STATE_C3:
1747                         state->flags |= CPUIDLE_FLAG_DEEP;
1748                         state->flags |= CPUIDLE_FLAG_TIME_VALID;
1749                         state->flags |= CPUIDLE_FLAG_CHECK_BM;
1750                         state->enter = pr->flags.bm_check ?
1751                                         acpi_idle_enter_bm :
1752                                         acpi_idle_enter_simple;
1753                         break;
1754                 }
1755
1756                 count++;
1757                 if (count == CPUIDLE_STATE_MAX)
1758                         break;
1759         }
1760
1761         dev->state_count = count;
1762
1763         if (!count)
1764                 return -EINVAL;
1765
1766         return 0;
1767 }
1768
1769 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
1770 {
1771         int ret = 0;
1772
1773         if (boot_option_idle_override)
1774                 return 0;
1775
1776         if (!pr)
1777                 return -EINVAL;
1778
1779         if (nocst) {
1780                 return -ENODEV;
1781         }
1782
1783         if (!pr->flags.power_setup_done)
1784                 return -ENODEV;
1785
1786         cpuidle_pause_and_lock();
1787         cpuidle_disable_device(&pr->power.dev);
1788         acpi_processor_get_power_info(pr);
1789         if (pr->flags.power) {
1790                 acpi_processor_setup_cpuidle(pr);
1791                 ret = cpuidle_enable_device(&pr->power.dev);
1792         }
1793         cpuidle_resume_and_unlock();
1794
1795         return ret;
1796 }
1797
1798 #endif /* CONFIG_CPU_IDLE */
1799
1800 int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
1801                               struct acpi_device *device)
1802 {
1803         acpi_status status = 0;
1804         static int first_run;
1805         struct proc_dir_entry *entry = NULL;
1806         unsigned int i;
1807
1808         if (boot_option_idle_override)
1809                 return 0;
1810
1811         if (!first_run) {
1812                 if (idle_halt) {
1813                         /*
1814                          * When the boot option of "idle=halt" is added, halt
1815                          * is used for CPU IDLE.
1816                          * In such case C2/C3 is meaningless. So the max_cstate
1817                          * is set to one.
1818                          */
1819                         max_cstate = 1;
1820                 }
1821                 dmi_check_system(processor_power_dmi_table);
1822                 max_cstate = acpi_processor_cstate_check(max_cstate);
1823                 if (max_cstate < ACPI_C_STATES_MAX)
1824                         printk(KERN_NOTICE
1825                                "ACPI: processor limited to max C-state %d\n",
1826                                max_cstate);
1827                 first_run++;
1828 #if !defined(CONFIG_CPU_IDLE) && defined(CONFIG_SMP)
1829                 pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY,
1830                                 &acpi_processor_latency_notifier);
1831 #endif
1832         }
1833
1834         if (!pr)
1835                 return -EINVAL;
1836
1837         if (acpi_gbl_FADT.cst_control && !nocst) {
1838                 status =
1839                     acpi_os_write_port(acpi_gbl_FADT.smi_command, acpi_gbl_FADT.cst_control, 8);
1840                 if (ACPI_FAILURE(status)) {
1841                         ACPI_EXCEPTION((AE_INFO, status,
1842                                         "Notifying BIOS of _CST ability failed"));
1843                 }
1844         }
1845
1846         acpi_processor_get_power_info(pr);
1847         pr->flags.power_setup_done = 1;
1848
1849         /*
1850          * Install the idle handler if processor power management is supported.
1851          * Note that we use previously set idle handler will be used on
1852          * platforms that only support C1.
1853          */
1854         if (pr->flags.power) {
1855 #ifdef CONFIG_CPU_IDLE
1856                 acpi_processor_setup_cpuidle(pr);
1857                 if (cpuidle_register_device(&pr->power.dev))
1858                         return -EIO;
1859 #endif
1860
1861                 printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
1862                 for (i = 1; i <= pr->power.count; i++)
1863                         if (pr->power.states[i].valid)
1864                                 printk(" C%d[C%d]", i,
1865                                        pr->power.states[i].type);
1866                 printk(")\n");
1867
1868 #ifndef CONFIG_CPU_IDLE
1869                 if (pr->id == 0) {
1870                         pm_idle_save = pm_idle;
1871                         pm_idle = acpi_processor_idle;
1872                 }
1873 #endif
1874         }
1875
1876         /* 'power' [R] */
1877         entry = proc_create_data(ACPI_PROCESSOR_FILE_POWER,
1878                                  S_IRUGO, acpi_device_dir(device),
1879                                  &acpi_processor_power_fops,
1880                                  acpi_driver_data(device));
1881         if (!entry)
1882                 return -EIO;
1883         return 0;
1884 }
1885
1886 int acpi_processor_power_exit(struct acpi_processor *pr,
1887                               struct acpi_device *device)
1888 {
1889         if (boot_option_idle_override)
1890                 return 0;
1891
1892 #ifdef CONFIG_CPU_IDLE
1893         cpuidle_unregister_device(&pr->power.dev);
1894 #endif
1895         pr->flags.power_setup_done = 0;
1896
1897         if (acpi_device_dir(device))
1898                 remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,
1899                                   acpi_device_dir(device));
1900
1901 #ifndef CONFIG_CPU_IDLE
1902
1903         /* Unregister the idle handler when processor #0 is removed. */
1904         if (pr->id == 0) {
1905                 if (pm_idle_save)
1906                         pm_idle = pm_idle_save;
1907
1908                 /*
1909                  * We are about to unload the current idle thread pm callback
1910                  * (pm_idle), Wait for all processors to update cached/local
1911                  * copies of pm_idle before proceeding.
1912                  */
1913                 cpu_idle_wait();
1914 #ifdef CONFIG_SMP
1915                 pm_qos_remove_notifier(PM_QOS_CPU_DMA_LATENCY,
1916                                 &acpi_processor_latency_notifier);
1917 #endif
1918         }
1919 #endif
1920
1921         return 0;
1922 }