Merge branch 'upstream-linus' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik...
[linux-2.6] / drivers / cpufreq / cpufreq_ondemand.c
1 /*
2  *  drivers/cpufreq/cpufreq_ondemand.c
3  *
4  *  Copyright (C)  2001 Russell King
5  *            (C)  2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
6  *                      Jun Nakajima <jun.nakajima@intel.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  */
12
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/init.h>
16 #include <linux/cpufreq.h>
17 #include <linux/cpu.h>
18 #include <linux/jiffies.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/mutex.h>
21
22 /*
23  * dbs is used in this file as a shortform for demandbased switching
24  * It helps to keep variable names smaller, simpler
25  */
26
27 #define DEF_FREQUENCY_UP_THRESHOLD              (80)
28 #define MIN_FREQUENCY_UP_THRESHOLD              (11)
29 #define MAX_FREQUENCY_UP_THRESHOLD              (100)
30
31 /*
32  * The polling frequency of this governor depends on the capability of
33  * the processor. Default polling frequency is 1000 times the transition
34  * latency of the processor. The governor will work on any processor with
35  * transition latency <= 10mS, using appropriate sampling
36  * rate.
37  * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
38  * this governor will not work.
39  * All times here are in uS.
40  */
41 static unsigned int def_sampling_rate;
42 #define MIN_SAMPLING_RATE_RATIO                 (2)
43 /* for correct statistics, we need at least 10 ticks between each measure */
44 #define MIN_STAT_SAMPLING_RATE                  \
45                         (MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10))
46 #define MIN_SAMPLING_RATE                       \
47                         (def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
48 #define MAX_SAMPLING_RATE                       (500 * def_sampling_rate)
49 #define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER    (1000)
50 #define TRANSITION_LATENCY_LIMIT                (10 * 1000)
51
52 static void do_dbs_timer(struct work_struct *work);
53
54 /* Sampling types */
55 enum dbs_sample {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
56
57 struct cpu_dbs_info_s {
58         cputime64_t prev_cpu_idle;
59         cputime64_t prev_cpu_wall;
60         struct cpufreq_policy *cur_policy;
61         struct delayed_work work;
62         enum dbs_sample sample_type;
63         unsigned int enable;
64         struct cpufreq_frequency_table *freq_table;
65         unsigned int freq_lo;
66         unsigned int freq_lo_jiffies;
67         unsigned int freq_hi_jiffies;
68 };
69 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
70
71 static unsigned int dbs_enable; /* number of CPUs using this policy */
72
73 /*
74  * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
75  * lock and dbs_mutex. cpu_hotplug lock should always be held before
76  * dbs_mutex. If any function that can potentially take cpu_hotplug lock
77  * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
78  * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
79  * is recursive for the same process. -Venki
80  */
81 static DEFINE_MUTEX(dbs_mutex);
82
83 static struct workqueue_struct  *kondemand_wq;
84
85 static struct dbs_tuners {
86         unsigned int sampling_rate;
87         unsigned int up_threshold;
88         unsigned int ignore_nice;
89         unsigned int powersave_bias;
90 } dbs_tuners_ins = {
91         .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
92         .ignore_nice = 0,
93         .powersave_bias = 0,
94 };
95
96 static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
97 {
98         cputime64_t retval;
99
100         retval = cputime64_add(kstat_cpu(cpu).cpustat.idle,
101                         kstat_cpu(cpu).cpustat.iowait);
102
103         if (dbs_tuners_ins.ignore_nice)
104                 retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice);
105
106         return retval;
107 }
108
109 /*
110  * Find right freq to be set now with powersave_bias on.
111  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
112  * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
113  */
114 static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
115                                           unsigned int freq_next,
116                                           unsigned int relation)
117 {
118         unsigned int freq_req, freq_reduc, freq_avg;
119         unsigned int freq_hi, freq_lo;
120         unsigned int index = 0;
121         unsigned int jiffies_total, jiffies_hi, jiffies_lo;
122         struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
123
124         if (!dbs_info->freq_table) {
125                 dbs_info->freq_lo = 0;
126                 dbs_info->freq_lo_jiffies = 0;
127                 return freq_next;
128         }
129
130         cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next,
131                         relation, &index);
132         freq_req = dbs_info->freq_table[index].frequency;
133         freq_reduc = freq_req * dbs_tuners_ins.powersave_bias / 1000;
134         freq_avg = freq_req - freq_reduc;
135
136         /* Find freq bounds for freq_avg in freq_table */
137         index = 0;
138         cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
139                         CPUFREQ_RELATION_H, &index);
140         freq_lo = dbs_info->freq_table[index].frequency;
141         index = 0;
142         cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
143                         CPUFREQ_RELATION_L, &index);
144         freq_hi = dbs_info->freq_table[index].frequency;
145
146         /* Find out how long we have to be in hi and lo freqs */
147         if (freq_hi == freq_lo) {
148                 dbs_info->freq_lo = 0;
149                 dbs_info->freq_lo_jiffies = 0;
150                 return freq_lo;
151         }
152         jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
153         jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
154         jiffies_hi += ((freq_hi - freq_lo) / 2);
155         jiffies_hi /= (freq_hi - freq_lo);
156         jiffies_lo = jiffies_total - jiffies_hi;
157         dbs_info->freq_lo = freq_lo;
158         dbs_info->freq_lo_jiffies = jiffies_lo;
159         dbs_info->freq_hi_jiffies = jiffies_hi;
160         return freq_hi;
161 }
162
163 static void ondemand_powersave_bias_init(void)
164 {
165         int i;
166         for_each_online_cpu(i) {
167                 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i);
168                 dbs_info->freq_table = cpufreq_frequency_get_table(i);
169                 dbs_info->freq_lo = 0;
170         }
171 }
172
173 /************************** sysfs interface ************************/
174 static ssize_t show_sampling_rate_max(struct cpufreq_policy *policy, char *buf)
175 {
176         return sprintf (buf, "%u\n", MAX_SAMPLING_RATE);
177 }
178
179 static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf)
180 {
181         return sprintf (buf, "%u\n", MIN_SAMPLING_RATE);
182 }
183
184 #define define_one_ro(_name)            \
185 static struct freq_attr _name =         \
186 __ATTR(_name, 0444, show_##_name, NULL)
187
188 define_one_ro(sampling_rate_max);
189 define_one_ro(sampling_rate_min);
190
191 /* cpufreq_ondemand Governor Tunables */
192 #define show_one(file_name, object)                                     \
193 static ssize_t show_##file_name                                         \
194 (struct cpufreq_policy *unused, char *buf)                              \
195 {                                                                       \
196         return sprintf(buf, "%u\n", dbs_tuners_ins.object);             \
197 }
198 show_one(sampling_rate, sampling_rate);
199 show_one(up_threshold, up_threshold);
200 show_one(ignore_nice_load, ignore_nice);
201 show_one(powersave_bias, powersave_bias);
202
203 static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
204                 const char *buf, size_t count)
205 {
206         unsigned int input;
207         int ret;
208         ret = sscanf(buf, "%u", &input);
209
210         mutex_lock(&dbs_mutex);
211         if (ret != 1 || input > MAX_SAMPLING_RATE
212                      || input < MIN_SAMPLING_RATE) {
213                 mutex_unlock(&dbs_mutex);
214                 return -EINVAL;
215         }
216
217         dbs_tuners_ins.sampling_rate = input;
218         mutex_unlock(&dbs_mutex);
219
220         return count;
221 }
222
223 static ssize_t store_up_threshold(struct cpufreq_policy *unused,
224                 const char *buf, size_t count)
225 {
226         unsigned int input;
227         int ret;
228         ret = sscanf(buf, "%u", &input);
229
230         mutex_lock(&dbs_mutex);
231         if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
232                         input < MIN_FREQUENCY_UP_THRESHOLD) {
233                 mutex_unlock(&dbs_mutex);
234                 return -EINVAL;
235         }
236
237         dbs_tuners_ins.up_threshold = input;
238         mutex_unlock(&dbs_mutex);
239
240         return count;
241 }
242
243 static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
244                 const char *buf, size_t count)
245 {
246         unsigned int input;
247         int ret;
248
249         unsigned int j;
250
251         ret = sscanf(buf, "%u", &input);
252         if ( ret != 1 )
253                 return -EINVAL;
254
255         if ( input > 1 )
256                 input = 1;
257
258         mutex_lock(&dbs_mutex);
259         if ( input == dbs_tuners_ins.ignore_nice ) { /* nothing to do */
260                 mutex_unlock(&dbs_mutex);
261                 return count;
262         }
263         dbs_tuners_ins.ignore_nice = input;
264
265         /* we need to re-evaluate prev_cpu_idle */
266         for_each_online_cpu(j) {
267                 struct cpu_dbs_info_s *dbs_info;
268                 dbs_info = &per_cpu(cpu_dbs_info, j);
269                 dbs_info->prev_cpu_idle = get_cpu_idle_time(j);
270                 dbs_info->prev_cpu_wall = get_jiffies_64();
271         }
272         mutex_unlock(&dbs_mutex);
273
274         return count;
275 }
276
277 static ssize_t store_powersave_bias(struct cpufreq_policy *unused,
278                 const char *buf, size_t count)
279 {
280         unsigned int input;
281         int ret;
282         ret = sscanf(buf, "%u", &input);
283
284         if (ret != 1)
285                 return -EINVAL;
286
287         if (input > 1000)
288                 input = 1000;
289
290         mutex_lock(&dbs_mutex);
291         dbs_tuners_ins.powersave_bias = input;
292         ondemand_powersave_bias_init();
293         mutex_unlock(&dbs_mutex);
294
295         return count;
296 }
297
298 #define define_one_rw(_name) \
299 static struct freq_attr _name = \
300 __ATTR(_name, 0644, show_##_name, store_##_name)
301
302 define_one_rw(sampling_rate);
303 define_one_rw(up_threshold);
304 define_one_rw(ignore_nice_load);
305 define_one_rw(powersave_bias);
306
307 static struct attribute * dbs_attributes[] = {
308         &sampling_rate_max.attr,
309         &sampling_rate_min.attr,
310         &sampling_rate.attr,
311         &up_threshold.attr,
312         &ignore_nice_load.attr,
313         &powersave_bias.attr,
314         NULL
315 };
316
317 static struct attribute_group dbs_attr_group = {
318         .attrs = dbs_attributes,
319         .name = "ondemand",
320 };
321
322 /************************** sysfs end ************************/
323
324 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
325 {
326         unsigned int idle_ticks, total_ticks;
327         unsigned int load;
328         cputime64_t cur_jiffies;
329
330         struct cpufreq_policy *policy;
331         unsigned int j;
332
333         if (!this_dbs_info->enable)
334                 return;
335
336         this_dbs_info->freq_lo = 0;
337         policy = this_dbs_info->cur_policy;
338         cur_jiffies = jiffies64_to_cputime64(get_jiffies_64());
339         total_ticks = (unsigned int) cputime64_sub(cur_jiffies,
340                         this_dbs_info->prev_cpu_wall);
341         this_dbs_info->prev_cpu_wall = cur_jiffies;
342         if (!total_ticks)
343                 return;
344         /*
345          * Every sampling_rate, we check, if current idle time is less
346          * than 20% (default), then we try to increase frequency
347          * Every sampling_rate, we look for a the lowest
348          * frequency which can sustain the load while keeping idle time over
349          * 30%. If such a frequency exist, we try to decrease to this frequency.
350          *
351          * Any frequency increase takes it to the maximum frequency.
352          * Frequency reduction happens at minimum steps of
353          * 5% (default) of current frequency
354          */
355
356         /* Get Idle Time */
357         idle_ticks = UINT_MAX;
358         for_each_cpu_mask(j, policy->cpus) {
359                 cputime64_t total_idle_ticks;
360                 unsigned int tmp_idle_ticks;
361                 struct cpu_dbs_info_s *j_dbs_info;
362
363                 j_dbs_info = &per_cpu(cpu_dbs_info, j);
364                 total_idle_ticks = get_cpu_idle_time(j);
365                 tmp_idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks,
366                                 j_dbs_info->prev_cpu_idle);
367                 j_dbs_info->prev_cpu_idle = total_idle_ticks;
368
369                 if (tmp_idle_ticks < idle_ticks)
370                         idle_ticks = tmp_idle_ticks;
371         }
372         load = (100 * (total_ticks - idle_ticks)) / total_ticks;
373
374         /* Check for frequency increase */
375         if (load > dbs_tuners_ins.up_threshold) {
376                 /* if we are already at full speed then break out early */
377                 if (!dbs_tuners_ins.powersave_bias) {
378                         if (policy->cur == policy->max)
379                                 return;
380
381                         __cpufreq_driver_target(policy, policy->max,
382                                 CPUFREQ_RELATION_H);
383                 } else {
384                         int freq = powersave_bias_target(policy, policy->max,
385                                         CPUFREQ_RELATION_H);
386                         __cpufreq_driver_target(policy, freq,
387                                 CPUFREQ_RELATION_L);
388                 }
389                 return;
390         }
391
392         /* Check for frequency decrease */
393         /* if we cannot reduce the frequency anymore, break out early */
394         if (policy->cur == policy->min)
395                 return;
396
397         /*
398          * The optimal frequency is the frequency that is the lowest that
399          * can support the current CPU usage without triggering the up
400          * policy. To be safe, we focus 10 points under the threshold.
401          */
402         if (load < (dbs_tuners_ins.up_threshold - 10)) {
403                 unsigned int freq_next, freq_cur;
404
405                 freq_cur = cpufreq_driver_getavg(policy);
406                 if (!freq_cur)
407                         freq_cur = policy->cur;
408
409                 freq_next = (freq_cur * load) /
410                         (dbs_tuners_ins.up_threshold - 10);
411
412                 if (!dbs_tuners_ins.powersave_bias) {
413                         __cpufreq_driver_target(policy, freq_next,
414                                         CPUFREQ_RELATION_L);
415                 } else {
416                         int freq = powersave_bias_target(policy, freq_next,
417                                         CPUFREQ_RELATION_L);
418                         __cpufreq_driver_target(policy, freq,
419                                 CPUFREQ_RELATION_L);
420                 }
421         }
422 }
423
424 static void do_dbs_timer(struct work_struct *work)
425 {
426         unsigned int cpu = smp_processor_id();
427         struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
428         enum dbs_sample sample_type = dbs_info->sample_type;
429         /* We want all CPUs to do sampling nearly on same jiffy */
430         int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
431
432         /* Permit rescheduling of this work item */
433         work_release(work);
434
435         delay -= jiffies % delay;
436
437         if (!dbs_info->enable)
438                 return;
439         /* Common NORMAL_SAMPLE setup */
440         dbs_info->sample_type = DBS_NORMAL_SAMPLE;
441         if (!dbs_tuners_ins.powersave_bias ||
442             sample_type == DBS_NORMAL_SAMPLE) {
443                 lock_cpu_hotplug();
444                 dbs_check_cpu(dbs_info);
445                 unlock_cpu_hotplug();
446                 if (dbs_info->freq_lo) {
447                         /* Setup timer for SUB_SAMPLE */
448                         dbs_info->sample_type = DBS_SUB_SAMPLE;
449                         delay = dbs_info->freq_hi_jiffies;
450                 }
451         } else {
452                 __cpufreq_driver_target(dbs_info->cur_policy,
453                                         dbs_info->freq_lo,
454                                         CPUFREQ_RELATION_H);
455         }
456         queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
457 }
458
459 static inline void dbs_timer_init(unsigned int cpu)
460 {
461         struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
462         /* We want all CPUs to do sampling nearly on same jiffy */
463         int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
464         delay -= jiffies % delay;
465
466         ondemand_powersave_bias_init();
467         INIT_DELAYED_WORK_NAR(&dbs_info->work, do_dbs_timer);
468         dbs_info->sample_type = DBS_NORMAL_SAMPLE;
469         queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
470 }
471
472 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
473 {
474         dbs_info->enable = 0;
475         cancel_delayed_work(&dbs_info->work);
476         flush_workqueue(kondemand_wq);
477 }
478
479 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
480                                    unsigned int event)
481 {
482         unsigned int cpu = policy->cpu;
483         struct cpu_dbs_info_s *this_dbs_info;
484         unsigned int j;
485         int rc;
486
487         this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
488
489         switch (event) {
490         case CPUFREQ_GOV_START:
491                 if ((!cpu_online(cpu)) || (!policy->cur))
492                         return -EINVAL;
493
494                 if (policy->cpuinfo.transition_latency >
495                                 (TRANSITION_LATENCY_LIMIT * 1000)) {
496                         printk(KERN_WARNING "ondemand governor failed to load "
497                                "due to too long transition latency\n");
498                         return -EINVAL;
499                 }
500                 if (this_dbs_info->enable) /* Already enabled */
501                         break;
502
503                 mutex_lock(&dbs_mutex);
504                 dbs_enable++;
505                 if (dbs_enable == 1) {
506                         kondemand_wq = create_workqueue("kondemand");
507                         if (!kondemand_wq) {
508                                 printk(KERN_ERR
509                                          "Creation of kondemand failed\n");
510                                 dbs_enable--;
511                                 mutex_unlock(&dbs_mutex);
512                                 return -ENOSPC;
513                         }
514                 }
515
516                 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
517                 if (rc) {
518                         if (dbs_enable == 1)
519                                 destroy_workqueue(kondemand_wq);
520                         dbs_enable--;
521                         mutex_unlock(&dbs_mutex);
522                         return rc;
523                 }
524
525                 for_each_cpu_mask(j, policy->cpus) {
526                         struct cpu_dbs_info_s *j_dbs_info;
527                         j_dbs_info = &per_cpu(cpu_dbs_info, j);
528                         j_dbs_info->cur_policy = policy;
529
530                         j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j);
531                         j_dbs_info->prev_cpu_wall = get_jiffies_64();
532                 }
533                 this_dbs_info->enable = 1;
534                 /*
535                  * Start the timerschedule work, when this governor
536                  * is used for first time
537                  */
538                 if (dbs_enable == 1) {
539                         unsigned int latency;
540                         /* policy latency is in nS. Convert it to uS first */
541                         latency = policy->cpuinfo.transition_latency / 1000;
542                         if (latency == 0)
543                                 latency = 1;
544
545                         def_sampling_rate = latency *
546                                         DEF_SAMPLING_RATE_LATENCY_MULTIPLIER;
547
548                         if (def_sampling_rate < MIN_STAT_SAMPLING_RATE)
549                                 def_sampling_rate = MIN_STAT_SAMPLING_RATE;
550
551                         dbs_tuners_ins.sampling_rate = def_sampling_rate;
552                 }
553                 dbs_timer_init(policy->cpu);
554
555                 mutex_unlock(&dbs_mutex);
556                 break;
557
558         case CPUFREQ_GOV_STOP:
559                 mutex_lock(&dbs_mutex);
560                 dbs_timer_exit(this_dbs_info);
561                 sysfs_remove_group(&policy->kobj, &dbs_attr_group);
562                 dbs_enable--;
563                 if (dbs_enable == 0)
564                         destroy_workqueue(kondemand_wq);
565
566                 mutex_unlock(&dbs_mutex);
567
568                 break;
569
570         case CPUFREQ_GOV_LIMITS:
571                 mutex_lock(&dbs_mutex);
572                 if (policy->max < this_dbs_info->cur_policy->cur)
573                         __cpufreq_driver_target(this_dbs_info->cur_policy,
574                                                 policy->max,
575                                                 CPUFREQ_RELATION_H);
576                 else if (policy->min > this_dbs_info->cur_policy->cur)
577                         __cpufreq_driver_target(this_dbs_info->cur_policy,
578                                                 policy->min,
579                                                 CPUFREQ_RELATION_L);
580                 mutex_unlock(&dbs_mutex);
581                 break;
582         }
583         return 0;
584 }
585
586 static struct cpufreq_governor cpufreq_gov_dbs = {
587         .name = "ondemand",
588         .governor = cpufreq_governor_dbs,
589         .owner = THIS_MODULE,
590 };
591
592 static int __init cpufreq_gov_dbs_init(void)
593 {
594         return cpufreq_register_governor(&cpufreq_gov_dbs);
595 }
596
597 static void __exit cpufreq_gov_dbs_exit(void)
598 {
599         cpufreq_unregister_governor(&cpufreq_gov_dbs);
600 }
601
602
603 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
604 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
605 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
606                    "Low Latency Frequency Transition capable processors");
607 MODULE_LICENSE("GPL");
608
609 module_init(cpufreq_gov_dbs_init);
610 module_exit(cpufreq_gov_dbs_exit);