Auto-update from upstream
[linux-2.6] / arch / ia64 / kernel / domain.c
1 /*
2  * arch/ia64/kernel/domain.c
3  * Architecture specific sched-domains builder.
4  *
5  * Copyright (C) 2004 Jesse Barnes
6  * Copyright (C) 2004 Silicon Graphics, Inc.
7  */
8
9 #include <linux/sched.h>
10 #include <linux/percpu.h>
11 #include <linux/slab.h>
12 #include <linux/cpumask.h>
13 #include <linux/init.h>
14 #include <linux/topology.h>
15 #include <linux/nodemask.h>
16
17 #define SD_NODES_PER_DOMAIN 16
18
19 #ifdef CONFIG_NUMA
20 /**
21  * find_next_best_node - find the next node to include in a sched_domain
22  * @node: node whose sched_domain we're building
23  * @used_nodes: nodes already in the sched_domain
24  *
25  * Find the next node to include in a given scheduling domain.  Simply
26  * finds the closest node not already in the @used_nodes map.
27  *
28  * Should use nodemask_t.
29  */
30 static int find_next_best_node(int node, unsigned long *used_nodes)
31 {
32         int i, n, val, min_val, best_node = 0;
33
34         min_val = INT_MAX;
35
36         for (i = 0; i < MAX_NUMNODES; i++) {
37                 /* Start at @node */
38                 n = (node + i) % MAX_NUMNODES;
39
40                 if (!nr_cpus_node(n))
41                         continue;
42
43                 /* Skip already used nodes */
44                 if (test_bit(n, used_nodes))
45                         continue;
46
47                 /* Simple min distance search */
48                 val = node_distance(node, n);
49
50                 if (val < min_val) {
51                         min_val = val;
52                         best_node = n;
53                 }
54         }
55
56         set_bit(best_node, used_nodes);
57         return best_node;
58 }
59
60 /**
61  * sched_domain_node_span - get a cpumask for a node's sched_domain
62  * @node: node whose cpumask we're constructing
63  * @size: number of nodes to include in this span
64  *
65  * Given a node, construct a good cpumask for its sched_domain to span.  It
66  * should be one that prevents unnecessary balancing, but also spreads tasks
67  * out optimally.
68  */
69 static cpumask_t sched_domain_node_span(int node)
70 {
71         int i;
72         cpumask_t span, nodemask;
73         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
74
75         cpus_clear(span);
76         bitmap_zero(used_nodes, MAX_NUMNODES);
77
78         nodemask = node_to_cpumask(node);
79         cpus_or(span, span, nodemask);
80         set_bit(node, used_nodes);
81
82         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
83                 int next_node = find_next_best_node(node, used_nodes);
84                 nodemask = node_to_cpumask(next_node);
85                 cpus_or(span, span, nodemask);
86         }
87
88         return span;
89 }
90 #endif
91
92 /*
93  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
94  * can switch it on easily if needed.
95  */
96 #ifdef CONFIG_SCHED_SMT
97 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
98 static struct sched_group sched_group_cpus[NR_CPUS];
99 static int cpu_to_cpu_group(int cpu)
100 {
101         return cpu;
102 }
103 #endif
104
105 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
106 static struct sched_group sched_group_phys[NR_CPUS];
107 static int cpu_to_phys_group(int cpu)
108 {
109 #ifdef CONFIG_SCHED_SMT
110         return first_cpu(cpu_sibling_map[cpu]);
111 #else
112         return cpu;
113 #endif
114 }
115
116 #ifdef CONFIG_NUMA
117 /*
118  * The init_sched_build_groups can't handle what we want to do with node
119  * groups, so roll our own. Now each node has its own list of groups which
120  * gets dynamically allocated.
121  */
122 static DEFINE_PER_CPU(struct sched_domain, node_domains);
123 static struct sched_group *sched_group_nodes[MAX_NUMNODES];
124
125 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
126 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
127
128 static int cpu_to_allnodes_group(int cpu)
129 {
130         return cpu_to_node(cpu);
131 }
132 #endif
133
134 /*
135  * Build sched domains for a given set of cpus and attach the sched domains
136  * to the individual cpus
137  */
138 void build_sched_domains(const cpumask_t *cpu_map)
139 {
140         int i;
141
142         /*
143          * Set up domains for cpus specified by the cpu_map.
144          */
145         for_each_cpu_mask(i, *cpu_map) {
146                 int group;
147                 struct sched_domain *sd = NULL, *p;
148                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
149
150                 cpus_and(nodemask, nodemask, *cpu_map);
151
152 #ifdef CONFIG_NUMA
153                 if (num_online_cpus()
154                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
155                         sd = &per_cpu(allnodes_domains, i);
156                         *sd = SD_ALLNODES_INIT;
157                         sd->span = *cpu_map;
158                         group = cpu_to_allnodes_group(i);
159                         sd->groups = &sched_group_allnodes[group];
160                         p = sd;
161                 } else
162                         p = NULL;
163
164                 sd = &per_cpu(node_domains, i);
165                 *sd = SD_NODE_INIT;
166                 sd->span = sched_domain_node_span(cpu_to_node(i));
167                 sd->parent = p;
168                 cpus_and(sd->span, sd->span, *cpu_map);
169 #endif
170
171                 p = sd;
172                 sd = &per_cpu(phys_domains, i);
173                 group = cpu_to_phys_group(i);
174                 *sd = SD_CPU_INIT;
175                 sd->span = nodemask;
176                 sd->parent = p;
177                 sd->groups = &sched_group_phys[group];
178
179 #ifdef CONFIG_SCHED_SMT
180                 p = sd;
181                 sd = &per_cpu(cpu_domains, i);
182                 group = cpu_to_cpu_group(i);
183                 *sd = SD_SIBLING_INIT;
184                 sd->span = cpu_sibling_map[i];
185                 cpus_and(sd->span, sd->span, *cpu_map);
186                 sd->parent = p;
187                 sd->groups = &sched_group_cpus[group];
188 #endif
189         }
190
191 #ifdef CONFIG_SCHED_SMT
192         /* Set up CPU (sibling) groups */
193         for_each_cpu_mask(i, *cpu_map) {
194                 cpumask_t this_sibling_map = cpu_sibling_map[i];
195                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
196                 if (i != first_cpu(this_sibling_map))
197                         continue;
198
199                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
200                                                 &cpu_to_cpu_group);
201         }
202 #endif
203
204         /* Set up physical groups */
205         for (i = 0; i < MAX_NUMNODES; i++) {
206                 cpumask_t nodemask = node_to_cpumask(i);
207
208                 cpus_and(nodemask, nodemask, *cpu_map);
209                 if (cpus_empty(nodemask))
210                         continue;
211
212                 init_sched_build_groups(sched_group_phys, nodemask,
213                                                 &cpu_to_phys_group);
214         }
215
216 #ifdef CONFIG_NUMA
217         init_sched_build_groups(sched_group_allnodes, *cpu_map,
218                                 &cpu_to_allnodes_group);
219
220         for (i = 0; i < MAX_NUMNODES; i++) {
221                 /* Set up node groups */
222                 struct sched_group *sg, *prev;
223                 cpumask_t nodemask = node_to_cpumask(i);
224                 cpumask_t domainspan;
225                 cpumask_t covered = CPU_MASK_NONE;
226                 int j;
227
228                 cpus_and(nodemask, nodemask, *cpu_map);
229                 if (cpus_empty(nodemask))
230                         continue;
231
232                 domainspan = sched_domain_node_span(i);
233                 cpus_and(domainspan, domainspan, *cpu_map);
234
235                 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
236                 sched_group_nodes[i] = sg;
237                 for_each_cpu_mask(j, nodemask) {
238                         struct sched_domain *sd;
239                         sd = &per_cpu(node_domains, j);
240                         sd->groups = sg;
241                         if (sd->groups == NULL) {
242                                 /* Turn off balancing if we have no groups */
243                                 sd->flags = 0;
244                         }
245                 }
246                 if (!sg) {
247                         printk(KERN_WARNING
248                         "Can not alloc domain group for node %d\n", i);
249                         continue;
250                 }
251                 sg->cpu_power = 0;
252                 sg->cpumask = nodemask;
253                 cpus_or(covered, covered, nodemask);
254                 prev = sg;
255
256                 for (j = 0; j < MAX_NUMNODES; j++) {
257                         cpumask_t tmp, notcovered;
258                         int n = (i + j) % MAX_NUMNODES;
259
260                         cpus_complement(notcovered, covered);
261                         cpus_and(tmp, notcovered, *cpu_map);
262                         cpus_and(tmp, tmp, domainspan);
263                         if (cpus_empty(tmp))
264                                 break;
265
266                         nodemask = node_to_cpumask(n);
267                         cpus_and(tmp, tmp, nodemask);
268                         if (cpus_empty(tmp))
269                                 continue;
270
271                         sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
272                         if (!sg) {
273                                 printk(KERN_WARNING
274                                 "Can not alloc domain group for node %d\n", j);
275                                 break;
276                         }
277                         sg->cpu_power = 0;
278                         sg->cpumask = tmp;
279                         cpus_or(covered, covered, tmp);
280                         prev->next = sg;
281                         prev = sg;
282                 }
283                 prev->next = sched_group_nodes[i];
284         }
285 #endif
286
287         /* Calculate CPU power for physical packages and nodes */
288         for_each_cpu_mask(i, *cpu_map) {
289                 int power;
290                 struct sched_domain *sd;
291 #ifdef CONFIG_SCHED_SMT
292                 sd = &per_cpu(cpu_domains, i);
293                 power = SCHED_LOAD_SCALE;
294                 sd->groups->cpu_power = power;
295 #endif
296
297                 sd = &per_cpu(phys_domains, i);
298                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
299                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
300                 sd->groups->cpu_power = power;
301
302 #ifdef CONFIG_NUMA
303                 sd = &per_cpu(allnodes_domains, i);
304                 if (sd->groups) {
305                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
306                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
307                         sd->groups->cpu_power = power;
308                 }
309 #endif
310         }
311
312 #ifdef CONFIG_NUMA
313         for (i = 0; i < MAX_NUMNODES; i++) {
314                 struct sched_group *sg = sched_group_nodes[i];
315                 int j;
316
317                 if (sg == NULL)
318                         continue;
319 next_sg:
320                 for_each_cpu_mask(j, sg->cpumask) {
321                         struct sched_domain *sd;
322                         int power;
323
324                         sd = &per_cpu(phys_domains, j);
325                         if (j != first_cpu(sd->groups->cpumask)) {
326                                 /*
327                                  * Only add "power" once for each
328                                  * physical package.
329                                  */
330                                 continue;
331                         }
332                         power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
333                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
334
335                         sg->cpu_power += power;
336                 }
337                 sg = sg->next;
338                 if (sg != sched_group_nodes[i])
339                         goto next_sg;
340         }
341 #endif
342
343         /* Attach the domains */
344         for_each_cpu_mask(i, *cpu_map) {
345                 struct sched_domain *sd;
346 #ifdef CONFIG_SCHED_SMT
347                 sd = &per_cpu(cpu_domains, i);
348 #else
349                 sd = &per_cpu(phys_domains, i);
350 #endif
351                 cpu_attach_domain(sd, i);
352         }
353 }
354 /*
355  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
356  */
357 void arch_init_sched_domains(const cpumask_t *cpu_map)
358 {
359         cpumask_t cpu_default_map;
360
361         /*
362          * Setup mask for cpus without special case scheduling requirements.
363          * For now this just excludes isolated cpus, but could be used to
364          * exclude other special cases in the future.
365          */
366         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
367
368         build_sched_domains(&cpu_default_map);
369 }
370
371 void arch_destroy_sched_domains(const cpumask_t *cpu_map)
372 {
373 #ifdef CONFIG_NUMA
374         int i;
375         for (i = 0; i < MAX_NUMNODES; i++) {
376                 cpumask_t nodemask = node_to_cpumask(i);
377                 struct sched_group *oldsg, *sg = sched_group_nodes[i];
378
379                 cpus_and(nodemask, nodemask, *cpu_map);
380                 if (cpus_empty(nodemask))
381                         continue;
382
383                 if (sg == NULL)
384                         continue;
385                 sg = sg->next;
386 next_sg:
387                 oldsg = sg;
388                 sg = sg->next;
389                 kfree(oldsg);
390                 if (oldsg != sched_group_nodes[i])
391                         goto next_sg;
392                 sched_group_nodes[i] = NULL;
393         }
394 #endif
395 }
396