Merge git://git.kernel.org/pub/scm/linux/kernel/git/bunk/trivial
[linux-2.6] / net / ipv4 / ipvs / ip_vs_lblc.c
1 /*
2  * IPVS:        Locality-Based Least-Connection scheduling module
3  *
4  * Version:     $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@gnuchina.org>
7  *
8  *              This program is free software; you can redistribute it and/or
9  *              modify it under the terms of the GNU General Public License
10  *              as published by the Free Software Foundation; either version
11  *              2 of the License, or (at your option) any later version.
12  *
13  * Changes:
14  *     Martin Hamilton         :    fixed the terrible locking bugs
15  *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
16  *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
17  *     Wensong Zhang           :    added doing full expiration check to
18  *                                   collect stale entries of 24+ hours when
19  *                                   no partial expire check in a half hour
20  *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
21  *                                   to avoid the possible race between timer
22  *                                   handler and del_timer thread in SMP
23  *
24  */
25
26 /*
27  * The lblc algorithm is as follows (pseudo code):
28  *
29  *       if cachenode[dest_ip] is null then
30  *               n, cachenode[dest_ip] <- {weighted least-conn node};
31  *       else
32  *               n <- cachenode[dest_ip];
33  *               if (n is dead) OR
34  *                  (n.conns>n.weight AND
35  *                   there is a node m with m.conns<m.weight/2) then
36  *                 n, cachenode[dest_ip] <- {weighted least-conn node};
37  *
38  *       return n;
39  *
40  * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
41  * me to write this module.
42  */
43
44 #include <linux/ip.h>
45 #include <linux/module.h>
46 #include <linux/kernel.h>
47 #include <linux/skbuff.h>
48
49 /* for sysctl */
50 #include <linux/fs.h>
51 #include <linux/sysctl.h>
52
53 #include <net/ip_vs.h>
54
55
56 /*
57  *    It is for garbage collection of stale IPVS lblc entries,
58  *    when the table is full.
59  */
60 #define CHECK_EXPIRE_INTERVAL   (60*HZ)
61 #define ENTRY_TIMEOUT           (6*60*HZ)
62
63 /*
64  *    It is for full expiration check.
65  *    When there is no partial expiration check (garbage collection)
66  *    in a half hour, do a full expiration check to collect stale
67  *    entries that haven't been touched for a day.
68  */
69 #define COUNT_FOR_FULL_EXPIRATION   30
70 static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
71
72
73 /*
74  *     for IPVS lblc entry hash table
75  */
76 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS
77 #define CONFIG_IP_VS_LBLC_TAB_BITS      10
78 #endif
79 #define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
80 #define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
81 #define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
82
83
84 /*
85  *      IPVS lblc entry represents an association between destination
86  *      IP address and its destination server
87  */
88 struct ip_vs_lblc_entry {
89         struct list_head        list;
90         __u32                   addr;           /* destination IP address */
91         struct ip_vs_dest       *dest;          /* real server (cache) */
92         unsigned long           lastuse;        /* last used time */
93 };
94
95
96 /*
97  *      IPVS lblc hash table
98  */
99 struct ip_vs_lblc_table {
100         rwlock_t                lock;           /* lock for this table */
101         struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
102         atomic_t                entries;        /* number of entries */
103         int                     max_size;       /* maximum size of entries */
104         struct timer_list       periodic_timer; /* collect stale entries */
105         int                     rover;          /* rover for expire check */
106         int                     counter;        /* counter for no expire */
107 };
108
109
110 /*
111  *      IPVS LBLC sysctl table
112  */
113
114 static ctl_table vs_vars_table[] = {
115         {
116                 .ctl_name       = NET_IPV4_VS_LBLC_EXPIRE,
117                 .procname       = "lblc_expiration",
118                 .data           = &sysctl_ip_vs_lblc_expiration,
119                 .maxlen         = sizeof(int),
120                 .mode           = 0644, 
121                 .proc_handler   = &proc_dointvec_jiffies,
122         },
123         { .ctl_name = 0 }
124 };
125
126 static ctl_table vs_table[] = {
127         {
128                 .ctl_name       = NET_IPV4_VS,
129                 .procname       = "vs",
130                 .mode           = 0555, 
131                 .child          = vs_vars_table
132         },
133         { .ctl_name = 0 }
134 };
135
136 static ctl_table ipvs_ipv4_table[] = {
137         {
138                 .ctl_name       = NET_IPV4,
139                 .procname       = "ipv4", 
140                 .mode           = 0555,
141                 .child          = vs_table
142         },
143         { .ctl_name = 0 }
144 };
145
146 static ctl_table lblc_root_table[] = {
147         {
148                 .ctl_name       = CTL_NET,
149                 .procname       = "net", 
150                 .mode           = 0555, 
151                 .child          = ipvs_ipv4_table
152         },
153         { .ctl_name = 0 }
154 };
155
156 static struct ctl_table_header * sysctl_header;
157
158 /*
159  *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
160  *      IP address to a server.
161  */
162 static inline struct ip_vs_lblc_entry *
163 ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
164 {
165         struct ip_vs_lblc_entry *en;
166
167         en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
168         if (en == NULL) {
169                 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
170                 return NULL;
171         }
172
173         INIT_LIST_HEAD(&en->list);
174         en->addr = daddr;
175
176         atomic_inc(&dest->refcnt);
177         en->dest = dest;
178
179         return en;
180 }
181
182
183 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
184 {
185         list_del(&en->list);
186         /*
187          * We don't kfree dest because it is refered either by its service
188          * or the trash dest list.
189          */
190         atomic_dec(&en->dest->refcnt);
191         kfree(en);
192 }
193
194
195 /*
196  *      Returns hash value for IPVS LBLC entry
197  */
198 static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
199 {
200         return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
201 }
202
203
204 /*
205  *      Hash an entry in the ip_vs_lblc_table.
206  *      returns bool success.
207  */
208 static int
209 ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
210 {
211         unsigned hash;
212
213         if (!list_empty(&en->list)) {
214                 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
215                           "called from %p\n", __builtin_return_address(0));
216                 return 0;
217         }
218
219         /*
220          *      Hash by destination IP address
221          */
222         hash = ip_vs_lblc_hashkey(en->addr);
223
224         write_lock(&tbl->lock);
225         list_add(&en->list, &tbl->bucket[hash]);
226         atomic_inc(&tbl->entries);
227         write_unlock(&tbl->lock);
228
229         return 1;
230 }
231
232
233 /*
234  *  Get ip_vs_lblc_entry associated with supplied parameters.
235  */
236 static inline struct ip_vs_lblc_entry *
237 ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
238 {
239         unsigned hash;
240         struct ip_vs_lblc_entry *en;
241
242         hash = ip_vs_lblc_hashkey(addr);
243
244         read_lock(&tbl->lock);
245
246         list_for_each_entry(en, &tbl->bucket[hash], list) {
247                 if (en->addr == addr) {
248                         /* HIT */
249                         read_unlock(&tbl->lock);
250                         return en;
251                 }
252         }
253
254         read_unlock(&tbl->lock);
255
256         return NULL;
257 }
258
259
260 /*
261  *      Flush all the entries of the specified table.
262  */
263 static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
264 {
265         int i;
266         struct ip_vs_lblc_entry *en, *nxt;
267
268         for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
269                 write_lock(&tbl->lock);
270                 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
271                         ip_vs_lblc_free(en);
272                         atomic_dec(&tbl->entries);
273                 }
274                 write_unlock(&tbl->lock);
275         }
276 }
277
278
279 static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
280 {
281         unsigned long now = jiffies;
282         int i, j;
283         struct ip_vs_lblc_entry *en, *nxt;
284
285         for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
286                 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
287
288                 write_lock(&tbl->lock);
289                 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
290                         if (time_before(now, 
291                                         en->lastuse + sysctl_ip_vs_lblc_expiration))
292                                 continue;
293
294                         ip_vs_lblc_free(en);
295                         atomic_dec(&tbl->entries);
296                 }
297                 write_unlock(&tbl->lock);
298         }
299         tbl->rover = j;
300 }
301
302
303 /*
304  *      Periodical timer handler for IPVS lblc table
305  *      It is used to collect stale entries when the number of entries
306  *      exceeds the maximum size of the table.
307  *
308  *      Fixme: we probably need more complicated algorithm to collect
309  *             entries that have not been used for a long time even
310  *             if the number of entries doesn't exceed the maximum size
311  *             of the table.
312  *      The full expiration check is for this purpose now.
313  */
314 static void ip_vs_lblc_check_expire(unsigned long data)
315 {
316         struct ip_vs_lblc_table *tbl;
317         unsigned long now = jiffies;
318         int goal;
319         int i, j;
320         struct ip_vs_lblc_entry *en, *nxt;
321
322         tbl = (struct ip_vs_lblc_table *)data;
323
324         if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
325                 /* do full expiration check */
326                 ip_vs_lblc_full_check(tbl);
327                 tbl->counter = 1;
328                 goto out;
329         }
330
331         if (atomic_read(&tbl->entries) <= tbl->max_size) {
332                 tbl->counter++;
333                 goto out;
334         }
335
336         goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
337         if (goal > tbl->max_size/2)
338                 goal = tbl->max_size/2;
339
340         for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
341                 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
342
343                 write_lock(&tbl->lock);
344                 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
345                         if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
346                                 continue;
347
348                         ip_vs_lblc_free(en);
349                         atomic_dec(&tbl->entries);
350                         goal--;
351                 }
352                 write_unlock(&tbl->lock);
353                 if (goal <= 0)
354                         break;
355         }
356         tbl->rover = j;
357
358   out:
359         mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
360 }
361
362
363 static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
364 {
365         int i;
366         struct ip_vs_lblc_table *tbl;
367
368         /*
369          *    Allocate the ip_vs_lblc_table for this service
370          */
371         tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
372         if (tbl == NULL) {
373                 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
374                 return -ENOMEM;
375         }
376         svc->sched_data = tbl;
377         IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
378                   "current service\n",
379                   sizeof(struct ip_vs_lblc_table));
380
381         /*
382          *    Initialize the hash buckets
383          */
384         for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
385                 INIT_LIST_HEAD(&tbl->bucket[i]);
386         }
387         rwlock_init(&tbl->lock);
388         tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
389         tbl->rover = 0;
390         tbl->counter = 1;
391
392         /*
393          *    Hook periodic timer for garbage collection
394          */
395         init_timer(&tbl->periodic_timer);
396         tbl->periodic_timer.data = (unsigned long)tbl;
397         tbl->periodic_timer.function = ip_vs_lblc_check_expire;
398         tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
399         add_timer(&tbl->periodic_timer);
400
401         return 0;
402 }
403
404
405 static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
406 {
407         struct ip_vs_lblc_table *tbl = svc->sched_data;
408
409         /* remove periodic timer */
410         del_timer_sync(&tbl->periodic_timer);
411
412         /* got to clean up table entries here */
413         ip_vs_lblc_flush(tbl);
414
415         /* release the table itself */
416         kfree(svc->sched_data);
417         IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
418                   sizeof(struct ip_vs_lblc_table));
419
420         return 0;
421 }
422
423
424 static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
425 {
426         return 0;
427 }
428
429
430 static inline struct ip_vs_dest *
431 __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
432 {
433         struct ip_vs_dest *dest, *least;
434         int loh, doh;
435
436         /*
437          * We think the overhead of processing active connections is fifty
438          * times higher than that of inactive connections in average. (This
439          * fifty times might not be accurate, we will change it later.) We
440          * use the following formula to estimate the overhead:
441          *                dest->activeconns*50 + dest->inactconns
442          * and the load:
443          *                (dest overhead) / dest->weight
444          *
445          * Remember -- no floats in kernel mode!!!
446          * The comparison of h1*w2 > h2*w1 is equivalent to that of
447          *                h1/w1 > h2/w2
448          * if every weight is larger than zero.
449          *
450          * The server with weight=0 is quiesced and will not receive any
451          * new connection.
452          */
453         list_for_each_entry(dest, &svc->destinations, n_list) {
454                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
455                         continue;
456                 if (atomic_read(&dest->weight) > 0) {
457                         least = dest;
458                         loh = atomic_read(&least->activeconns) * 50
459                                 + atomic_read(&least->inactconns);
460                         goto nextstage;
461                 }
462         }
463         return NULL;
464
465         /*
466          *    Find the destination with the least load.
467          */
468   nextstage:
469         list_for_each_entry_continue(dest, &svc->destinations, n_list) {
470                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
471                         continue;
472
473                 doh = atomic_read(&dest->activeconns) * 50
474                         + atomic_read(&dest->inactconns);
475                 if (loh * atomic_read(&dest->weight) >
476                     doh * atomic_read(&least->weight)) {
477                         least = dest;
478                         loh = doh;
479                 }
480         }
481
482         IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
483                   "activeconns %d refcnt %d weight %d overhead %d\n",
484                   NIPQUAD(least->addr), ntohs(least->port),
485                   atomic_read(&least->activeconns),
486                   atomic_read(&least->refcnt),
487                   atomic_read(&least->weight), loh);
488
489         return least;
490 }
491
492
493 /*
494  *   If this destination server is overloaded and there is a less loaded
495  *   server, then return true.
496  */
497 static inline int
498 is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
499 {
500         if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
501                 struct ip_vs_dest *d;
502
503                 list_for_each_entry(d, &svc->destinations, n_list) {
504                         if (atomic_read(&d->activeconns)*2
505                             < atomic_read(&d->weight)) {
506                                 return 1;
507                         }
508                 }
509         }
510         return 0;
511 }
512
513
514 /*
515  *    Locality-Based (weighted) Least-Connection scheduling
516  */
517 static struct ip_vs_dest *
518 ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
519 {
520         struct ip_vs_dest *dest;
521         struct ip_vs_lblc_table *tbl;
522         struct ip_vs_lblc_entry *en;
523         struct iphdr *iph = skb->nh.iph;
524
525         IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
526
527         tbl = (struct ip_vs_lblc_table *)svc->sched_data;
528         en = ip_vs_lblc_get(tbl, iph->daddr);
529         if (en == NULL) {
530                 dest = __ip_vs_wlc_schedule(svc, iph);
531                 if (dest == NULL) {
532                         IP_VS_DBG(1, "no destination available\n");
533                         return NULL;
534                 }
535                 en = ip_vs_lblc_new(iph->daddr, dest);
536                 if (en == NULL) {
537                         return NULL;
538                 }
539                 ip_vs_lblc_hash(tbl, en);
540         } else {
541                 dest = en->dest;
542                 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
543                     || atomic_read(&dest->weight) <= 0
544                     || is_overloaded(dest, svc)) {
545                         dest = __ip_vs_wlc_schedule(svc, iph);
546                         if (dest == NULL) {
547                                 IP_VS_DBG(1, "no destination available\n");
548                                 return NULL;
549                         }
550                         atomic_dec(&en->dest->refcnt);
551                         atomic_inc(&dest->refcnt);
552                         en->dest = dest;
553                 }
554         }
555         en->lastuse = jiffies;
556
557         IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
558                   "--> server %u.%u.%u.%u:%d\n",
559                   NIPQUAD(en->addr),
560                   NIPQUAD(dest->addr),
561                   ntohs(dest->port));
562
563         return dest;
564 }
565
566
567 /*
568  *      IPVS LBLC Scheduler structure
569  */
570 static struct ip_vs_scheduler ip_vs_lblc_scheduler =
571 {
572         .name =                 "lblc",
573         .refcnt =               ATOMIC_INIT(0),
574         .module =               THIS_MODULE,
575         .init_service =         ip_vs_lblc_init_svc,
576         .done_service =         ip_vs_lblc_done_svc,
577         .update_service =       ip_vs_lblc_update_svc,
578         .schedule =             ip_vs_lblc_schedule,
579 };
580
581
582 static int __init ip_vs_lblc_init(void)
583 {
584         INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
585         sysctl_header = register_sysctl_table(lblc_root_table, 0);
586         return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
587 }
588
589
590 static void __exit ip_vs_lblc_cleanup(void)
591 {
592         unregister_sysctl_table(sysctl_header);
593         unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
594 }
595
596
597 module_init(ip_vs_lblc_init);
598 module_exit(ip_vs_lblc_cleanup);
599 MODULE_LICENSE("GPL");