Pull platform-drivers into test branch
[linux-2.6] / net / ipv4 / ipvs / ip_vs_lblcr.c
1 /*
2  * IPVS:        Locality-Based Least-Connection with Replication scheduler
3  *
4  * Version:     $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@gnuchina.org>
7  *
8  *              This program is free software; you can redistribute it and/or
9  *              modify it under the terms of the GNU General Public License
10  *              as published by the Free Software Foundation; either version
11  *              2 of the License, or (at your option) any later version.
12  *
13  * Changes:
14  *     Julian Anastasov        :    Added the missing (dest->weight>0)
15  *                                  condition in the ip_vs_dest_set_max.
16  *
17  */
18
19 /*
20  * The lblc/r algorithm is as follows (pseudo code):
21  *
22  *       if serverSet[dest_ip] is null then
23  *               n, serverSet[dest_ip] <- {weighted least-conn node};
24  *       else
25  *               n <- {least-conn (alive) node in serverSet[dest_ip]};
26  *               if (n is null) OR
27  *                  (n.conns>n.weight AND
28  *                   there is a node m with m.conns<m.weight/2) then
29  *                   n <- {weighted least-conn node};
30  *                   add n to serverSet[dest_ip];
31  *               if |serverSet[dest_ip]| > 1 AND
32  *                   now - serverSet[dest_ip].lastMod > T then
33  *                   m <- {most conn node in serverSet[dest_ip]};
34  *                   remove m from serverSet[dest_ip];
35  *       if serverSet[dest_ip] changed then
36  *               serverSet[dest_ip].lastMod <- now;
37  *
38  *       return n;
39  *
40  */
41
42 #include <linux/ip.h>
43 #include <linux/module.h>
44 #include <linux/kernel.h>
45 #include <linux/skbuff.h>
46 #include <linux/jiffies.h>
47
48 /* for sysctl */
49 #include <linux/fs.h>
50 #include <linux/sysctl.h>
51 /* for proc_net_create/proc_net_remove */
52 #include <linux/proc_fs.h>
53
54 #include <net/ip_vs.h>
55
56
57 /*
58  *    It is for garbage collection of stale IPVS lblcr entries,
59  *    when the table is full.
60  */
61 #define CHECK_EXPIRE_INTERVAL   (60*HZ)
62 #define ENTRY_TIMEOUT           (6*60*HZ)
63
64 /*
65  *    It is for full expiration check.
66  *    When there is no partial expiration check (garbage collection)
67  *    in a half hour, do a full expiration check to collect stale
68  *    entries that haven't been touched for a day.
69  */
70 #define COUNT_FOR_FULL_EXPIRATION   30
71 static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
72
73
74 /*
75  *     for IPVS lblcr entry hash table
76  */
77 #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
78 #define CONFIG_IP_VS_LBLCR_TAB_BITS      10
79 #endif
80 #define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
81 #define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
82 #define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
83
84
85 /*
86  *      IPVS destination set structure and operations
87  */
88 struct ip_vs_dest_list {
89         struct ip_vs_dest_list  *next;          /* list link */
90         struct ip_vs_dest       *dest;          /* destination server */
91 };
92
93 struct ip_vs_dest_set {
94         atomic_t                size;           /* set size */
95         unsigned long           lastmod;        /* last modified time */
96         struct ip_vs_dest_list  *list;          /* destination list */
97         rwlock_t                lock;           /* lock for this list */
98 };
99
100
101 static struct ip_vs_dest_list *
102 ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
103 {
104         struct ip_vs_dest_list *e;
105
106         for (e=set->list; e!=NULL; e=e->next) {
107                 if (e->dest == dest)
108                         /* already existed */
109                         return NULL;
110         }
111
112         e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
113         if (e == NULL) {
114                 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
115                 return NULL;
116         }
117
118         atomic_inc(&dest->refcnt);
119         e->dest = dest;
120
121         /* link it to the list */
122         write_lock(&set->lock);
123         e->next = set->list;
124         set->list = e;
125         atomic_inc(&set->size);
126         write_unlock(&set->lock);
127
128         set->lastmod = jiffies;
129         return e;
130 }
131
132 static void
133 ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
134 {
135         struct ip_vs_dest_list *e, **ep;
136
137         write_lock(&set->lock);
138         for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
139                 if (e->dest == dest) {
140                         /* HIT */
141                         *ep = e->next;
142                         atomic_dec(&set->size);
143                         set->lastmod = jiffies;
144                         atomic_dec(&e->dest->refcnt);
145                         kfree(e);
146                         break;
147                 }
148                 ep = &e->next;
149         }
150         write_unlock(&set->lock);
151 }
152
153 static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
154 {
155         struct ip_vs_dest_list *e, **ep;
156
157         write_lock(&set->lock);
158         for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
159                 *ep = e->next;
160                 /*
161                  * We don't kfree dest because it is refered either
162                  * by its service or by the trash dest list.
163                  */
164                 atomic_dec(&e->dest->refcnt);
165                 kfree(e);
166         }
167         write_unlock(&set->lock);
168 }
169
170 /* get weighted least-connection node in the destination set */
171 static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
172 {
173         register struct ip_vs_dest_list *e;
174         struct ip_vs_dest *dest, *least;
175         int loh, doh;
176
177         if (set == NULL)
178                 return NULL;
179
180         read_lock(&set->lock);
181         /* select the first destination server, whose weight > 0 */
182         for (e=set->list; e!=NULL; e=e->next) {
183                 least = e->dest;
184                 if (least->flags & IP_VS_DEST_F_OVERLOAD)
185                         continue;
186
187                 if ((atomic_read(&least->weight) > 0)
188                     && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
189                         loh = atomic_read(&least->activeconns) * 50
190                                 + atomic_read(&least->inactconns);
191                         goto nextstage;
192                 }
193         }
194         read_unlock(&set->lock);
195         return NULL;
196
197         /* find the destination with the weighted least load */
198   nextstage:
199         for (e=e->next; e!=NULL; e=e->next) {
200                 dest = e->dest;
201                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
202                         continue;
203
204                 doh = atomic_read(&dest->activeconns) * 50
205                         + atomic_read(&dest->inactconns);
206                 if ((loh * atomic_read(&dest->weight) >
207                      doh * atomic_read(&least->weight))
208                     && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
209                         least = dest;
210                         loh = doh;
211                 }
212         }
213         read_unlock(&set->lock);
214
215         IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
216                   "activeconns %d refcnt %d weight %d overhead %d\n",
217                   NIPQUAD(least->addr), ntohs(least->port),
218                   atomic_read(&least->activeconns),
219                   atomic_read(&least->refcnt),
220                   atomic_read(&least->weight), loh);
221         return least;
222 }
223
224
225 /* get weighted most-connection node in the destination set */
226 static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
227 {
228         register struct ip_vs_dest_list *e;
229         struct ip_vs_dest *dest, *most;
230         int moh, doh;
231
232         if (set == NULL)
233                 return NULL;
234
235         read_lock(&set->lock);
236         /* select the first destination server, whose weight > 0 */
237         for (e=set->list; e!=NULL; e=e->next) {
238                 most = e->dest;
239                 if (atomic_read(&most->weight) > 0) {
240                         moh = atomic_read(&most->activeconns) * 50
241                                 + atomic_read(&most->inactconns);
242                         goto nextstage;
243                 }
244         }
245         read_unlock(&set->lock);
246         return NULL;
247
248         /* find the destination with the weighted most load */
249   nextstage:
250         for (e=e->next; e!=NULL; e=e->next) {
251                 dest = e->dest;
252                 doh = atomic_read(&dest->activeconns) * 50
253                         + atomic_read(&dest->inactconns);
254                 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
255                 if ((moh * atomic_read(&dest->weight) <
256                      doh * atomic_read(&most->weight))
257                     && (atomic_read(&dest->weight) > 0)) {
258                         most = dest;
259                         moh = doh;
260                 }
261         }
262         read_unlock(&set->lock);
263
264         IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
265                   "activeconns %d refcnt %d weight %d overhead %d\n",
266                   NIPQUAD(most->addr), ntohs(most->port),
267                   atomic_read(&most->activeconns),
268                   atomic_read(&most->refcnt),
269                   atomic_read(&most->weight), moh);
270         return most;
271 }
272
273
274 /*
275  *      IPVS lblcr entry represents an association between destination
276  *      IP address and its destination server set
277  */
278 struct ip_vs_lblcr_entry {
279         struct list_head        list;
280         __be32                   addr;           /* destination IP address */
281         struct ip_vs_dest_set   set;            /* destination server set */
282         unsigned long           lastuse;        /* last used time */
283 };
284
285
286 /*
287  *      IPVS lblcr hash table
288  */
289 struct ip_vs_lblcr_table {
290         rwlock_t                lock;           /* lock for this table */
291         struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
292         atomic_t                entries;        /* number of entries */
293         int                     max_size;       /* maximum size of entries */
294         struct timer_list       periodic_timer; /* collect stale entries */
295         int                     rover;          /* rover for expire check */
296         int                     counter;        /* counter for no expire */
297 };
298
299
300 /*
301  *      IPVS LBLCR sysctl table
302  */
303
304 static ctl_table vs_vars_table[] = {
305         {
306                 .ctl_name       = NET_IPV4_VS_LBLCR_EXPIRE,
307                 .procname       = "lblcr_expiration",
308                 .data           = &sysctl_ip_vs_lblcr_expiration,
309                 .maxlen         = sizeof(int),
310                 .mode           = 0644, 
311                 .proc_handler   = &proc_dointvec_jiffies,
312         },
313         { .ctl_name = 0 }
314 };
315
316 static ctl_table vs_table[] = {
317         {
318                 .ctl_name       = NET_IPV4_VS,
319                 .procname       = "vs",
320                 .mode           = 0555,
321                 .child          = vs_vars_table
322         },
323         { .ctl_name = 0 }
324 };
325
326 static ctl_table ipvs_ipv4_table[] = {
327         {
328                 .ctl_name       = NET_IPV4,
329                 .procname       = "ipv4", 
330                 .mode           = 0555,
331                 .child          = vs_table
332         },
333         { .ctl_name = 0 }
334 };
335
336 static ctl_table lblcr_root_table[] = {
337         {
338                 .ctl_name       = CTL_NET,
339                 .procname       = "net", 
340                 .mode           = 0555, 
341                 .child          = ipvs_ipv4_table
342         },
343         { .ctl_name = 0 }
344 };
345
346 static struct ctl_table_header * sysctl_header;
347
348 /*
349  *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
350  *      IP address to a server.
351  */
352 static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
353 {
354         struct ip_vs_lblcr_entry *en;
355
356         en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
357         if (en == NULL) {
358                 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
359                 return NULL;
360         }
361
362         INIT_LIST_HEAD(&en->list);
363         en->addr = daddr;
364
365         /* initilize its dest set */
366         atomic_set(&(en->set.size), 0);
367         en->set.list = NULL;
368         rwlock_init(&en->set.lock);
369
370         return en;
371 }
372
373
374 static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
375 {
376         list_del(&en->list);
377         ip_vs_dest_set_eraseall(&en->set);
378         kfree(en);
379 }
380
381
382 /*
383  *      Returns hash value for IPVS LBLCR entry
384  */
385 static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
386 {
387         return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
388 }
389
390
391 /*
392  *      Hash an entry in the ip_vs_lblcr_table.
393  *      returns bool success.
394  */
395 static int
396 ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
397 {
398         unsigned hash;
399
400         if (!list_empty(&en->list)) {
401                 IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
402                           "called from %p\n", __builtin_return_address(0));
403                 return 0;
404         }
405
406         /*
407          *      Hash by destination IP address
408          */
409         hash = ip_vs_lblcr_hashkey(en->addr);
410
411         write_lock(&tbl->lock);
412         list_add(&en->list, &tbl->bucket[hash]);
413         atomic_inc(&tbl->entries);
414         write_unlock(&tbl->lock);
415
416         return 1;
417 }
418
419
420 /*
421  *  Get ip_vs_lblcr_entry associated with supplied parameters.
422  */
423 static inline struct ip_vs_lblcr_entry *
424 ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
425 {
426         unsigned hash;
427         struct ip_vs_lblcr_entry *en;
428
429         hash = ip_vs_lblcr_hashkey(addr);
430
431         read_lock(&tbl->lock);
432
433         list_for_each_entry(en, &tbl->bucket[hash], list) {
434                 if (en->addr == addr) {
435                         /* HIT */
436                         read_unlock(&tbl->lock);
437                         return en;
438                 }
439         }
440
441         read_unlock(&tbl->lock);
442
443         return NULL;
444 }
445
446
447 /*
448  *      Flush all the entries of the specified table.
449  */
450 static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
451 {
452         int i;
453         struct ip_vs_lblcr_entry *en, *nxt;
454
455         for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
456                 write_lock(&tbl->lock);
457                 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
458                         ip_vs_lblcr_free(en);
459                         atomic_dec(&tbl->entries);
460                 }
461                 write_unlock(&tbl->lock);
462         }
463 }
464
465
466 static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
467 {
468         unsigned long now = jiffies;
469         int i, j;
470         struct ip_vs_lblcr_entry *en, *nxt;
471
472         for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
473                 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
474
475                 write_lock(&tbl->lock);
476                 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
477                         if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
478                                        now))
479                                 continue;
480
481                         ip_vs_lblcr_free(en);
482                         atomic_dec(&tbl->entries);
483                 }
484                 write_unlock(&tbl->lock);
485         }
486         tbl->rover = j;
487 }
488
489
490 /*
491  *      Periodical timer handler for IPVS lblcr table
492  *      It is used to collect stale entries when the number of entries
493  *      exceeds the maximum size of the table.
494  *
495  *      Fixme: we probably need more complicated algorithm to collect
496  *             entries that have not been used for a long time even
497  *             if the number of entries doesn't exceed the maximum size
498  *             of the table.
499  *      The full expiration check is for this purpose now.
500  */
501 static void ip_vs_lblcr_check_expire(unsigned long data)
502 {
503         struct ip_vs_lblcr_table *tbl;
504         unsigned long now = jiffies;
505         int goal;
506         int i, j;
507         struct ip_vs_lblcr_entry *en, *nxt;
508
509         tbl = (struct ip_vs_lblcr_table *)data;
510
511         if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
512                 /* do full expiration check */
513                 ip_vs_lblcr_full_check(tbl);
514                 tbl->counter = 1;
515                 goto out;
516         }
517
518         if (atomic_read(&tbl->entries) <= tbl->max_size) {
519                 tbl->counter++;
520                 goto out;
521         }
522
523         goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
524         if (goal > tbl->max_size/2)
525                 goal = tbl->max_size/2;
526
527         for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
528                 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
529
530                 write_lock(&tbl->lock);
531                 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
532                         if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
533                                 continue;
534
535                         ip_vs_lblcr_free(en);
536                         atomic_dec(&tbl->entries);
537                         goal--;
538                 }
539                 write_unlock(&tbl->lock);
540                 if (goal <= 0)
541                         break;
542         }
543         tbl->rover = j;
544
545   out:
546         mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
547 }
548
549
550 #ifdef CONFIG_IP_VS_LBLCR_DEBUG
551 static struct ip_vs_lblcr_table *lblcr_table_list;
552
553 /*
554  *      /proc/net/ip_vs_lblcr to display the mappings of
555  *                  destination IP address <==> its serverSet
556  */
557 static int
558 ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
559 {
560         off_t pos=0, begin;
561         int len=0, size;
562         struct ip_vs_lblcr_table *tbl;
563         unsigned long now = jiffies;
564         int i;
565         struct ip_vs_lblcr_entry *en;
566
567         tbl = lblcr_table_list;
568
569         size = sprintf(buffer, "LastTime Dest IP address  Server set\n");
570         pos += size;
571         len += size;
572
573         for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
574                 read_lock_bh(&tbl->lock);
575                 list_for_each_entry(en, &tbl->bucket[i], list) {
576                         char tbuf[16];
577                         struct ip_vs_dest_list *d;
578
579                         sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
580                         size = sprintf(buffer+len, "%8lu %-16s ",
581                                        now-en->lastuse, tbuf);
582
583                         read_lock(&en->set.lock);
584                         for (d=en->set.list; d!=NULL; d=d->next) {
585                                 size += sprintf(buffer+len+size,
586                                                 "%u.%u.%u.%u ",
587                                                 NIPQUAD(d->dest->addr));
588                         }
589                         read_unlock(&en->set.lock);
590                         size += sprintf(buffer+len+size, "\n");
591                         len += size;
592                         pos += size;
593                         if (pos <= offset)
594                                 len=0;
595                         if (pos >= offset+length) {
596                                 read_unlock_bh(&tbl->lock);
597                                 goto done;
598                         }
599                 }
600                 read_unlock_bh(&tbl->lock);
601         }
602
603   done:
604         begin = len - (pos - offset);
605         *start = buffer + begin;
606         len -= begin;
607         if(len>length)
608                 len = length;
609         return len;
610 }
611 #endif
612
613
614 static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
615 {
616         int i;
617         struct ip_vs_lblcr_table *tbl;
618
619         /*
620          *    Allocate the ip_vs_lblcr_table for this service
621          */
622         tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
623         if (tbl == NULL) {
624                 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
625                 return -ENOMEM;
626         }
627         svc->sched_data = tbl;
628         IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
629                   "current service\n",
630                   sizeof(struct ip_vs_lblcr_table));
631
632         /*
633          *    Initialize the hash buckets
634          */
635         for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
636                 INIT_LIST_HEAD(&tbl->bucket[i]);
637         }
638         rwlock_init(&tbl->lock);
639         tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
640         tbl->rover = 0;
641         tbl->counter = 1;
642
643         /*
644          *    Hook periodic timer for garbage collection
645          */
646         init_timer(&tbl->periodic_timer);
647         tbl->periodic_timer.data = (unsigned long)tbl;
648         tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
649         tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
650         add_timer(&tbl->periodic_timer);
651
652 #ifdef CONFIG_IP_VS_LBLCR_DEBUG
653         lblcr_table_list = tbl;
654 #endif
655         return 0;
656 }
657
658
659 static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
660 {
661         struct ip_vs_lblcr_table *tbl = svc->sched_data;
662
663         /* remove periodic timer */
664         del_timer_sync(&tbl->periodic_timer);
665
666         /* got to clean up table entries here */
667         ip_vs_lblcr_flush(tbl);
668
669         /* release the table itself */
670         kfree(svc->sched_data);
671         IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
672                   sizeof(struct ip_vs_lblcr_table));
673
674         return 0;
675 }
676
677
678 static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
679 {
680         return 0;
681 }
682
683
684 static inline struct ip_vs_dest *
685 __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
686 {
687         struct ip_vs_dest *dest, *least;
688         int loh, doh;
689
690         /*
691          * We think the overhead of processing active connections is fifty
692          * times higher than that of inactive connections in average. (This
693          * fifty times might not be accurate, we will change it later.) We
694          * use the following formula to estimate the overhead:
695          *                dest->activeconns*50 + dest->inactconns
696          * and the load:
697          *                (dest overhead) / dest->weight
698          *
699          * Remember -- no floats in kernel mode!!!
700          * The comparison of h1*w2 > h2*w1 is equivalent to that of
701          *                h1/w1 > h2/w2
702          * if every weight is larger than zero.
703          *
704          * The server with weight=0 is quiesced and will not receive any
705          * new connection.
706          */
707         list_for_each_entry(dest, &svc->destinations, n_list) {
708                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
709                         continue;
710
711                 if (atomic_read(&dest->weight) > 0) {
712                         least = dest;
713                         loh = atomic_read(&least->activeconns) * 50
714                                 + atomic_read(&least->inactconns);
715                         goto nextstage;
716                 }
717         }
718         return NULL;
719
720         /*
721          *    Find the destination with the least load.
722          */
723   nextstage:
724         list_for_each_entry_continue(dest, &svc->destinations, n_list) {
725                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
726                         continue;
727
728                 doh = atomic_read(&dest->activeconns) * 50
729                         + atomic_read(&dest->inactconns);
730                 if (loh * atomic_read(&dest->weight) >
731                     doh * atomic_read(&least->weight)) {
732                         least = dest;
733                         loh = doh;
734                 }
735         }
736
737         IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
738                   "activeconns %d refcnt %d weight %d overhead %d\n",
739                   NIPQUAD(least->addr), ntohs(least->port),
740                   atomic_read(&least->activeconns),
741                   atomic_read(&least->refcnt),
742                   atomic_read(&least->weight), loh);
743
744         return least;
745 }
746
747
748 /*
749  *   If this destination server is overloaded and there is a less loaded
750  *   server, then return true.
751  */
752 static inline int
753 is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
754 {
755         if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
756                 struct ip_vs_dest *d;
757
758                 list_for_each_entry(d, &svc->destinations, n_list) {
759                         if (atomic_read(&d->activeconns)*2
760                             < atomic_read(&d->weight)) {
761                                 return 1;
762                         }
763                 }
764         }
765         return 0;
766 }
767
768
769 /*
770  *    Locality-Based (weighted) Least-Connection scheduling
771  */
772 static struct ip_vs_dest *
773 ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
774 {
775         struct ip_vs_dest *dest;
776         struct ip_vs_lblcr_table *tbl;
777         struct ip_vs_lblcr_entry *en;
778         struct iphdr *iph = skb->nh.iph;
779
780         IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
781
782         tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
783         en = ip_vs_lblcr_get(tbl, iph->daddr);
784         if (en == NULL) {
785                 dest = __ip_vs_wlc_schedule(svc, iph);
786                 if (dest == NULL) {
787                         IP_VS_DBG(1, "no destination available\n");
788                         return NULL;
789                 }
790                 en = ip_vs_lblcr_new(iph->daddr);
791                 if (en == NULL) {
792                         return NULL;
793                 }
794                 ip_vs_dest_set_insert(&en->set, dest);
795                 ip_vs_lblcr_hash(tbl, en);
796         } else {
797                 dest = ip_vs_dest_set_min(&en->set);
798                 if (!dest || is_overloaded(dest, svc)) {
799                         dest = __ip_vs_wlc_schedule(svc, iph);
800                         if (dest == NULL) {
801                                 IP_VS_DBG(1, "no destination available\n");
802                                 return NULL;
803                         }
804                         ip_vs_dest_set_insert(&en->set, dest);
805                 }
806                 if (atomic_read(&en->set.size) > 1 &&
807                     jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
808                         struct ip_vs_dest *m;
809                         m = ip_vs_dest_set_max(&en->set);
810                         if (m)
811                                 ip_vs_dest_set_erase(&en->set, m);
812                 }
813         }
814         en->lastuse = jiffies;
815
816         IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
817                   "--> server %u.%u.%u.%u:%d\n",
818                   NIPQUAD(en->addr),
819                   NIPQUAD(dest->addr),
820                   ntohs(dest->port));
821
822         return dest;
823 }
824
825
826 /*
827  *      IPVS LBLCR Scheduler structure
828  */
829 static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
830 {
831         .name =                 "lblcr",
832         .refcnt =               ATOMIC_INIT(0),
833         .module =               THIS_MODULE,
834         .init_service =         ip_vs_lblcr_init_svc,
835         .done_service =         ip_vs_lblcr_done_svc,
836         .update_service =       ip_vs_lblcr_update_svc,
837         .schedule =             ip_vs_lblcr_schedule,
838 };
839
840
841 static int __init ip_vs_lblcr_init(void)
842 {
843         INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
844         sysctl_header = register_sysctl_table(lblcr_root_table, 0);
845 #ifdef CONFIG_IP_VS_LBLCR_DEBUG
846         proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
847 #endif
848         return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
849 }
850
851
852 static void __exit ip_vs_lblcr_cleanup(void)
853 {
854 #ifdef CONFIG_IP_VS_LBLCR_DEBUG
855         proc_net_remove("ip_vs_lblcr");
856 #endif
857         unregister_sysctl_table(sysctl_header);
858         unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
859 }
860
861
862 module_init(ip_vs_lblcr_init);
863 module_exit(ip_vs_lblcr_cleanup);
864 MODULE_LICENSE("GPL");