Merge git://git.kernel.org/pub/scm/linux/kernel/git/sam/kbuild
[linux-2.6] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/fs.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <net/sock.h>
40
41 #include <asm/uaccess.h>
42
43 #include <net/ip_vs.h>
44
45 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
46 static DECLARE_MUTEX(__ip_vs_mutex);
47
48 /* lock for service table */
49 static DEFINE_RWLOCK(__ip_vs_svc_lock);
50
51 /* lock for table with the real services */
52 static DEFINE_RWLOCK(__ip_vs_rs_lock);
53
54 /* lock for state and timeout tables */
55 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56
57 /* lock for drop entry handling */
58 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59
60 /* lock for drop packet handling */
61 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62
63 /* 1/rate drop and drop-entry variables */
64 int ip_vs_drop_rate = 0;
65 int ip_vs_drop_counter = 0;
66 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67
68 /* number of virtual services */
69 static int ip_vs_num_services = 0;
70
71 /* sysctl variables */
72 static int sysctl_ip_vs_drop_entry = 0;
73 static int sysctl_ip_vs_drop_packet = 0;
74 static int sysctl_ip_vs_secure_tcp = 0;
75 static int sysctl_ip_vs_amemthresh = 1024;
76 static int sysctl_ip_vs_am_droprate = 10;
77 int sysctl_ip_vs_cache_bypass = 0;
78 int sysctl_ip_vs_expire_nodest_conn = 0;
79 int sysctl_ip_vs_expire_quiescent_template = 0;
80 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81 int sysctl_ip_vs_nat_icmp_send = 0;
82
83
84 #ifdef CONFIG_IP_VS_DEBUG
85 static int sysctl_ip_vs_debug_level = 0;
86
87 int ip_vs_get_debug_level(void)
88 {
89         return sysctl_ip_vs_debug_level;
90 }
91 #endif
92
93 /*
94  *      update_defense_level is called from keventd and from sysctl,
95  *      so it needs to protect itself from softirqs
96  */
97 static void update_defense_level(void)
98 {
99         struct sysinfo i;
100         static int old_secure_tcp = 0;
101         int availmem;
102         int nomem;
103         int to_change = -1;
104
105         /* we only count free and buffered memory (in pages) */
106         si_meminfo(&i);
107         availmem = i.freeram + i.bufferram;
108         /* however in linux 2.5 the i.bufferram is total page cache size,
109            we need adjust it */
110         /* si_swapinfo(&i); */
111         /* availmem = availmem - (i.totalswap - i.freeswap); */
112
113         nomem = (availmem < sysctl_ip_vs_amemthresh);
114
115         local_bh_disable();
116
117         /* drop_entry */
118         spin_lock(&__ip_vs_dropentry_lock);
119         switch (sysctl_ip_vs_drop_entry) {
120         case 0:
121                 atomic_set(&ip_vs_dropentry, 0);
122                 break;
123         case 1:
124                 if (nomem) {
125                         atomic_set(&ip_vs_dropentry, 1);
126                         sysctl_ip_vs_drop_entry = 2;
127                 } else {
128                         atomic_set(&ip_vs_dropentry, 0);
129                 }
130                 break;
131         case 2:
132                 if (nomem) {
133                         atomic_set(&ip_vs_dropentry, 1);
134                 } else {
135                         atomic_set(&ip_vs_dropentry, 0);
136                         sysctl_ip_vs_drop_entry = 1;
137                 };
138                 break;
139         case 3:
140                 atomic_set(&ip_vs_dropentry, 1);
141                 break;
142         }
143         spin_unlock(&__ip_vs_dropentry_lock);
144
145         /* drop_packet */
146         spin_lock(&__ip_vs_droppacket_lock);
147         switch (sysctl_ip_vs_drop_packet) {
148         case 0:
149                 ip_vs_drop_rate = 0;
150                 break;
151         case 1:
152                 if (nomem) {
153                         ip_vs_drop_rate = ip_vs_drop_counter
154                                 = sysctl_ip_vs_amemthresh /
155                                 (sysctl_ip_vs_amemthresh-availmem);
156                         sysctl_ip_vs_drop_packet = 2;
157                 } else {
158                         ip_vs_drop_rate = 0;
159                 }
160                 break;
161         case 2:
162                 if (nomem) {
163                         ip_vs_drop_rate = ip_vs_drop_counter
164                                 = sysctl_ip_vs_amemthresh /
165                                 (sysctl_ip_vs_amemthresh-availmem);
166                 } else {
167                         ip_vs_drop_rate = 0;
168                         sysctl_ip_vs_drop_packet = 1;
169                 }
170                 break;
171         case 3:
172                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173                 break;
174         }
175         spin_unlock(&__ip_vs_droppacket_lock);
176
177         /* secure_tcp */
178         write_lock(&__ip_vs_securetcp_lock);
179         switch (sysctl_ip_vs_secure_tcp) {
180         case 0:
181                 if (old_secure_tcp >= 2)
182                         to_change = 0;
183                 break;
184         case 1:
185                 if (nomem) {
186                         if (old_secure_tcp < 2)
187                                 to_change = 1;
188                         sysctl_ip_vs_secure_tcp = 2;
189                 } else {
190                         if (old_secure_tcp >= 2)
191                                 to_change = 0;
192                 }
193                 break;
194         case 2:
195                 if (nomem) {
196                         if (old_secure_tcp < 2)
197                                 to_change = 1;
198                 } else {
199                         if (old_secure_tcp >= 2)
200                                 to_change = 0;
201                         sysctl_ip_vs_secure_tcp = 1;
202                 }
203                 break;
204         case 3:
205                 if (old_secure_tcp < 2)
206                         to_change = 1;
207                 break;
208         }
209         old_secure_tcp = sysctl_ip_vs_secure_tcp;
210         if (to_change >= 0)
211                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212         write_unlock(&__ip_vs_securetcp_lock);
213
214         local_bh_enable();
215 }
216
217
218 /*
219  *      Timer for checking the defense
220  */
221 #define DEFENSE_TIMER_PERIOD    1*HZ
222 static void defense_work_handler(void *data);
223 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
224
225 static void defense_work_handler(void *data)
226 {
227         update_defense_level();
228         if (atomic_read(&ip_vs_dropentry))
229                 ip_vs_random_dropentry();
230
231         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
232 }
233
234 int
235 ip_vs_use_count_inc(void)
236 {
237         return try_module_get(THIS_MODULE);
238 }
239
240 void
241 ip_vs_use_count_dec(void)
242 {
243         module_put(THIS_MODULE);
244 }
245
246
247 /*
248  *      Hash table: for virtual service lookups
249  */
250 #define IP_VS_SVC_TAB_BITS 8
251 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254 /* the service table hashed by <protocol, addr, port> */
255 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256 /* the service table hashed by fwmark */
257 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259 /*
260  *      Hash table: for real service lookups
261  */
262 #define IP_VS_RTAB_BITS 4
263 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265
266 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
267
268 /*
269  *      Trash for destinations
270  */
271 static LIST_HEAD(ip_vs_dest_trash);
272
273 /*
274  *      FTP & NULL virtual service counters
275  */
276 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
278
279
280 /*
281  *      Returns hash value for virtual service
282  */
283 static __inline__ unsigned
284 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
285 {
286         register unsigned porth = ntohs(port);
287
288         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289                 & IP_VS_SVC_TAB_MASK;
290 }
291
292 /*
293  *      Returns hash value of fwmark for virtual service lookup
294  */
295 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296 {
297         return fwmark & IP_VS_SVC_TAB_MASK;
298 }
299
300 /*
301  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302  *      or in the ip_vs_svc_fwm_table by fwmark.
303  *      Should be called with locked tables.
304  */
305 static int ip_vs_svc_hash(struct ip_vs_service *svc)
306 {
307         unsigned hash;
308
309         if (svc->flags & IP_VS_SVC_F_HASHED) {
310                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311                           "called from %p\n", __builtin_return_address(0));
312                 return 0;
313         }
314
315         if (svc->fwmark == 0) {
316                 /*
317                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
318                  */
319                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321         } else {
322                 /*
323                  *  Hash it by fwmark in ip_vs_svc_fwm_table
324                  */
325                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
327         }
328
329         svc->flags |= IP_VS_SVC_F_HASHED;
330         /* increase its refcnt because it is referenced by the svc table */
331         atomic_inc(&svc->refcnt);
332         return 1;
333 }
334
335
336 /*
337  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338  *      Should be called with locked tables.
339  */
340 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341 {
342         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344                           "called from %p\n", __builtin_return_address(0));
345                 return 0;
346         }
347
348         if (svc->fwmark == 0) {
349                 /* Remove it from the ip_vs_svc_table table */
350                 list_del(&svc->s_list);
351         } else {
352                 /* Remove it from the ip_vs_svc_fwm_table table */
353                 list_del(&svc->f_list);
354         }
355
356         svc->flags &= ~IP_VS_SVC_F_HASHED;
357         atomic_dec(&svc->refcnt);
358         return 1;
359 }
360
361
362 /*
363  *      Get service by {proto,addr,port} in the service table.
364  */
365 static __inline__ struct ip_vs_service *
366 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
367 {
368         unsigned hash;
369         struct ip_vs_service *svc;
370
371         /* Check for "full" addressed entries */
372         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373
374         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375                 if ((svc->addr == vaddr)
376                     && (svc->port == vport)
377                     && (svc->protocol == protocol)) {
378                         /* HIT */
379                         atomic_inc(&svc->usecnt);
380                         return svc;
381                 }
382         }
383
384         return NULL;
385 }
386
387
388 /*
389  *      Get service by {fwmark} in the service table.
390  */
391 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
392 {
393         unsigned hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark) {
401                         /* HIT */
402                         atomic_inc(&svc->usecnt);
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
412 {
413         struct ip_vs_service *svc;
414
415         read_lock(&__ip_vs_svc_lock);
416
417         /*
418          *      Check the table hashed by fwmark first
419          */
420         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
421                 goto out;
422
423         /*
424          *      Check the table hashed by <protocol,addr,port>
425          *      for "full" addressed entries
426          */
427         svc = __ip_vs_service_get(protocol, vaddr, vport);
428
429         if (svc == NULL
430             && protocol == IPPROTO_TCP
431             && atomic_read(&ip_vs_ftpsvc_counter)
432             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433                 /*
434                  * Check if ftp service entry exists, the packet
435                  * might belong to FTP data connections.
436                  */
437                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
438         }
439
440         if (svc == NULL
441             && atomic_read(&ip_vs_nullsvc_counter)) {
442                 /*
443                  * Check if the catch-all port (port zero) exists
444                  */
445                 svc = __ip_vs_service_get(protocol, vaddr, 0);
446         }
447
448   out:
449         read_unlock(&__ip_vs_svc_lock);
450
451         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
452                   fwmark, ip_vs_proto_name(protocol),
453                   NIPQUAD(vaddr), ntohs(vport),
454                   svc?"hit":"not hit");
455
456         return svc;
457 }
458
459
460 static inline void
461 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462 {
463         atomic_inc(&svc->refcnt);
464         dest->svc = svc;
465 }
466
467 static inline void
468 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
469 {
470         struct ip_vs_service *svc = dest->svc;
471
472         dest->svc = NULL;
473         if (atomic_dec_and_test(&svc->refcnt))
474                 kfree(svc);
475 }
476
477
478 /*
479  *      Returns hash value for real service
480  */
481 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
482 {
483         register unsigned porth = ntohs(port);
484
485         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486                 & IP_VS_RTAB_MASK;
487 }
488
489 /*
490  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491  *      should be called with locked tables.
492  */
493 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
494 {
495         unsigned hash;
496
497         if (!list_empty(&dest->d_list)) {
498                 return 0;
499         }
500
501         /*
502          *      Hash by proto,addr,port,
503          *      which are the parameters of the real service.
504          */
505         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506         list_add(&dest->d_list, &ip_vs_rtable[hash]);
507
508         return 1;
509 }
510
511 /*
512  *      UNhashes ip_vs_dest from ip_vs_rtable.
513  *      should be called with locked tables.
514  */
515 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
516 {
517         /*
518          * Remove it from the ip_vs_rtable table.
519          */
520         if (!list_empty(&dest->d_list)) {
521                 list_del(&dest->d_list);
522                 INIT_LIST_HEAD(&dest->d_list);
523         }
524
525         return 1;
526 }
527
528 /*
529  *      Lookup real service by <proto,addr,port> in the real service table.
530  */
531 struct ip_vs_dest *
532 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
533 {
534         unsigned hash;
535         struct ip_vs_dest *dest;
536
537         /*
538          *      Check for "full" addressed entries
539          *      Return the first found entry
540          */
541         hash = ip_vs_rs_hashkey(daddr, dport);
542
543         read_lock(&__ip_vs_rs_lock);
544         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545                 if ((dest->addr == daddr)
546                     && (dest->port == dport)
547                     && ((dest->protocol == protocol) ||
548                         dest->vfwmark)) {
549                         /* HIT */
550                         read_unlock(&__ip_vs_rs_lock);
551                         return dest;
552                 }
553         }
554         read_unlock(&__ip_vs_rs_lock);
555
556         return NULL;
557 }
558
559 /*
560  *      Lookup destination by {addr,port} in the given service
561  */
562 static struct ip_vs_dest *
563 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
564 {
565         struct ip_vs_dest *dest;
566
567         /*
568          * Find the destination for the given service
569          */
570         list_for_each_entry(dest, &svc->destinations, n_list) {
571                 if ((dest->addr == daddr) && (dest->port == dport)) {
572                         /* HIT */
573                         return dest;
574                 }
575         }
576
577         return NULL;
578 }
579
580
581 /*
582  *  Lookup dest by {svc,addr,port} in the destination trash.
583  *  The destination trash is used to hold the destinations that are removed
584  *  from the service table but are still referenced by some conn entries.
585  *  The reason to add the destination trash is when the dest is temporary
586  *  down (either by administrator or by monitor program), the dest can be
587  *  picked back from the trash, the remaining connections to the dest can
588  *  continue, and the counting information of the dest is also useful for
589  *  scheduling.
590  */
591 static struct ip_vs_dest *
592 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
593 {
594         struct ip_vs_dest *dest, *nxt;
595
596         /*
597          * Find the destination in trash
598          */
599         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
600                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
601                           "dest->refcnt=%d\n",
602                           dest->vfwmark,
603                           NIPQUAD(dest->addr), ntohs(dest->port),
604                           atomic_read(&dest->refcnt));
605                 if (dest->addr == daddr &&
606                     dest->port == dport &&
607                     dest->vfwmark == svc->fwmark &&
608                     dest->protocol == svc->protocol &&
609                     (svc->fwmark ||
610                      (dest->vaddr == svc->addr &&
611                       dest->vport == svc->port))) {
612                         /* HIT */
613                         return dest;
614                 }
615
616                 /*
617                  * Try to purge the destination from trash if not referenced
618                  */
619                 if (atomic_read(&dest->refcnt) == 1) {
620                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
621                                   "from trash\n",
622                                   dest->vfwmark,
623                                   NIPQUAD(dest->addr), ntohs(dest->port));
624                         list_del(&dest->n_list);
625                         ip_vs_dst_reset(dest);
626                         __ip_vs_unbind_svc(dest);
627                         kfree(dest);
628                 }
629         }
630
631         return NULL;
632 }
633
634
635 /*
636  *  Clean up all the destinations in the trash
637  *  Called by the ip_vs_control_cleanup()
638  *
639  *  When the ip_vs_control_clearup is activated by ipvs module exit,
640  *  the service tables must have been flushed and all the connections
641  *  are expired, and the refcnt of each destination in the trash must
642  *  be 1, so we simply release them here.
643  */
644 static void ip_vs_trash_cleanup(void)
645 {
646         struct ip_vs_dest *dest, *nxt;
647
648         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
649                 list_del(&dest->n_list);
650                 ip_vs_dst_reset(dest);
651                 __ip_vs_unbind_svc(dest);
652                 kfree(dest);
653         }
654 }
655
656
657 static void
658 ip_vs_zero_stats(struct ip_vs_stats *stats)
659 {
660         spin_lock_bh(&stats->lock);
661         memset(stats, 0, (char *)&stats->lock - (char *)stats);
662         spin_unlock_bh(&stats->lock);
663         ip_vs_zero_estimator(stats);
664 }
665
666 /*
667  *      Update a destination in the given service
668  */
669 static void
670 __ip_vs_update_dest(struct ip_vs_service *svc,
671                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
672 {
673         int conn_flags;
674
675         /* set the weight and the flags */
676         atomic_set(&dest->weight, udest->weight);
677         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
678
679         /* check if local node and update the flags */
680         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
681                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
682                         | IP_VS_CONN_F_LOCALNODE;
683         }
684
685         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
686         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
687                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
688         } else {
689                 /*
690                  *    Put the real service in ip_vs_rtable if not present.
691                  *    For now only for NAT!
692                  */
693                 write_lock_bh(&__ip_vs_rs_lock);
694                 ip_vs_rs_hash(dest);
695                 write_unlock_bh(&__ip_vs_rs_lock);
696         }
697         atomic_set(&dest->conn_flags, conn_flags);
698
699         /* bind the service */
700         if (!dest->svc) {
701                 __ip_vs_bind_svc(dest, svc);
702         } else {
703                 if (dest->svc != svc) {
704                         __ip_vs_unbind_svc(dest);
705                         ip_vs_zero_stats(&dest->stats);
706                         __ip_vs_bind_svc(dest, svc);
707                 }
708         }
709
710         /* set the dest status flags */
711         dest->flags |= IP_VS_DEST_F_AVAILABLE;
712
713         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
714                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
715         dest->u_threshold = udest->u_threshold;
716         dest->l_threshold = udest->l_threshold;
717 }
718
719
720 /*
721  *      Create a destination for the given service
722  */
723 static int
724 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
725                struct ip_vs_dest **dest_p)
726 {
727         struct ip_vs_dest *dest;
728         unsigned atype;
729
730         EnterFunction(2);
731
732         atype = inet_addr_type(udest->addr);
733         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
734                 return -EINVAL;
735
736         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
737         if (dest == NULL) {
738                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
739                 return -ENOMEM;
740         }
741         memset(dest, 0, sizeof(struct ip_vs_dest));
742
743         dest->protocol = svc->protocol;
744         dest->vaddr = svc->addr;
745         dest->vport = svc->port;
746         dest->vfwmark = svc->fwmark;
747         dest->addr = udest->addr;
748         dest->port = udest->port;
749
750         atomic_set(&dest->activeconns, 0);
751         atomic_set(&dest->inactconns, 0);
752         atomic_set(&dest->persistconns, 0);
753         atomic_set(&dest->refcnt, 0);
754
755         INIT_LIST_HEAD(&dest->d_list);
756         spin_lock_init(&dest->dst_lock);
757         spin_lock_init(&dest->stats.lock);
758         __ip_vs_update_dest(svc, dest, udest);
759         ip_vs_new_estimator(&dest->stats);
760
761         *dest_p = dest;
762
763         LeaveFunction(2);
764         return 0;
765 }
766
767
768 /*
769  *      Add a destination into an existing service
770  */
771 static int
772 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
773 {
774         struct ip_vs_dest *dest;
775         __u32 daddr = udest->addr;
776         __u16 dport = udest->port;
777         int ret;
778
779         EnterFunction(2);
780
781         if (udest->weight < 0) {
782                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
783                 return -ERANGE;
784         }
785
786         if (udest->l_threshold > udest->u_threshold) {
787                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
788                           "upper threshold\n");
789                 return -ERANGE;
790         }
791
792         /*
793          * Check if the dest already exists in the list
794          */
795         dest = ip_vs_lookup_dest(svc, daddr, dport);
796         if (dest != NULL) {
797                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
798                 return -EEXIST;
799         }
800
801         /*
802          * Check if the dest already exists in the trash and
803          * is from the same service
804          */
805         dest = ip_vs_trash_get_dest(svc, daddr, dport);
806         if (dest != NULL) {
807                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
808                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
809                           NIPQUAD(daddr), ntohs(dport),
810                           atomic_read(&dest->refcnt),
811                           dest->vfwmark,
812                           NIPQUAD(dest->vaddr),
813                           ntohs(dest->vport));
814                 __ip_vs_update_dest(svc, dest, udest);
815
816                 /*
817                  * Get the destination from the trash
818                  */
819                 list_del(&dest->n_list);
820
821                 ip_vs_new_estimator(&dest->stats);
822
823                 write_lock_bh(&__ip_vs_svc_lock);
824
825                 /*
826                  * Wait until all other svc users go away.
827                  */
828                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
829
830                 list_add(&dest->n_list, &svc->destinations);
831                 svc->num_dests++;
832
833                 /* call the update_service function of its scheduler */
834                 svc->scheduler->update_service(svc);
835
836                 write_unlock_bh(&__ip_vs_svc_lock);
837                 return 0;
838         }
839
840         /*
841          * Allocate and initialize the dest structure
842          */
843         ret = ip_vs_new_dest(svc, udest, &dest);
844         if (ret) {
845                 return ret;
846         }
847
848         /*
849          * Add the dest entry into the list
850          */
851         atomic_inc(&dest->refcnt);
852
853         write_lock_bh(&__ip_vs_svc_lock);
854
855         /*
856          * Wait until all other svc users go away.
857          */
858         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
859
860         list_add(&dest->n_list, &svc->destinations);
861         svc->num_dests++;
862
863         /* call the update_service function of its scheduler */
864         svc->scheduler->update_service(svc);
865
866         write_unlock_bh(&__ip_vs_svc_lock);
867
868         LeaveFunction(2);
869
870         return 0;
871 }
872
873
874 /*
875  *      Edit a destination in the given service
876  */
877 static int
878 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
879 {
880         struct ip_vs_dest *dest;
881         __u32 daddr = udest->addr;
882         __u16 dport = udest->port;
883
884         EnterFunction(2);
885
886         if (udest->weight < 0) {
887                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
888                 return -ERANGE;
889         }
890
891         if (udest->l_threshold > udest->u_threshold) {
892                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
893                           "upper threshold\n");
894                 return -ERANGE;
895         }
896
897         /*
898          *  Lookup the destination list
899          */
900         dest = ip_vs_lookup_dest(svc, daddr, dport);
901         if (dest == NULL) {
902                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
903                 return -ENOENT;
904         }
905
906         __ip_vs_update_dest(svc, dest, udest);
907
908         write_lock_bh(&__ip_vs_svc_lock);
909
910         /* Wait until all other svc users go away */
911         while (atomic_read(&svc->usecnt) > 1) {};
912
913         /* call the update_service, because server weight may be changed */
914         svc->scheduler->update_service(svc);
915
916         write_unlock_bh(&__ip_vs_svc_lock);
917
918         LeaveFunction(2);
919
920         return 0;
921 }
922
923
924 /*
925  *      Delete a destination (must be already unlinked from the service)
926  */
927 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
928 {
929         ip_vs_kill_estimator(&dest->stats);
930
931         /*
932          *  Remove it from the d-linked list with the real services.
933          */
934         write_lock_bh(&__ip_vs_rs_lock);
935         ip_vs_rs_unhash(dest);
936         write_unlock_bh(&__ip_vs_rs_lock);
937
938         /*
939          *  Decrease the refcnt of the dest, and free the dest
940          *  if nobody refers to it (refcnt=0). Otherwise, throw
941          *  the destination into the trash.
942          */
943         if (atomic_dec_and_test(&dest->refcnt)) {
944                 ip_vs_dst_reset(dest);
945                 /* simply decrease svc->refcnt here, let the caller check
946                    and release the service if nobody refers to it.
947                    Only user context can release destination and service,
948                    and only one user context can update virtual service at a
949                    time, so the operation here is OK */
950                 atomic_dec(&dest->svc->refcnt);
951                 kfree(dest);
952         } else {
953                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
954                           "dest->refcnt=%d\n",
955                           NIPQUAD(dest->addr), ntohs(dest->port),
956                           atomic_read(&dest->refcnt));
957                 list_add(&dest->n_list, &ip_vs_dest_trash);
958                 atomic_inc(&dest->refcnt);
959         }
960 }
961
962
963 /*
964  *      Unlink a destination from the given service
965  */
966 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
967                                 struct ip_vs_dest *dest,
968                                 int svcupd)
969 {
970         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
971
972         /*
973          *  Remove it from the d-linked destination list.
974          */
975         list_del(&dest->n_list);
976         svc->num_dests--;
977         if (svcupd) {
978                 /*
979                  *  Call the update_service function of its scheduler
980                  */
981                 svc->scheduler->update_service(svc);
982         }
983 }
984
985
986 /*
987  *      Delete a destination server in the given service
988  */
989 static int
990 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
991 {
992         struct ip_vs_dest *dest;
993         __u32 daddr = udest->addr;
994         __u16 dport = udest->port;
995
996         EnterFunction(2);
997
998         dest = ip_vs_lookup_dest(svc, daddr, dport);
999         if (dest == NULL) {
1000                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1001                 return -ENOENT;
1002         }
1003
1004         write_lock_bh(&__ip_vs_svc_lock);
1005
1006         /*
1007          *      Wait until all other svc users go away.
1008          */
1009         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1010
1011         /*
1012          *      Unlink dest from the service
1013          */
1014         __ip_vs_unlink_dest(svc, dest, 1);
1015
1016         write_unlock_bh(&__ip_vs_svc_lock);
1017
1018         /*
1019          *      Delete the destination
1020          */
1021         __ip_vs_del_dest(dest);
1022
1023         LeaveFunction(2);
1024
1025         return 0;
1026 }
1027
1028
1029 /*
1030  *      Add a service into the service hash table
1031  */
1032 static int
1033 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1034 {
1035         int ret = 0;
1036         struct ip_vs_scheduler *sched = NULL;
1037         struct ip_vs_service *svc = NULL;
1038
1039         /* increase the module use count */
1040         ip_vs_use_count_inc();
1041
1042         /* Lookup the scheduler by 'u->sched_name' */
1043         sched = ip_vs_scheduler_get(u->sched_name);
1044         if (sched == NULL) {
1045                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1046                            u->sched_name);
1047                 ret = -ENOENT;
1048                 goto out_mod_dec;
1049         }
1050
1051         svc = (struct ip_vs_service *)
1052                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1053         if (svc == NULL) {
1054                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1055                 ret = -ENOMEM;
1056                 goto out_err;
1057         }
1058         memset(svc, 0, sizeof(struct ip_vs_service));
1059
1060         /* I'm the first user of the service */
1061         atomic_set(&svc->usecnt, 1);
1062         atomic_set(&svc->refcnt, 0);
1063
1064         svc->protocol = u->protocol;
1065         svc->addr = u->addr;
1066         svc->port = u->port;
1067         svc->fwmark = u->fwmark;
1068         svc->flags = u->flags;
1069         svc->timeout = u->timeout * HZ;
1070         svc->netmask = u->netmask;
1071
1072         INIT_LIST_HEAD(&svc->destinations);
1073         rwlock_init(&svc->sched_lock);
1074         spin_lock_init(&svc->stats.lock);
1075
1076         /* Bind the scheduler */
1077         ret = ip_vs_bind_scheduler(svc, sched);
1078         if (ret)
1079                 goto out_err;
1080         sched = NULL;
1081
1082         /* Update the virtual service counters */
1083         if (svc->port == FTPPORT)
1084                 atomic_inc(&ip_vs_ftpsvc_counter);
1085         else if (svc->port == 0)
1086                 atomic_inc(&ip_vs_nullsvc_counter);
1087
1088         ip_vs_new_estimator(&svc->stats);
1089         ip_vs_num_services++;
1090
1091         /* Hash the service into the service table */
1092         write_lock_bh(&__ip_vs_svc_lock);
1093         ip_vs_svc_hash(svc);
1094         write_unlock_bh(&__ip_vs_svc_lock);
1095
1096         *svc_p = svc;
1097         return 0;
1098
1099   out_err:
1100         if (svc != NULL) {
1101                 if (svc->scheduler)
1102                         ip_vs_unbind_scheduler(svc);
1103                 if (svc->inc) {
1104                         local_bh_disable();
1105                         ip_vs_app_inc_put(svc->inc);
1106                         local_bh_enable();
1107                 }
1108                 kfree(svc);
1109         }
1110         ip_vs_scheduler_put(sched);
1111
1112   out_mod_dec:
1113         /* decrease the module use count */
1114         ip_vs_use_count_dec();
1115
1116         return ret;
1117 }
1118
1119
1120 /*
1121  *      Edit a service and bind it with a new scheduler
1122  */
1123 static int
1124 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1125 {
1126         struct ip_vs_scheduler *sched, *old_sched;
1127         int ret = 0;
1128
1129         /*
1130          * Lookup the scheduler, by 'u->sched_name'
1131          */
1132         sched = ip_vs_scheduler_get(u->sched_name);
1133         if (sched == NULL) {
1134                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1135                            u->sched_name);
1136                 return -ENOENT;
1137         }
1138         old_sched = sched;
1139
1140         write_lock_bh(&__ip_vs_svc_lock);
1141
1142         /*
1143          * Wait until all other svc users go away.
1144          */
1145         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1146
1147         /*
1148          * Set the flags and timeout value
1149          */
1150         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1151         svc->timeout = u->timeout * HZ;
1152         svc->netmask = u->netmask;
1153
1154         old_sched = svc->scheduler;
1155         if (sched != old_sched) {
1156                 /*
1157                  * Unbind the old scheduler
1158                  */
1159                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1160                         old_sched = sched;
1161                         goto out;
1162                 }
1163
1164                 /*
1165                  * Bind the new scheduler
1166                  */
1167                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1168                         /*
1169                          * If ip_vs_bind_scheduler fails, restore the old
1170                          * scheduler.
1171                          * The main reason of failure is out of memory.
1172                          *
1173                          * The question is if the old scheduler can be
1174                          * restored all the time. TODO: if it cannot be
1175                          * restored some time, we must delete the service,
1176                          * otherwise the system may crash.
1177                          */
1178                         ip_vs_bind_scheduler(svc, old_sched);
1179                         old_sched = sched;
1180                         goto out;
1181                 }
1182         }
1183
1184   out:
1185         write_unlock_bh(&__ip_vs_svc_lock);
1186
1187         if (old_sched)
1188                 ip_vs_scheduler_put(old_sched);
1189
1190         return ret;
1191 }
1192
1193
1194 /*
1195  *      Delete a service from the service list
1196  *      - The service must be unlinked, unlocked and not referenced!
1197  *      - We are called under _bh lock
1198  */
1199 static void __ip_vs_del_service(struct ip_vs_service *svc)
1200 {
1201         struct ip_vs_dest *dest, *nxt;
1202         struct ip_vs_scheduler *old_sched;
1203
1204         ip_vs_num_services--;
1205         ip_vs_kill_estimator(&svc->stats);
1206
1207         /* Unbind scheduler */
1208         old_sched = svc->scheduler;
1209         ip_vs_unbind_scheduler(svc);
1210         if (old_sched)
1211                 ip_vs_scheduler_put(old_sched);
1212
1213         /* Unbind app inc */
1214         if (svc->inc) {
1215                 ip_vs_app_inc_put(svc->inc);
1216                 svc->inc = NULL;
1217         }
1218
1219         /*
1220          *    Unlink the whole destination list
1221          */
1222         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1223                 __ip_vs_unlink_dest(svc, dest, 0);
1224                 __ip_vs_del_dest(dest);
1225         }
1226
1227         /*
1228          *    Update the virtual service counters
1229          */
1230         if (svc->port == FTPPORT)
1231                 atomic_dec(&ip_vs_ftpsvc_counter);
1232         else if (svc->port == 0)
1233                 atomic_dec(&ip_vs_nullsvc_counter);
1234
1235         /*
1236          *    Free the service if nobody refers to it
1237          */
1238         if (atomic_read(&svc->refcnt) == 0)
1239                 kfree(svc);
1240
1241         /* decrease the module use count */
1242         ip_vs_use_count_dec();
1243 }
1244
1245 /*
1246  *      Delete a service from the service list
1247  */
1248 static int ip_vs_del_service(struct ip_vs_service *svc)
1249 {
1250         if (svc == NULL)
1251                 return -EEXIST;
1252
1253         /*
1254          * Unhash it from the service table
1255          */
1256         write_lock_bh(&__ip_vs_svc_lock);
1257
1258         ip_vs_svc_unhash(svc);
1259
1260         /*
1261          * Wait until all the svc users go away.
1262          */
1263         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1264
1265         __ip_vs_del_service(svc);
1266
1267         write_unlock_bh(&__ip_vs_svc_lock);
1268
1269         return 0;
1270 }
1271
1272
1273 /*
1274  *      Flush all the virtual services
1275  */
1276 static int ip_vs_flush(void)
1277 {
1278         int idx;
1279         struct ip_vs_service *svc, *nxt;
1280
1281         /*
1282          * Flush the service table hashed by <protocol,addr,port>
1283          */
1284         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1285                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1286                         write_lock_bh(&__ip_vs_svc_lock);
1287                         ip_vs_svc_unhash(svc);
1288                         /*
1289                          * Wait until all the svc users go away.
1290                          */
1291                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1292                         __ip_vs_del_service(svc);
1293                         write_unlock_bh(&__ip_vs_svc_lock);
1294                 }
1295         }
1296
1297         /*
1298          * Flush the service table hashed by fwmark
1299          */
1300         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1301                 list_for_each_entry_safe(svc, nxt,
1302                                          &ip_vs_svc_fwm_table[idx], f_list) {
1303                         write_lock_bh(&__ip_vs_svc_lock);
1304                         ip_vs_svc_unhash(svc);
1305                         /*
1306                          * Wait until all the svc users go away.
1307                          */
1308                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1309                         __ip_vs_del_service(svc);
1310                         write_unlock_bh(&__ip_vs_svc_lock);
1311                 }
1312         }
1313
1314         return 0;
1315 }
1316
1317
1318 /*
1319  *      Zero counters in a service or all services
1320  */
1321 static int ip_vs_zero_service(struct ip_vs_service *svc)
1322 {
1323         struct ip_vs_dest *dest;
1324
1325         write_lock_bh(&__ip_vs_svc_lock);
1326         list_for_each_entry(dest, &svc->destinations, n_list) {
1327                 ip_vs_zero_stats(&dest->stats);
1328         }
1329         ip_vs_zero_stats(&svc->stats);
1330         write_unlock_bh(&__ip_vs_svc_lock);
1331         return 0;
1332 }
1333
1334 static int ip_vs_zero_all(void)
1335 {
1336         int idx;
1337         struct ip_vs_service *svc;
1338
1339         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1340                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1341                         ip_vs_zero_service(svc);
1342                 }
1343         }
1344
1345         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1346                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1347                         ip_vs_zero_service(svc);
1348                 }
1349         }
1350
1351         ip_vs_zero_stats(&ip_vs_stats);
1352         return 0;
1353 }
1354
1355
1356 static int
1357 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1358                      void __user *buffer, size_t *lenp, loff_t *ppos)
1359 {
1360         int *valp = table->data;
1361         int val = *valp;
1362         int rc;
1363
1364         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1365         if (write && (*valp != val)) {
1366                 if ((*valp < 0) || (*valp > 3)) {
1367                         /* Restore the correct value */
1368                         *valp = val;
1369                 } else {
1370                         update_defense_level();
1371                 }
1372         }
1373         return rc;
1374 }
1375
1376
1377 static int
1378 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1379                        void __user *buffer, size_t *lenp, loff_t *ppos)
1380 {
1381         int *valp = table->data;
1382         int val[2];
1383         int rc;
1384
1385         /* backup the value first */
1386         memcpy(val, valp, sizeof(val));
1387
1388         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1389         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1390                 /* Restore the correct value */
1391                 memcpy(valp, val, sizeof(val));
1392         }
1393         return rc;
1394 }
1395
1396
1397 /*
1398  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1399  */
1400
1401 static struct ctl_table vs_vars[] = {
1402         {
1403                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1404                 .procname       = "amemthresh",
1405                 .data           = &sysctl_ip_vs_amemthresh,
1406                 .maxlen         = sizeof(int),
1407                 .mode           = 0644,
1408                 .proc_handler   = &proc_dointvec,
1409         },
1410 #ifdef CONFIG_IP_VS_DEBUG
1411         {
1412                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1413                 .procname       = "debug_level",
1414                 .data           = &sysctl_ip_vs_debug_level,
1415                 .maxlen         = sizeof(int),
1416                 .mode           = 0644,
1417                 .proc_handler   = &proc_dointvec,
1418         },
1419 #endif
1420         {
1421                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1422                 .procname       = "am_droprate",
1423                 .data           = &sysctl_ip_vs_am_droprate,
1424                 .maxlen         = sizeof(int),
1425                 .mode           = 0644,
1426                 .proc_handler   = &proc_dointvec,
1427         },
1428         {
1429                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1430                 .procname       = "drop_entry",
1431                 .data           = &sysctl_ip_vs_drop_entry,
1432                 .maxlen         = sizeof(int),
1433                 .mode           = 0644,
1434                 .proc_handler   = &proc_do_defense_mode,
1435         },
1436         {
1437                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1438                 .procname       = "drop_packet",
1439                 .data           = &sysctl_ip_vs_drop_packet,
1440                 .maxlen         = sizeof(int),
1441                 .mode           = 0644,
1442                 .proc_handler   = &proc_do_defense_mode,
1443         },
1444         {
1445                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1446                 .procname       = "secure_tcp",
1447                 .data           = &sysctl_ip_vs_secure_tcp,
1448                 .maxlen         = sizeof(int),
1449                 .mode           = 0644,
1450                 .proc_handler   = &proc_do_defense_mode,
1451         },
1452 #if 0
1453         {
1454                 .ctl_name       = NET_IPV4_VS_TO_ES,
1455                 .procname       = "timeout_established",
1456                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1457                 .maxlen         = sizeof(int),
1458                 .mode           = 0644,
1459                 .proc_handler   = &proc_dointvec_jiffies,
1460         },
1461         {
1462                 .ctl_name       = NET_IPV4_VS_TO_SS,
1463                 .procname       = "timeout_synsent",
1464                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1465                 .maxlen         = sizeof(int),
1466                 .mode           = 0644,
1467                 .proc_handler   = &proc_dointvec_jiffies,
1468         },
1469         {
1470                 .ctl_name       = NET_IPV4_VS_TO_SR,
1471                 .procname       = "timeout_synrecv",
1472                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1473                 .maxlen         = sizeof(int),
1474                 .mode           = 0644,
1475                 .proc_handler   = &proc_dointvec_jiffies,
1476         },
1477         {
1478                 .ctl_name       = NET_IPV4_VS_TO_FW,
1479                 .procname       = "timeout_finwait",
1480                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1481                 .maxlen         = sizeof(int),
1482                 .mode           = 0644,
1483                 .proc_handler   = &proc_dointvec_jiffies,
1484         },
1485         {
1486                 .ctl_name       = NET_IPV4_VS_TO_TW,
1487                 .procname       = "timeout_timewait",
1488                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1489                 .maxlen         = sizeof(int),
1490                 .mode           = 0644,
1491                 .proc_handler   = &proc_dointvec_jiffies,
1492         },
1493         {
1494                 .ctl_name       = NET_IPV4_VS_TO_CL,
1495                 .procname       = "timeout_close",
1496                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1497                 .maxlen         = sizeof(int),
1498                 .mode           = 0644,
1499                 .proc_handler   = &proc_dointvec_jiffies,
1500         },
1501         {
1502                 .ctl_name       = NET_IPV4_VS_TO_CW,
1503                 .procname       = "timeout_closewait",
1504                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1505                 .maxlen         = sizeof(int),
1506                 .mode           = 0644,
1507                 .proc_handler   = &proc_dointvec_jiffies,
1508         },
1509         {
1510                 .ctl_name       = NET_IPV4_VS_TO_LA,
1511                 .procname       = "timeout_lastack",
1512                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1513                 .maxlen         = sizeof(int),
1514                 .mode           = 0644,
1515                 .proc_handler   = &proc_dointvec_jiffies,
1516         },
1517         {
1518                 .ctl_name       = NET_IPV4_VS_TO_LI,
1519                 .procname       = "timeout_listen",
1520                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1521                 .maxlen         = sizeof(int),
1522                 .mode           = 0644,
1523                 .proc_handler   = &proc_dointvec_jiffies,
1524         },
1525         {
1526                 .ctl_name       = NET_IPV4_VS_TO_SA,
1527                 .procname       = "timeout_synack",
1528                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1529                 .maxlen         = sizeof(int),
1530                 .mode           = 0644,
1531                 .proc_handler   = &proc_dointvec_jiffies,
1532         },
1533         {
1534                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1535                 .procname       = "timeout_udp",
1536                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1537                 .maxlen         = sizeof(int),
1538                 .mode           = 0644,
1539                 .proc_handler   = &proc_dointvec_jiffies,
1540         },
1541         {
1542                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1543                 .procname       = "timeout_icmp",
1544                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1545                 .maxlen         = sizeof(int),
1546                 .mode           = 0644,
1547                 .proc_handler   = &proc_dointvec_jiffies,
1548         },
1549 #endif
1550         {
1551                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1552                 .procname       = "cache_bypass",
1553                 .data           = &sysctl_ip_vs_cache_bypass,
1554                 .maxlen         = sizeof(int),
1555                 .mode           = 0644,
1556                 .proc_handler   = &proc_dointvec,
1557         },
1558         {
1559                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1560                 .procname       = "expire_nodest_conn",
1561                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1562                 .maxlen         = sizeof(int),
1563                 .mode           = 0644,
1564                 .proc_handler   = &proc_dointvec,
1565         },
1566         {
1567                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1568                 .procname       = "expire_quiescent_template",
1569                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1570                 .maxlen         = sizeof(int),
1571                 .mode           = 0644,
1572                 .proc_handler   = &proc_dointvec,
1573         },
1574         {
1575                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1576                 .procname       = "sync_threshold",
1577                 .data           = &sysctl_ip_vs_sync_threshold,
1578                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1579                 .mode           = 0644,
1580                 .proc_handler   = &proc_do_sync_threshold,
1581         },
1582         {
1583                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1584                 .procname       = "nat_icmp_send",
1585                 .data           = &sysctl_ip_vs_nat_icmp_send,
1586                 .maxlen         = sizeof(int),
1587                 .mode           = 0644,
1588                 .proc_handler   = &proc_dointvec,
1589         },
1590         { .ctl_name = 0 }
1591 };
1592
1593 static ctl_table vs_table[] = {
1594         {
1595                 .ctl_name       = NET_IPV4_VS,
1596                 .procname       = "vs",
1597                 .mode           = 0555,
1598                 .child          = vs_vars
1599         },
1600         { .ctl_name = 0 }
1601 };
1602
1603 static ctl_table ipvs_ipv4_table[] = {
1604         {
1605                 .ctl_name       = NET_IPV4,
1606                 .procname       = "ipv4",
1607                 .mode           = 0555,
1608                 .child          = vs_table,
1609         },
1610         { .ctl_name = 0 }
1611 };
1612
1613 static ctl_table vs_root_table[] = {
1614         {
1615                 .ctl_name       = CTL_NET,
1616                 .procname       = "net",
1617                 .mode           = 0555,
1618                 .child          = ipvs_ipv4_table,
1619         },
1620         { .ctl_name = 0 }
1621 };
1622
1623 static struct ctl_table_header * sysctl_header;
1624
1625 #ifdef CONFIG_PROC_FS
1626
1627 struct ip_vs_iter {
1628         struct list_head *table;
1629         int bucket;
1630 };
1631
1632 /*
1633  *      Write the contents of the VS rule table to a PROCfs file.
1634  *      (It is kept just for backward compatibility)
1635  */
1636 static inline const char *ip_vs_fwd_name(unsigned flags)
1637 {
1638         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1639         case IP_VS_CONN_F_LOCALNODE:
1640                 return "Local";
1641         case IP_VS_CONN_F_TUNNEL:
1642                 return "Tunnel";
1643         case IP_VS_CONN_F_DROUTE:
1644                 return "Route";
1645         default:
1646                 return "Masq";
1647         }
1648 }
1649
1650
1651 /* Get the Nth entry in the two lists */
1652 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1653 {
1654         struct ip_vs_iter *iter = seq->private;
1655         int idx;
1656         struct ip_vs_service *svc;
1657
1658         /* look in hash by protocol */
1659         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1660                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1661                         if (pos-- == 0){
1662                                 iter->table = ip_vs_svc_table;
1663                                 iter->bucket = idx;
1664                                 return svc;
1665                         }
1666                 }
1667         }
1668
1669         /* keep looking in fwmark */
1670         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1671                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1672                         if (pos-- == 0) {
1673                                 iter->table = ip_vs_svc_fwm_table;
1674                                 iter->bucket = idx;
1675                                 return svc;
1676                         }
1677                 }
1678         }
1679
1680         return NULL;
1681 }
1682
1683 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1684 {
1685
1686         read_lock_bh(&__ip_vs_svc_lock);
1687         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1688 }
1689
1690
1691 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1692 {
1693         struct list_head *e;
1694         struct ip_vs_iter *iter;
1695         struct ip_vs_service *svc;
1696
1697         ++*pos;
1698         if (v == SEQ_START_TOKEN)
1699                 return ip_vs_info_array(seq,0);
1700
1701         svc = v;
1702         iter = seq->private;
1703
1704         if (iter->table == ip_vs_svc_table) {
1705                 /* next service in table hashed by protocol */
1706                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1707                         return list_entry(e, struct ip_vs_service, s_list);
1708
1709
1710                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1711                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1712                                             s_list) {
1713                                 return svc;
1714                         }
1715                 }
1716
1717                 iter->table = ip_vs_svc_fwm_table;
1718                 iter->bucket = -1;
1719                 goto scan_fwmark;
1720         }
1721
1722         /* next service in hashed by fwmark */
1723         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1724                 return list_entry(e, struct ip_vs_service, f_list);
1725
1726  scan_fwmark:
1727         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1728                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1729                                     f_list)
1730                         return svc;
1731         }
1732
1733         return NULL;
1734 }
1735
1736 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1737 {
1738         read_unlock_bh(&__ip_vs_svc_lock);
1739 }
1740
1741
1742 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1743 {
1744         if (v == SEQ_START_TOKEN) {
1745                 seq_printf(seq,
1746                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1747                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1748                 seq_puts(seq,
1749                          "Prot LocalAddress:Port Scheduler Flags\n");
1750                 seq_puts(seq,
1751                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1752         } else {
1753                 const struct ip_vs_service *svc = v;
1754                 const struct ip_vs_iter *iter = seq->private;
1755                 const struct ip_vs_dest *dest;
1756
1757                 if (iter->table == ip_vs_svc_table)
1758                         seq_printf(seq, "%s  %08X:%04X %s ",
1759                                    ip_vs_proto_name(svc->protocol),
1760                                    ntohl(svc->addr),
1761                                    ntohs(svc->port),
1762                                    svc->scheduler->name);
1763                 else
1764                         seq_printf(seq, "FWM  %08X %s ",
1765                                    svc->fwmark, svc->scheduler->name);
1766
1767                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1768                         seq_printf(seq, "persistent %d %08X\n",
1769                                 svc->timeout,
1770                                 ntohl(svc->netmask));
1771                 else
1772                         seq_putc(seq, '\n');
1773
1774                 list_for_each_entry(dest, &svc->destinations, n_list) {
1775                         seq_printf(seq,
1776                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1777                                    ntohl(dest->addr), ntohs(dest->port),
1778                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1779                                    atomic_read(&dest->weight),
1780                                    atomic_read(&dest->activeconns),
1781                                    atomic_read(&dest->inactconns));
1782                 }
1783         }
1784         return 0;
1785 }
1786
1787 static struct seq_operations ip_vs_info_seq_ops = {
1788         .start = ip_vs_info_seq_start,
1789         .next  = ip_vs_info_seq_next,
1790         .stop  = ip_vs_info_seq_stop,
1791         .show  = ip_vs_info_seq_show,
1792 };
1793
1794 static int ip_vs_info_open(struct inode *inode, struct file *file)
1795 {
1796         struct seq_file *seq;
1797         int rc = -ENOMEM;
1798         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1799
1800         if (!s)
1801                 goto out;
1802
1803         rc = seq_open(file, &ip_vs_info_seq_ops);
1804         if (rc)
1805                 goto out_kfree;
1806
1807         seq          = file->private_data;
1808         seq->private = s;
1809         memset(s, 0, sizeof(*s));
1810 out:
1811         return rc;
1812 out_kfree:
1813         kfree(s);
1814         goto out;
1815 }
1816
1817 static struct file_operations ip_vs_info_fops = {
1818         .owner   = THIS_MODULE,
1819         .open    = ip_vs_info_open,
1820         .read    = seq_read,
1821         .llseek  = seq_lseek,
1822         .release = seq_release_private,
1823 };
1824
1825 #endif
1826
1827 struct ip_vs_stats ip_vs_stats;
1828
1829 #ifdef CONFIG_PROC_FS
1830 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1831 {
1832
1833 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1834         seq_puts(seq,
1835                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1836         seq_printf(seq,
1837                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1838
1839         spin_lock_bh(&ip_vs_stats.lock);
1840         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1841                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1842                    (unsigned long long) ip_vs_stats.inbytes,
1843                    (unsigned long long) ip_vs_stats.outbytes);
1844
1845 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1846         seq_puts(seq,
1847                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1848         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1849                         ip_vs_stats.cps,
1850                         ip_vs_stats.inpps,
1851                         ip_vs_stats.outpps,
1852                         ip_vs_stats.inbps,
1853                         ip_vs_stats.outbps);
1854         spin_unlock_bh(&ip_vs_stats.lock);
1855
1856         return 0;
1857 }
1858
1859 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1860 {
1861         return single_open(file, ip_vs_stats_show, NULL);
1862 }
1863
1864 static struct file_operations ip_vs_stats_fops = {
1865         .owner = THIS_MODULE,
1866         .open = ip_vs_stats_seq_open,
1867         .read = seq_read,
1868         .llseek = seq_lseek,
1869         .release = single_release,
1870 };
1871
1872 #endif
1873
1874 /*
1875  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1876  */
1877 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1878 {
1879         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1880                   u->tcp_timeout,
1881                   u->tcp_fin_timeout,
1882                   u->udp_timeout);
1883
1884 #ifdef CONFIG_IP_VS_PROTO_TCP
1885         if (u->tcp_timeout) {
1886                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1887                         = u->tcp_timeout * HZ;
1888         }
1889
1890         if (u->tcp_fin_timeout) {
1891                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1892                         = u->tcp_fin_timeout * HZ;
1893         }
1894 #endif
1895
1896 #ifdef CONFIG_IP_VS_PROTO_UDP
1897         if (u->udp_timeout) {
1898                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1899                         = u->udp_timeout * HZ;
1900         }
1901 #endif
1902         return 0;
1903 }
1904
1905
1906 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1907 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1908 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1909                                  sizeof(struct ip_vs_dest_user))
1910 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1911 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1912 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1913
1914 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1915         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1916         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1917         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1919         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1921         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1924         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1925         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1926 };
1927
1928 static int
1929 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1930 {
1931         int ret;
1932         unsigned char arg[MAX_ARG_LEN];
1933         struct ip_vs_service_user *usvc;
1934         struct ip_vs_service *svc;
1935         struct ip_vs_dest_user *udest;
1936
1937         if (!capable(CAP_NET_ADMIN))
1938                 return -EPERM;
1939
1940         if (len != set_arglen[SET_CMDID(cmd)]) {
1941                 IP_VS_ERR("set_ctl: len %u != %u\n",
1942                           len, set_arglen[SET_CMDID(cmd)]);
1943                 return -EINVAL;
1944         }
1945
1946         if (copy_from_user(arg, user, len) != 0)
1947                 return -EFAULT;
1948
1949         /* increase the module use count */
1950         ip_vs_use_count_inc();
1951
1952         if (down_interruptible(&__ip_vs_mutex)) {
1953                 ret = -ERESTARTSYS;
1954                 goto out_dec;
1955         }
1956
1957         if (cmd == IP_VS_SO_SET_FLUSH) {
1958                 /* Flush the virtual service */
1959                 ret = ip_vs_flush();
1960                 goto out_unlock;
1961         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1962                 /* Set timeout values for (tcp tcpfin udp) */
1963                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1964                 goto out_unlock;
1965         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1966                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1967                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1968                 goto out_unlock;
1969         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1970                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1971                 ret = stop_sync_thread(dm->state);
1972                 goto out_unlock;
1973         }
1974
1975         usvc = (struct ip_vs_service_user *)arg;
1976         udest = (struct ip_vs_dest_user *)(usvc + 1);
1977
1978         if (cmd == IP_VS_SO_SET_ZERO) {
1979                 /* if no service address is set, zero counters in all */
1980                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1981                         ret = ip_vs_zero_all();
1982                         goto out_unlock;
1983                 }
1984         }
1985
1986         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1987         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1988                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1989                           usvc->protocol, NIPQUAD(usvc->addr),
1990                           ntohs(usvc->port), usvc->sched_name);
1991                 ret = -EFAULT;
1992                 goto out_unlock;
1993         }
1994
1995         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1996         if (usvc->fwmark == 0)
1997                 svc = __ip_vs_service_get(usvc->protocol,
1998                                           usvc->addr, usvc->port);
1999         else
2000                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2001
2002         if (cmd != IP_VS_SO_SET_ADD
2003             && (svc == NULL || svc->protocol != usvc->protocol)) {
2004                 ret = -ESRCH;
2005                 goto out_unlock;
2006         }
2007
2008         switch (cmd) {
2009         case IP_VS_SO_SET_ADD:
2010                 if (svc != NULL)
2011                         ret = -EEXIST;
2012                 else
2013                         ret = ip_vs_add_service(usvc, &svc);
2014                 break;
2015         case IP_VS_SO_SET_EDIT:
2016                 ret = ip_vs_edit_service(svc, usvc);
2017                 break;
2018         case IP_VS_SO_SET_DEL:
2019                 ret = ip_vs_del_service(svc);
2020                 if (!ret)
2021                         goto out_unlock;
2022                 break;
2023         case IP_VS_SO_SET_ZERO:
2024                 ret = ip_vs_zero_service(svc);
2025                 break;
2026         case IP_VS_SO_SET_ADDDEST:
2027                 ret = ip_vs_add_dest(svc, udest);
2028                 break;
2029         case IP_VS_SO_SET_EDITDEST:
2030                 ret = ip_vs_edit_dest(svc, udest);
2031                 break;
2032         case IP_VS_SO_SET_DELDEST:
2033                 ret = ip_vs_del_dest(svc, udest);
2034                 break;
2035         default:
2036                 ret = -EINVAL;
2037         }
2038
2039         if (svc)
2040                 ip_vs_service_put(svc);
2041
2042   out_unlock:
2043         up(&__ip_vs_mutex);
2044   out_dec:
2045         /* decrease the module use count */
2046         ip_vs_use_count_dec();
2047
2048         return ret;
2049 }
2050
2051
2052 static void
2053 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2054 {
2055         spin_lock_bh(&src->lock);
2056         memcpy(dst, src, (char*)&src->lock - (char*)src);
2057         spin_unlock_bh(&src->lock);
2058 }
2059
2060 static void
2061 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2062 {
2063         dst->protocol = src->protocol;
2064         dst->addr = src->addr;
2065         dst->port = src->port;
2066         dst->fwmark = src->fwmark;
2067         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2068         dst->flags = src->flags;
2069         dst->timeout = src->timeout / HZ;
2070         dst->netmask = src->netmask;
2071         dst->num_dests = src->num_dests;
2072         ip_vs_copy_stats(&dst->stats, &src->stats);
2073 }
2074
2075 static inline int
2076 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2077                             struct ip_vs_get_services __user *uptr)
2078 {
2079         int idx, count=0;
2080         struct ip_vs_service *svc;
2081         struct ip_vs_service_entry entry;
2082         int ret = 0;
2083
2084         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2085                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2086                         if (count >= get->num_services)
2087                                 goto out;
2088                         memset(&entry, 0, sizeof(entry));
2089                         ip_vs_copy_service(&entry, svc);
2090                         if (copy_to_user(&uptr->entrytable[count],
2091                                          &entry, sizeof(entry))) {
2092                                 ret = -EFAULT;
2093                                 goto out;
2094                         }
2095                         count++;
2096                 }
2097         }
2098
2099         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2100                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2101                         if (count >= get->num_services)
2102                                 goto out;
2103                         memset(&entry, 0, sizeof(entry));
2104                         ip_vs_copy_service(&entry, svc);
2105                         if (copy_to_user(&uptr->entrytable[count],
2106                                          &entry, sizeof(entry))) {
2107                                 ret = -EFAULT;
2108                                 goto out;
2109                         }
2110                         count++;
2111                 }
2112         }
2113   out:
2114         return ret;
2115 }
2116
2117 static inline int
2118 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2119                          struct ip_vs_get_dests __user *uptr)
2120 {
2121         struct ip_vs_service *svc;
2122         int ret = 0;
2123
2124         if (get->fwmark)
2125                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2126         else
2127                 svc = __ip_vs_service_get(get->protocol,
2128                                           get->addr, get->port);
2129         if (svc) {
2130                 int count = 0;
2131                 struct ip_vs_dest *dest;
2132                 struct ip_vs_dest_entry entry;
2133
2134                 list_for_each_entry(dest, &svc->destinations, n_list) {
2135                         if (count >= get->num_dests)
2136                                 break;
2137
2138                         entry.addr = dest->addr;
2139                         entry.port = dest->port;
2140                         entry.conn_flags = atomic_read(&dest->conn_flags);
2141                         entry.weight = atomic_read(&dest->weight);
2142                         entry.u_threshold = dest->u_threshold;
2143                         entry.l_threshold = dest->l_threshold;
2144                         entry.activeconns = atomic_read(&dest->activeconns);
2145                         entry.inactconns = atomic_read(&dest->inactconns);
2146                         entry.persistconns = atomic_read(&dest->persistconns);
2147                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2148                         if (copy_to_user(&uptr->entrytable[count],
2149                                          &entry, sizeof(entry))) {
2150                                 ret = -EFAULT;
2151                                 break;
2152                         }
2153                         count++;
2154                 }
2155                 ip_vs_service_put(svc);
2156         } else
2157                 ret = -ESRCH;
2158         return ret;
2159 }
2160
2161 static inline void
2162 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2163 {
2164 #ifdef CONFIG_IP_VS_PROTO_TCP
2165         u->tcp_timeout =
2166                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2167         u->tcp_fin_timeout =
2168                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2169 #endif
2170 #ifdef CONFIG_IP_VS_PROTO_UDP
2171         u->udp_timeout =
2172                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2173 #endif
2174 }
2175
2176
2177 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2178 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2179 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2180 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2181 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2182 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2183 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2184
2185 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2186         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2187         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2188         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2189         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2190         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2191         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2192         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2193 };
2194
2195 static int
2196 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2197 {
2198         unsigned char arg[128];
2199         int ret = 0;
2200
2201         if (!capable(CAP_NET_ADMIN))
2202                 return -EPERM;
2203
2204         if (*len < get_arglen[GET_CMDID(cmd)]) {
2205                 IP_VS_ERR("get_ctl: len %u < %u\n",
2206                           *len, get_arglen[GET_CMDID(cmd)]);
2207                 return -EINVAL;
2208         }
2209
2210         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2211                 return -EFAULT;
2212
2213         if (down_interruptible(&__ip_vs_mutex))
2214                 return -ERESTARTSYS;
2215
2216         switch (cmd) {
2217         case IP_VS_SO_GET_VERSION:
2218         {
2219                 char buf[64];
2220
2221                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2222                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2223                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2224                         ret = -EFAULT;
2225                         goto out;
2226                 }
2227                 *len = strlen(buf)+1;
2228         }
2229         break;
2230
2231         case IP_VS_SO_GET_INFO:
2232         {
2233                 struct ip_vs_getinfo info;
2234                 info.version = IP_VS_VERSION_CODE;
2235                 info.size = IP_VS_CONN_TAB_SIZE;
2236                 info.num_services = ip_vs_num_services;
2237                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2238                         ret = -EFAULT;
2239         }
2240         break;
2241
2242         case IP_VS_SO_GET_SERVICES:
2243         {
2244                 struct ip_vs_get_services *get;
2245                 int size;
2246
2247                 get = (struct ip_vs_get_services *)arg;
2248                 size = sizeof(*get) +
2249                         sizeof(struct ip_vs_service_entry) * get->num_services;
2250                 if (*len != size) {
2251                         IP_VS_ERR("length: %u != %u\n", *len, size);
2252                         ret = -EINVAL;
2253                         goto out;
2254                 }
2255                 ret = __ip_vs_get_service_entries(get, user);
2256         }
2257         break;
2258
2259         case IP_VS_SO_GET_SERVICE:
2260         {
2261                 struct ip_vs_service_entry *entry;
2262                 struct ip_vs_service *svc;
2263
2264                 entry = (struct ip_vs_service_entry *)arg;
2265                 if (entry->fwmark)
2266                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2267                 else
2268                         svc = __ip_vs_service_get(entry->protocol,
2269                                                   entry->addr, entry->port);
2270                 if (svc) {
2271                         ip_vs_copy_service(entry, svc);
2272                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2273                                 ret = -EFAULT;
2274                         ip_vs_service_put(svc);
2275                 } else
2276                         ret = -ESRCH;
2277         }
2278         break;
2279
2280         case IP_VS_SO_GET_DESTS:
2281         {
2282                 struct ip_vs_get_dests *get;
2283                 int size;
2284
2285                 get = (struct ip_vs_get_dests *)arg;
2286                 size = sizeof(*get) +
2287                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2288                 if (*len != size) {
2289                         IP_VS_ERR("length: %u != %u\n", *len, size);
2290                         ret = -EINVAL;
2291                         goto out;
2292                 }
2293                 ret = __ip_vs_get_dest_entries(get, user);
2294         }
2295         break;
2296
2297         case IP_VS_SO_GET_TIMEOUT:
2298         {
2299                 struct ip_vs_timeout_user t;
2300
2301                 __ip_vs_get_timeouts(&t);
2302                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2303                         ret = -EFAULT;
2304         }
2305         break;
2306
2307         case IP_VS_SO_GET_DAEMON:
2308         {
2309                 struct ip_vs_daemon_user d[2];
2310
2311                 memset(&d, 0, sizeof(d));
2312                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2313                         d[0].state = IP_VS_STATE_MASTER;
2314                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2315                         d[0].syncid = ip_vs_master_syncid;
2316                 }
2317                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2318                         d[1].state = IP_VS_STATE_BACKUP;
2319                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2320                         d[1].syncid = ip_vs_backup_syncid;
2321                 }
2322                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2323                         ret = -EFAULT;
2324         }
2325         break;
2326
2327         default:
2328                 ret = -EINVAL;
2329         }
2330
2331   out:
2332         up(&__ip_vs_mutex);
2333         return ret;
2334 }
2335
2336
2337 static struct nf_sockopt_ops ip_vs_sockopts = {
2338         .pf             = PF_INET,
2339         .set_optmin     = IP_VS_BASE_CTL,
2340         .set_optmax     = IP_VS_SO_SET_MAX+1,
2341         .set            = do_ip_vs_set_ctl,
2342         .get_optmin     = IP_VS_BASE_CTL,
2343         .get_optmax     = IP_VS_SO_GET_MAX+1,
2344         .get            = do_ip_vs_get_ctl,
2345 };
2346
2347
2348 int ip_vs_control_init(void)
2349 {
2350         int ret;
2351         int idx;
2352
2353         EnterFunction(2);
2354
2355         ret = nf_register_sockopt(&ip_vs_sockopts);
2356         if (ret) {
2357                 IP_VS_ERR("cannot register sockopt.\n");
2358                 return ret;
2359         }
2360
2361         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2362         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2363
2364         sysctl_header = register_sysctl_table(vs_root_table, 0);
2365
2366         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2367         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2368                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2369                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2370         }
2371         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2372                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2373         }
2374
2375         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2376         spin_lock_init(&ip_vs_stats.lock);
2377         ip_vs_new_estimator(&ip_vs_stats);
2378
2379         /* Hook the defense timer */
2380         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2381
2382         LeaveFunction(2);
2383         return 0;
2384 }
2385
2386
2387 void ip_vs_control_cleanup(void)
2388 {
2389         EnterFunction(2);
2390         ip_vs_trash_cleanup();
2391         cancel_rearming_delayed_work(&defense_work);
2392         ip_vs_kill_estimator(&ip_vs_stats);
2393         unregister_sysctl_table(sysctl_header);
2394         proc_net_remove("ip_vs_stats");
2395         proc_net_remove("ip_vs");
2396         nf_unregister_sockopt(&ip_vs_sockopts);
2397         LeaveFunction(2);
2398 }