Merge master.kernel.org:/home/rmk/linux-2.6-arm
[linux-2.6] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34
35 #include <linux/netfilter.h>
36 #include <linux/netfilter_ipv4.h>
37 #include <linux/mutex.h>
38
39 #include <net/ip.h>
40 #include <net/route.h>
41 #include <net/sock.h>
42
43 #include <asm/uaccess.h>
44
45 #include <net/ip_vs.h>
46
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DEFINE_MUTEX(__ip_vs_mutex);
49
50 /* lock for service table */
51 static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53 /* lock for table with the real services */
54 static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56 /* lock for state and timeout tables */
57 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59 /* lock for drop entry handling */
60 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62 /* lock for drop packet handling */
63 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate = 0;
67 int ip_vs_drop_counter = 0;
68 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70 /* number of virtual services */
71 static int ip_vs_num_services = 0;
72
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry = 0;
75 static int sysctl_ip_vs_drop_packet = 0;
76 static int sysctl_ip_vs_secure_tcp = 0;
77 static int sysctl_ip_vs_amemthresh = 1024;
78 static int sysctl_ip_vs_am_droprate = 10;
79 int sysctl_ip_vs_cache_bypass = 0;
80 int sysctl_ip_vs_expire_nodest_conn = 0;
81 int sysctl_ip_vs_expire_quiescent_template = 0;
82 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83 int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86 #ifdef CONFIG_IP_VS_DEBUG
87 static int sysctl_ip_vs_debug_level = 0;
88
89 int ip_vs_get_debug_level(void)
90 {
91         return sysctl_ip_vs_debug_level;
92 }
93 #endif
94
95 /*
96  *      update_defense_level is called from keventd and from sysctl,
97  *      so it needs to protect itself from softirqs
98  */
99 static void update_defense_level(void)
100 {
101         struct sysinfo i;
102         static int old_secure_tcp = 0;
103         int availmem;
104         int nomem;
105         int to_change = -1;
106
107         /* we only count free and buffered memory (in pages) */
108         si_meminfo(&i);
109         availmem = i.freeram + i.bufferram;
110         /* however in linux 2.5 the i.bufferram is total page cache size,
111            we need adjust it */
112         /* si_swapinfo(&i); */
113         /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115         nomem = (availmem < sysctl_ip_vs_amemthresh);
116
117         local_bh_disable();
118
119         /* drop_entry */
120         spin_lock(&__ip_vs_dropentry_lock);
121         switch (sysctl_ip_vs_drop_entry) {
122         case 0:
123                 atomic_set(&ip_vs_dropentry, 0);
124                 break;
125         case 1:
126                 if (nomem) {
127                         atomic_set(&ip_vs_dropentry, 1);
128                         sysctl_ip_vs_drop_entry = 2;
129                 } else {
130                         atomic_set(&ip_vs_dropentry, 0);
131                 }
132                 break;
133         case 2:
134                 if (nomem) {
135                         atomic_set(&ip_vs_dropentry, 1);
136                 } else {
137                         atomic_set(&ip_vs_dropentry, 0);
138                         sysctl_ip_vs_drop_entry = 1;
139                 };
140                 break;
141         case 3:
142                 atomic_set(&ip_vs_dropentry, 1);
143                 break;
144         }
145         spin_unlock(&__ip_vs_dropentry_lock);
146
147         /* drop_packet */
148         spin_lock(&__ip_vs_droppacket_lock);
149         switch (sysctl_ip_vs_drop_packet) {
150         case 0:
151                 ip_vs_drop_rate = 0;
152                 break;
153         case 1:
154                 if (nomem) {
155                         ip_vs_drop_rate = ip_vs_drop_counter
156                                 = sysctl_ip_vs_amemthresh /
157                                 (sysctl_ip_vs_amemthresh-availmem);
158                         sysctl_ip_vs_drop_packet = 2;
159                 } else {
160                         ip_vs_drop_rate = 0;
161                 }
162                 break;
163         case 2:
164                 if (nomem) {
165                         ip_vs_drop_rate = ip_vs_drop_counter
166                                 = sysctl_ip_vs_amemthresh /
167                                 (sysctl_ip_vs_amemthresh-availmem);
168                 } else {
169                         ip_vs_drop_rate = 0;
170                         sysctl_ip_vs_drop_packet = 1;
171                 }
172                 break;
173         case 3:
174                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175                 break;
176         }
177         spin_unlock(&__ip_vs_droppacket_lock);
178
179         /* secure_tcp */
180         write_lock(&__ip_vs_securetcp_lock);
181         switch (sysctl_ip_vs_secure_tcp) {
182         case 0:
183                 if (old_secure_tcp >= 2)
184                         to_change = 0;
185                 break;
186         case 1:
187                 if (nomem) {
188                         if (old_secure_tcp < 2)
189                                 to_change = 1;
190                         sysctl_ip_vs_secure_tcp = 2;
191                 } else {
192                         if (old_secure_tcp >= 2)
193                                 to_change = 0;
194                 }
195                 break;
196         case 2:
197                 if (nomem) {
198                         if (old_secure_tcp < 2)
199                                 to_change = 1;
200                 } else {
201                         if (old_secure_tcp >= 2)
202                                 to_change = 0;
203                         sysctl_ip_vs_secure_tcp = 1;
204                 }
205                 break;
206         case 3:
207                 if (old_secure_tcp < 2)
208                         to_change = 1;
209                 break;
210         }
211         old_secure_tcp = sysctl_ip_vs_secure_tcp;
212         if (to_change >= 0)
213                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214         write_unlock(&__ip_vs_securetcp_lock);
215
216         local_bh_enable();
217 }
218
219
220 /*
221  *      Timer for checking the defense
222  */
223 #define DEFENSE_TIMER_PERIOD    1*HZ
224 static void defense_work_handler(void *data);
225 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
226
227 static void defense_work_handler(void *data)
228 {
229         update_defense_level();
230         if (atomic_read(&ip_vs_dropentry))
231                 ip_vs_random_dropentry();
232
233         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234 }
235
236 int
237 ip_vs_use_count_inc(void)
238 {
239         return try_module_get(THIS_MODULE);
240 }
241
242 void
243 ip_vs_use_count_dec(void)
244 {
245         module_put(THIS_MODULE);
246 }
247
248
249 /*
250  *      Hash table: for virtual service lookups
251  */
252 #define IP_VS_SVC_TAB_BITS 8
253 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256 /* the service table hashed by <protocol, addr, port> */
257 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258 /* the service table hashed by fwmark */
259 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261 /*
262  *      Hash table: for real service lookups
263  */
264 #define IP_VS_RTAB_BITS 4
265 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270 /*
271  *      Trash for destinations
272  */
273 static LIST_HEAD(ip_vs_dest_trash);
274
275 /*
276  *      FTP & NULL virtual service counters
277  */
278 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282 /*
283  *      Returns hash value for virtual service
284  */
285 static __inline__ unsigned
286 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
287 {
288         register unsigned porth = ntohs(port);
289
290         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291                 & IP_VS_SVC_TAB_MASK;
292 }
293
294 /*
295  *      Returns hash value of fwmark for virtual service lookup
296  */
297 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298 {
299         return fwmark & IP_VS_SVC_TAB_MASK;
300 }
301
302 /*
303  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304  *      or in the ip_vs_svc_fwm_table by fwmark.
305  *      Should be called with locked tables.
306  */
307 static int ip_vs_svc_hash(struct ip_vs_service *svc)
308 {
309         unsigned hash;
310
311         if (svc->flags & IP_VS_SVC_F_HASHED) {
312                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313                           "called from %p\n", __builtin_return_address(0));
314                 return 0;
315         }
316
317         if (svc->fwmark == 0) {
318                 /*
319                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
320                  */
321                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323         } else {
324                 /*
325                  *  Hash it by fwmark in ip_vs_svc_fwm_table
326                  */
327                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329         }
330
331         svc->flags |= IP_VS_SVC_F_HASHED;
332         /* increase its refcnt because it is referenced by the svc table */
333         atomic_inc(&svc->refcnt);
334         return 1;
335 }
336
337
338 /*
339  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340  *      Should be called with locked tables.
341  */
342 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343 {
344         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346                           "called from %p\n", __builtin_return_address(0));
347                 return 0;
348         }
349
350         if (svc->fwmark == 0) {
351                 /* Remove it from the ip_vs_svc_table table */
352                 list_del(&svc->s_list);
353         } else {
354                 /* Remove it from the ip_vs_svc_fwm_table table */
355                 list_del(&svc->f_list);
356         }
357
358         svc->flags &= ~IP_VS_SVC_F_HASHED;
359         atomic_dec(&svc->refcnt);
360         return 1;
361 }
362
363
364 /*
365  *      Get service by {proto,addr,port} in the service table.
366  */
367 static __inline__ struct ip_vs_service *
368 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
369 {
370         unsigned hash;
371         struct ip_vs_service *svc;
372
373         /* Check for "full" addressed entries */
374         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377                 if ((svc->addr == vaddr)
378                     && (svc->port == vport)
379                     && (svc->protocol == protocol)) {
380                         /* HIT */
381                         atomic_inc(&svc->usecnt);
382                         return svc;
383                 }
384         }
385
386         return NULL;
387 }
388
389
390 /*
391  *      Get service by {fwmark} in the service table.
392  */
393 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394 {
395         unsigned hash;
396         struct ip_vs_service *svc;
397
398         /* Check for fwmark addressed entries */
399         hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402                 if (svc->fwmark == fwmark) {
403                         /* HIT */
404                         atomic_inc(&svc->usecnt);
405                         return svc;
406                 }
407         }
408
409         return NULL;
410 }
411
412 struct ip_vs_service *
413 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
414 {
415         struct ip_vs_service *svc;
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423                 goto out;
424
425         /*
426          *      Check the table hashed by <protocol,addr,port>
427          *      for "full" addressed entries
428          */
429         svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431         if (svc == NULL
432             && protocol == IPPROTO_TCP
433             && atomic_read(&ip_vs_ftpsvc_counter)
434             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435                 /*
436                  * Check if ftp service entry exists, the packet
437                  * might belong to FTP data connections.
438                  */
439                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440         }
441
442         if (svc == NULL
443             && atomic_read(&ip_vs_nullsvc_counter)) {
444                 /*
445                  * Check if the catch-all port (port zero) exists
446                  */
447                 svc = __ip_vs_service_get(protocol, vaddr, 0);
448         }
449
450   out:
451         read_unlock(&__ip_vs_svc_lock);
452
453         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454                   fwmark, ip_vs_proto_name(protocol),
455                   NIPQUAD(vaddr), ntohs(vport),
456                   svc?"hit":"not hit");
457
458         return svc;
459 }
460
461
462 static inline void
463 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464 {
465         atomic_inc(&svc->refcnt);
466         dest->svc = svc;
467 }
468
469 static inline void
470 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
471 {
472         struct ip_vs_service *svc = dest->svc;
473
474         dest->svc = NULL;
475         if (atomic_dec_and_test(&svc->refcnt))
476                 kfree(svc);
477 }
478
479
480 /*
481  *      Returns hash value for real service
482  */
483 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
484 {
485         register unsigned porth = ntohs(port);
486
487         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488                 & IP_VS_RTAB_MASK;
489 }
490
491 /*
492  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493  *      should be called with locked tables.
494  */
495 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496 {
497         unsigned hash;
498
499         if (!list_empty(&dest->d_list)) {
500                 return 0;
501         }
502
503         /*
504          *      Hash by proto,addr,port,
505          *      which are the parameters of the real service.
506          */
507         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508         list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510         return 1;
511 }
512
513 /*
514  *      UNhashes ip_vs_dest from ip_vs_rtable.
515  *      should be called with locked tables.
516  */
517 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518 {
519         /*
520          * Remove it from the ip_vs_rtable table.
521          */
522         if (!list_empty(&dest->d_list)) {
523                 list_del(&dest->d_list);
524                 INIT_LIST_HEAD(&dest->d_list);
525         }
526
527         return 1;
528 }
529
530 /*
531  *      Lookup real service by <proto,addr,port> in the real service table.
532  */
533 struct ip_vs_dest *
534 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
535 {
536         unsigned hash;
537         struct ip_vs_dest *dest;
538
539         /*
540          *      Check for "full" addressed entries
541          *      Return the first found entry
542          */
543         hash = ip_vs_rs_hashkey(daddr, dport);
544
545         read_lock(&__ip_vs_rs_lock);
546         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547                 if ((dest->addr == daddr)
548                     && (dest->port == dport)
549                     && ((dest->protocol == protocol) ||
550                         dest->vfwmark)) {
551                         /* HIT */
552                         read_unlock(&__ip_vs_rs_lock);
553                         return dest;
554                 }
555         }
556         read_unlock(&__ip_vs_rs_lock);
557
558         return NULL;
559 }
560
561 /*
562  *      Lookup destination by {addr,port} in the given service
563  */
564 static struct ip_vs_dest *
565 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
566 {
567         struct ip_vs_dest *dest;
568
569         /*
570          * Find the destination for the given service
571          */
572         list_for_each_entry(dest, &svc->destinations, n_list) {
573                 if ((dest->addr == daddr) && (dest->port == dport)) {
574                         /* HIT */
575                         return dest;
576                 }
577         }
578
579         return NULL;
580 }
581
582
583 /*
584  *  Lookup dest by {svc,addr,port} in the destination trash.
585  *  The destination trash is used to hold the destinations that are removed
586  *  from the service table but are still referenced by some conn entries.
587  *  The reason to add the destination trash is when the dest is temporary
588  *  down (either by administrator or by monitor program), the dest can be
589  *  picked back from the trash, the remaining connections to the dest can
590  *  continue, and the counting information of the dest is also useful for
591  *  scheduling.
592  */
593 static struct ip_vs_dest *
594 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
595 {
596         struct ip_vs_dest *dest, *nxt;
597
598         /*
599          * Find the destination in trash
600          */
601         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
602                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
603                           "dest->refcnt=%d\n",
604                           dest->vfwmark,
605                           NIPQUAD(dest->addr), ntohs(dest->port),
606                           atomic_read(&dest->refcnt));
607                 if (dest->addr == daddr &&
608                     dest->port == dport &&
609                     dest->vfwmark == svc->fwmark &&
610                     dest->protocol == svc->protocol &&
611                     (svc->fwmark ||
612                      (dest->vaddr == svc->addr &&
613                       dest->vport == svc->port))) {
614                         /* HIT */
615                         return dest;
616                 }
617
618                 /*
619                  * Try to purge the destination from trash if not referenced
620                  */
621                 if (atomic_read(&dest->refcnt) == 1) {
622                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
623                                   "from trash\n",
624                                   dest->vfwmark,
625                                   NIPQUAD(dest->addr), ntohs(dest->port));
626                         list_del(&dest->n_list);
627                         ip_vs_dst_reset(dest);
628                         __ip_vs_unbind_svc(dest);
629                         kfree(dest);
630                 }
631         }
632
633         return NULL;
634 }
635
636
637 /*
638  *  Clean up all the destinations in the trash
639  *  Called by the ip_vs_control_cleanup()
640  *
641  *  When the ip_vs_control_clearup is activated by ipvs module exit,
642  *  the service tables must have been flushed and all the connections
643  *  are expired, and the refcnt of each destination in the trash must
644  *  be 1, so we simply release them here.
645  */
646 static void ip_vs_trash_cleanup(void)
647 {
648         struct ip_vs_dest *dest, *nxt;
649
650         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
651                 list_del(&dest->n_list);
652                 ip_vs_dst_reset(dest);
653                 __ip_vs_unbind_svc(dest);
654                 kfree(dest);
655         }
656 }
657
658
659 static void
660 ip_vs_zero_stats(struct ip_vs_stats *stats)
661 {
662         spin_lock_bh(&stats->lock);
663         memset(stats, 0, (char *)&stats->lock - (char *)stats);
664         spin_unlock_bh(&stats->lock);
665         ip_vs_zero_estimator(stats);
666 }
667
668 /*
669  *      Update a destination in the given service
670  */
671 static void
672 __ip_vs_update_dest(struct ip_vs_service *svc,
673                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
674 {
675         int conn_flags;
676
677         /* set the weight and the flags */
678         atomic_set(&dest->weight, udest->weight);
679         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
680
681         /* check if local node and update the flags */
682         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
683                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
684                         | IP_VS_CONN_F_LOCALNODE;
685         }
686
687         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
688         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
689                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
690         } else {
691                 /*
692                  *    Put the real service in ip_vs_rtable if not present.
693                  *    For now only for NAT!
694                  */
695                 write_lock_bh(&__ip_vs_rs_lock);
696                 ip_vs_rs_hash(dest);
697                 write_unlock_bh(&__ip_vs_rs_lock);
698         }
699         atomic_set(&dest->conn_flags, conn_flags);
700
701         /* bind the service */
702         if (!dest->svc) {
703                 __ip_vs_bind_svc(dest, svc);
704         } else {
705                 if (dest->svc != svc) {
706                         __ip_vs_unbind_svc(dest);
707                         ip_vs_zero_stats(&dest->stats);
708                         __ip_vs_bind_svc(dest, svc);
709                 }
710         }
711
712         /* set the dest status flags */
713         dest->flags |= IP_VS_DEST_F_AVAILABLE;
714
715         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
716                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
717         dest->u_threshold = udest->u_threshold;
718         dest->l_threshold = udest->l_threshold;
719 }
720
721
722 /*
723  *      Create a destination for the given service
724  */
725 static int
726 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
727                struct ip_vs_dest **dest_p)
728 {
729         struct ip_vs_dest *dest;
730         unsigned atype;
731
732         EnterFunction(2);
733
734         atype = inet_addr_type(udest->addr);
735         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
736                 return -EINVAL;
737
738         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
739         if (dest == NULL) {
740                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
741                 return -ENOMEM;
742         }
743
744         dest->protocol = svc->protocol;
745         dest->vaddr = svc->addr;
746         dest->vport = svc->port;
747         dest->vfwmark = svc->fwmark;
748         dest->addr = udest->addr;
749         dest->port = udest->port;
750
751         atomic_set(&dest->activeconns, 0);
752         atomic_set(&dest->inactconns, 0);
753         atomic_set(&dest->persistconns, 0);
754         atomic_set(&dest->refcnt, 0);
755
756         INIT_LIST_HEAD(&dest->d_list);
757         spin_lock_init(&dest->dst_lock);
758         spin_lock_init(&dest->stats.lock);
759         __ip_vs_update_dest(svc, dest, udest);
760         ip_vs_new_estimator(&dest->stats);
761
762         *dest_p = dest;
763
764         LeaveFunction(2);
765         return 0;
766 }
767
768
769 /*
770  *      Add a destination into an existing service
771  */
772 static int
773 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
774 {
775         struct ip_vs_dest *dest;
776         __u32 daddr = udest->addr;
777         __u16 dport = udest->port;
778         int ret;
779
780         EnterFunction(2);
781
782         if (udest->weight < 0) {
783                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
784                 return -ERANGE;
785         }
786
787         if (udest->l_threshold > udest->u_threshold) {
788                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
789                           "upper threshold\n");
790                 return -ERANGE;
791         }
792
793         /*
794          * Check if the dest already exists in the list
795          */
796         dest = ip_vs_lookup_dest(svc, daddr, dport);
797         if (dest != NULL) {
798                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
799                 return -EEXIST;
800         }
801
802         /*
803          * Check if the dest already exists in the trash and
804          * is from the same service
805          */
806         dest = ip_vs_trash_get_dest(svc, daddr, dport);
807         if (dest != NULL) {
808                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
809                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
810                           NIPQUAD(daddr), ntohs(dport),
811                           atomic_read(&dest->refcnt),
812                           dest->vfwmark,
813                           NIPQUAD(dest->vaddr),
814                           ntohs(dest->vport));
815                 __ip_vs_update_dest(svc, dest, udest);
816
817                 /*
818                  * Get the destination from the trash
819                  */
820                 list_del(&dest->n_list);
821
822                 ip_vs_new_estimator(&dest->stats);
823
824                 write_lock_bh(&__ip_vs_svc_lock);
825
826                 /*
827                  * Wait until all other svc users go away.
828                  */
829                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
830
831                 list_add(&dest->n_list, &svc->destinations);
832                 svc->num_dests++;
833
834                 /* call the update_service function of its scheduler */
835                 svc->scheduler->update_service(svc);
836
837                 write_unlock_bh(&__ip_vs_svc_lock);
838                 return 0;
839         }
840
841         /*
842          * Allocate and initialize the dest structure
843          */
844         ret = ip_vs_new_dest(svc, udest, &dest);
845         if (ret) {
846                 return ret;
847         }
848
849         /*
850          * Add the dest entry into the list
851          */
852         atomic_inc(&dest->refcnt);
853
854         write_lock_bh(&__ip_vs_svc_lock);
855
856         /*
857          * Wait until all other svc users go away.
858          */
859         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
860
861         list_add(&dest->n_list, &svc->destinations);
862         svc->num_dests++;
863
864         /* call the update_service function of its scheduler */
865         svc->scheduler->update_service(svc);
866
867         write_unlock_bh(&__ip_vs_svc_lock);
868
869         LeaveFunction(2);
870
871         return 0;
872 }
873
874
875 /*
876  *      Edit a destination in the given service
877  */
878 static int
879 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
880 {
881         struct ip_vs_dest *dest;
882         __u32 daddr = udest->addr;
883         __u16 dport = udest->port;
884
885         EnterFunction(2);
886
887         if (udest->weight < 0) {
888                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
889                 return -ERANGE;
890         }
891
892         if (udest->l_threshold > udest->u_threshold) {
893                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
894                           "upper threshold\n");
895                 return -ERANGE;
896         }
897
898         /*
899          *  Lookup the destination list
900          */
901         dest = ip_vs_lookup_dest(svc, daddr, dport);
902         if (dest == NULL) {
903                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
904                 return -ENOENT;
905         }
906
907         __ip_vs_update_dest(svc, dest, udest);
908
909         write_lock_bh(&__ip_vs_svc_lock);
910
911         /* Wait until all other svc users go away */
912         while (atomic_read(&svc->usecnt) > 1) {};
913
914         /* call the update_service, because server weight may be changed */
915         svc->scheduler->update_service(svc);
916
917         write_unlock_bh(&__ip_vs_svc_lock);
918
919         LeaveFunction(2);
920
921         return 0;
922 }
923
924
925 /*
926  *      Delete a destination (must be already unlinked from the service)
927  */
928 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
929 {
930         ip_vs_kill_estimator(&dest->stats);
931
932         /*
933          *  Remove it from the d-linked list with the real services.
934          */
935         write_lock_bh(&__ip_vs_rs_lock);
936         ip_vs_rs_unhash(dest);
937         write_unlock_bh(&__ip_vs_rs_lock);
938
939         /*
940          *  Decrease the refcnt of the dest, and free the dest
941          *  if nobody refers to it (refcnt=0). Otherwise, throw
942          *  the destination into the trash.
943          */
944         if (atomic_dec_and_test(&dest->refcnt)) {
945                 ip_vs_dst_reset(dest);
946                 /* simply decrease svc->refcnt here, let the caller check
947                    and release the service if nobody refers to it.
948                    Only user context can release destination and service,
949                    and only one user context can update virtual service at a
950                    time, so the operation here is OK */
951                 atomic_dec(&dest->svc->refcnt);
952                 kfree(dest);
953         } else {
954                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
955                           "dest->refcnt=%d\n",
956                           NIPQUAD(dest->addr), ntohs(dest->port),
957                           atomic_read(&dest->refcnt));
958                 list_add(&dest->n_list, &ip_vs_dest_trash);
959                 atomic_inc(&dest->refcnt);
960         }
961 }
962
963
964 /*
965  *      Unlink a destination from the given service
966  */
967 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
968                                 struct ip_vs_dest *dest,
969                                 int svcupd)
970 {
971         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
972
973         /*
974          *  Remove it from the d-linked destination list.
975          */
976         list_del(&dest->n_list);
977         svc->num_dests--;
978         if (svcupd) {
979                 /*
980                  *  Call the update_service function of its scheduler
981                  */
982                 svc->scheduler->update_service(svc);
983         }
984 }
985
986
987 /*
988  *      Delete a destination server in the given service
989  */
990 static int
991 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
992 {
993         struct ip_vs_dest *dest;
994         __u32 daddr = udest->addr;
995         __u16 dport = udest->port;
996
997         EnterFunction(2);
998
999         dest = ip_vs_lookup_dest(svc, daddr, dport);
1000         if (dest == NULL) {
1001                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1002                 return -ENOENT;
1003         }
1004
1005         write_lock_bh(&__ip_vs_svc_lock);
1006
1007         /*
1008          *      Wait until all other svc users go away.
1009          */
1010         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1011
1012         /*
1013          *      Unlink dest from the service
1014          */
1015         __ip_vs_unlink_dest(svc, dest, 1);
1016
1017         write_unlock_bh(&__ip_vs_svc_lock);
1018
1019         /*
1020          *      Delete the destination
1021          */
1022         __ip_vs_del_dest(dest);
1023
1024         LeaveFunction(2);
1025
1026         return 0;
1027 }
1028
1029
1030 /*
1031  *      Add a service into the service hash table
1032  */
1033 static int
1034 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1035 {
1036         int ret = 0;
1037         struct ip_vs_scheduler *sched = NULL;
1038         struct ip_vs_service *svc = NULL;
1039
1040         /* increase the module use count */
1041         ip_vs_use_count_inc();
1042
1043         /* Lookup the scheduler by 'u->sched_name' */
1044         sched = ip_vs_scheduler_get(u->sched_name);
1045         if (sched == NULL) {
1046                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1047                            u->sched_name);
1048                 ret = -ENOENT;
1049                 goto out_mod_dec;
1050         }
1051
1052         svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1053         if (svc == NULL) {
1054                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1055                 ret = -ENOMEM;
1056                 goto out_err;
1057         }
1058
1059         /* I'm the first user of the service */
1060         atomic_set(&svc->usecnt, 1);
1061         atomic_set(&svc->refcnt, 0);
1062
1063         svc->protocol = u->protocol;
1064         svc->addr = u->addr;
1065         svc->port = u->port;
1066         svc->fwmark = u->fwmark;
1067         svc->flags = u->flags;
1068         svc->timeout = u->timeout * HZ;
1069         svc->netmask = u->netmask;
1070
1071         INIT_LIST_HEAD(&svc->destinations);
1072         rwlock_init(&svc->sched_lock);
1073         spin_lock_init(&svc->stats.lock);
1074
1075         /* Bind the scheduler */
1076         ret = ip_vs_bind_scheduler(svc, sched);
1077         if (ret)
1078                 goto out_err;
1079         sched = NULL;
1080
1081         /* Update the virtual service counters */
1082         if (svc->port == FTPPORT)
1083                 atomic_inc(&ip_vs_ftpsvc_counter);
1084         else if (svc->port == 0)
1085                 atomic_inc(&ip_vs_nullsvc_counter);
1086
1087         ip_vs_new_estimator(&svc->stats);
1088         ip_vs_num_services++;
1089
1090         /* Hash the service into the service table */
1091         write_lock_bh(&__ip_vs_svc_lock);
1092         ip_vs_svc_hash(svc);
1093         write_unlock_bh(&__ip_vs_svc_lock);
1094
1095         *svc_p = svc;
1096         return 0;
1097
1098   out_err:
1099         if (svc != NULL) {
1100                 if (svc->scheduler)
1101                         ip_vs_unbind_scheduler(svc);
1102                 if (svc->inc) {
1103                         local_bh_disable();
1104                         ip_vs_app_inc_put(svc->inc);
1105                         local_bh_enable();
1106                 }
1107                 kfree(svc);
1108         }
1109         ip_vs_scheduler_put(sched);
1110
1111   out_mod_dec:
1112         /* decrease the module use count */
1113         ip_vs_use_count_dec();
1114
1115         return ret;
1116 }
1117
1118
1119 /*
1120  *      Edit a service and bind it with a new scheduler
1121  */
1122 static int
1123 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1124 {
1125         struct ip_vs_scheduler *sched, *old_sched;
1126         int ret = 0;
1127
1128         /*
1129          * Lookup the scheduler, by 'u->sched_name'
1130          */
1131         sched = ip_vs_scheduler_get(u->sched_name);
1132         if (sched == NULL) {
1133                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1134                            u->sched_name);
1135                 return -ENOENT;
1136         }
1137         old_sched = sched;
1138
1139         write_lock_bh(&__ip_vs_svc_lock);
1140
1141         /*
1142          * Wait until all other svc users go away.
1143          */
1144         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1145
1146         /*
1147          * Set the flags and timeout value
1148          */
1149         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1150         svc->timeout = u->timeout * HZ;
1151         svc->netmask = u->netmask;
1152
1153         old_sched = svc->scheduler;
1154         if (sched != old_sched) {
1155                 /*
1156                  * Unbind the old scheduler
1157                  */
1158                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1159                         old_sched = sched;
1160                         goto out;
1161                 }
1162
1163                 /*
1164                  * Bind the new scheduler
1165                  */
1166                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1167                         /*
1168                          * If ip_vs_bind_scheduler fails, restore the old
1169                          * scheduler.
1170                          * The main reason of failure is out of memory.
1171                          *
1172                          * The question is if the old scheduler can be
1173                          * restored all the time. TODO: if it cannot be
1174                          * restored some time, we must delete the service,
1175                          * otherwise the system may crash.
1176                          */
1177                         ip_vs_bind_scheduler(svc, old_sched);
1178                         old_sched = sched;
1179                         goto out;
1180                 }
1181         }
1182
1183   out:
1184         write_unlock_bh(&__ip_vs_svc_lock);
1185
1186         if (old_sched)
1187                 ip_vs_scheduler_put(old_sched);
1188
1189         return ret;
1190 }
1191
1192
1193 /*
1194  *      Delete a service from the service list
1195  *      - The service must be unlinked, unlocked and not referenced!
1196  *      - We are called under _bh lock
1197  */
1198 static void __ip_vs_del_service(struct ip_vs_service *svc)
1199 {
1200         struct ip_vs_dest *dest, *nxt;
1201         struct ip_vs_scheduler *old_sched;
1202
1203         ip_vs_num_services--;
1204         ip_vs_kill_estimator(&svc->stats);
1205
1206         /* Unbind scheduler */
1207         old_sched = svc->scheduler;
1208         ip_vs_unbind_scheduler(svc);
1209         if (old_sched)
1210                 ip_vs_scheduler_put(old_sched);
1211
1212         /* Unbind app inc */
1213         if (svc->inc) {
1214                 ip_vs_app_inc_put(svc->inc);
1215                 svc->inc = NULL;
1216         }
1217
1218         /*
1219          *    Unlink the whole destination list
1220          */
1221         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1222                 __ip_vs_unlink_dest(svc, dest, 0);
1223                 __ip_vs_del_dest(dest);
1224         }
1225
1226         /*
1227          *    Update the virtual service counters
1228          */
1229         if (svc->port == FTPPORT)
1230                 atomic_dec(&ip_vs_ftpsvc_counter);
1231         else if (svc->port == 0)
1232                 atomic_dec(&ip_vs_nullsvc_counter);
1233
1234         /*
1235          *    Free the service if nobody refers to it
1236          */
1237         if (atomic_read(&svc->refcnt) == 0)
1238                 kfree(svc);
1239
1240         /* decrease the module use count */
1241         ip_vs_use_count_dec();
1242 }
1243
1244 /*
1245  *      Delete a service from the service list
1246  */
1247 static int ip_vs_del_service(struct ip_vs_service *svc)
1248 {
1249         if (svc == NULL)
1250                 return -EEXIST;
1251
1252         /*
1253          * Unhash it from the service table
1254          */
1255         write_lock_bh(&__ip_vs_svc_lock);
1256
1257         ip_vs_svc_unhash(svc);
1258
1259         /*
1260          * Wait until all the svc users go away.
1261          */
1262         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1263
1264         __ip_vs_del_service(svc);
1265
1266         write_unlock_bh(&__ip_vs_svc_lock);
1267
1268         return 0;
1269 }
1270
1271
1272 /*
1273  *      Flush all the virtual services
1274  */
1275 static int ip_vs_flush(void)
1276 {
1277         int idx;
1278         struct ip_vs_service *svc, *nxt;
1279
1280         /*
1281          * Flush the service table hashed by <protocol,addr,port>
1282          */
1283         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1284                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1285                         write_lock_bh(&__ip_vs_svc_lock);
1286                         ip_vs_svc_unhash(svc);
1287                         /*
1288                          * Wait until all the svc users go away.
1289                          */
1290                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1291                         __ip_vs_del_service(svc);
1292                         write_unlock_bh(&__ip_vs_svc_lock);
1293                 }
1294         }
1295
1296         /*
1297          * Flush the service table hashed by fwmark
1298          */
1299         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1300                 list_for_each_entry_safe(svc, nxt,
1301                                          &ip_vs_svc_fwm_table[idx], f_list) {
1302                         write_lock_bh(&__ip_vs_svc_lock);
1303                         ip_vs_svc_unhash(svc);
1304                         /*
1305                          * Wait until all the svc users go away.
1306                          */
1307                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1308                         __ip_vs_del_service(svc);
1309                         write_unlock_bh(&__ip_vs_svc_lock);
1310                 }
1311         }
1312
1313         return 0;
1314 }
1315
1316
1317 /*
1318  *      Zero counters in a service or all services
1319  */
1320 static int ip_vs_zero_service(struct ip_vs_service *svc)
1321 {
1322         struct ip_vs_dest *dest;
1323
1324         write_lock_bh(&__ip_vs_svc_lock);
1325         list_for_each_entry(dest, &svc->destinations, n_list) {
1326                 ip_vs_zero_stats(&dest->stats);
1327         }
1328         ip_vs_zero_stats(&svc->stats);
1329         write_unlock_bh(&__ip_vs_svc_lock);
1330         return 0;
1331 }
1332
1333 static int ip_vs_zero_all(void)
1334 {
1335         int idx;
1336         struct ip_vs_service *svc;
1337
1338         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1340                         ip_vs_zero_service(svc);
1341                 }
1342         }
1343
1344         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1345                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1346                         ip_vs_zero_service(svc);
1347                 }
1348         }
1349
1350         ip_vs_zero_stats(&ip_vs_stats);
1351         return 0;
1352 }
1353
1354
1355 static int
1356 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1357                      void __user *buffer, size_t *lenp, loff_t *ppos)
1358 {
1359         int *valp = table->data;
1360         int val = *valp;
1361         int rc;
1362
1363         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1364         if (write && (*valp != val)) {
1365                 if ((*valp < 0) || (*valp > 3)) {
1366                         /* Restore the correct value */
1367                         *valp = val;
1368                 } else {
1369                         update_defense_level();
1370                 }
1371         }
1372         return rc;
1373 }
1374
1375
1376 static int
1377 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1378                        void __user *buffer, size_t *lenp, loff_t *ppos)
1379 {
1380         int *valp = table->data;
1381         int val[2];
1382         int rc;
1383
1384         /* backup the value first */
1385         memcpy(val, valp, sizeof(val));
1386
1387         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1388         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1389                 /* Restore the correct value */
1390                 memcpy(valp, val, sizeof(val));
1391         }
1392         return rc;
1393 }
1394
1395
1396 /*
1397  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1398  */
1399
1400 static struct ctl_table vs_vars[] = {
1401         {
1402                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1403                 .procname       = "amemthresh",
1404                 .data           = &sysctl_ip_vs_amemthresh,
1405                 .maxlen         = sizeof(int),
1406                 .mode           = 0644,
1407                 .proc_handler   = &proc_dointvec,
1408         },
1409 #ifdef CONFIG_IP_VS_DEBUG
1410         {
1411                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1412                 .procname       = "debug_level",
1413                 .data           = &sysctl_ip_vs_debug_level,
1414                 .maxlen         = sizeof(int),
1415                 .mode           = 0644,
1416                 .proc_handler   = &proc_dointvec,
1417         },
1418 #endif
1419         {
1420                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1421                 .procname       = "am_droprate",
1422                 .data           = &sysctl_ip_vs_am_droprate,
1423                 .maxlen         = sizeof(int),
1424                 .mode           = 0644,
1425                 .proc_handler   = &proc_dointvec,
1426         },
1427         {
1428                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1429                 .procname       = "drop_entry",
1430                 .data           = &sysctl_ip_vs_drop_entry,
1431                 .maxlen         = sizeof(int),
1432                 .mode           = 0644,
1433                 .proc_handler   = &proc_do_defense_mode,
1434         },
1435         {
1436                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1437                 .procname       = "drop_packet",
1438                 .data           = &sysctl_ip_vs_drop_packet,
1439                 .maxlen         = sizeof(int),
1440                 .mode           = 0644,
1441                 .proc_handler   = &proc_do_defense_mode,
1442         },
1443         {
1444                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1445                 .procname       = "secure_tcp",
1446                 .data           = &sysctl_ip_vs_secure_tcp,
1447                 .maxlen         = sizeof(int),
1448                 .mode           = 0644,
1449                 .proc_handler   = &proc_do_defense_mode,
1450         },
1451 #if 0
1452         {
1453                 .ctl_name       = NET_IPV4_VS_TO_ES,
1454                 .procname       = "timeout_established",
1455                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1456                 .maxlen         = sizeof(int),
1457                 .mode           = 0644,
1458                 .proc_handler   = &proc_dointvec_jiffies,
1459         },
1460         {
1461                 .ctl_name       = NET_IPV4_VS_TO_SS,
1462                 .procname       = "timeout_synsent",
1463                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1464                 .maxlen         = sizeof(int),
1465                 .mode           = 0644,
1466                 .proc_handler   = &proc_dointvec_jiffies,
1467         },
1468         {
1469                 .ctl_name       = NET_IPV4_VS_TO_SR,
1470                 .procname       = "timeout_synrecv",
1471                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1472                 .maxlen         = sizeof(int),
1473                 .mode           = 0644,
1474                 .proc_handler   = &proc_dointvec_jiffies,
1475         },
1476         {
1477                 .ctl_name       = NET_IPV4_VS_TO_FW,
1478                 .procname       = "timeout_finwait",
1479                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1480                 .maxlen         = sizeof(int),
1481                 .mode           = 0644,
1482                 .proc_handler   = &proc_dointvec_jiffies,
1483         },
1484         {
1485                 .ctl_name       = NET_IPV4_VS_TO_TW,
1486                 .procname       = "timeout_timewait",
1487                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1488                 .maxlen         = sizeof(int),
1489                 .mode           = 0644,
1490                 .proc_handler   = &proc_dointvec_jiffies,
1491         },
1492         {
1493                 .ctl_name       = NET_IPV4_VS_TO_CL,
1494                 .procname       = "timeout_close",
1495                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1496                 .maxlen         = sizeof(int),
1497                 .mode           = 0644,
1498                 .proc_handler   = &proc_dointvec_jiffies,
1499         },
1500         {
1501                 .ctl_name       = NET_IPV4_VS_TO_CW,
1502                 .procname       = "timeout_closewait",
1503                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1504                 .maxlen         = sizeof(int),
1505                 .mode           = 0644,
1506                 .proc_handler   = &proc_dointvec_jiffies,
1507         },
1508         {
1509                 .ctl_name       = NET_IPV4_VS_TO_LA,
1510                 .procname       = "timeout_lastack",
1511                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1512                 .maxlen         = sizeof(int),
1513                 .mode           = 0644,
1514                 .proc_handler   = &proc_dointvec_jiffies,
1515         },
1516         {
1517                 .ctl_name       = NET_IPV4_VS_TO_LI,
1518                 .procname       = "timeout_listen",
1519                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1520                 .maxlen         = sizeof(int),
1521                 .mode           = 0644,
1522                 .proc_handler   = &proc_dointvec_jiffies,
1523         },
1524         {
1525                 .ctl_name       = NET_IPV4_VS_TO_SA,
1526                 .procname       = "timeout_synack",
1527                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1528                 .maxlen         = sizeof(int),
1529                 .mode           = 0644,
1530                 .proc_handler   = &proc_dointvec_jiffies,
1531         },
1532         {
1533                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1534                 .procname       = "timeout_udp",
1535                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1536                 .maxlen         = sizeof(int),
1537                 .mode           = 0644,
1538                 .proc_handler   = &proc_dointvec_jiffies,
1539         },
1540         {
1541                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1542                 .procname       = "timeout_icmp",
1543                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1544                 .maxlen         = sizeof(int),
1545                 .mode           = 0644,
1546                 .proc_handler   = &proc_dointvec_jiffies,
1547         },
1548 #endif
1549         {
1550                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1551                 .procname       = "cache_bypass",
1552                 .data           = &sysctl_ip_vs_cache_bypass,
1553                 .maxlen         = sizeof(int),
1554                 .mode           = 0644,
1555                 .proc_handler   = &proc_dointvec,
1556         },
1557         {
1558                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1559                 .procname       = "expire_nodest_conn",
1560                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1561                 .maxlen         = sizeof(int),
1562                 .mode           = 0644,
1563                 .proc_handler   = &proc_dointvec,
1564         },
1565         {
1566                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1567                 .procname       = "expire_quiescent_template",
1568                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1569                 .maxlen         = sizeof(int),
1570                 .mode           = 0644,
1571                 .proc_handler   = &proc_dointvec,
1572         },
1573         {
1574                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1575                 .procname       = "sync_threshold",
1576                 .data           = &sysctl_ip_vs_sync_threshold,
1577                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1578                 .mode           = 0644,
1579                 .proc_handler   = &proc_do_sync_threshold,
1580         },
1581         {
1582                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1583                 .procname       = "nat_icmp_send",
1584                 .data           = &sysctl_ip_vs_nat_icmp_send,
1585                 .maxlen         = sizeof(int),
1586                 .mode           = 0644,
1587                 .proc_handler   = &proc_dointvec,
1588         },
1589         { .ctl_name = 0 }
1590 };
1591
1592 static ctl_table vs_table[] = {
1593         {
1594                 .ctl_name       = NET_IPV4_VS,
1595                 .procname       = "vs",
1596                 .mode           = 0555,
1597                 .child          = vs_vars
1598         },
1599         { .ctl_name = 0 }
1600 };
1601
1602 static ctl_table ipvs_ipv4_table[] = {
1603         {
1604                 .ctl_name       = NET_IPV4,
1605                 .procname       = "ipv4",
1606                 .mode           = 0555,
1607                 .child          = vs_table,
1608         },
1609         { .ctl_name = 0 }
1610 };
1611
1612 static ctl_table vs_root_table[] = {
1613         {
1614                 .ctl_name       = CTL_NET,
1615                 .procname       = "net",
1616                 .mode           = 0555,
1617                 .child          = ipvs_ipv4_table,
1618         },
1619         { .ctl_name = 0 }
1620 };
1621
1622 static struct ctl_table_header * sysctl_header;
1623
1624 #ifdef CONFIG_PROC_FS
1625
1626 struct ip_vs_iter {
1627         struct list_head *table;
1628         int bucket;
1629 };
1630
1631 /*
1632  *      Write the contents of the VS rule table to a PROCfs file.
1633  *      (It is kept just for backward compatibility)
1634  */
1635 static inline const char *ip_vs_fwd_name(unsigned flags)
1636 {
1637         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1638         case IP_VS_CONN_F_LOCALNODE:
1639                 return "Local";
1640         case IP_VS_CONN_F_TUNNEL:
1641                 return "Tunnel";
1642         case IP_VS_CONN_F_DROUTE:
1643                 return "Route";
1644         default:
1645                 return "Masq";
1646         }
1647 }
1648
1649
1650 /* Get the Nth entry in the two lists */
1651 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1652 {
1653         struct ip_vs_iter *iter = seq->private;
1654         int idx;
1655         struct ip_vs_service *svc;
1656
1657         /* look in hash by protocol */
1658         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1659                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1660                         if (pos-- == 0){
1661                                 iter->table = ip_vs_svc_table;
1662                                 iter->bucket = idx;
1663                                 return svc;
1664                         }
1665                 }
1666         }
1667
1668         /* keep looking in fwmark */
1669         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1670                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1671                         if (pos-- == 0) {
1672                                 iter->table = ip_vs_svc_fwm_table;
1673                                 iter->bucket = idx;
1674                                 return svc;
1675                         }
1676                 }
1677         }
1678
1679         return NULL;
1680 }
1681
1682 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1683 {
1684
1685         read_lock_bh(&__ip_vs_svc_lock);
1686         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1687 }
1688
1689
1690 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1691 {
1692         struct list_head *e;
1693         struct ip_vs_iter *iter;
1694         struct ip_vs_service *svc;
1695
1696         ++*pos;
1697         if (v == SEQ_START_TOKEN)
1698                 return ip_vs_info_array(seq,0);
1699
1700         svc = v;
1701         iter = seq->private;
1702
1703         if (iter->table == ip_vs_svc_table) {
1704                 /* next service in table hashed by protocol */
1705                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1706                         return list_entry(e, struct ip_vs_service, s_list);
1707
1708
1709                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1710                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1711                                             s_list) {
1712                                 return svc;
1713                         }
1714                 }
1715
1716                 iter->table = ip_vs_svc_fwm_table;
1717                 iter->bucket = -1;
1718                 goto scan_fwmark;
1719         }
1720
1721         /* next service in hashed by fwmark */
1722         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1723                 return list_entry(e, struct ip_vs_service, f_list);
1724
1725  scan_fwmark:
1726         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1727                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1728                                     f_list)
1729                         return svc;
1730         }
1731
1732         return NULL;
1733 }
1734
1735 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1736 {
1737         read_unlock_bh(&__ip_vs_svc_lock);
1738 }
1739
1740
1741 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1742 {
1743         if (v == SEQ_START_TOKEN) {
1744                 seq_printf(seq,
1745                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1746                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1747                 seq_puts(seq,
1748                          "Prot LocalAddress:Port Scheduler Flags\n");
1749                 seq_puts(seq,
1750                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1751         } else {
1752                 const struct ip_vs_service *svc = v;
1753                 const struct ip_vs_iter *iter = seq->private;
1754                 const struct ip_vs_dest *dest;
1755
1756                 if (iter->table == ip_vs_svc_table)
1757                         seq_printf(seq, "%s  %08X:%04X %s ",
1758                                    ip_vs_proto_name(svc->protocol),
1759                                    ntohl(svc->addr),
1760                                    ntohs(svc->port),
1761                                    svc->scheduler->name);
1762                 else
1763                         seq_printf(seq, "FWM  %08X %s ",
1764                                    svc->fwmark, svc->scheduler->name);
1765
1766                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1767                         seq_printf(seq, "persistent %d %08X\n",
1768                                 svc->timeout,
1769                                 ntohl(svc->netmask));
1770                 else
1771                         seq_putc(seq, '\n');
1772
1773                 list_for_each_entry(dest, &svc->destinations, n_list) {
1774                         seq_printf(seq,
1775                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1776                                    ntohl(dest->addr), ntohs(dest->port),
1777                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1778                                    atomic_read(&dest->weight),
1779                                    atomic_read(&dest->activeconns),
1780                                    atomic_read(&dest->inactconns));
1781                 }
1782         }
1783         return 0;
1784 }
1785
1786 static struct seq_operations ip_vs_info_seq_ops = {
1787         .start = ip_vs_info_seq_start,
1788         .next  = ip_vs_info_seq_next,
1789         .stop  = ip_vs_info_seq_stop,
1790         .show  = ip_vs_info_seq_show,
1791 };
1792
1793 static int ip_vs_info_open(struct inode *inode, struct file *file)
1794 {
1795         struct seq_file *seq;
1796         int rc = -ENOMEM;
1797         struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL);
1798
1799         if (!s)
1800                 goto out;
1801
1802         rc = seq_open(file, &ip_vs_info_seq_ops);
1803         if (rc)
1804                 goto out_kfree;
1805
1806         seq          = file->private_data;
1807         seq->private = s;
1808 out:
1809         return rc;
1810 out_kfree:
1811         kfree(s);
1812         goto out;
1813 }
1814
1815 static struct file_operations ip_vs_info_fops = {
1816         .owner   = THIS_MODULE,
1817         .open    = ip_vs_info_open,
1818         .read    = seq_read,
1819         .llseek  = seq_lseek,
1820         .release = seq_release_private,
1821 };
1822
1823 #endif
1824
1825 struct ip_vs_stats ip_vs_stats;
1826
1827 #ifdef CONFIG_PROC_FS
1828 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1829 {
1830
1831 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1832         seq_puts(seq,
1833                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1834         seq_printf(seq,
1835                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1836
1837         spin_lock_bh(&ip_vs_stats.lock);
1838         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1839                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1840                    (unsigned long long) ip_vs_stats.inbytes,
1841                    (unsigned long long) ip_vs_stats.outbytes);
1842
1843 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1844         seq_puts(seq,
1845                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1846         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1847                         ip_vs_stats.cps,
1848                         ip_vs_stats.inpps,
1849                         ip_vs_stats.outpps,
1850                         ip_vs_stats.inbps,
1851                         ip_vs_stats.outbps);
1852         spin_unlock_bh(&ip_vs_stats.lock);
1853
1854         return 0;
1855 }
1856
1857 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1858 {
1859         return single_open(file, ip_vs_stats_show, NULL);
1860 }
1861
1862 static struct file_operations ip_vs_stats_fops = {
1863         .owner = THIS_MODULE,
1864         .open = ip_vs_stats_seq_open,
1865         .read = seq_read,
1866         .llseek = seq_lseek,
1867         .release = single_release,
1868 };
1869
1870 #endif
1871
1872 /*
1873  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1874  */
1875 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1876 {
1877         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1878                   u->tcp_timeout,
1879                   u->tcp_fin_timeout,
1880                   u->udp_timeout);
1881
1882 #ifdef CONFIG_IP_VS_PROTO_TCP
1883         if (u->tcp_timeout) {
1884                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1885                         = u->tcp_timeout * HZ;
1886         }
1887
1888         if (u->tcp_fin_timeout) {
1889                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1890                         = u->tcp_fin_timeout * HZ;
1891         }
1892 #endif
1893
1894 #ifdef CONFIG_IP_VS_PROTO_UDP
1895         if (u->udp_timeout) {
1896                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1897                         = u->udp_timeout * HZ;
1898         }
1899 #endif
1900         return 0;
1901 }
1902
1903
1904 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1905 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1906 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1907                                  sizeof(struct ip_vs_dest_user))
1908 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1909 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1910 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1911
1912 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1913         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1914         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1915         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1916         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1917         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1921         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1924 };
1925
1926 static int
1927 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1928 {
1929         int ret;
1930         unsigned char arg[MAX_ARG_LEN];
1931         struct ip_vs_service_user *usvc;
1932         struct ip_vs_service *svc;
1933         struct ip_vs_dest_user *udest;
1934
1935         if (!capable(CAP_NET_ADMIN))
1936                 return -EPERM;
1937
1938         if (len != set_arglen[SET_CMDID(cmd)]) {
1939                 IP_VS_ERR("set_ctl: len %u != %u\n",
1940                           len, set_arglen[SET_CMDID(cmd)]);
1941                 return -EINVAL;
1942         }
1943
1944         if (copy_from_user(arg, user, len) != 0)
1945                 return -EFAULT;
1946
1947         /* increase the module use count */
1948         ip_vs_use_count_inc();
1949
1950         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1951                 ret = -ERESTARTSYS;
1952                 goto out_dec;
1953         }
1954
1955         if (cmd == IP_VS_SO_SET_FLUSH) {
1956                 /* Flush the virtual service */
1957                 ret = ip_vs_flush();
1958                 goto out_unlock;
1959         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1960                 /* Set timeout values for (tcp tcpfin udp) */
1961                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1962                 goto out_unlock;
1963         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1964                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1965                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1966                 goto out_unlock;
1967         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1968                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969                 ret = stop_sync_thread(dm->state);
1970                 goto out_unlock;
1971         }
1972
1973         usvc = (struct ip_vs_service_user *)arg;
1974         udest = (struct ip_vs_dest_user *)(usvc + 1);
1975
1976         if (cmd == IP_VS_SO_SET_ZERO) {
1977                 /* if no service address is set, zero counters in all */
1978                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1979                         ret = ip_vs_zero_all();
1980                         goto out_unlock;
1981                 }
1982         }
1983
1984         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1985         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1986                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1987                           usvc->protocol, NIPQUAD(usvc->addr),
1988                           ntohs(usvc->port), usvc->sched_name);
1989                 ret = -EFAULT;
1990                 goto out_unlock;
1991         }
1992
1993         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1994         if (usvc->fwmark == 0)
1995                 svc = __ip_vs_service_get(usvc->protocol,
1996                                           usvc->addr, usvc->port);
1997         else
1998                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1999
2000         if (cmd != IP_VS_SO_SET_ADD
2001             && (svc == NULL || svc->protocol != usvc->protocol)) {
2002                 ret = -ESRCH;
2003                 goto out_unlock;
2004         }
2005
2006         switch (cmd) {
2007         case IP_VS_SO_SET_ADD:
2008                 if (svc != NULL)
2009                         ret = -EEXIST;
2010                 else
2011                         ret = ip_vs_add_service(usvc, &svc);
2012                 break;
2013         case IP_VS_SO_SET_EDIT:
2014                 ret = ip_vs_edit_service(svc, usvc);
2015                 break;
2016         case IP_VS_SO_SET_DEL:
2017                 ret = ip_vs_del_service(svc);
2018                 if (!ret)
2019                         goto out_unlock;
2020                 break;
2021         case IP_VS_SO_SET_ZERO:
2022                 ret = ip_vs_zero_service(svc);
2023                 break;
2024         case IP_VS_SO_SET_ADDDEST:
2025                 ret = ip_vs_add_dest(svc, udest);
2026                 break;
2027         case IP_VS_SO_SET_EDITDEST:
2028                 ret = ip_vs_edit_dest(svc, udest);
2029                 break;
2030         case IP_VS_SO_SET_DELDEST:
2031                 ret = ip_vs_del_dest(svc, udest);
2032                 break;
2033         default:
2034                 ret = -EINVAL;
2035         }
2036
2037         if (svc)
2038                 ip_vs_service_put(svc);
2039
2040   out_unlock:
2041         mutex_unlock(&__ip_vs_mutex);
2042   out_dec:
2043         /* decrease the module use count */
2044         ip_vs_use_count_dec();
2045
2046         return ret;
2047 }
2048
2049
2050 static void
2051 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2052 {
2053         spin_lock_bh(&src->lock);
2054         memcpy(dst, src, (char*)&src->lock - (char*)src);
2055         spin_unlock_bh(&src->lock);
2056 }
2057
2058 static void
2059 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2060 {
2061         dst->protocol = src->protocol;
2062         dst->addr = src->addr;
2063         dst->port = src->port;
2064         dst->fwmark = src->fwmark;
2065         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2066         dst->flags = src->flags;
2067         dst->timeout = src->timeout / HZ;
2068         dst->netmask = src->netmask;
2069         dst->num_dests = src->num_dests;
2070         ip_vs_copy_stats(&dst->stats, &src->stats);
2071 }
2072
2073 static inline int
2074 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2075                             struct ip_vs_get_services __user *uptr)
2076 {
2077         int idx, count=0;
2078         struct ip_vs_service *svc;
2079         struct ip_vs_service_entry entry;
2080         int ret = 0;
2081
2082         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2083                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2084                         if (count >= get->num_services)
2085                                 goto out;
2086                         memset(&entry, 0, sizeof(entry));
2087                         ip_vs_copy_service(&entry, svc);
2088                         if (copy_to_user(&uptr->entrytable[count],
2089                                          &entry, sizeof(entry))) {
2090                                 ret = -EFAULT;
2091                                 goto out;
2092                         }
2093                         count++;
2094                 }
2095         }
2096
2097         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2098                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2099                         if (count >= get->num_services)
2100                                 goto out;
2101                         memset(&entry, 0, sizeof(entry));
2102                         ip_vs_copy_service(&entry, svc);
2103                         if (copy_to_user(&uptr->entrytable[count],
2104                                          &entry, sizeof(entry))) {
2105                                 ret = -EFAULT;
2106                                 goto out;
2107                         }
2108                         count++;
2109                 }
2110         }
2111   out:
2112         return ret;
2113 }
2114
2115 static inline int
2116 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2117                          struct ip_vs_get_dests __user *uptr)
2118 {
2119         struct ip_vs_service *svc;
2120         int ret = 0;
2121
2122         if (get->fwmark)
2123                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2124         else
2125                 svc = __ip_vs_service_get(get->protocol,
2126                                           get->addr, get->port);
2127         if (svc) {
2128                 int count = 0;
2129                 struct ip_vs_dest *dest;
2130                 struct ip_vs_dest_entry entry;
2131
2132                 list_for_each_entry(dest, &svc->destinations, n_list) {
2133                         if (count >= get->num_dests)
2134                                 break;
2135
2136                         entry.addr = dest->addr;
2137                         entry.port = dest->port;
2138                         entry.conn_flags = atomic_read(&dest->conn_flags);
2139                         entry.weight = atomic_read(&dest->weight);
2140                         entry.u_threshold = dest->u_threshold;
2141                         entry.l_threshold = dest->l_threshold;
2142                         entry.activeconns = atomic_read(&dest->activeconns);
2143                         entry.inactconns = atomic_read(&dest->inactconns);
2144                         entry.persistconns = atomic_read(&dest->persistconns);
2145                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2146                         if (copy_to_user(&uptr->entrytable[count],
2147                                          &entry, sizeof(entry))) {
2148                                 ret = -EFAULT;
2149                                 break;
2150                         }
2151                         count++;
2152                 }
2153                 ip_vs_service_put(svc);
2154         } else
2155                 ret = -ESRCH;
2156         return ret;
2157 }
2158
2159 static inline void
2160 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2161 {
2162 #ifdef CONFIG_IP_VS_PROTO_TCP
2163         u->tcp_timeout =
2164                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2165         u->tcp_fin_timeout =
2166                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2167 #endif
2168 #ifdef CONFIG_IP_VS_PROTO_UDP
2169         u->udp_timeout =
2170                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2171 #endif
2172 }
2173
2174
2175 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2176 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2177 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2178 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2179 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2180 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2181 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2182
2183 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2184         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2185         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2186         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2187         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2188         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2189         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2190         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2191 };
2192
2193 static int
2194 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2195 {
2196         unsigned char arg[128];
2197         int ret = 0;
2198
2199         if (!capable(CAP_NET_ADMIN))
2200                 return -EPERM;
2201
2202         if (*len < get_arglen[GET_CMDID(cmd)]) {
2203                 IP_VS_ERR("get_ctl: len %u < %u\n",
2204                           *len, get_arglen[GET_CMDID(cmd)]);
2205                 return -EINVAL;
2206         }
2207
2208         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2209                 return -EFAULT;
2210
2211         if (mutex_lock_interruptible(&__ip_vs_mutex))
2212                 return -ERESTARTSYS;
2213
2214         switch (cmd) {
2215         case IP_VS_SO_GET_VERSION:
2216         {
2217                 char buf[64];
2218
2219                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2220                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2221                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2222                         ret = -EFAULT;
2223                         goto out;
2224                 }
2225                 *len = strlen(buf)+1;
2226         }
2227         break;
2228
2229         case IP_VS_SO_GET_INFO:
2230         {
2231                 struct ip_vs_getinfo info;
2232                 info.version = IP_VS_VERSION_CODE;
2233                 info.size = IP_VS_CONN_TAB_SIZE;
2234                 info.num_services = ip_vs_num_services;
2235                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2236                         ret = -EFAULT;
2237         }
2238         break;
2239
2240         case IP_VS_SO_GET_SERVICES:
2241         {
2242                 struct ip_vs_get_services *get;
2243                 int size;
2244
2245                 get = (struct ip_vs_get_services *)arg;
2246                 size = sizeof(*get) +
2247                         sizeof(struct ip_vs_service_entry) * get->num_services;
2248                 if (*len != size) {
2249                         IP_VS_ERR("length: %u != %u\n", *len, size);
2250                         ret = -EINVAL;
2251                         goto out;
2252                 }
2253                 ret = __ip_vs_get_service_entries(get, user);
2254         }
2255         break;
2256
2257         case IP_VS_SO_GET_SERVICE:
2258         {
2259                 struct ip_vs_service_entry *entry;
2260                 struct ip_vs_service *svc;
2261
2262                 entry = (struct ip_vs_service_entry *)arg;
2263                 if (entry->fwmark)
2264                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2265                 else
2266                         svc = __ip_vs_service_get(entry->protocol,
2267                                                   entry->addr, entry->port);
2268                 if (svc) {
2269                         ip_vs_copy_service(entry, svc);
2270                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2271                                 ret = -EFAULT;
2272                         ip_vs_service_put(svc);
2273                 } else
2274                         ret = -ESRCH;
2275         }
2276         break;
2277
2278         case IP_VS_SO_GET_DESTS:
2279         {
2280                 struct ip_vs_get_dests *get;
2281                 int size;
2282
2283                 get = (struct ip_vs_get_dests *)arg;
2284                 size = sizeof(*get) +
2285                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2286                 if (*len != size) {
2287                         IP_VS_ERR("length: %u != %u\n", *len, size);
2288                         ret = -EINVAL;
2289                         goto out;
2290                 }
2291                 ret = __ip_vs_get_dest_entries(get, user);
2292         }
2293         break;
2294
2295         case IP_VS_SO_GET_TIMEOUT:
2296         {
2297                 struct ip_vs_timeout_user t;
2298
2299                 __ip_vs_get_timeouts(&t);
2300                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2301                         ret = -EFAULT;
2302         }
2303         break;
2304
2305         case IP_VS_SO_GET_DAEMON:
2306         {
2307                 struct ip_vs_daemon_user d[2];
2308
2309                 memset(&d, 0, sizeof(d));
2310                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2311                         d[0].state = IP_VS_STATE_MASTER;
2312                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2313                         d[0].syncid = ip_vs_master_syncid;
2314                 }
2315                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2316                         d[1].state = IP_VS_STATE_BACKUP;
2317                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2318                         d[1].syncid = ip_vs_backup_syncid;
2319                 }
2320                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2321                         ret = -EFAULT;
2322         }
2323         break;
2324
2325         default:
2326                 ret = -EINVAL;
2327         }
2328
2329   out:
2330         mutex_unlock(&__ip_vs_mutex);
2331         return ret;
2332 }
2333
2334
2335 static struct nf_sockopt_ops ip_vs_sockopts = {
2336         .pf             = PF_INET,
2337         .set_optmin     = IP_VS_BASE_CTL,
2338         .set_optmax     = IP_VS_SO_SET_MAX+1,
2339         .set            = do_ip_vs_set_ctl,
2340         .get_optmin     = IP_VS_BASE_CTL,
2341         .get_optmax     = IP_VS_SO_GET_MAX+1,
2342         .get            = do_ip_vs_get_ctl,
2343 };
2344
2345
2346 int ip_vs_control_init(void)
2347 {
2348         int ret;
2349         int idx;
2350
2351         EnterFunction(2);
2352
2353         ret = nf_register_sockopt(&ip_vs_sockopts);
2354         if (ret) {
2355                 IP_VS_ERR("cannot register sockopt.\n");
2356                 return ret;
2357         }
2358
2359         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2360         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2361
2362         sysctl_header = register_sysctl_table(vs_root_table, 0);
2363
2364         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2365         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2366                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2367                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2368         }
2369         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2370                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2371         }
2372
2373         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2374         spin_lock_init(&ip_vs_stats.lock);
2375         ip_vs_new_estimator(&ip_vs_stats);
2376
2377         /* Hook the defense timer */
2378         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2379
2380         LeaveFunction(2);
2381         return 0;
2382 }
2383
2384
2385 void ip_vs_control_cleanup(void)
2386 {
2387         EnterFunction(2);
2388         ip_vs_trash_cleanup();
2389         cancel_rearming_delayed_work(&defense_work);
2390         ip_vs_kill_estimator(&ip_vs_stats);
2391         unregister_sysctl_table(sysctl_header);
2392         proc_net_remove("ip_vs_stats");
2393         proc_net_remove("ip_vs");
2394         nf_unregister_sockopt(&ip_vs_sockopts);
2395         LeaveFunction(2);
2396 }