/home/lenb/src/to-linus branch 'acpi-2.6.12'
[linux-2.6] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * Changes:
20  *
21  */
22
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/fs.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36
37 #include <net/ip.h>
38 #include <net/sock.h>
39
40 #include <asm/uaccess.h>
41
42 #include <net/ip_vs.h>
43
44 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45 static DECLARE_MUTEX(__ip_vs_mutex);
46
47 /* lock for service table */
48 static DEFINE_RWLOCK(__ip_vs_svc_lock);
49
50 /* lock for table with the real services */
51 static DEFINE_RWLOCK(__ip_vs_rs_lock);
52
53 /* lock for state and timeout tables */
54 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
55
56 /* lock for drop entry handling */
57 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
58
59 /* lock for drop packet handling */
60 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
61
62 /* 1/rate drop and drop-entry variables */
63 int ip_vs_drop_rate = 0;
64 int ip_vs_drop_counter = 0;
65 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
66
67 /* number of virtual services */
68 static int ip_vs_num_services = 0;
69
70 /* sysctl variables */
71 static int sysctl_ip_vs_drop_entry = 0;
72 static int sysctl_ip_vs_drop_packet = 0;
73 static int sysctl_ip_vs_secure_tcp = 0;
74 static int sysctl_ip_vs_amemthresh = 1024;
75 static int sysctl_ip_vs_am_droprate = 10;
76 int sysctl_ip_vs_cache_bypass = 0;
77 int sysctl_ip_vs_expire_nodest_conn = 0;
78 int sysctl_ip_vs_expire_quiescent_template = 0;
79 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80 int sysctl_ip_vs_nat_icmp_send = 0;
81
82
83 #ifdef CONFIG_IP_VS_DEBUG
84 static int sysctl_ip_vs_debug_level = 0;
85
86 int ip_vs_get_debug_level(void)
87 {
88         return sysctl_ip_vs_debug_level;
89 }
90 #endif
91
92 /*
93  *      update_defense_level is called from keventd and from sysctl,
94  *      so it needs to protect itself from softirqs
95  */
96 static void update_defense_level(void)
97 {
98         struct sysinfo i;
99         static int old_secure_tcp = 0;
100         int availmem;
101         int nomem;
102         int to_change = -1;
103
104         /* we only count free and buffered memory (in pages) */
105         si_meminfo(&i);
106         availmem = i.freeram + i.bufferram;
107         /* however in linux 2.5 the i.bufferram is total page cache size,
108            we need adjust it */
109         /* si_swapinfo(&i); */
110         /* availmem = availmem - (i.totalswap - i.freeswap); */
111
112         nomem = (availmem < sysctl_ip_vs_amemthresh);
113
114         local_bh_disable();
115
116         /* drop_entry */
117         spin_lock(&__ip_vs_dropentry_lock);
118         switch (sysctl_ip_vs_drop_entry) {
119         case 0:
120                 atomic_set(&ip_vs_dropentry, 0);
121                 break;
122         case 1:
123                 if (nomem) {
124                         atomic_set(&ip_vs_dropentry, 1);
125                         sysctl_ip_vs_drop_entry = 2;
126                 } else {
127                         atomic_set(&ip_vs_dropentry, 0);
128                 }
129                 break;
130         case 2:
131                 if (nomem) {
132                         atomic_set(&ip_vs_dropentry, 1);
133                 } else {
134                         atomic_set(&ip_vs_dropentry, 0);
135                         sysctl_ip_vs_drop_entry = 1;
136                 };
137                 break;
138         case 3:
139                 atomic_set(&ip_vs_dropentry, 1);
140                 break;
141         }
142         spin_unlock(&__ip_vs_dropentry_lock);
143
144         /* drop_packet */
145         spin_lock(&__ip_vs_droppacket_lock);
146         switch (sysctl_ip_vs_drop_packet) {
147         case 0:
148                 ip_vs_drop_rate = 0;
149                 break;
150         case 1:
151                 if (nomem) {
152                         ip_vs_drop_rate = ip_vs_drop_counter
153                                 = sysctl_ip_vs_amemthresh /
154                                 (sysctl_ip_vs_amemthresh-availmem);
155                         sysctl_ip_vs_drop_packet = 2;
156                 } else {
157                         ip_vs_drop_rate = 0;
158                 }
159                 break;
160         case 2:
161                 if (nomem) {
162                         ip_vs_drop_rate = ip_vs_drop_counter
163                                 = sysctl_ip_vs_amemthresh /
164                                 (sysctl_ip_vs_amemthresh-availmem);
165                 } else {
166                         ip_vs_drop_rate = 0;
167                         sysctl_ip_vs_drop_packet = 1;
168                 }
169                 break;
170         case 3:
171                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
172                 break;
173         }
174         spin_unlock(&__ip_vs_droppacket_lock);
175
176         /* secure_tcp */
177         write_lock(&__ip_vs_securetcp_lock);
178         switch (sysctl_ip_vs_secure_tcp) {
179         case 0:
180                 if (old_secure_tcp >= 2)
181                         to_change = 0;
182                 break;
183         case 1:
184                 if (nomem) {
185                         if (old_secure_tcp < 2)
186                                 to_change = 1;
187                         sysctl_ip_vs_secure_tcp = 2;
188                 } else {
189                         if (old_secure_tcp >= 2)
190                                 to_change = 0;
191                 }
192                 break;
193         case 2:
194                 if (nomem) {
195                         if (old_secure_tcp < 2)
196                                 to_change = 1;
197                 } else {
198                         if (old_secure_tcp >= 2)
199                                 to_change = 0;
200                         sysctl_ip_vs_secure_tcp = 1;
201                 }
202                 break;
203         case 3:
204                 if (old_secure_tcp < 2)
205                         to_change = 1;
206                 break;
207         }
208         old_secure_tcp = sysctl_ip_vs_secure_tcp;
209         if (to_change >= 0)
210                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
211         write_unlock(&__ip_vs_securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221 static void defense_work_handler(void *data);
222 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
223
224 static void defense_work_handler(void *data)
225 {
226         update_defense_level();
227         if (atomic_read(&ip_vs_dropentry))
228                 ip_vs_random_dropentry();
229
230         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
231 }
232
233 int
234 ip_vs_use_count_inc(void)
235 {
236         return try_module_get(THIS_MODULE);
237 }
238
239 void
240 ip_vs_use_count_dec(void)
241 {
242         module_put(THIS_MODULE);
243 }
244
245
246 /*
247  *      Hash table: for virtual service lookups
248  */
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258 /*
259  *      Hash table: for real service lookups
260  */
261 #define IP_VS_RTAB_BITS 4
262 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
263 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
264
265 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
266
267 /*
268  *      Trash for destinations
269  */
270 static LIST_HEAD(ip_vs_dest_trash);
271
272 /*
273  *      FTP & NULL virtual service counters
274  */
275 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
276 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
277
278
279 /*
280  *      Returns hash value for virtual service
281  */
282 static __inline__ unsigned
283 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
284 {
285         register unsigned porth = ntohs(port);
286
287         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
288                 & IP_VS_SVC_TAB_MASK;
289 }
290
291 /*
292  *      Returns hash value of fwmark for virtual service lookup
293  */
294 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
295 {
296         return fwmark & IP_VS_SVC_TAB_MASK;
297 }
298
299 /*
300  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
301  *      or in the ip_vs_svc_fwm_table by fwmark.
302  *      Should be called with locked tables.
303  */
304 static int ip_vs_svc_hash(struct ip_vs_service *svc)
305 {
306         unsigned hash;
307
308         if (svc->flags & IP_VS_SVC_F_HASHED) {
309                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
310                           "called from %p\n", __builtin_return_address(0));
311                 return 0;
312         }
313
314         if (svc->fwmark == 0) {
315                 /*
316                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
317                  */
318                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
319                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
320         } else {
321                 /*
322                  *  Hash it by fwmark in ip_vs_svc_fwm_table
323                  */
324                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
325                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
326         }
327
328         svc->flags |= IP_VS_SVC_F_HASHED;
329         /* increase its refcnt because it is referenced by the svc table */
330         atomic_inc(&svc->refcnt);
331         return 1;
332 }
333
334
335 /*
336  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
337  *      Should be called with locked tables.
338  */
339 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
340 {
341         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
342                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
343                           "called from %p\n", __builtin_return_address(0));
344                 return 0;
345         }
346
347         if (svc->fwmark == 0) {
348                 /* Remove it from the ip_vs_svc_table table */
349                 list_del(&svc->s_list);
350         } else {
351                 /* Remove it from the ip_vs_svc_fwm_table table */
352                 list_del(&svc->f_list);
353         }
354
355         svc->flags &= ~IP_VS_SVC_F_HASHED;
356         atomic_dec(&svc->refcnt);
357         return 1;
358 }
359
360
361 /*
362  *      Get service by {proto,addr,port} in the service table.
363  */
364 static __inline__ struct ip_vs_service *
365 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
366 {
367         unsigned hash;
368         struct ip_vs_service *svc;
369
370         /* Check for "full" addressed entries */
371         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
372
373         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
374                 if ((svc->addr == vaddr)
375                     && (svc->port == vport)
376                     && (svc->protocol == protocol)) {
377                         /* HIT */
378                         atomic_inc(&svc->usecnt);
379                         return svc;
380                 }
381         }
382
383         return NULL;
384 }
385
386
387 /*
388  *      Get service by {fwmark} in the service table.
389  */
390 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
391 {
392         unsigned hash;
393         struct ip_vs_service *svc;
394
395         /* Check for fwmark addressed entries */
396         hash = ip_vs_svc_fwm_hashkey(fwmark);
397
398         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
399                 if (svc->fwmark == fwmark) {
400                         /* HIT */
401                         atomic_inc(&svc->usecnt);
402                         return svc;
403                 }
404         }
405
406         return NULL;
407 }
408
409 struct ip_vs_service *
410 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
411 {
412         struct ip_vs_service *svc;
413
414         read_lock(&__ip_vs_svc_lock);
415
416         /*
417          *      Check the table hashed by fwmark first
418          */
419         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
420                 goto out;
421
422         /*
423          *      Check the table hashed by <protocol,addr,port>
424          *      for "full" addressed entries
425          */
426         svc = __ip_vs_service_get(protocol, vaddr, vport);
427
428         if (svc == NULL
429             && protocol == IPPROTO_TCP
430             && atomic_read(&ip_vs_ftpsvc_counter)
431             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
432                 /*
433                  * Check if ftp service entry exists, the packet
434                  * might belong to FTP data connections.
435                  */
436                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
437         }
438
439         if (svc == NULL
440             && atomic_read(&ip_vs_nullsvc_counter)) {
441                 /*
442                  * Check if the catch-all port (port zero) exists
443                  */
444                 svc = __ip_vs_service_get(protocol, vaddr, 0);
445         }
446
447   out:
448         read_unlock(&__ip_vs_svc_lock);
449
450         IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
451                   fwmark, ip_vs_proto_name(protocol),
452                   NIPQUAD(vaddr), ntohs(vport),
453                   svc?"hit":"not hit");
454
455         return svc;
456 }
457
458
459 static inline void
460 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461 {
462         atomic_inc(&svc->refcnt);
463         dest->svc = svc;
464 }
465
466 static inline void
467 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
468 {
469         struct ip_vs_service *svc = dest->svc;
470
471         dest->svc = NULL;
472         if (atomic_dec_and_test(&svc->refcnt))
473                 kfree(svc);
474 }
475
476
477 /*
478  *      Returns hash value for real service
479  */
480 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
481 {
482         register unsigned porth = ntohs(port);
483
484         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
485                 & IP_VS_RTAB_MASK;
486 }
487
488 /*
489  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
490  *      should be called with locked tables.
491  */
492 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
493 {
494         unsigned hash;
495
496         if (!list_empty(&dest->d_list)) {
497                 return 0;
498         }
499
500         /*
501          *      Hash by proto,addr,port,
502          *      which are the parameters of the real service.
503          */
504         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
505         list_add(&dest->d_list, &ip_vs_rtable[hash]);
506
507         return 1;
508 }
509
510 /*
511  *      UNhashes ip_vs_dest from ip_vs_rtable.
512  *      should be called with locked tables.
513  */
514 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
515 {
516         /*
517          * Remove it from the ip_vs_rtable table.
518          */
519         if (!list_empty(&dest->d_list)) {
520                 list_del(&dest->d_list);
521                 INIT_LIST_HEAD(&dest->d_list);
522         }
523
524         return 1;
525 }
526
527 /*
528  *      Lookup real service by <proto,addr,port> in the real service table.
529  */
530 struct ip_vs_dest *
531 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
532 {
533         unsigned hash;
534         struct ip_vs_dest *dest;
535
536         /*
537          *      Check for "full" addressed entries
538          *      Return the first found entry
539          */
540         hash = ip_vs_rs_hashkey(daddr, dport);
541
542         read_lock(&__ip_vs_rs_lock);
543         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
544                 if ((dest->addr == daddr)
545                     && (dest->port == dport)
546                     && ((dest->protocol == protocol) ||
547                         dest->vfwmark)) {
548                         /* HIT */
549                         read_unlock(&__ip_vs_rs_lock);
550                         return dest;
551                 }
552         }
553         read_unlock(&__ip_vs_rs_lock);
554
555         return NULL;
556 }
557
558 /*
559  *      Lookup destination by {addr,port} in the given service
560  */
561 static struct ip_vs_dest *
562 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
563 {
564         struct ip_vs_dest *dest;
565
566         /*
567          * Find the destination for the given service
568          */
569         list_for_each_entry(dest, &svc->destinations, n_list) {
570                 if ((dest->addr == daddr) && (dest->port == dport)) {
571                         /* HIT */
572                         return dest;
573                 }
574         }
575
576         return NULL;
577 }
578
579
580 /*
581  *  Lookup dest by {svc,addr,port} in the destination trash.
582  *  The destination trash is used to hold the destinations that are removed
583  *  from the service table but are still referenced by some conn entries.
584  *  The reason to add the destination trash is when the dest is temporary
585  *  down (either by administrator or by monitor program), the dest can be
586  *  picked back from the trash, the remaining connections to the dest can
587  *  continue, and the counting information of the dest is also useful for
588  *  scheduling.
589  */
590 static struct ip_vs_dest *
591 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
592 {
593         struct ip_vs_dest *dest, *nxt;
594
595         /*
596          * Find the destination in trash
597          */
598         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
599                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
600                           "refcnt=%d\n",
601                           dest->vfwmark,
602                           NIPQUAD(dest->addr), ntohs(dest->port),
603                           atomic_read(&dest->refcnt));
604                 if (dest->addr == daddr &&
605                     dest->port == dport &&
606                     dest->vfwmark == svc->fwmark &&
607                     dest->protocol == svc->protocol &&
608                     (svc->fwmark ||
609                      (dest->vaddr == svc->addr &&
610                       dest->vport == svc->port))) {
611                         /* HIT */
612                         return dest;
613                 }
614
615                 /*
616                  * Try to purge the destination from trash if not referenced
617                  */
618                 if (atomic_read(&dest->refcnt) == 1) {
619                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
620                                   "from trash\n",
621                                   dest->vfwmark,
622                                   NIPQUAD(dest->addr), ntohs(dest->port));
623                         list_del(&dest->n_list);
624                         ip_vs_dst_reset(dest);
625                         __ip_vs_unbind_svc(dest);
626                         kfree(dest);
627                 }
628         }
629
630         return NULL;
631 }
632
633
634 /*
635  *  Clean up all the destinations in the trash
636  *  Called by the ip_vs_control_cleanup()
637  *
638  *  When the ip_vs_control_clearup is activated by ipvs module exit,
639  *  the service tables must have been flushed and all the connections
640  *  are expired, and the refcnt of each destination in the trash must
641  *  be 1, so we simply release them here.
642  */
643 static void ip_vs_trash_cleanup(void)
644 {
645         struct ip_vs_dest *dest, *nxt;
646
647         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
648                 list_del(&dest->n_list);
649                 ip_vs_dst_reset(dest);
650                 __ip_vs_unbind_svc(dest);
651                 kfree(dest);
652         }
653 }
654
655
656 static void
657 ip_vs_zero_stats(struct ip_vs_stats *stats)
658 {
659         spin_lock_bh(&stats->lock);
660         memset(stats, 0, (char *)&stats->lock - (char *)stats);
661         spin_unlock_bh(&stats->lock);
662         ip_vs_zero_estimator(stats);
663 }
664
665 /*
666  *      Update a destination in the given service
667  */
668 static void
669 __ip_vs_update_dest(struct ip_vs_service *svc,
670                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
671 {
672         int conn_flags;
673
674         /* set the weight and the flags */
675         atomic_set(&dest->weight, udest->weight);
676         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
677
678         /* check if local node and update the flags */
679         if (inet_addr_type(udest->addr) == RTN_LOCAL) {
680                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
681                         | IP_VS_CONN_F_LOCALNODE;
682         }
683
684         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
685         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
686                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
687         } else {
688                 /*
689                  *    Put the real service in ip_vs_rtable if not present.
690                  *    For now only for NAT!
691                  */
692                 write_lock_bh(&__ip_vs_rs_lock);
693                 ip_vs_rs_hash(dest);
694                 write_unlock_bh(&__ip_vs_rs_lock);
695         }
696         atomic_set(&dest->conn_flags, conn_flags);
697
698         /* bind the service */
699         if (!dest->svc) {
700                 __ip_vs_bind_svc(dest, svc);
701         } else {
702                 if (dest->svc != svc) {
703                         __ip_vs_unbind_svc(dest);
704                         ip_vs_zero_stats(&dest->stats);
705                         __ip_vs_bind_svc(dest, svc);
706                 }
707         }
708
709         /* set the dest status flags */
710         dest->flags |= IP_VS_DEST_F_AVAILABLE;
711
712         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
713                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
714         dest->u_threshold = udest->u_threshold;
715         dest->l_threshold = udest->l_threshold;
716 }
717
718
719 /*
720  *      Create a destination for the given service
721  */
722 static int
723 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
724                struct ip_vs_dest **dest_p)
725 {
726         struct ip_vs_dest *dest;
727         unsigned atype;
728
729         EnterFunction(2);
730
731         atype = inet_addr_type(udest->addr);
732         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
733                 return -EINVAL;
734
735         dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
736         if (dest == NULL) {
737                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
738                 return -ENOMEM;
739         }
740         memset(dest, 0, sizeof(struct ip_vs_dest));
741
742         dest->protocol = svc->protocol;
743         dest->vaddr = svc->addr;
744         dest->vport = svc->port;
745         dest->vfwmark = svc->fwmark;
746         dest->addr = udest->addr;
747         dest->port = udest->port;
748
749         atomic_set(&dest->activeconns, 0);
750         atomic_set(&dest->inactconns, 0);
751         atomic_set(&dest->persistconns, 0);
752         atomic_set(&dest->refcnt, 0);
753
754         INIT_LIST_HEAD(&dest->d_list);
755         spin_lock_init(&dest->dst_lock);
756         spin_lock_init(&dest->stats.lock);
757         __ip_vs_update_dest(svc, dest, udest);
758         ip_vs_new_estimator(&dest->stats);
759
760         *dest_p = dest;
761
762         LeaveFunction(2);
763         return 0;
764 }
765
766
767 /*
768  *      Add a destination into an existing service
769  */
770 static int
771 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
772 {
773         struct ip_vs_dest *dest;
774         __u32 daddr = udest->addr;
775         __u16 dport = udest->port;
776         int ret;
777
778         EnterFunction(2);
779
780         if (udest->weight < 0) {
781                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
782                 return -ERANGE;
783         }
784
785         if (udest->l_threshold > udest->u_threshold) {
786                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
787                           "upper threshold\n");
788                 return -ERANGE;
789         }
790
791         /*
792          * Check if the dest already exists in the list
793          */
794         dest = ip_vs_lookup_dest(svc, daddr, dport);
795         if (dest != NULL) {
796                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
797                 return -EEXIST;
798         }
799
800         /*
801          * Check if the dest already exists in the trash and
802          * is from the same service
803          */
804         dest = ip_vs_trash_get_dest(svc, daddr, dport);
805         if (dest != NULL) {
806                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
807                           "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
808                           NIPQUAD(daddr), ntohs(dport),
809                           atomic_read(&dest->refcnt),
810                           dest->vfwmark,
811                           NIPQUAD(dest->vaddr),
812                           ntohs(dest->vport));
813                 __ip_vs_update_dest(svc, dest, udest);
814
815                 /*
816                  * Get the destination from the trash
817                  */
818                 list_del(&dest->n_list);
819
820                 ip_vs_new_estimator(&dest->stats);
821
822                 write_lock_bh(&__ip_vs_svc_lock);
823
824                 /*
825                  * Wait until all other svc users go away.
826                  */
827                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
828
829                 list_add(&dest->n_list, &svc->destinations);
830                 svc->num_dests++;
831
832                 /* call the update_service function of its scheduler */
833                 svc->scheduler->update_service(svc);
834
835                 write_unlock_bh(&__ip_vs_svc_lock);
836                 return 0;
837         }
838
839         /*
840          * Allocate and initialize the dest structure
841          */
842         ret = ip_vs_new_dest(svc, udest, &dest);
843         if (ret) {
844                 return ret;
845         }
846
847         /*
848          * Add the dest entry into the list
849          */
850         atomic_inc(&dest->refcnt);
851
852         write_lock_bh(&__ip_vs_svc_lock);
853
854         /*
855          * Wait until all other svc users go away.
856          */
857         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
858
859         list_add(&dest->n_list, &svc->destinations);
860         svc->num_dests++;
861
862         /* call the update_service function of its scheduler */
863         svc->scheduler->update_service(svc);
864
865         write_unlock_bh(&__ip_vs_svc_lock);
866
867         LeaveFunction(2);
868
869         return 0;
870 }
871
872
873 /*
874  *      Edit a destination in the given service
875  */
876 static int
877 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
878 {
879         struct ip_vs_dest *dest;
880         __u32 daddr = udest->addr;
881         __u16 dport = udest->port;
882
883         EnterFunction(2);
884
885         if (udest->weight < 0) {
886                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
887                 return -ERANGE;
888         }
889
890         if (udest->l_threshold > udest->u_threshold) {
891                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
892                           "upper threshold\n");
893                 return -ERANGE;
894         }
895
896         /*
897          *  Lookup the destination list
898          */
899         dest = ip_vs_lookup_dest(svc, daddr, dport);
900         if (dest == NULL) {
901                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
902                 return -ENOENT;
903         }
904
905         __ip_vs_update_dest(svc, dest, udest);
906
907         write_lock_bh(&__ip_vs_svc_lock);
908
909         /* Wait until all other svc users go away */
910         while (atomic_read(&svc->usecnt) > 1) {};
911
912         /* call the update_service, because server weight may be changed */
913         svc->scheduler->update_service(svc);
914
915         write_unlock_bh(&__ip_vs_svc_lock);
916
917         LeaveFunction(2);
918
919         return 0;
920 }
921
922
923 /*
924  *      Delete a destination (must be already unlinked from the service)
925  */
926 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
927 {
928         ip_vs_kill_estimator(&dest->stats);
929
930         /*
931          *  Remove it from the d-linked list with the real services.
932          */
933         write_lock_bh(&__ip_vs_rs_lock);
934         ip_vs_rs_unhash(dest);
935         write_unlock_bh(&__ip_vs_rs_lock);
936
937         /*
938          *  Decrease the refcnt of the dest, and free the dest
939          *  if nobody refers to it (refcnt=0). Otherwise, throw
940          *  the destination into the trash.
941          */
942         if (atomic_dec_and_test(&dest->refcnt)) {
943                 ip_vs_dst_reset(dest);
944                 /* simply decrease svc->refcnt here, let the caller check
945                    and release the service if nobody refers to it.
946                    Only user context can release destination and service,
947                    and only one user context can update virtual service at a
948                    time, so the operation here is OK */
949                 atomic_dec(&dest->svc->refcnt);
950                 kfree(dest);
951         } else {
952                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
953                           NIPQUAD(dest->addr), ntohs(dest->port),
954                           atomic_read(&dest->refcnt));
955                 list_add(&dest->n_list, &ip_vs_dest_trash);
956                 atomic_inc(&dest->refcnt);
957         }
958 }
959
960
961 /*
962  *      Unlink a destination from the given service
963  */
964 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
965                                 struct ip_vs_dest *dest,
966                                 int svcupd)
967 {
968         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
969
970         /*
971          *  Remove it from the d-linked destination list.
972          */
973         list_del(&dest->n_list);
974         svc->num_dests--;
975         if (svcupd) {
976                 /*
977                  *  Call the update_service function of its scheduler
978                  */
979                 svc->scheduler->update_service(svc);
980         }
981 }
982
983
984 /*
985  *      Delete a destination server in the given service
986  */
987 static int
988 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
989 {
990         struct ip_vs_dest *dest;
991         __u32 daddr = udest->addr;
992         __u16 dport = udest->port;
993
994         EnterFunction(2);
995
996         dest = ip_vs_lookup_dest(svc, daddr, dport);
997         if (dest == NULL) {
998                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
999                 return -ENOENT;
1000         }
1001
1002         write_lock_bh(&__ip_vs_svc_lock);
1003
1004         /*
1005          *      Wait until all other svc users go away.
1006          */
1007         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1008
1009         /*
1010          *      Unlink dest from the service
1011          */
1012         __ip_vs_unlink_dest(svc, dest, 1);
1013
1014         write_unlock_bh(&__ip_vs_svc_lock);
1015
1016         /*
1017          *      Delete the destination
1018          */
1019         __ip_vs_del_dest(dest);
1020
1021         LeaveFunction(2);
1022
1023         return 0;
1024 }
1025
1026
1027 /*
1028  *      Add a service into the service hash table
1029  */
1030 static int
1031 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1032 {
1033         int ret = 0;
1034         struct ip_vs_scheduler *sched = NULL;
1035         struct ip_vs_service *svc = NULL;
1036
1037         /* increase the module use count */
1038         ip_vs_use_count_inc();
1039
1040         /* Lookup the scheduler by 'u->sched_name' */
1041         sched = ip_vs_scheduler_get(u->sched_name);
1042         if (sched == NULL) {
1043                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1044                            u->sched_name);
1045                 ret = -ENOENT;
1046                 goto out_mod_dec;
1047         }
1048
1049         svc = (struct ip_vs_service *)
1050                 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1051         if (svc == NULL) {
1052                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1053                 ret = -ENOMEM;
1054                 goto out_err;
1055         }
1056         memset(svc, 0, sizeof(struct ip_vs_service));
1057
1058         /* I'm the first user of the service */
1059         atomic_set(&svc->usecnt, 1);
1060         atomic_set(&svc->refcnt, 0);
1061
1062         svc->protocol = u->protocol;
1063         svc->addr = u->addr;
1064         svc->port = u->port;
1065         svc->fwmark = u->fwmark;
1066         svc->flags = u->flags;
1067         svc->timeout = u->timeout * HZ;
1068         svc->netmask = u->netmask;
1069
1070         INIT_LIST_HEAD(&svc->destinations);
1071         rwlock_init(&svc->sched_lock);
1072         spin_lock_init(&svc->stats.lock);
1073
1074         /* Bind the scheduler */
1075         ret = ip_vs_bind_scheduler(svc, sched);
1076         if (ret)
1077                 goto out_err;
1078         sched = NULL;
1079
1080         /* Update the virtual service counters */
1081         if (svc->port == FTPPORT)
1082                 atomic_inc(&ip_vs_ftpsvc_counter);
1083         else if (svc->port == 0)
1084                 atomic_inc(&ip_vs_nullsvc_counter);
1085
1086         ip_vs_new_estimator(&svc->stats);
1087         ip_vs_num_services++;
1088
1089         /* Hash the service into the service table */
1090         write_lock_bh(&__ip_vs_svc_lock);
1091         ip_vs_svc_hash(svc);
1092         write_unlock_bh(&__ip_vs_svc_lock);
1093
1094         *svc_p = svc;
1095         return 0;
1096
1097   out_err:
1098         if (svc != NULL) {
1099                 if (svc->scheduler)
1100                         ip_vs_unbind_scheduler(svc);
1101                 if (svc->inc) {
1102                         local_bh_disable();
1103                         ip_vs_app_inc_put(svc->inc);
1104                         local_bh_enable();
1105                 }
1106                 kfree(svc);
1107         }
1108         ip_vs_scheduler_put(sched);
1109
1110   out_mod_dec:
1111         /* decrease the module use count */
1112         ip_vs_use_count_dec();
1113
1114         return ret;
1115 }
1116
1117
1118 /*
1119  *      Edit a service and bind it with a new scheduler
1120  */
1121 static int
1122 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1123 {
1124         struct ip_vs_scheduler *sched, *old_sched;
1125         int ret = 0;
1126
1127         /*
1128          * Lookup the scheduler, by 'u->sched_name'
1129          */
1130         sched = ip_vs_scheduler_get(u->sched_name);
1131         if (sched == NULL) {
1132                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1133                            u->sched_name);
1134                 return -ENOENT;
1135         }
1136         old_sched = sched;
1137
1138         write_lock_bh(&__ip_vs_svc_lock);
1139
1140         /*
1141          * Wait until all other svc users go away.
1142          */
1143         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1144
1145         /*
1146          * Set the flags and timeout value
1147          */
1148         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1149         svc->timeout = u->timeout * HZ;
1150         svc->netmask = u->netmask;
1151
1152         old_sched = svc->scheduler;
1153         if (sched != old_sched) {
1154                 /*
1155                  * Unbind the old scheduler
1156                  */
1157                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1158                         old_sched = sched;
1159                         goto out;
1160                 }
1161
1162                 /*
1163                  * Bind the new scheduler
1164                  */
1165                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1166                         /*
1167                          * If ip_vs_bind_scheduler fails, restore the old
1168                          * scheduler.
1169                          * The main reason of failure is out of memory.
1170                          *
1171                          * The question is if the old scheduler can be
1172                          * restored all the time. TODO: if it cannot be
1173                          * restored some time, we must delete the service,
1174                          * otherwise the system may crash.
1175                          */
1176                         ip_vs_bind_scheduler(svc, old_sched);
1177                         old_sched = sched;
1178                         goto out;
1179                 }
1180         }
1181
1182   out:
1183         write_unlock_bh(&__ip_vs_svc_lock);
1184
1185         if (old_sched)
1186                 ip_vs_scheduler_put(old_sched);
1187
1188         return ret;
1189 }
1190
1191
1192 /*
1193  *      Delete a service from the service list
1194  *      - The service must be unlinked, unlocked and not referenced!
1195  *      - We are called under _bh lock
1196  */
1197 static void __ip_vs_del_service(struct ip_vs_service *svc)
1198 {
1199         struct ip_vs_dest *dest, *nxt;
1200         struct ip_vs_scheduler *old_sched;
1201
1202         ip_vs_num_services--;
1203         ip_vs_kill_estimator(&svc->stats);
1204
1205         /* Unbind scheduler */
1206         old_sched = svc->scheduler;
1207         ip_vs_unbind_scheduler(svc);
1208         if (old_sched)
1209                 ip_vs_scheduler_put(old_sched);
1210
1211         /* Unbind app inc */
1212         if (svc->inc) {
1213                 ip_vs_app_inc_put(svc->inc);
1214                 svc->inc = NULL;
1215         }
1216
1217         /*
1218          *    Unlink the whole destination list
1219          */
1220         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1221                 __ip_vs_unlink_dest(svc, dest, 0);
1222                 __ip_vs_del_dest(dest);
1223         }
1224
1225         /*
1226          *    Update the virtual service counters
1227          */
1228         if (svc->port == FTPPORT)
1229                 atomic_dec(&ip_vs_ftpsvc_counter);
1230         else if (svc->port == 0)
1231                 atomic_dec(&ip_vs_nullsvc_counter);
1232
1233         /*
1234          *    Free the service if nobody refers to it
1235          */
1236         if (atomic_read(&svc->refcnt) == 0)
1237                 kfree(svc);
1238
1239         /* decrease the module use count */
1240         ip_vs_use_count_dec();
1241 }
1242
1243 /*
1244  *      Delete a service from the service list
1245  */
1246 static int ip_vs_del_service(struct ip_vs_service *svc)
1247 {
1248         if (svc == NULL)
1249                 return -EEXIST;
1250
1251         /*
1252          * Unhash it from the service table
1253          */
1254         write_lock_bh(&__ip_vs_svc_lock);
1255
1256         ip_vs_svc_unhash(svc);
1257
1258         /*
1259          * Wait until all the svc users go away.
1260          */
1261         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1262
1263         __ip_vs_del_service(svc);
1264
1265         write_unlock_bh(&__ip_vs_svc_lock);
1266
1267         return 0;
1268 }
1269
1270
1271 /*
1272  *      Flush all the virtual services
1273  */
1274 static int ip_vs_flush(void)
1275 {
1276         int idx;
1277         struct ip_vs_service *svc, *nxt;
1278
1279         /*
1280          * Flush the service table hashed by <protocol,addr,port>
1281          */
1282         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1283                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1284                         write_lock_bh(&__ip_vs_svc_lock);
1285                         ip_vs_svc_unhash(svc);
1286                         /*
1287                          * Wait until all the svc users go away.
1288                          */
1289                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1290                         __ip_vs_del_service(svc);
1291                         write_unlock_bh(&__ip_vs_svc_lock);
1292                 }
1293         }
1294
1295         /*
1296          * Flush the service table hashed by fwmark
1297          */
1298         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1299                 list_for_each_entry_safe(svc, nxt,
1300                                          &ip_vs_svc_fwm_table[idx], f_list) {
1301                         write_lock_bh(&__ip_vs_svc_lock);
1302                         ip_vs_svc_unhash(svc);
1303                         /*
1304                          * Wait until all the svc users go away.
1305                          */
1306                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1307                         __ip_vs_del_service(svc);
1308                         write_unlock_bh(&__ip_vs_svc_lock);
1309                 }
1310         }
1311
1312         return 0;
1313 }
1314
1315
1316 /*
1317  *      Zero counters in a service or all services
1318  */
1319 static int ip_vs_zero_service(struct ip_vs_service *svc)
1320 {
1321         struct ip_vs_dest *dest;
1322
1323         write_lock_bh(&__ip_vs_svc_lock);
1324         list_for_each_entry(dest, &svc->destinations, n_list) {
1325                 ip_vs_zero_stats(&dest->stats);
1326         }
1327         ip_vs_zero_stats(&svc->stats);
1328         write_unlock_bh(&__ip_vs_svc_lock);
1329         return 0;
1330 }
1331
1332 static int ip_vs_zero_all(void)
1333 {
1334         int idx;
1335         struct ip_vs_service *svc;
1336
1337         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1338                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1339                         ip_vs_zero_service(svc);
1340                 }
1341         }
1342
1343         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1344                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1345                         ip_vs_zero_service(svc);
1346                 }
1347         }
1348
1349         ip_vs_zero_stats(&ip_vs_stats);
1350         return 0;
1351 }
1352
1353
1354 static int
1355 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1356                      void __user *buffer, size_t *lenp, loff_t *ppos)
1357 {
1358         int *valp = table->data;
1359         int val = *valp;
1360         int rc;
1361
1362         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1363         if (write && (*valp != val)) {
1364                 if ((*valp < 0) || (*valp > 3)) {
1365                         /* Restore the correct value */
1366                         *valp = val;
1367                 } else {
1368                         update_defense_level();
1369                 }
1370         }
1371         return rc;
1372 }
1373
1374
1375 static int
1376 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1377                        void __user *buffer, size_t *lenp, loff_t *ppos)
1378 {
1379         int *valp = table->data;
1380         int val[2];
1381         int rc;
1382
1383         /* backup the value first */
1384         memcpy(val, valp, sizeof(val));
1385
1386         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1387         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1388                 /* Restore the correct value */
1389                 memcpy(valp, val, sizeof(val));
1390         }
1391         return rc;
1392 }
1393
1394
1395 /*
1396  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1397  */
1398
1399 static struct ctl_table vs_vars[] = {
1400         {
1401                 .ctl_name       = NET_IPV4_VS_AMEMTHRESH,
1402                 .procname       = "amemthresh",
1403                 .data           = &sysctl_ip_vs_amemthresh,
1404                 .maxlen         = sizeof(int),
1405                 .mode           = 0644,
1406                 .proc_handler   = &proc_dointvec,
1407         },
1408 #ifdef CONFIG_IP_VS_DEBUG
1409         {
1410                 .ctl_name       = NET_IPV4_VS_DEBUG_LEVEL,
1411                 .procname       = "debug_level",
1412                 .data           = &sysctl_ip_vs_debug_level,
1413                 .maxlen         = sizeof(int),
1414                 .mode           = 0644,
1415                 .proc_handler   = &proc_dointvec,
1416         },
1417 #endif
1418         {
1419                 .ctl_name       = NET_IPV4_VS_AMDROPRATE,
1420                 .procname       = "am_droprate",
1421                 .data           = &sysctl_ip_vs_am_droprate,
1422                 .maxlen         = sizeof(int),
1423                 .mode           = 0644,
1424                 .proc_handler   = &proc_dointvec,
1425         },
1426         {
1427                 .ctl_name       = NET_IPV4_VS_DROP_ENTRY,
1428                 .procname       = "drop_entry",
1429                 .data           = &sysctl_ip_vs_drop_entry,
1430                 .maxlen         = sizeof(int),
1431                 .mode           = 0644,
1432                 .proc_handler   = &proc_do_defense_mode,
1433         },
1434         {
1435                 .ctl_name       = NET_IPV4_VS_DROP_PACKET,
1436                 .procname       = "drop_packet",
1437                 .data           = &sysctl_ip_vs_drop_packet,
1438                 .maxlen         = sizeof(int),
1439                 .mode           = 0644,
1440                 .proc_handler   = &proc_do_defense_mode,
1441         },
1442         {
1443                 .ctl_name       = NET_IPV4_VS_SECURE_TCP,
1444                 .procname       = "secure_tcp",
1445                 .data           = &sysctl_ip_vs_secure_tcp,
1446                 .maxlen         = sizeof(int),
1447                 .mode           = 0644,
1448                 .proc_handler   = &proc_do_defense_mode,
1449         },
1450 #if 0
1451         {
1452                 .ctl_name       = NET_IPV4_VS_TO_ES,
1453                 .procname       = "timeout_established",
1454                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1455                 .maxlen         = sizeof(int),
1456                 .mode           = 0644,
1457                 .proc_handler   = &proc_dointvec_jiffies,
1458         },
1459         {
1460                 .ctl_name       = NET_IPV4_VS_TO_SS,
1461                 .procname       = "timeout_synsent",
1462                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1463                 .maxlen         = sizeof(int),
1464                 .mode           = 0644,
1465                 .proc_handler   = &proc_dointvec_jiffies,
1466         },
1467         {
1468                 .ctl_name       = NET_IPV4_VS_TO_SR,
1469                 .procname       = "timeout_synrecv",
1470                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1471                 .maxlen         = sizeof(int),
1472                 .mode           = 0644,
1473                 .proc_handler   = &proc_dointvec_jiffies,
1474         },
1475         {
1476                 .ctl_name       = NET_IPV4_VS_TO_FW,
1477                 .procname       = "timeout_finwait",
1478                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1479                 .maxlen         = sizeof(int),
1480                 .mode           = 0644,
1481                 .proc_handler   = &proc_dointvec_jiffies,
1482         },
1483         {
1484                 .ctl_name       = NET_IPV4_VS_TO_TW,
1485                 .procname       = "timeout_timewait",
1486                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1487                 .maxlen         = sizeof(int),
1488                 .mode           = 0644,
1489                 .proc_handler   = &proc_dointvec_jiffies,
1490         },
1491         {
1492                 .ctl_name       = NET_IPV4_VS_TO_CL,
1493                 .procname       = "timeout_close",
1494                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1495                 .maxlen         = sizeof(int),
1496                 .mode           = 0644,
1497                 .proc_handler   = &proc_dointvec_jiffies,
1498         },
1499         {
1500                 .ctl_name       = NET_IPV4_VS_TO_CW,
1501                 .procname       = "timeout_closewait",
1502                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1503                 .maxlen         = sizeof(int),
1504                 .mode           = 0644,
1505                 .proc_handler   = &proc_dointvec_jiffies,
1506         },
1507         {
1508                 .ctl_name       = NET_IPV4_VS_TO_LA,
1509                 .procname       = "timeout_lastack",
1510                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1511                 .maxlen         = sizeof(int),
1512                 .mode           = 0644,
1513                 .proc_handler   = &proc_dointvec_jiffies,
1514         },
1515         {
1516                 .ctl_name       = NET_IPV4_VS_TO_LI,
1517                 .procname       = "timeout_listen",
1518                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1519                 .maxlen         = sizeof(int),
1520                 .mode           = 0644,
1521                 .proc_handler   = &proc_dointvec_jiffies,
1522         },
1523         {
1524                 .ctl_name       = NET_IPV4_VS_TO_SA,
1525                 .procname       = "timeout_synack",
1526                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1527                 .maxlen         = sizeof(int),
1528                 .mode           = 0644,
1529                 .proc_handler   = &proc_dointvec_jiffies,
1530         },
1531         {
1532                 .ctl_name       = NET_IPV4_VS_TO_UDP,
1533                 .procname       = "timeout_udp",
1534                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1535                 .maxlen         = sizeof(int),
1536                 .mode           = 0644,
1537                 .proc_handler   = &proc_dointvec_jiffies,
1538         },
1539         {
1540                 .ctl_name       = NET_IPV4_VS_TO_ICMP,
1541                 .procname       = "timeout_icmp",
1542                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1543                 .maxlen         = sizeof(int),
1544                 .mode           = 0644,
1545                 .proc_handler   = &proc_dointvec_jiffies,
1546         },
1547 #endif
1548         {
1549                 .ctl_name       = NET_IPV4_VS_CACHE_BYPASS,
1550                 .procname       = "cache_bypass",
1551                 .data           = &sysctl_ip_vs_cache_bypass,
1552                 .maxlen         = sizeof(int),
1553                 .mode           = 0644,
1554                 .proc_handler   = &proc_dointvec,
1555         },
1556         {
1557                 .ctl_name       = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1558                 .procname       = "expire_nodest_conn",
1559                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1560                 .maxlen         = sizeof(int),
1561                 .mode           = 0644,
1562                 .proc_handler   = &proc_dointvec,
1563         },
1564         {
1565                 .ctl_name       = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1566                 .procname       = "expire_quiescent_template",
1567                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1568                 .maxlen         = sizeof(int),
1569                 .mode           = 0644,
1570                 .proc_handler   = &proc_dointvec,
1571         },
1572         {
1573                 .ctl_name       = NET_IPV4_VS_SYNC_THRESHOLD,
1574                 .procname       = "sync_threshold",
1575                 .data           = &sysctl_ip_vs_sync_threshold,
1576                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1577                 .mode           = 0644,
1578                 .proc_handler   = &proc_do_sync_threshold,
1579         },
1580         {
1581                 .ctl_name       = NET_IPV4_VS_NAT_ICMP_SEND,
1582                 .procname       = "nat_icmp_send",
1583                 .data           = &sysctl_ip_vs_nat_icmp_send,
1584                 .maxlen         = sizeof(int),
1585                 .mode           = 0644,
1586                 .proc_handler   = &proc_dointvec,
1587         },
1588         { .ctl_name = 0 }
1589 };
1590
1591 static ctl_table vs_table[] = {
1592         {
1593                 .ctl_name       = NET_IPV4_VS,
1594                 .procname       = "vs",
1595                 .mode           = 0555,
1596                 .child          = vs_vars
1597         },
1598         { .ctl_name = 0 }
1599 };
1600
1601 static ctl_table ipv4_table[] = {
1602         {
1603                 .ctl_name       = NET_IPV4,
1604                 .procname       = "ipv4",
1605                 .mode           = 0555,
1606                 .child          = vs_table,
1607         },
1608         { .ctl_name = 0 }
1609 };
1610
1611 static ctl_table vs_root_table[] = {
1612         {
1613                 .ctl_name       = CTL_NET,
1614                 .procname       = "net",
1615                 .mode           = 0555,
1616                 .child          = ipv4_table,
1617         },
1618         { .ctl_name = 0 }
1619 };
1620
1621 static struct ctl_table_header * sysctl_header;
1622
1623 #ifdef CONFIG_PROC_FS
1624
1625 struct ip_vs_iter {
1626         struct list_head *table;
1627         int bucket;
1628 };
1629
1630 /*
1631  *      Write the contents of the VS rule table to a PROCfs file.
1632  *      (It is kept just for backward compatibility)
1633  */
1634 static inline const char *ip_vs_fwd_name(unsigned flags)
1635 {
1636         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1637         case IP_VS_CONN_F_LOCALNODE:
1638                 return "Local";
1639         case IP_VS_CONN_F_TUNNEL:
1640                 return "Tunnel";
1641         case IP_VS_CONN_F_DROUTE:
1642                 return "Route";
1643         default:
1644                 return "Masq";
1645         }
1646 }
1647
1648
1649 /* Get the Nth entry in the two lists */
1650 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1651 {
1652         struct ip_vs_iter *iter = seq->private;
1653         int idx;
1654         struct ip_vs_service *svc;
1655
1656         /* look in hash by protocol */
1657         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1658                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1659                         if (pos-- == 0){
1660                                 iter->table = ip_vs_svc_table;
1661                                 iter->bucket = idx;
1662                                 return svc;
1663                         }
1664                 }
1665         }
1666
1667         /* keep looking in fwmark */
1668         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1669                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1670                         if (pos-- == 0) {
1671                                 iter->table = ip_vs_svc_fwm_table;
1672                                 iter->bucket = idx;
1673                                 return svc;
1674                         }
1675                 }
1676         }
1677
1678         return NULL;
1679 }
1680
1681 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1682 {
1683
1684         read_lock_bh(&__ip_vs_svc_lock);
1685         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1686 }
1687
1688
1689 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1690 {
1691         struct list_head *e;
1692         struct ip_vs_iter *iter;
1693         struct ip_vs_service *svc;
1694
1695         ++*pos;
1696         if (v == SEQ_START_TOKEN)
1697                 return ip_vs_info_array(seq,0);
1698
1699         svc = v;
1700         iter = seq->private;
1701
1702         if (iter->table == ip_vs_svc_table) {
1703                 /* next service in table hashed by protocol */
1704                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1705                         return list_entry(e, struct ip_vs_service, s_list);
1706
1707
1708                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1709                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1710                                             s_list) {
1711                                 return svc;
1712                         }
1713                 }
1714
1715                 iter->table = ip_vs_svc_fwm_table;
1716                 iter->bucket = -1;
1717                 goto scan_fwmark;
1718         }
1719
1720         /* next service in hashed by fwmark */
1721         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1722                 return list_entry(e, struct ip_vs_service, f_list);
1723
1724  scan_fwmark:
1725         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1726                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1727                                     f_list)
1728                         return svc;
1729         }
1730
1731         return NULL;
1732 }
1733
1734 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1735 {
1736         read_unlock_bh(&__ip_vs_svc_lock);
1737 }
1738
1739
1740 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1741 {
1742         if (v == SEQ_START_TOKEN) {
1743                 seq_printf(seq,
1744                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1745                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1746                 seq_puts(seq,
1747                          "Prot LocalAddress:Port Scheduler Flags\n");
1748                 seq_puts(seq,
1749                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1750         } else {
1751                 const struct ip_vs_service *svc = v;
1752                 const struct ip_vs_iter *iter = seq->private;
1753                 const struct ip_vs_dest *dest;
1754
1755                 if (iter->table == ip_vs_svc_table)
1756                         seq_printf(seq, "%s  %08X:%04X %s ",
1757                                    ip_vs_proto_name(svc->protocol),
1758                                    ntohl(svc->addr),
1759                                    ntohs(svc->port),
1760                                    svc->scheduler->name);
1761                 else
1762                         seq_printf(seq, "FWM  %08X %s ",
1763                                    svc->fwmark, svc->scheduler->name);
1764
1765                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1766                         seq_printf(seq, "persistent %d %08X\n",
1767                                 svc->timeout,
1768                                 ntohl(svc->netmask));
1769                 else
1770                         seq_putc(seq, '\n');
1771
1772                 list_for_each_entry(dest, &svc->destinations, n_list) {
1773                         seq_printf(seq,
1774                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1775                                    ntohl(dest->addr), ntohs(dest->port),
1776                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1777                                    atomic_read(&dest->weight),
1778                                    atomic_read(&dest->activeconns),
1779                                    atomic_read(&dest->inactconns));
1780                 }
1781         }
1782         return 0;
1783 }
1784
1785 static struct seq_operations ip_vs_info_seq_ops = {
1786         .start = ip_vs_info_seq_start,
1787         .next  = ip_vs_info_seq_next,
1788         .stop  = ip_vs_info_seq_stop,
1789         .show  = ip_vs_info_seq_show,
1790 };
1791
1792 static int ip_vs_info_open(struct inode *inode, struct file *file)
1793 {
1794         struct seq_file *seq;
1795         int rc = -ENOMEM;
1796         struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1797
1798         if (!s)
1799                 goto out;
1800
1801         rc = seq_open(file, &ip_vs_info_seq_ops);
1802         if (rc)
1803                 goto out_kfree;
1804
1805         seq          = file->private_data;
1806         seq->private = s;
1807         memset(s, 0, sizeof(*s));
1808 out:
1809         return rc;
1810 out_kfree:
1811         kfree(s);
1812         goto out;
1813 }
1814
1815 static struct file_operations ip_vs_info_fops = {
1816         .owner   = THIS_MODULE,
1817         .open    = ip_vs_info_open,
1818         .read    = seq_read,
1819         .llseek  = seq_lseek,
1820         .release = seq_release_private,
1821 };
1822
1823 #endif
1824
1825 struct ip_vs_stats ip_vs_stats;
1826
1827 #ifdef CONFIG_PROC_FS
1828 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1829 {
1830
1831 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1832         seq_puts(seq,
1833                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1834         seq_printf(seq,
1835                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1836
1837         spin_lock_bh(&ip_vs_stats.lock);
1838         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1839                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1840                    (unsigned long long) ip_vs_stats.inbytes,
1841                    (unsigned long long) ip_vs_stats.outbytes);
1842
1843 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1844         seq_puts(seq,
1845                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1846         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1847                         ip_vs_stats.cps,
1848                         ip_vs_stats.inpps,
1849                         ip_vs_stats.outpps,
1850                         ip_vs_stats.inbps,
1851                         ip_vs_stats.outbps);
1852         spin_unlock_bh(&ip_vs_stats.lock);
1853
1854         return 0;
1855 }
1856
1857 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1858 {
1859         return single_open(file, ip_vs_stats_show, NULL);
1860 }
1861
1862 static struct file_operations ip_vs_stats_fops = {
1863         .owner = THIS_MODULE,
1864         .open = ip_vs_stats_seq_open,
1865         .read = seq_read,
1866         .llseek = seq_lseek,
1867         .release = single_release,
1868 };
1869
1870 #endif
1871
1872 /*
1873  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1874  */
1875 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1876 {
1877         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1878                   u->tcp_timeout,
1879                   u->tcp_fin_timeout,
1880                   u->udp_timeout);
1881
1882 #ifdef CONFIG_IP_VS_PROTO_TCP
1883         if (u->tcp_timeout) {
1884                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1885                         = u->tcp_timeout * HZ;
1886         }
1887
1888         if (u->tcp_fin_timeout) {
1889                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1890                         = u->tcp_fin_timeout * HZ;
1891         }
1892 #endif
1893
1894 #ifdef CONFIG_IP_VS_PROTO_UDP
1895         if (u->udp_timeout) {
1896                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1897                         = u->udp_timeout * HZ;
1898         }
1899 #endif
1900         return 0;
1901 }
1902
1903
1904 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1905 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1906 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1907                                  sizeof(struct ip_vs_dest_user))
1908 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1909 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1910 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1911
1912 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1913         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1914         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1915         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1916         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1917         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1918         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1919         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1920         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1921         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1922         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1923         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1924 };
1925
1926 static int
1927 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1928 {
1929         int ret;
1930         unsigned char arg[MAX_ARG_LEN];
1931         struct ip_vs_service_user *usvc;
1932         struct ip_vs_service *svc;
1933         struct ip_vs_dest_user *udest;
1934
1935         if (!capable(CAP_NET_ADMIN))
1936                 return -EPERM;
1937
1938         if (len != set_arglen[SET_CMDID(cmd)]) {
1939                 IP_VS_ERR("set_ctl: len %u != %u\n",
1940                           len, set_arglen[SET_CMDID(cmd)]);
1941                 return -EINVAL;
1942         }
1943
1944         if (copy_from_user(arg, user, len) != 0)
1945                 return -EFAULT;
1946
1947         /* increase the module use count */
1948         ip_vs_use_count_inc();
1949
1950         if (down_interruptible(&__ip_vs_mutex)) {
1951                 ret = -ERESTARTSYS;
1952                 goto out_dec;
1953         }
1954
1955         if (cmd == IP_VS_SO_SET_FLUSH) {
1956                 /* Flush the virtual service */
1957                 ret = ip_vs_flush();
1958                 goto out_unlock;
1959         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1960                 /* Set timeout values for (tcp tcpfin udp) */
1961                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1962                 goto out_unlock;
1963         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1964                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1965                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1966                 goto out_unlock;
1967         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1968                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969                 ret = stop_sync_thread(dm->state);
1970                 goto out_unlock;
1971         }
1972
1973         usvc = (struct ip_vs_service_user *)arg;
1974         udest = (struct ip_vs_dest_user *)(usvc + 1);
1975
1976         if (cmd == IP_VS_SO_SET_ZERO) {
1977                 /* if no service address is set, zero counters in all */
1978                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1979                         ret = ip_vs_zero_all();
1980                         goto out_unlock;
1981                 }
1982         }
1983
1984         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1985         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1986                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1987                           usvc->protocol, NIPQUAD(usvc->addr),
1988                           ntohs(usvc->port), usvc->sched_name);
1989                 ret = -EFAULT;
1990                 goto out_unlock;
1991         }
1992
1993         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1994         if (usvc->fwmark == 0)
1995                 svc = __ip_vs_service_get(usvc->protocol,
1996                                           usvc->addr, usvc->port);
1997         else
1998                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1999
2000         if (cmd != IP_VS_SO_SET_ADD
2001             && (svc == NULL || svc->protocol != usvc->protocol)) {
2002                 ret = -ESRCH;
2003                 goto out_unlock;
2004         }
2005
2006         switch (cmd) {
2007         case IP_VS_SO_SET_ADD:
2008                 if (svc != NULL)
2009                         ret = -EEXIST;
2010                 else
2011                         ret = ip_vs_add_service(usvc, &svc);
2012                 break;
2013         case IP_VS_SO_SET_EDIT:
2014                 ret = ip_vs_edit_service(svc, usvc);
2015                 break;
2016         case IP_VS_SO_SET_DEL:
2017                 ret = ip_vs_del_service(svc);
2018                 if (!ret)
2019                         goto out_unlock;
2020                 break;
2021         case IP_VS_SO_SET_ZERO:
2022                 ret = ip_vs_zero_service(svc);
2023                 break;
2024         case IP_VS_SO_SET_ADDDEST:
2025                 ret = ip_vs_add_dest(svc, udest);
2026                 break;
2027         case IP_VS_SO_SET_EDITDEST:
2028                 ret = ip_vs_edit_dest(svc, udest);
2029                 break;
2030         case IP_VS_SO_SET_DELDEST:
2031                 ret = ip_vs_del_dest(svc, udest);
2032                 break;
2033         default:
2034                 ret = -EINVAL;
2035         }
2036
2037         if (svc)
2038                 ip_vs_service_put(svc);
2039
2040   out_unlock:
2041         up(&__ip_vs_mutex);
2042   out_dec:
2043         /* decrease the module use count */
2044         ip_vs_use_count_dec();
2045
2046         return ret;
2047 }
2048
2049
2050 static void
2051 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2052 {
2053         spin_lock_bh(&src->lock);
2054         memcpy(dst, src, (char*)&src->lock - (char*)src);
2055         spin_unlock_bh(&src->lock);
2056 }
2057
2058 static void
2059 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2060 {
2061         dst->protocol = src->protocol;
2062         dst->addr = src->addr;
2063         dst->port = src->port;
2064         dst->fwmark = src->fwmark;
2065         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2066         dst->flags = src->flags;
2067         dst->timeout = src->timeout / HZ;
2068         dst->netmask = src->netmask;
2069         dst->num_dests = src->num_dests;
2070         ip_vs_copy_stats(&dst->stats, &src->stats);
2071 }
2072
2073 static inline int
2074 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2075                             struct ip_vs_get_services __user *uptr)
2076 {
2077         int idx, count=0;
2078         struct ip_vs_service *svc;
2079         struct ip_vs_service_entry entry;
2080         int ret = 0;
2081
2082         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2083                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2084                         if (count >= get->num_services)
2085                                 goto out;
2086                         memset(&entry, 0, sizeof(entry));
2087                         ip_vs_copy_service(&entry, svc);
2088                         if (copy_to_user(&uptr->entrytable[count],
2089                                          &entry, sizeof(entry))) {
2090                                 ret = -EFAULT;
2091                                 goto out;
2092                         }
2093                         count++;
2094                 }
2095         }
2096
2097         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2098                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2099                         if (count >= get->num_services)
2100                                 goto out;
2101                         memset(&entry, 0, sizeof(entry));
2102                         ip_vs_copy_service(&entry, svc);
2103                         if (copy_to_user(&uptr->entrytable[count],
2104                                          &entry, sizeof(entry))) {
2105                                 ret = -EFAULT;
2106                                 goto out;
2107                         }
2108                         count++;
2109                 }
2110         }
2111   out:
2112         return ret;
2113 }
2114
2115 static inline int
2116 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2117                          struct ip_vs_get_dests __user *uptr)
2118 {
2119         struct ip_vs_service *svc;
2120         int ret = 0;
2121
2122         if (get->fwmark)
2123                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2124         else
2125                 svc = __ip_vs_service_get(get->protocol,
2126                                           get->addr, get->port);
2127         if (svc) {
2128                 int count = 0;
2129                 struct ip_vs_dest *dest;
2130                 struct ip_vs_dest_entry entry;
2131
2132                 list_for_each_entry(dest, &svc->destinations, n_list) {
2133                         if (count >= get->num_dests)
2134                                 break;
2135
2136                         entry.addr = dest->addr;
2137                         entry.port = dest->port;
2138                         entry.conn_flags = atomic_read(&dest->conn_flags);
2139                         entry.weight = atomic_read(&dest->weight);
2140                         entry.u_threshold = dest->u_threshold;
2141                         entry.l_threshold = dest->l_threshold;
2142                         entry.activeconns = atomic_read(&dest->activeconns);
2143                         entry.inactconns = atomic_read(&dest->inactconns);
2144                         entry.persistconns = atomic_read(&dest->persistconns);
2145                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2146                         if (copy_to_user(&uptr->entrytable[count],
2147                                          &entry, sizeof(entry))) {
2148                                 ret = -EFAULT;
2149                                 break;
2150                         }
2151                         count++;
2152                 }
2153                 ip_vs_service_put(svc);
2154         } else
2155                 ret = -ESRCH;
2156         return ret;
2157 }
2158
2159 static inline void
2160 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2161 {
2162 #ifdef CONFIG_IP_VS_PROTO_TCP
2163         u->tcp_timeout =
2164                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2165         u->tcp_fin_timeout =
2166                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2167 #endif
2168 #ifdef CONFIG_IP_VS_PROTO_UDP
2169         u->udp_timeout =
2170                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2171 #endif
2172 }
2173
2174
2175 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2176 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2177 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2178 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2179 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2180 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2181 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2182
2183 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2184         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2185         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2186         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2187         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2188         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2189         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2190         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2191 };
2192
2193 static int
2194 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2195 {
2196         unsigned char arg[128];
2197         int ret = 0;
2198
2199         if (!capable(CAP_NET_ADMIN))
2200                 return -EPERM;
2201
2202         if (*len < get_arglen[GET_CMDID(cmd)]) {
2203                 IP_VS_ERR("get_ctl: len %u < %u\n",
2204                           *len, get_arglen[GET_CMDID(cmd)]);
2205                 return -EINVAL;
2206         }
2207
2208         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2209                 return -EFAULT;
2210
2211         if (down_interruptible(&__ip_vs_mutex))
2212                 return -ERESTARTSYS;
2213
2214         switch (cmd) {
2215         case IP_VS_SO_GET_VERSION:
2216         {
2217                 char buf[64];
2218
2219                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2220                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2221                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2222                         ret = -EFAULT;
2223                         goto out;
2224                 }
2225                 *len = strlen(buf)+1;
2226         }
2227         break;
2228
2229         case IP_VS_SO_GET_INFO:
2230         {
2231                 struct ip_vs_getinfo info;
2232                 info.version = IP_VS_VERSION_CODE;
2233                 info.size = IP_VS_CONN_TAB_SIZE;
2234                 info.num_services = ip_vs_num_services;
2235                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2236                         ret = -EFAULT;
2237         }
2238         break;
2239
2240         case IP_VS_SO_GET_SERVICES:
2241         {
2242                 struct ip_vs_get_services *get;
2243                 int size;
2244
2245                 get = (struct ip_vs_get_services *)arg;
2246                 size = sizeof(*get) +
2247                         sizeof(struct ip_vs_service_entry) * get->num_services;
2248                 if (*len != size) {
2249                         IP_VS_ERR("length: %u != %u\n", *len, size);
2250                         ret = -EINVAL;
2251                         goto out;
2252                 }
2253                 ret = __ip_vs_get_service_entries(get, user);
2254         }
2255         break;
2256
2257         case IP_VS_SO_GET_SERVICE:
2258         {
2259                 struct ip_vs_service_entry *entry;
2260                 struct ip_vs_service *svc;
2261
2262                 entry = (struct ip_vs_service_entry *)arg;
2263                 if (entry->fwmark)
2264                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2265                 else
2266                         svc = __ip_vs_service_get(entry->protocol,
2267                                                   entry->addr, entry->port);
2268                 if (svc) {
2269                         ip_vs_copy_service(entry, svc);
2270                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2271                                 ret = -EFAULT;
2272                         ip_vs_service_put(svc);
2273                 } else
2274                         ret = -ESRCH;
2275         }
2276         break;
2277
2278         case IP_VS_SO_GET_DESTS:
2279         {
2280                 struct ip_vs_get_dests *get;
2281                 int size;
2282
2283                 get = (struct ip_vs_get_dests *)arg;
2284                 size = sizeof(*get) +
2285                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2286                 if (*len != size) {
2287                         IP_VS_ERR("length: %u != %u\n", *len, size);
2288                         ret = -EINVAL;
2289                         goto out;
2290                 }
2291                 ret = __ip_vs_get_dest_entries(get, user);
2292         }
2293         break;
2294
2295         case IP_VS_SO_GET_TIMEOUT:
2296         {
2297                 struct ip_vs_timeout_user t;
2298
2299                 __ip_vs_get_timeouts(&t);
2300                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2301                         ret = -EFAULT;
2302         }
2303         break;
2304
2305         case IP_VS_SO_GET_DAEMON:
2306         {
2307                 struct ip_vs_daemon_user d[2];
2308
2309                 memset(&d, 0, sizeof(d));
2310                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2311                         d[0].state = IP_VS_STATE_MASTER;
2312                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2313                         d[0].syncid = ip_vs_master_syncid;
2314                 }
2315                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2316                         d[1].state = IP_VS_STATE_BACKUP;
2317                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2318                         d[1].syncid = ip_vs_backup_syncid;
2319                 }
2320                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2321                         ret = -EFAULT;
2322         }
2323         break;
2324
2325         default:
2326                 ret = -EINVAL;
2327         }
2328
2329   out:
2330         up(&__ip_vs_mutex);
2331         return ret;
2332 }
2333
2334
2335 static struct nf_sockopt_ops ip_vs_sockopts = {
2336         .pf             = PF_INET,
2337         .set_optmin     = IP_VS_BASE_CTL,
2338         .set_optmax     = IP_VS_SO_SET_MAX+1,
2339         .set            = do_ip_vs_set_ctl,
2340         .get_optmin     = IP_VS_BASE_CTL,
2341         .get_optmax     = IP_VS_SO_GET_MAX+1,
2342         .get            = do_ip_vs_get_ctl,
2343 };
2344
2345
2346 int ip_vs_control_init(void)
2347 {
2348         int ret;
2349         int idx;
2350
2351         EnterFunction(2);
2352
2353         ret = nf_register_sockopt(&ip_vs_sockopts);
2354         if (ret) {
2355                 IP_VS_ERR("cannot register sockopt.\n");
2356                 return ret;
2357         }
2358
2359         proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2360         proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2361
2362         sysctl_header = register_sysctl_table(vs_root_table, 0);
2363
2364         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2365         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2366                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2367                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2368         }
2369         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2370                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2371         }
2372
2373         memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2374         spin_lock_init(&ip_vs_stats.lock);
2375         ip_vs_new_estimator(&ip_vs_stats);
2376
2377         /* Hook the defense timer */
2378         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2379
2380         LeaveFunction(2);
2381         return 0;
2382 }
2383
2384
2385 void ip_vs_control_cleanup(void)
2386 {
2387         EnterFunction(2);
2388         ip_vs_trash_cleanup();
2389         cancel_rearming_delayed_work(&defense_work);
2390         ip_vs_kill_estimator(&ip_vs_stats);
2391         unregister_sysctl_table(sysctl_header);
2392         proc_net_remove("ip_vs_stats");
2393         proc_net_remove("ip_vs");
2394         nf_unregister_sockopt(&ip_vs_sockopts);
2395         LeaveFunction(2);
2396 }