Merge branch 'for-2.6.27' of git://linux-nfs.org/~bfields/linux
[linux-2.6] / net / ipv4 / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #include <linux/module.h>
22 #include <linux/init.h>
23 #include <linux/types.h>
24 #include <linux/capability.h>
25 #include <linux/fs.h>
26 #include <linux/sysctl.h>
27 #include <linux/proc_fs.h>
28 #include <linux/workqueue.h>
29 #include <linux/swap.h>
30 #include <linux/seq_file.h>
31
32 #include <linux/netfilter.h>
33 #include <linux/netfilter_ipv4.h>
34 #include <linux/mutex.h>
35
36 #include <net/net_namespace.h>
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <net/sock.h>
40
41 #include <asm/uaccess.h>
42
43 #include <net/ip_vs.h>
44
45 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
46 static DEFINE_MUTEX(__ip_vs_mutex);
47
48 /* lock for service table */
49 static DEFINE_RWLOCK(__ip_vs_svc_lock);
50
51 /* lock for table with the real services */
52 static DEFINE_RWLOCK(__ip_vs_rs_lock);
53
54 /* lock for state and timeout tables */
55 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56
57 /* lock for drop entry handling */
58 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59
60 /* lock for drop packet handling */
61 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62
63 /* 1/rate drop and drop-entry variables */
64 int ip_vs_drop_rate = 0;
65 int ip_vs_drop_counter = 0;
66 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67
68 /* number of virtual services */
69 static int ip_vs_num_services = 0;
70
71 /* sysctl variables */
72 static int sysctl_ip_vs_drop_entry = 0;
73 static int sysctl_ip_vs_drop_packet = 0;
74 static int sysctl_ip_vs_secure_tcp = 0;
75 static int sysctl_ip_vs_amemthresh = 1024;
76 static int sysctl_ip_vs_am_droprate = 10;
77 int sysctl_ip_vs_cache_bypass = 0;
78 int sysctl_ip_vs_expire_nodest_conn = 0;
79 int sysctl_ip_vs_expire_quiescent_template = 0;
80 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81 int sysctl_ip_vs_nat_icmp_send = 0;
82
83
84 #ifdef CONFIG_IP_VS_DEBUG
85 static int sysctl_ip_vs_debug_level = 0;
86
87 int ip_vs_get_debug_level(void)
88 {
89         return sysctl_ip_vs_debug_level;
90 }
91 #endif
92
93 /*
94  *      update_defense_level is called from keventd and from sysctl,
95  *      so it needs to protect itself from softirqs
96  */
97 static void update_defense_level(void)
98 {
99         struct sysinfo i;
100         static int old_secure_tcp = 0;
101         int availmem;
102         int nomem;
103         int to_change = -1;
104
105         /* we only count free and buffered memory (in pages) */
106         si_meminfo(&i);
107         availmem = i.freeram + i.bufferram;
108         /* however in linux 2.5 the i.bufferram is total page cache size,
109            we need adjust it */
110         /* si_swapinfo(&i); */
111         /* availmem = availmem - (i.totalswap - i.freeswap); */
112
113         nomem = (availmem < sysctl_ip_vs_amemthresh);
114
115         local_bh_disable();
116
117         /* drop_entry */
118         spin_lock(&__ip_vs_dropentry_lock);
119         switch (sysctl_ip_vs_drop_entry) {
120         case 0:
121                 atomic_set(&ip_vs_dropentry, 0);
122                 break;
123         case 1:
124                 if (nomem) {
125                         atomic_set(&ip_vs_dropentry, 1);
126                         sysctl_ip_vs_drop_entry = 2;
127                 } else {
128                         atomic_set(&ip_vs_dropentry, 0);
129                 }
130                 break;
131         case 2:
132                 if (nomem) {
133                         atomic_set(&ip_vs_dropentry, 1);
134                 } else {
135                         atomic_set(&ip_vs_dropentry, 0);
136                         sysctl_ip_vs_drop_entry = 1;
137                 };
138                 break;
139         case 3:
140                 atomic_set(&ip_vs_dropentry, 1);
141                 break;
142         }
143         spin_unlock(&__ip_vs_dropentry_lock);
144
145         /* drop_packet */
146         spin_lock(&__ip_vs_droppacket_lock);
147         switch (sysctl_ip_vs_drop_packet) {
148         case 0:
149                 ip_vs_drop_rate = 0;
150                 break;
151         case 1:
152                 if (nomem) {
153                         ip_vs_drop_rate = ip_vs_drop_counter
154                                 = sysctl_ip_vs_amemthresh /
155                                 (sysctl_ip_vs_amemthresh-availmem);
156                         sysctl_ip_vs_drop_packet = 2;
157                 } else {
158                         ip_vs_drop_rate = 0;
159                 }
160                 break;
161         case 2:
162                 if (nomem) {
163                         ip_vs_drop_rate = ip_vs_drop_counter
164                                 = sysctl_ip_vs_amemthresh /
165                                 (sysctl_ip_vs_amemthresh-availmem);
166                 } else {
167                         ip_vs_drop_rate = 0;
168                         sysctl_ip_vs_drop_packet = 1;
169                 }
170                 break;
171         case 3:
172                 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173                 break;
174         }
175         spin_unlock(&__ip_vs_droppacket_lock);
176
177         /* secure_tcp */
178         write_lock(&__ip_vs_securetcp_lock);
179         switch (sysctl_ip_vs_secure_tcp) {
180         case 0:
181                 if (old_secure_tcp >= 2)
182                         to_change = 0;
183                 break;
184         case 1:
185                 if (nomem) {
186                         if (old_secure_tcp < 2)
187                                 to_change = 1;
188                         sysctl_ip_vs_secure_tcp = 2;
189                 } else {
190                         if (old_secure_tcp >= 2)
191                                 to_change = 0;
192                 }
193                 break;
194         case 2:
195                 if (nomem) {
196                         if (old_secure_tcp < 2)
197                                 to_change = 1;
198                 } else {
199                         if (old_secure_tcp >= 2)
200                                 to_change = 0;
201                         sysctl_ip_vs_secure_tcp = 1;
202                 }
203                 break;
204         case 3:
205                 if (old_secure_tcp < 2)
206                         to_change = 1;
207                 break;
208         }
209         old_secure_tcp = sysctl_ip_vs_secure_tcp;
210         if (to_change >= 0)
211                 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212         write_unlock(&__ip_vs_securetcp_lock);
213
214         local_bh_enable();
215 }
216
217
218 /*
219  *      Timer for checking the defense
220  */
221 #define DEFENSE_TIMER_PERIOD    1*HZ
222 static void defense_work_handler(struct work_struct *work);
223 static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
224
225 static void defense_work_handler(struct work_struct *work)
226 {
227         update_defense_level();
228         if (atomic_read(&ip_vs_dropentry))
229                 ip_vs_random_dropentry();
230
231         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
232 }
233
234 int
235 ip_vs_use_count_inc(void)
236 {
237         return try_module_get(THIS_MODULE);
238 }
239
240 void
241 ip_vs_use_count_dec(void)
242 {
243         module_put(THIS_MODULE);
244 }
245
246
247 /*
248  *      Hash table: for virtual service lookups
249  */
250 #define IP_VS_SVC_TAB_BITS 8
251 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254 /* the service table hashed by <protocol, addr, port> */
255 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256 /* the service table hashed by fwmark */
257 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259 /*
260  *      Hash table: for real service lookups
261  */
262 #define IP_VS_RTAB_BITS 4
263 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265
266 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
267
268 /*
269  *      Trash for destinations
270  */
271 static LIST_HEAD(ip_vs_dest_trash);
272
273 /*
274  *      FTP & NULL virtual service counters
275  */
276 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
278
279
280 /*
281  *      Returns hash value for virtual service
282  */
283 static __inline__ unsigned
284 ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
285 {
286         register unsigned porth = ntohs(port);
287
288         return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289                 & IP_VS_SVC_TAB_MASK;
290 }
291
292 /*
293  *      Returns hash value of fwmark for virtual service lookup
294  */
295 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296 {
297         return fwmark & IP_VS_SVC_TAB_MASK;
298 }
299
300 /*
301  *      Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302  *      or in the ip_vs_svc_fwm_table by fwmark.
303  *      Should be called with locked tables.
304  */
305 static int ip_vs_svc_hash(struct ip_vs_service *svc)
306 {
307         unsigned hash;
308
309         if (svc->flags & IP_VS_SVC_F_HASHED) {
310                 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311                           "called from %p\n", __builtin_return_address(0));
312                 return 0;
313         }
314
315         if (svc->fwmark == 0) {
316                 /*
317                  *  Hash it by <protocol,addr,port> in ip_vs_svc_table
318                  */
319                 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321         } else {
322                 /*
323                  *  Hash it by fwmark in ip_vs_svc_fwm_table
324                  */
325                 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
327         }
328
329         svc->flags |= IP_VS_SVC_F_HASHED;
330         /* increase its refcnt because it is referenced by the svc table */
331         atomic_inc(&svc->refcnt);
332         return 1;
333 }
334
335
336 /*
337  *      Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338  *      Should be called with locked tables.
339  */
340 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341 {
342         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343                 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344                           "called from %p\n", __builtin_return_address(0));
345                 return 0;
346         }
347
348         if (svc->fwmark == 0) {
349                 /* Remove it from the ip_vs_svc_table table */
350                 list_del(&svc->s_list);
351         } else {
352                 /* Remove it from the ip_vs_svc_fwm_table table */
353                 list_del(&svc->f_list);
354         }
355
356         svc->flags &= ~IP_VS_SVC_F_HASHED;
357         atomic_dec(&svc->refcnt);
358         return 1;
359 }
360
361
362 /*
363  *      Get service by {proto,addr,port} in the service table.
364  */
365 static __inline__ struct ip_vs_service *
366 __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
367 {
368         unsigned hash;
369         struct ip_vs_service *svc;
370
371         /* Check for "full" addressed entries */
372         hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373
374         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375                 if ((svc->addr == vaddr)
376                     && (svc->port == vport)
377                     && (svc->protocol == protocol)) {
378                         /* HIT */
379                         atomic_inc(&svc->usecnt);
380                         return svc;
381                 }
382         }
383
384         return NULL;
385 }
386
387
388 /*
389  *      Get service by {fwmark} in the service table.
390  */
391 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
392 {
393         unsigned hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark) {
401                         /* HIT */
402                         atomic_inc(&svc->usecnt);
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
412 {
413         struct ip_vs_service *svc;
414
415         read_lock(&__ip_vs_svc_lock);
416
417         /*
418          *      Check the table hashed by fwmark first
419          */
420         if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
421                 goto out;
422
423         /*
424          *      Check the table hashed by <protocol,addr,port>
425          *      for "full" addressed entries
426          */
427         svc = __ip_vs_service_get(protocol, vaddr, vport);
428
429         if (svc == NULL
430             && protocol == IPPROTO_TCP
431             && atomic_read(&ip_vs_ftpsvc_counter)
432             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433                 /*
434                  * Check if ftp service entry exists, the packet
435                  * might belong to FTP data connections.
436                  */
437                 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
438         }
439
440         if (svc == NULL
441             && atomic_read(&ip_vs_nullsvc_counter)) {
442                 /*
443                  * Check if the catch-all port (port zero) exists
444                  */
445                 svc = __ip_vs_service_get(protocol, vaddr, 0);
446         }
447
448   out:
449         read_unlock(&__ip_vs_svc_lock);
450
451         IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
452                   fwmark, ip_vs_proto_name(protocol),
453                   NIPQUAD(vaddr), ntohs(vport),
454                   svc?"hit":"not hit");
455
456         return svc;
457 }
458
459
460 static inline void
461 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462 {
463         atomic_inc(&svc->refcnt);
464         dest->svc = svc;
465 }
466
467 static inline void
468 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
469 {
470         struct ip_vs_service *svc = dest->svc;
471
472         dest->svc = NULL;
473         if (atomic_dec_and_test(&svc->refcnt))
474                 kfree(svc);
475 }
476
477
478 /*
479  *      Returns hash value for real service
480  */
481 static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
482 {
483         register unsigned porth = ntohs(port);
484
485         return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486                 & IP_VS_RTAB_MASK;
487 }
488
489 /*
490  *      Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491  *      should be called with locked tables.
492  */
493 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
494 {
495         unsigned hash;
496
497         if (!list_empty(&dest->d_list)) {
498                 return 0;
499         }
500
501         /*
502          *      Hash by proto,addr,port,
503          *      which are the parameters of the real service.
504          */
505         hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506         list_add(&dest->d_list, &ip_vs_rtable[hash]);
507
508         return 1;
509 }
510
511 /*
512  *      UNhashes ip_vs_dest from ip_vs_rtable.
513  *      should be called with locked tables.
514  */
515 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
516 {
517         /*
518          * Remove it from the ip_vs_rtable table.
519          */
520         if (!list_empty(&dest->d_list)) {
521                 list_del(&dest->d_list);
522                 INIT_LIST_HEAD(&dest->d_list);
523         }
524
525         return 1;
526 }
527
528 /*
529  *      Lookup real service by <proto,addr,port> in the real service table.
530  */
531 struct ip_vs_dest *
532 ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
533 {
534         unsigned hash;
535         struct ip_vs_dest *dest;
536
537         /*
538          *      Check for "full" addressed entries
539          *      Return the first found entry
540          */
541         hash = ip_vs_rs_hashkey(daddr, dport);
542
543         read_lock(&__ip_vs_rs_lock);
544         list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545                 if ((dest->addr == daddr)
546                     && (dest->port == dport)
547                     && ((dest->protocol == protocol) ||
548                         dest->vfwmark)) {
549                         /* HIT */
550                         read_unlock(&__ip_vs_rs_lock);
551                         return dest;
552                 }
553         }
554         read_unlock(&__ip_vs_rs_lock);
555
556         return NULL;
557 }
558
559 /*
560  *      Lookup destination by {addr,port} in the given service
561  */
562 static struct ip_vs_dest *
563 ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
564 {
565         struct ip_vs_dest *dest;
566
567         /*
568          * Find the destination for the given service
569          */
570         list_for_each_entry(dest, &svc->destinations, n_list) {
571                 if ((dest->addr == daddr) && (dest->port == dport)) {
572                         /* HIT */
573                         return dest;
574                 }
575         }
576
577         return NULL;
578 }
579
580 /*
581  * Find destination by {daddr,dport,vaddr,protocol}
582  * Cretaed to be used in ip_vs_process_message() in
583  * the backup synchronization daemon. It finds the
584  * destination to be bound to the received connection
585  * on the backup.
586  *
587  * ip_vs_lookup_real_service() looked promissing, but
588  * seems not working as expected.
589  */
590 struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
591                                     __be32 vaddr, __be16 vport, __u16 protocol)
592 {
593         struct ip_vs_dest *dest;
594         struct ip_vs_service *svc;
595
596         svc = ip_vs_service_get(0, protocol, vaddr, vport);
597         if (!svc)
598                 return NULL;
599         dest = ip_vs_lookup_dest(svc, daddr, dport);
600         if (dest)
601                 atomic_inc(&dest->refcnt);
602         ip_vs_service_put(svc);
603         return dest;
604 }
605
606 /*
607  *  Lookup dest by {svc,addr,port} in the destination trash.
608  *  The destination trash is used to hold the destinations that are removed
609  *  from the service table but are still referenced by some conn entries.
610  *  The reason to add the destination trash is when the dest is temporary
611  *  down (either by administrator or by monitor program), the dest can be
612  *  picked back from the trash, the remaining connections to the dest can
613  *  continue, and the counting information of the dest is also useful for
614  *  scheduling.
615  */
616 static struct ip_vs_dest *
617 ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
618 {
619         struct ip_vs_dest *dest, *nxt;
620
621         /*
622          * Find the destination in trash
623          */
624         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
625                 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
626                           "dest->refcnt=%d\n",
627                           dest->vfwmark,
628                           NIPQUAD(dest->addr), ntohs(dest->port),
629                           atomic_read(&dest->refcnt));
630                 if (dest->addr == daddr &&
631                     dest->port == dport &&
632                     dest->vfwmark == svc->fwmark &&
633                     dest->protocol == svc->protocol &&
634                     (svc->fwmark ||
635                      (dest->vaddr == svc->addr &&
636                       dest->vport == svc->port))) {
637                         /* HIT */
638                         return dest;
639                 }
640
641                 /*
642                  * Try to purge the destination from trash if not referenced
643                  */
644                 if (atomic_read(&dest->refcnt) == 1) {
645                         IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
646                                   "from trash\n",
647                                   dest->vfwmark,
648                                   NIPQUAD(dest->addr), ntohs(dest->port));
649                         list_del(&dest->n_list);
650                         ip_vs_dst_reset(dest);
651                         __ip_vs_unbind_svc(dest);
652                         kfree(dest);
653                 }
654         }
655
656         return NULL;
657 }
658
659
660 /*
661  *  Clean up all the destinations in the trash
662  *  Called by the ip_vs_control_cleanup()
663  *
664  *  When the ip_vs_control_clearup is activated by ipvs module exit,
665  *  the service tables must have been flushed and all the connections
666  *  are expired, and the refcnt of each destination in the trash must
667  *  be 1, so we simply release them here.
668  */
669 static void ip_vs_trash_cleanup(void)
670 {
671         struct ip_vs_dest *dest, *nxt;
672
673         list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
674                 list_del(&dest->n_list);
675                 ip_vs_dst_reset(dest);
676                 __ip_vs_unbind_svc(dest);
677                 kfree(dest);
678         }
679 }
680
681
682 static void
683 ip_vs_zero_stats(struct ip_vs_stats *stats)
684 {
685         spin_lock_bh(&stats->lock);
686
687         stats->conns = 0;
688         stats->inpkts = 0;
689         stats->outpkts = 0;
690         stats->inbytes = 0;
691         stats->outbytes = 0;
692
693         stats->cps = 0;
694         stats->inpps = 0;
695         stats->outpps = 0;
696         stats->inbps = 0;
697         stats->outbps = 0;
698
699         ip_vs_zero_estimator(stats);
700
701         spin_unlock_bh(&stats->lock);
702 }
703
704 /*
705  *      Update a destination in the given service
706  */
707 static void
708 __ip_vs_update_dest(struct ip_vs_service *svc,
709                     struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
710 {
711         int conn_flags;
712
713         /* set the weight and the flags */
714         atomic_set(&dest->weight, udest->weight);
715         conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
716
717         /* check if local node and update the flags */
718         if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
719                 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
720                         | IP_VS_CONN_F_LOCALNODE;
721         }
722
723         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
724         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
725                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
726         } else {
727                 /*
728                  *    Put the real service in ip_vs_rtable if not present.
729                  *    For now only for NAT!
730                  */
731                 write_lock_bh(&__ip_vs_rs_lock);
732                 ip_vs_rs_hash(dest);
733                 write_unlock_bh(&__ip_vs_rs_lock);
734         }
735         atomic_set(&dest->conn_flags, conn_flags);
736
737         /* bind the service */
738         if (!dest->svc) {
739                 __ip_vs_bind_svc(dest, svc);
740         } else {
741                 if (dest->svc != svc) {
742                         __ip_vs_unbind_svc(dest);
743                         ip_vs_zero_stats(&dest->stats);
744                         __ip_vs_bind_svc(dest, svc);
745                 }
746         }
747
748         /* set the dest status flags */
749         dest->flags |= IP_VS_DEST_F_AVAILABLE;
750
751         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
752                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
753         dest->u_threshold = udest->u_threshold;
754         dest->l_threshold = udest->l_threshold;
755 }
756
757
758 /*
759  *      Create a destination for the given service
760  */
761 static int
762 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
763                struct ip_vs_dest **dest_p)
764 {
765         struct ip_vs_dest *dest;
766         unsigned atype;
767
768         EnterFunction(2);
769
770         atype = inet_addr_type(&init_net, udest->addr);
771         if (atype != RTN_LOCAL && atype != RTN_UNICAST)
772                 return -EINVAL;
773
774         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
775         if (dest == NULL) {
776                 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
777                 return -ENOMEM;
778         }
779
780         dest->protocol = svc->protocol;
781         dest->vaddr = svc->addr;
782         dest->vport = svc->port;
783         dest->vfwmark = svc->fwmark;
784         dest->addr = udest->addr;
785         dest->port = udest->port;
786
787         atomic_set(&dest->activeconns, 0);
788         atomic_set(&dest->inactconns, 0);
789         atomic_set(&dest->persistconns, 0);
790         atomic_set(&dest->refcnt, 0);
791
792         INIT_LIST_HEAD(&dest->d_list);
793         spin_lock_init(&dest->dst_lock);
794         spin_lock_init(&dest->stats.lock);
795         __ip_vs_update_dest(svc, dest, udest);
796         ip_vs_new_estimator(&dest->stats);
797
798         *dest_p = dest;
799
800         LeaveFunction(2);
801         return 0;
802 }
803
804
805 /*
806  *      Add a destination into an existing service
807  */
808 static int
809 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
810 {
811         struct ip_vs_dest *dest;
812         __be32 daddr = udest->addr;
813         __be16 dport = udest->port;
814         int ret;
815
816         EnterFunction(2);
817
818         if (udest->weight < 0) {
819                 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
820                 return -ERANGE;
821         }
822
823         if (udest->l_threshold > udest->u_threshold) {
824                 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
825                           "upper threshold\n");
826                 return -ERANGE;
827         }
828
829         /*
830          * Check if the dest already exists in the list
831          */
832         dest = ip_vs_lookup_dest(svc, daddr, dport);
833         if (dest != NULL) {
834                 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
835                 return -EEXIST;
836         }
837
838         /*
839          * Check if the dest already exists in the trash and
840          * is from the same service
841          */
842         dest = ip_vs_trash_get_dest(svc, daddr, dport);
843         if (dest != NULL) {
844                 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
845                           "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
846                           NIPQUAD(daddr), ntohs(dport),
847                           atomic_read(&dest->refcnt),
848                           dest->vfwmark,
849                           NIPQUAD(dest->vaddr),
850                           ntohs(dest->vport));
851                 __ip_vs_update_dest(svc, dest, udest);
852
853                 /*
854                  * Get the destination from the trash
855                  */
856                 list_del(&dest->n_list);
857
858                 ip_vs_new_estimator(&dest->stats);
859
860                 write_lock_bh(&__ip_vs_svc_lock);
861
862                 /*
863                  * Wait until all other svc users go away.
864                  */
865                 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
866
867                 list_add(&dest->n_list, &svc->destinations);
868                 svc->num_dests++;
869
870                 /* call the update_service function of its scheduler */
871                 svc->scheduler->update_service(svc);
872
873                 write_unlock_bh(&__ip_vs_svc_lock);
874                 return 0;
875         }
876
877         /*
878          * Allocate and initialize the dest structure
879          */
880         ret = ip_vs_new_dest(svc, udest, &dest);
881         if (ret) {
882                 return ret;
883         }
884
885         /*
886          * Add the dest entry into the list
887          */
888         atomic_inc(&dest->refcnt);
889
890         write_lock_bh(&__ip_vs_svc_lock);
891
892         /*
893          * Wait until all other svc users go away.
894          */
895         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
896
897         list_add(&dest->n_list, &svc->destinations);
898         svc->num_dests++;
899
900         /* call the update_service function of its scheduler */
901         svc->scheduler->update_service(svc);
902
903         write_unlock_bh(&__ip_vs_svc_lock);
904
905         LeaveFunction(2);
906
907         return 0;
908 }
909
910
911 /*
912  *      Edit a destination in the given service
913  */
914 static int
915 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
916 {
917         struct ip_vs_dest *dest;
918         __be32 daddr = udest->addr;
919         __be16 dport = udest->port;
920
921         EnterFunction(2);
922
923         if (udest->weight < 0) {
924                 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
925                 return -ERANGE;
926         }
927
928         if (udest->l_threshold > udest->u_threshold) {
929                 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
930                           "upper threshold\n");
931                 return -ERANGE;
932         }
933
934         /*
935          *  Lookup the destination list
936          */
937         dest = ip_vs_lookup_dest(svc, daddr, dport);
938         if (dest == NULL) {
939                 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
940                 return -ENOENT;
941         }
942
943         __ip_vs_update_dest(svc, dest, udest);
944
945         write_lock_bh(&__ip_vs_svc_lock);
946
947         /* Wait until all other svc users go away */
948         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
949
950         /* call the update_service, because server weight may be changed */
951         svc->scheduler->update_service(svc);
952
953         write_unlock_bh(&__ip_vs_svc_lock);
954
955         LeaveFunction(2);
956
957         return 0;
958 }
959
960
961 /*
962  *      Delete a destination (must be already unlinked from the service)
963  */
964 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
965 {
966         ip_vs_kill_estimator(&dest->stats);
967
968         /*
969          *  Remove it from the d-linked list with the real services.
970          */
971         write_lock_bh(&__ip_vs_rs_lock);
972         ip_vs_rs_unhash(dest);
973         write_unlock_bh(&__ip_vs_rs_lock);
974
975         /*
976          *  Decrease the refcnt of the dest, and free the dest
977          *  if nobody refers to it (refcnt=0). Otherwise, throw
978          *  the destination into the trash.
979          */
980         if (atomic_dec_and_test(&dest->refcnt)) {
981                 ip_vs_dst_reset(dest);
982                 /* simply decrease svc->refcnt here, let the caller check
983                    and release the service if nobody refers to it.
984                    Only user context can release destination and service,
985                    and only one user context can update virtual service at a
986                    time, so the operation here is OK */
987                 atomic_dec(&dest->svc->refcnt);
988                 kfree(dest);
989         } else {
990                 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
991                           "dest->refcnt=%d\n",
992                           NIPQUAD(dest->addr), ntohs(dest->port),
993                           atomic_read(&dest->refcnt));
994                 list_add(&dest->n_list, &ip_vs_dest_trash);
995                 atomic_inc(&dest->refcnt);
996         }
997 }
998
999
1000 /*
1001  *      Unlink a destination from the given service
1002  */
1003 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1004                                 struct ip_vs_dest *dest,
1005                                 int svcupd)
1006 {
1007         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1008
1009         /*
1010          *  Remove it from the d-linked destination list.
1011          */
1012         list_del(&dest->n_list);
1013         svc->num_dests--;
1014         if (svcupd) {
1015                 /*
1016                  *  Call the update_service function of its scheduler
1017                  */
1018                 svc->scheduler->update_service(svc);
1019         }
1020 }
1021
1022
1023 /*
1024  *      Delete a destination server in the given service
1025  */
1026 static int
1027 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1028 {
1029         struct ip_vs_dest *dest;
1030         __be32 daddr = udest->addr;
1031         __be16 dport = udest->port;
1032
1033         EnterFunction(2);
1034
1035         dest = ip_vs_lookup_dest(svc, daddr, dport);
1036         if (dest == NULL) {
1037                 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1038                 return -ENOENT;
1039         }
1040
1041         write_lock_bh(&__ip_vs_svc_lock);
1042
1043         /*
1044          *      Wait until all other svc users go away.
1045          */
1046         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1047
1048         /*
1049          *      Unlink dest from the service
1050          */
1051         __ip_vs_unlink_dest(svc, dest, 1);
1052
1053         write_unlock_bh(&__ip_vs_svc_lock);
1054
1055         /*
1056          *      Delete the destination
1057          */
1058         __ip_vs_del_dest(dest);
1059
1060         LeaveFunction(2);
1061
1062         return 0;
1063 }
1064
1065
1066 /*
1067  *      Add a service into the service hash table
1068  */
1069 static int
1070 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1071 {
1072         int ret = 0;
1073         struct ip_vs_scheduler *sched = NULL;
1074         struct ip_vs_service *svc = NULL;
1075
1076         /* increase the module use count */
1077         ip_vs_use_count_inc();
1078
1079         /* Lookup the scheduler by 'u->sched_name' */
1080         sched = ip_vs_scheduler_get(u->sched_name);
1081         if (sched == NULL) {
1082                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1083                            u->sched_name);
1084                 ret = -ENOENT;
1085                 goto out_mod_dec;
1086         }
1087
1088         svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1089         if (svc == NULL) {
1090                 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1091                 ret = -ENOMEM;
1092                 goto out_err;
1093         }
1094
1095         /* I'm the first user of the service */
1096         atomic_set(&svc->usecnt, 1);
1097         atomic_set(&svc->refcnt, 0);
1098
1099         svc->protocol = u->protocol;
1100         svc->addr = u->addr;
1101         svc->port = u->port;
1102         svc->fwmark = u->fwmark;
1103         svc->flags = u->flags;
1104         svc->timeout = u->timeout * HZ;
1105         svc->netmask = u->netmask;
1106
1107         INIT_LIST_HEAD(&svc->destinations);
1108         rwlock_init(&svc->sched_lock);
1109         spin_lock_init(&svc->stats.lock);
1110
1111         /* Bind the scheduler */
1112         ret = ip_vs_bind_scheduler(svc, sched);
1113         if (ret)
1114                 goto out_err;
1115         sched = NULL;
1116
1117         /* Update the virtual service counters */
1118         if (svc->port == FTPPORT)
1119                 atomic_inc(&ip_vs_ftpsvc_counter);
1120         else if (svc->port == 0)
1121                 atomic_inc(&ip_vs_nullsvc_counter);
1122
1123         ip_vs_new_estimator(&svc->stats);
1124         ip_vs_num_services++;
1125
1126         /* Hash the service into the service table */
1127         write_lock_bh(&__ip_vs_svc_lock);
1128         ip_vs_svc_hash(svc);
1129         write_unlock_bh(&__ip_vs_svc_lock);
1130
1131         *svc_p = svc;
1132         return 0;
1133
1134   out_err:
1135         if (svc != NULL) {
1136                 if (svc->scheduler)
1137                         ip_vs_unbind_scheduler(svc);
1138                 if (svc->inc) {
1139                         local_bh_disable();
1140                         ip_vs_app_inc_put(svc->inc);
1141                         local_bh_enable();
1142                 }
1143                 kfree(svc);
1144         }
1145         ip_vs_scheduler_put(sched);
1146
1147   out_mod_dec:
1148         /* decrease the module use count */
1149         ip_vs_use_count_dec();
1150
1151         return ret;
1152 }
1153
1154
1155 /*
1156  *      Edit a service and bind it with a new scheduler
1157  */
1158 static int
1159 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1160 {
1161         struct ip_vs_scheduler *sched, *old_sched;
1162         int ret = 0;
1163
1164         /*
1165          * Lookup the scheduler, by 'u->sched_name'
1166          */
1167         sched = ip_vs_scheduler_get(u->sched_name);
1168         if (sched == NULL) {
1169                 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1170                            u->sched_name);
1171                 return -ENOENT;
1172         }
1173         old_sched = sched;
1174
1175         write_lock_bh(&__ip_vs_svc_lock);
1176
1177         /*
1178          * Wait until all other svc users go away.
1179          */
1180         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1181
1182         /*
1183          * Set the flags and timeout value
1184          */
1185         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1186         svc->timeout = u->timeout * HZ;
1187         svc->netmask = u->netmask;
1188
1189         old_sched = svc->scheduler;
1190         if (sched != old_sched) {
1191                 /*
1192                  * Unbind the old scheduler
1193                  */
1194                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1195                         old_sched = sched;
1196                         goto out;
1197                 }
1198
1199                 /*
1200                  * Bind the new scheduler
1201                  */
1202                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1203                         /*
1204                          * If ip_vs_bind_scheduler fails, restore the old
1205                          * scheduler.
1206                          * The main reason of failure is out of memory.
1207                          *
1208                          * The question is if the old scheduler can be
1209                          * restored all the time. TODO: if it cannot be
1210                          * restored some time, we must delete the service,
1211                          * otherwise the system may crash.
1212                          */
1213                         ip_vs_bind_scheduler(svc, old_sched);
1214                         old_sched = sched;
1215                         goto out;
1216                 }
1217         }
1218
1219   out:
1220         write_unlock_bh(&__ip_vs_svc_lock);
1221
1222         if (old_sched)
1223                 ip_vs_scheduler_put(old_sched);
1224
1225         return ret;
1226 }
1227
1228
1229 /*
1230  *      Delete a service from the service list
1231  *      - The service must be unlinked, unlocked and not referenced!
1232  *      - We are called under _bh lock
1233  */
1234 static void __ip_vs_del_service(struct ip_vs_service *svc)
1235 {
1236         struct ip_vs_dest *dest, *nxt;
1237         struct ip_vs_scheduler *old_sched;
1238
1239         ip_vs_num_services--;
1240         ip_vs_kill_estimator(&svc->stats);
1241
1242         /* Unbind scheduler */
1243         old_sched = svc->scheduler;
1244         ip_vs_unbind_scheduler(svc);
1245         if (old_sched)
1246                 ip_vs_scheduler_put(old_sched);
1247
1248         /* Unbind app inc */
1249         if (svc->inc) {
1250                 ip_vs_app_inc_put(svc->inc);
1251                 svc->inc = NULL;
1252         }
1253
1254         /*
1255          *    Unlink the whole destination list
1256          */
1257         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1258                 __ip_vs_unlink_dest(svc, dest, 0);
1259                 __ip_vs_del_dest(dest);
1260         }
1261
1262         /*
1263          *    Update the virtual service counters
1264          */
1265         if (svc->port == FTPPORT)
1266                 atomic_dec(&ip_vs_ftpsvc_counter);
1267         else if (svc->port == 0)
1268                 atomic_dec(&ip_vs_nullsvc_counter);
1269
1270         /*
1271          *    Free the service if nobody refers to it
1272          */
1273         if (atomic_read(&svc->refcnt) == 0)
1274                 kfree(svc);
1275
1276         /* decrease the module use count */
1277         ip_vs_use_count_dec();
1278 }
1279
1280 /*
1281  *      Delete a service from the service list
1282  */
1283 static int ip_vs_del_service(struct ip_vs_service *svc)
1284 {
1285         if (svc == NULL)
1286                 return -EEXIST;
1287
1288         /*
1289          * Unhash it from the service table
1290          */
1291         write_lock_bh(&__ip_vs_svc_lock);
1292
1293         ip_vs_svc_unhash(svc);
1294
1295         /*
1296          * Wait until all the svc users go away.
1297          */
1298         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1299
1300         __ip_vs_del_service(svc);
1301
1302         write_unlock_bh(&__ip_vs_svc_lock);
1303
1304         return 0;
1305 }
1306
1307
1308 /*
1309  *      Flush all the virtual services
1310  */
1311 static int ip_vs_flush(void)
1312 {
1313         int idx;
1314         struct ip_vs_service *svc, *nxt;
1315
1316         /*
1317          * Flush the service table hashed by <protocol,addr,port>
1318          */
1319         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1320                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1321                         write_lock_bh(&__ip_vs_svc_lock);
1322                         ip_vs_svc_unhash(svc);
1323                         /*
1324                          * Wait until all the svc users go away.
1325                          */
1326                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1327                         __ip_vs_del_service(svc);
1328                         write_unlock_bh(&__ip_vs_svc_lock);
1329                 }
1330         }
1331
1332         /*
1333          * Flush the service table hashed by fwmark
1334          */
1335         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1336                 list_for_each_entry_safe(svc, nxt,
1337                                          &ip_vs_svc_fwm_table[idx], f_list) {
1338                         write_lock_bh(&__ip_vs_svc_lock);
1339                         ip_vs_svc_unhash(svc);
1340                         /*
1341                          * Wait until all the svc users go away.
1342                          */
1343                         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1344                         __ip_vs_del_service(svc);
1345                         write_unlock_bh(&__ip_vs_svc_lock);
1346                 }
1347         }
1348
1349         return 0;
1350 }
1351
1352
1353 /*
1354  *      Zero counters in a service or all services
1355  */
1356 static int ip_vs_zero_service(struct ip_vs_service *svc)
1357 {
1358         struct ip_vs_dest *dest;
1359
1360         write_lock_bh(&__ip_vs_svc_lock);
1361         list_for_each_entry(dest, &svc->destinations, n_list) {
1362                 ip_vs_zero_stats(&dest->stats);
1363         }
1364         ip_vs_zero_stats(&svc->stats);
1365         write_unlock_bh(&__ip_vs_svc_lock);
1366         return 0;
1367 }
1368
1369 static int ip_vs_zero_all(void)
1370 {
1371         int idx;
1372         struct ip_vs_service *svc;
1373
1374         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1375                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1376                         ip_vs_zero_service(svc);
1377                 }
1378         }
1379
1380         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1381                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1382                         ip_vs_zero_service(svc);
1383                 }
1384         }
1385
1386         ip_vs_zero_stats(&ip_vs_stats);
1387         return 0;
1388 }
1389
1390
1391 static int
1392 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1393                      void __user *buffer, size_t *lenp, loff_t *ppos)
1394 {
1395         int *valp = table->data;
1396         int val = *valp;
1397         int rc;
1398
1399         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1400         if (write && (*valp != val)) {
1401                 if ((*valp < 0) || (*valp > 3)) {
1402                         /* Restore the correct value */
1403                         *valp = val;
1404                 } else {
1405                         update_defense_level();
1406                 }
1407         }
1408         return rc;
1409 }
1410
1411
1412 static int
1413 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1414                        void __user *buffer, size_t *lenp, loff_t *ppos)
1415 {
1416         int *valp = table->data;
1417         int val[2];
1418         int rc;
1419
1420         /* backup the value first */
1421         memcpy(val, valp, sizeof(val));
1422
1423         rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1424         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1425                 /* Restore the correct value */
1426                 memcpy(valp, val, sizeof(val));
1427         }
1428         return rc;
1429 }
1430
1431
1432 /*
1433  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1434  */
1435
1436 static struct ctl_table vs_vars[] = {
1437         {
1438                 .procname       = "amemthresh",
1439                 .data           = &sysctl_ip_vs_amemthresh,
1440                 .maxlen         = sizeof(int),
1441                 .mode           = 0644,
1442                 .proc_handler   = &proc_dointvec,
1443         },
1444 #ifdef CONFIG_IP_VS_DEBUG
1445         {
1446                 .procname       = "debug_level",
1447                 .data           = &sysctl_ip_vs_debug_level,
1448                 .maxlen         = sizeof(int),
1449                 .mode           = 0644,
1450                 .proc_handler   = &proc_dointvec,
1451         },
1452 #endif
1453         {
1454                 .procname       = "am_droprate",
1455                 .data           = &sysctl_ip_vs_am_droprate,
1456                 .maxlen         = sizeof(int),
1457                 .mode           = 0644,
1458                 .proc_handler   = &proc_dointvec,
1459         },
1460         {
1461                 .procname       = "drop_entry",
1462                 .data           = &sysctl_ip_vs_drop_entry,
1463                 .maxlen         = sizeof(int),
1464                 .mode           = 0644,
1465                 .proc_handler   = &proc_do_defense_mode,
1466         },
1467         {
1468                 .procname       = "drop_packet",
1469                 .data           = &sysctl_ip_vs_drop_packet,
1470                 .maxlen         = sizeof(int),
1471                 .mode           = 0644,
1472                 .proc_handler   = &proc_do_defense_mode,
1473         },
1474         {
1475                 .procname       = "secure_tcp",
1476                 .data           = &sysctl_ip_vs_secure_tcp,
1477                 .maxlen         = sizeof(int),
1478                 .mode           = 0644,
1479                 .proc_handler   = &proc_do_defense_mode,
1480         },
1481 #if 0
1482         {
1483                 .procname       = "timeout_established",
1484                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1485                 .maxlen         = sizeof(int),
1486                 .mode           = 0644,
1487                 .proc_handler   = &proc_dointvec_jiffies,
1488         },
1489         {
1490                 .procname       = "timeout_synsent",
1491                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1492                 .maxlen         = sizeof(int),
1493                 .mode           = 0644,
1494                 .proc_handler   = &proc_dointvec_jiffies,
1495         },
1496         {
1497                 .procname       = "timeout_synrecv",
1498                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1499                 .maxlen         = sizeof(int),
1500                 .mode           = 0644,
1501                 .proc_handler   = &proc_dointvec_jiffies,
1502         },
1503         {
1504                 .procname       = "timeout_finwait",
1505                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1506                 .maxlen         = sizeof(int),
1507                 .mode           = 0644,
1508                 .proc_handler   = &proc_dointvec_jiffies,
1509         },
1510         {
1511                 .procname       = "timeout_timewait",
1512                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1513                 .maxlen         = sizeof(int),
1514                 .mode           = 0644,
1515                 .proc_handler   = &proc_dointvec_jiffies,
1516         },
1517         {
1518                 .procname       = "timeout_close",
1519                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1520                 .maxlen         = sizeof(int),
1521                 .mode           = 0644,
1522                 .proc_handler   = &proc_dointvec_jiffies,
1523         },
1524         {
1525                 .procname       = "timeout_closewait",
1526                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1527                 .maxlen         = sizeof(int),
1528                 .mode           = 0644,
1529                 .proc_handler   = &proc_dointvec_jiffies,
1530         },
1531         {
1532                 .procname       = "timeout_lastack",
1533                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1534                 .maxlen         = sizeof(int),
1535                 .mode           = 0644,
1536                 .proc_handler   = &proc_dointvec_jiffies,
1537         },
1538         {
1539                 .procname       = "timeout_listen",
1540                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1541                 .maxlen         = sizeof(int),
1542                 .mode           = 0644,
1543                 .proc_handler   = &proc_dointvec_jiffies,
1544         },
1545         {
1546                 .procname       = "timeout_synack",
1547                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1548                 .maxlen         = sizeof(int),
1549                 .mode           = 0644,
1550                 .proc_handler   = &proc_dointvec_jiffies,
1551         },
1552         {
1553                 .procname       = "timeout_udp",
1554                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1555                 .maxlen         = sizeof(int),
1556                 .mode           = 0644,
1557                 .proc_handler   = &proc_dointvec_jiffies,
1558         },
1559         {
1560                 .procname       = "timeout_icmp",
1561                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1562                 .maxlen         = sizeof(int),
1563                 .mode           = 0644,
1564                 .proc_handler   = &proc_dointvec_jiffies,
1565         },
1566 #endif
1567         {
1568                 .procname       = "cache_bypass",
1569                 .data           = &sysctl_ip_vs_cache_bypass,
1570                 .maxlen         = sizeof(int),
1571                 .mode           = 0644,
1572                 .proc_handler   = &proc_dointvec,
1573         },
1574         {
1575                 .procname       = "expire_nodest_conn",
1576                 .data           = &sysctl_ip_vs_expire_nodest_conn,
1577                 .maxlen         = sizeof(int),
1578                 .mode           = 0644,
1579                 .proc_handler   = &proc_dointvec,
1580         },
1581         {
1582                 .procname       = "expire_quiescent_template",
1583                 .data           = &sysctl_ip_vs_expire_quiescent_template,
1584                 .maxlen         = sizeof(int),
1585                 .mode           = 0644,
1586                 .proc_handler   = &proc_dointvec,
1587         },
1588         {
1589                 .procname       = "sync_threshold",
1590                 .data           = &sysctl_ip_vs_sync_threshold,
1591                 .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
1592                 .mode           = 0644,
1593                 .proc_handler   = &proc_do_sync_threshold,
1594         },
1595         {
1596                 .procname       = "nat_icmp_send",
1597                 .data           = &sysctl_ip_vs_nat_icmp_send,
1598                 .maxlen         = sizeof(int),
1599                 .mode           = 0644,
1600                 .proc_handler   = &proc_dointvec,
1601         },
1602         { .ctl_name = 0 }
1603 };
1604
1605 const struct ctl_path net_vs_ctl_path[] = {
1606         { .procname = "net", .ctl_name = CTL_NET, },
1607         { .procname = "ipv4", .ctl_name = NET_IPV4, },
1608         { .procname = "vs", },
1609         { }
1610 };
1611 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1612
1613 static struct ctl_table_header * sysctl_header;
1614
1615 #ifdef CONFIG_PROC_FS
1616
1617 struct ip_vs_iter {
1618         struct list_head *table;
1619         int bucket;
1620 };
1621
1622 /*
1623  *      Write the contents of the VS rule table to a PROCfs file.
1624  *      (It is kept just for backward compatibility)
1625  */
1626 static inline const char *ip_vs_fwd_name(unsigned flags)
1627 {
1628         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1629         case IP_VS_CONN_F_LOCALNODE:
1630                 return "Local";
1631         case IP_VS_CONN_F_TUNNEL:
1632                 return "Tunnel";
1633         case IP_VS_CONN_F_DROUTE:
1634                 return "Route";
1635         default:
1636                 return "Masq";
1637         }
1638 }
1639
1640
1641 /* Get the Nth entry in the two lists */
1642 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1643 {
1644         struct ip_vs_iter *iter = seq->private;
1645         int idx;
1646         struct ip_vs_service *svc;
1647
1648         /* look in hash by protocol */
1649         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1650                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1651                         if (pos-- == 0){
1652                                 iter->table = ip_vs_svc_table;
1653                                 iter->bucket = idx;
1654                                 return svc;
1655                         }
1656                 }
1657         }
1658
1659         /* keep looking in fwmark */
1660         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1661                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1662                         if (pos-- == 0) {
1663                                 iter->table = ip_vs_svc_fwm_table;
1664                                 iter->bucket = idx;
1665                                 return svc;
1666                         }
1667                 }
1668         }
1669
1670         return NULL;
1671 }
1672
1673 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1674 {
1675
1676         read_lock_bh(&__ip_vs_svc_lock);
1677         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1678 }
1679
1680
1681 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1682 {
1683         struct list_head *e;
1684         struct ip_vs_iter *iter;
1685         struct ip_vs_service *svc;
1686
1687         ++*pos;
1688         if (v == SEQ_START_TOKEN)
1689                 return ip_vs_info_array(seq,0);
1690
1691         svc = v;
1692         iter = seq->private;
1693
1694         if (iter->table == ip_vs_svc_table) {
1695                 /* next service in table hashed by protocol */
1696                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1697                         return list_entry(e, struct ip_vs_service, s_list);
1698
1699
1700                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1701                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1702                                             s_list) {
1703                                 return svc;
1704                         }
1705                 }
1706
1707                 iter->table = ip_vs_svc_fwm_table;
1708                 iter->bucket = -1;
1709                 goto scan_fwmark;
1710         }
1711
1712         /* next service in hashed by fwmark */
1713         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1714                 return list_entry(e, struct ip_vs_service, f_list);
1715
1716  scan_fwmark:
1717         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1718                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1719                                     f_list)
1720                         return svc;
1721         }
1722
1723         return NULL;
1724 }
1725
1726 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1727 {
1728         read_unlock_bh(&__ip_vs_svc_lock);
1729 }
1730
1731
1732 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1733 {
1734         if (v == SEQ_START_TOKEN) {
1735                 seq_printf(seq,
1736                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1737                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1738                 seq_puts(seq,
1739                          "Prot LocalAddress:Port Scheduler Flags\n");
1740                 seq_puts(seq,
1741                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1742         } else {
1743                 const struct ip_vs_service *svc = v;
1744                 const struct ip_vs_iter *iter = seq->private;
1745                 const struct ip_vs_dest *dest;
1746
1747                 if (iter->table == ip_vs_svc_table)
1748                         seq_printf(seq, "%s  %08X:%04X %s ",
1749                                    ip_vs_proto_name(svc->protocol),
1750                                    ntohl(svc->addr),
1751                                    ntohs(svc->port),
1752                                    svc->scheduler->name);
1753                 else
1754                         seq_printf(seq, "FWM  %08X %s ",
1755                                    svc->fwmark, svc->scheduler->name);
1756
1757                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1758                         seq_printf(seq, "persistent %d %08X\n",
1759                                 svc->timeout,
1760                                 ntohl(svc->netmask));
1761                 else
1762                         seq_putc(seq, '\n');
1763
1764                 list_for_each_entry(dest, &svc->destinations, n_list) {
1765                         seq_printf(seq,
1766                                    "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1767                                    ntohl(dest->addr), ntohs(dest->port),
1768                                    ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1769                                    atomic_read(&dest->weight),
1770                                    atomic_read(&dest->activeconns),
1771                                    atomic_read(&dest->inactconns));
1772                 }
1773         }
1774         return 0;
1775 }
1776
1777 static const struct seq_operations ip_vs_info_seq_ops = {
1778         .start = ip_vs_info_seq_start,
1779         .next  = ip_vs_info_seq_next,
1780         .stop  = ip_vs_info_seq_stop,
1781         .show  = ip_vs_info_seq_show,
1782 };
1783
1784 static int ip_vs_info_open(struct inode *inode, struct file *file)
1785 {
1786         return seq_open_private(file, &ip_vs_info_seq_ops,
1787                         sizeof(struct ip_vs_iter));
1788 }
1789
1790 static const struct file_operations ip_vs_info_fops = {
1791         .owner   = THIS_MODULE,
1792         .open    = ip_vs_info_open,
1793         .read    = seq_read,
1794         .llseek  = seq_lseek,
1795         .release = seq_release_private,
1796 };
1797
1798 #endif
1799
1800 struct ip_vs_stats ip_vs_stats = {
1801         .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1802 };
1803
1804 #ifdef CONFIG_PROC_FS
1805 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1806 {
1807
1808 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1809         seq_puts(seq,
1810                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1811         seq_printf(seq,
1812                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1813
1814         spin_lock_bh(&ip_vs_stats.lock);
1815         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1816                    ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1817                    (unsigned long long) ip_vs_stats.inbytes,
1818                    (unsigned long long) ip_vs_stats.outbytes);
1819
1820 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1821         seq_puts(seq,
1822                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1823         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1824                         ip_vs_stats.cps,
1825                         ip_vs_stats.inpps,
1826                         ip_vs_stats.outpps,
1827                         ip_vs_stats.inbps,
1828                         ip_vs_stats.outbps);
1829         spin_unlock_bh(&ip_vs_stats.lock);
1830
1831         return 0;
1832 }
1833
1834 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1835 {
1836         return single_open(file, ip_vs_stats_show, NULL);
1837 }
1838
1839 static const struct file_operations ip_vs_stats_fops = {
1840         .owner = THIS_MODULE,
1841         .open = ip_vs_stats_seq_open,
1842         .read = seq_read,
1843         .llseek = seq_lseek,
1844         .release = single_release,
1845 };
1846
1847 #endif
1848
1849 /*
1850  *      Set timeout values for tcp tcpfin udp in the timeout_table.
1851  */
1852 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1853 {
1854         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1855                   u->tcp_timeout,
1856                   u->tcp_fin_timeout,
1857                   u->udp_timeout);
1858
1859 #ifdef CONFIG_IP_VS_PROTO_TCP
1860         if (u->tcp_timeout) {
1861                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1862                         = u->tcp_timeout * HZ;
1863         }
1864
1865         if (u->tcp_fin_timeout) {
1866                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1867                         = u->tcp_fin_timeout * HZ;
1868         }
1869 #endif
1870
1871 #ifdef CONFIG_IP_VS_PROTO_UDP
1872         if (u->udp_timeout) {
1873                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1874                         = u->udp_timeout * HZ;
1875         }
1876 #endif
1877         return 0;
1878 }
1879
1880
1881 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
1882 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
1883 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
1884                                  sizeof(struct ip_vs_dest_user))
1885 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
1886 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
1887 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
1888
1889 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1890         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
1891         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
1892         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
1893         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
1894         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
1895         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
1896         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
1897         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
1898         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
1899         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
1900         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
1901 };
1902
1903 static int
1904 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1905 {
1906         int ret;
1907         unsigned char arg[MAX_ARG_LEN];
1908         struct ip_vs_service_user *usvc;
1909         struct ip_vs_service *svc;
1910         struct ip_vs_dest_user *udest;
1911
1912         if (!capable(CAP_NET_ADMIN))
1913                 return -EPERM;
1914
1915         if (len != set_arglen[SET_CMDID(cmd)]) {
1916                 IP_VS_ERR("set_ctl: len %u != %u\n",
1917                           len, set_arglen[SET_CMDID(cmd)]);
1918                 return -EINVAL;
1919         }
1920
1921         if (copy_from_user(arg, user, len) != 0)
1922                 return -EFAULT;
1923
1924         /* increase the module use count */
1925         ip_vs_use_count_inc();
1926
1927         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1928                 ret = -ERESTARTSYS;
1929                 goto out_dec;
1930         }
1931
1932         if (cmd == IP_VS_SO_SET_FLUSH) {
1933                 /* Flush the virtual service */
1934                 ret = ip_vs_flush();
1935                 goto out_unlock;
1936         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1937                 /* Set timeout values for (tcp tcpfin udp) */
1938                 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1939                 goto out_unlock;
1940         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1941                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1942                 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1943                 goto out_unlock;
1944         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1945                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1946                 ret = stop_sync_thread(dm->state);
1947                 goto out_unlock;
1948         }
1949
1950         usvc = (struct ip_vs_service_user *)arg;
1951         udest = (struct ip_vs_dest_user *)(usvc + 1);
1952
1953         if (cmd == IP_VS_SO_SET_ZERO) {
1954                 /* if no service address is set, zero counters in all */
1955                 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1956                         ret = ip_vs_zero_all();
1957                         goto out_unlock;
1958                 }
1959         }
1960
1961         /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1962         if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1963                 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1964                           usvc->protocol, NIPQUAD(usvc->addr),
1965                           ntohs(usvc->port), usvc->sched_name);
1966                 ret = -EFAULT;
1967                 goto out_unlock;
1968         }
1969
1970         /* Lookup the exact service by <protocol, addr, port> or fwmark */
1971         if (usvc->fwmark == 0)
1972                 svc = __ip_vs_service_get(usvc->protocol,
1973                                           usvc->addr, usvc->port);
1974         else
1975                 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1976
1977         if (cmd != IP_VS_SO_SET_ADD
1978             && (svc == NULL || svc->protocol != usvc->protocol)) {
1979                 ret = -ESRCH;
1980                 goto out_unlock;
1981         }
1982
1983         switch (cmd) {
1984         case IP_VS_SO_SET_ADD:
1985                 if (svc != NULL)
1986                         ret = -EEXIST;
1987                 else
1988                         ret = ip_vs_add_service(usvc, &svc);
1989                 break;
1990         case IP_VS_SO_SET_EDIT:
1991                 ret = ip_vs_edit_service(svc, usvc);
1992                 break;
1993         case IP_VS_SO_SET_DEL:
1994                 ret = ip_vs_del_service(svc);
1995                 if (!ret)
1996                         goto out_unlock;
1997                 break;
1998         case IP_VS_SO_SET_ZERO:
1999                 ret = ip_vs_zero_service(svc);
2000                 break;
2001         case IP_VS_SO_SET_ADDDEST:
2002                 ret = ip_vs_add_dest(svc, udest);
2003                 break;
2004         case IP_VS_SO_SET_EDITDEST:
2005                 ret = ip_vs_edit_dest(svc, udest);
2006                 break;
2007         case IP_VS_SO_SET_DELDEST:
2008                 ret = ip_vs_del_dest(svc, udest);
2009                 break;
2010         default:
2011                 ret = -EINVAL;
2012         }
2013
2014         if (svc)
2015                 ip_vs_service_put(svc);
2016
2017   out_unlock:
2018         mutex_unlock(&__ip_vs_mutex);
2019   out_dec:
2020         /* decrease the module use count */
2021         ip_vs_use_count_dec();
2022
2023         return ret;
2024 }
2025
2026
2027 static void
2028 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2029 {
2030         spin_lock_bh(&src->lock);
2031         memcpy(dst, src, (char*)&src->lock - (char*)src);
2032         spin_unlock_bh(&src->lock);
2033 }
2034
2035 static void
2036 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2037 {
2038         dst->protocol = src->protocol;
2039         dst->addr = src->addr;
2040         dst->port = src->port;
2041         dst->fwmark = src->fwmark;
2042         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2043         dst->flags = src->flags;
2044         dst->timeout = src->timeout / HZ;
2045         dst->netmask = src->netmask;
2046         dst->num_dests = src->num_dests;
2047         ip_vs_copy_stats(&dst->stats, &src->stats);
2048 }
2049
2050 static inline int
2051 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2052                             struct ip_vs_get_services __user *uptr)
2053 {
2054         int idx, count=0;
2055         struct ip_vs_service *svc;
2056         struct ip_vs_service_entry entry;
2057         int ret = 0;
2058
2059         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2060                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2061                         if (count >= get->num_services)
2062                                 goto out;
2063                         memset(&entry, 0, sizeof(entry));
2064                         ip_vs_copy_service(&entry, svc);
2065                         if (copy_to_user(&uptr->entrytable[count],
2066                                          &entry, sizeof(entry))) {
2067                                 ret = -EFAULT;
2068                                 goto out;
2069                         }
2070                         count++;
2071                 }
2072         }
2073
2074         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2075                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2076                         if (count >= get->num_services)
2077                                 goto out;
2078                         memset(&entry, 0, sizeof(entry));
2079                         ip_vs_copy_service(&entry, svc);
2080                         if (copy_to_user(&uptr->entrytable[count],
2081                                          &entry, sizeof(entry))) {
2082                                 ret = -EFAULT;
2083                                 goto out;
2084                         }
2085                         count++;
2086                 }
2087         }
2088   out:
2089         return ret;
2090 }
2091
2092 static inline int
2093 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2094                          struct ip_vs_get_dests __user *uptr)
2095 {
2096         struct ip_vs_service *svc;
2097         int ret = 0;
2098
2099         if (get->fwmark)
2100                 svc = __ip_vs_svc_fwm_get(get->fwmark);
2101         else
2102                 svc = __ip_vs_service_get(get->protocol,
2103                                           get->addr, get->port);
2104         if (svc) {
2105                 int count = 0;
2106                 struct ip_vs_dest *dest;
2107                 struct ip_vs_dest_entry entry;
2108
2109                 list_for_each_entry(dest, &svc->destinations, n_list) {
2110                         if (count >= get->num_dests)
2111                                 break;
2112
2113                         entry.addr = dest->addr;
2114                         entry.port = dest->port;
2115                         entry.conn_flags = atomic_read(&dest->conn_flags);
2116                         entry.weight = atomic_read(&dest->weight);
2117                         entry.u_threshold = dest->u_threshold;
2118                         entry.l_threshold = dest->l_threshold;
2119                         entry.activeconns = atomic_read(&dest->activeconns);
2120                         entry.inactconns = atomic_read(&dest->inactconns);
2121                         entry.persistconns = atomic_read(&dest->persistconns);
2122                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2123                         if (copy_to_user(&uptr->entrytable[count],
2124                                          &entry, sizeof(entry))) {
2125                                 ret = -EFAULT;
2126                                 break;
2127                         }
2128                         count++;
2129                 }
2130                 ip_vs_service_put(svc);
2131         } else
2132                 ret = -ESRCH;
2133         return ret;
2134 }
2135
2136 static inline void
2137 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2138 {
2139 #ifdef CONFIG_IP_VS_PROTO_TCP
2140         u->tcp_timeout =
2141                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2142         u->tcp_fin_timeout =
2143                 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2144 #endif
2145 #ifdef CONFIG_IP_VS_PROTO_UDP
2146         u->udp_timeout =
2147                 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2148 #endif
2149 }
2150
2151
2152 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2153 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2154 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2155 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2156 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2157 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2158 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2159
2160 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2161         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2162         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2163         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2164         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2165         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2166         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2167         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2168 };
2169
2170 static int
2171 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2172 {
2173         unsigned char arg[128];
2174         int ret = 0;
2175
2176         if (!capable(CAP_NET_ADMIN))
2177                 return -EPERM;
2178
2179         if (*len < get_arglen[GET_CMDID(cmd)]) {
2180                 IP_VS_ERR("get_ctl: len %u < %u\n",
2181                           *len, get_arglen[GET_CMDID(cmd)]);
2182                 return -EINVAL;
2183         }
2184
2185         if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2186                 return -EFAULT;
2187
2188         if (mutex_lock_interruptible(&__ip_vs_mutex))
2189                 return -ERESTARTSYS;
2190
2191         switch (cmd) {
2192         case IP_VS_SO_GET_VERSION:
2193         {
2194                 char buf[64];
2195
2196                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2197                         NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2198                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2199                         ret = -EFAULT;
2200                         goto out;
2201                 }
2202                 *len = strlen(buf)+1;
2203         }
2204         break;
2205
2206         case IP_VS_SO_GET_INFO:
2207         {
2208                 struct ip_vs_getinfo info;
2209                 info.version = IP_VS_VERSION_CODE;
2210                 info.size = IP_VS_CONN_TAB_SIZE;
2211                 info.num_services = ip_vs_num_services;
2212                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2213                         ret = -EFAULT;
2214         }
2215         break;
2216
2217         case IP_VS_SO_GET_SERVICES:
2218         {
2219                 struct ip_vs_get_services *get;
2220                 int size;
2221
2222                 get = (struct ip_vs_get_services *)arg;
2223                 size = sizeof(*get) +
2224                         sizeof(struct ip_vs_service_entry) * get->num_services;
2225                 if (*len != size) {
2226                         IP_VS_ERR("length: %u != %u\n", *len, size);
2227                         ret = -EINVAL;
2228                         goto out;
2229                 }
2230                 ret = __ip_vs_get_service_entries(get, user);
2231         }
2232         break;
2233
2234         case IP_VS_SO_GET_SERVICE:
2235         {
2236                 struct ip_vs_service_entry *entry;
2237                 struct ip_vs_service *svc;
2238
2239                 entry = (struct ip_vs_service_entry *)arg;
2240                 if (entry->fwmark)
2241                         svc = __ip_vs_svc_fwm_get(entry->fwmark);
2242                 else
2243                         svc = __ip_vs_service_get(entry->protocol,
2244                                                   entry->addr, entry->port);
2245                 if (svc) {
2246                         ip_vs_copy_service(entry, svc);
2247                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2248                                 ret = -EFAULT;
2249                         ip_vs_service_put(svc);
2250                 } else
2251                         ret = -ESRCH;
2252         }
2253         break;
2254
2255         case IP_VS_SO_GET_DESTS:
2256         {
2257                 struct ip_vs_get_dests *get;
2258                 int size;
2259
2260                 get = (struct ip_vs_get_dests *)arg;
2261                 size = sizeof(*get) +
2262                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2263                 if (*len != size) {
2264                         IP_VS_ERR("length: %u != %u\n", *len, size);
2265                         ret = -EINVAL;
2266                         goto out;
2267                 }
2268                 ret = __ip_vs_get_dest_entries(get, user);
2269         }
2270         break;
2271
2272         case IP_VS_SO_GET_TIMEOUT:
2273         {
2274                 struct ip_vs_timeout_user t;
2275
2276                 __ip_vs_get_timeouts(&t);
2277                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2278                         ret = -EFAULT;
2279         }
2280         break;
2281
2282         case IP_VS_SO_GET_DAEMON:
2283         {
2284                 struct ip_vs_daemon_user d[2];
2285
2286                 memset(&d, 0, sizeof(d));
2287                 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2288                         d[0].state = IP_VS_STATE_MASTER;
2289                         strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2290                         d[0].syncid = ip_vs_master_syncid;
2291                 }
2292                 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2293                         d[1].state = IP_VS_STATE_BACKUP;
2294                         strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2295                         d[1].syncid = ip_vs_backup_syncid;
2296                 }
2297                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2298                         ret = -EFAULT;
2299         }
2300         break;
2301
2302         default:
2303                 ret = -EINVAL;
2304         }
2305
2306   out:
2307         mutex_unlock(&__ip_vs_mutex);
2308         return ret;
2309 }
2310
2311
2312 static struct nf_sockopt_ops ip_vs_sockopts = {
2313         .pf             = PF_INET,
2314         .set_optmin     = IP_VS_BASE_CTL,
2315         .set_optmax     = IP_VS_SO_SET_MAX+1,
2316         .set            = do_ip_vs_set_ctl,
2317         .get_optmin     = IP_VS_BASE_CTL,
2318         .get_optmax     = IP_VS_SO_GET_MAX+1,
2319         .get            = do_ip_vs_get_ctl,
2320         .owner          = THIS_MODULE,
2321 };
2322
2323
2324 int __init ip_vs_control_init(void)
2325 {
2326         int ret;
2327         int idx;
2328
2329         EnterFunction(2);
2330
2331         ret = nf_register_sockopt(&ip_vs_sockopts);
2332         if (ret) {
2333                 IP_VS_ERR("cannot register sockopt.\n");
2334                 return ret;
2335         }
2336
2337         proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2338         proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2339
2340         sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
2341
2342         /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2343         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2344                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2345                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2346         }
2347         for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2348                 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2349         }
2350
2351         ip_vs_new_estimator(&ip_vs_stats);
2352
2353         /* Hook the defense timer */
2354         schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2355
2356         LeaveFunction(2);
2357         return 0;
2358 }
2359
2360
2361 void ip_vs_control_cleanup(void)
2362 {
2363         EnterFunction(2);
2364         ip_vs_trash_cleanup();
2365         cancel_rearming_delayed_work(&defense_work);
2366         cancel_work_sync(&defense_work.work);
2367         ip_vs_kill_estimator(&ip_vs_stats);
2368         unregister_sysctl_table(sysctl_header);
2369         proc_net_remove(&init_net, "ip_vs_stats");
2370         proc_net_remove(&init_net, "ip_vs");
2371         nf_unregister_sockopt(&ip_vs_sockopts);
2372         LeaveFunction(2);
2373 }