2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
26 #include <linux/capability.h>
28 #include <linux/sysctl.h>
29 #include <linux/proc_fs.h>
30 #include <linux/workqueue.h>
31 #include <linux/swap.h>
32 #include <linux/seq_file.h>
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
36 #include <linux/mutex.h>
38 #include <net/net_namespace.h>
40 #include <net/route.h>
43 #include <asm/uaccess.h>
45 #include <net/ip_vs.h>
47 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48 static DEFINE_MUTEX(__ip_vs_mutex);
50 /* lock for service table */
51 static DEFINE_RWLOCK(__ip_vs_svc_lock);
53 /* lock for table with the real services */
54 static DEFINE_RWLOCK(__ip_vs_rs_lock);
56 /* lock for state and timeout tables */
57 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
59 /* lock for drop entry handling */
60 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
62 /* lock for drop packet handling */
63 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
65 /* 1/rate drop and drop-entry variables */
66 int ip_vs_drop_rate = 0;
67 int ip_vs_drop_counter = 0;
68 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
70 /* number of virtual services */
71 static int ip_vs_num_services = 0;
73 /* sysctl variables */
74 static int sysctl_ip_vs_drop_entry = 0;
75 static int sysctl_ip_vs_drop_packet = 0;
76 static int sysctl_ip_vs_secure_tcp = 0;
77 static int sysctl_ip_vs_amemthresh = 1024;
78 static int sysctl_ip_vs_am_droprate = 10;
79 int sysctl_ip_vs_cache_bypass = 0;
80 int sysctl_ip_vs_expire_nodest_conn = 0;
81 int sysctl_ip_vs_expire_quiescent_template = 0;
82 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83 int sysctl_ip_vs_nat_icmp_send = 0;
86 #ifdef CONFIG_IP_VS_DEBUG
87 static int sysctl_ip_vs_debug_level = 0;
89 int ip_vs_get_debug_level(void)
91 return sysctl_ip_vs_debug_level;
96 * update_defense_level is called from keventd and from sysctl,
97 * so it needs to protect itself from softirqs
99 static void update_defense_level(void)
102 static int old_secure_tcp = 0;
107 /* we only count free and buffered memory (in pages) */
109 availmem = i.freeram + i.bufferram;
110 /* however in linux 2.5 the i.bufferram is total page cache size,
112 /* si_swapinfo(&i); */
113 /* availmem = availmem - (i.totalswap - i.freeswap); */
115 nomem = (availmem < sysctl_ip_vs_amemthresh);
120 spin_lock(&__ip_vs_dropentry_lock);
121 switch (sysctl_ip_vs_drop_entry) {
123 atomic_set(&ip_vs_dropentry, 0);
127 atomic_set(&ip_vs_dropentry, 1);
128 sysctl_ip_vs_drop_entry = 2;
130 atomic_set(&ip_vs_dropentry, 0);
135 atomic_set(&ip_vs_dropentry, 1);
137 atomic_set(&ip_vs_dropentry, 0);
138 sysctl_ip_vs_drop_entry = 1;
142 atomic_set(&ip_vs_dropentry, 1);
145 spin_unlock(&__ip_vs_dropentry_lock);
148 spin_lock(&__ip_vs_droppacket_lock);
149 switch (sysctl_ip_vs_drop_packet) {
155 ip_vs_drop_rate = ip_vs_drop_counter
156 = sysctl_ip_vs_amemthresh /
157 (sysctl_ip_vs_amemthresh-availmem);
158 sysctl_ip_vs_drop_packet = 2;
165 ip_vs_drop_rate = ip_vs_drop_counter
166 = sysctl_ip_vs_amemthresh /
167 (sysctl_ip_vs_amemthresh-availmem);
170 sysctl_ip_vs_drop_packet = 1;
174 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
177 spin_unlock(&__ip_vs_droppacket_lock);
180 write_lock(&__ip_vs_securetcp_lock);
181 switch (sysctl_ip_vs_secure_tcp) {
183 if (old_secure_tcp >= 2)
188 if (old_secure_tcp < 2)
190 sysctl_ip_vs_secure_tcp = 2;
192 if (old_secure_tcp >= 2)
198 if (old_secure_tcp < 2)
201 if (old_secure_tcp >= 2)
203 sysctl_ip_vs_secure_tcp = 1;
207 if (old_secure_tcp < 2)
211 old_secure_tcp = sysctl_ip_vs_secure_tcp;
213 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214 write_unlock(&__ip_vs_securetcp_lock);
221 * Timer for checking the defense
223 #define DEFENSE_TIMER_PERIOD 1*HZ
224 static void defense_work_handler(struct work_struct *work);
225 static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
227 static void defense_work_handler(struct work_struct *work)
229 update_defense_level();
230 if (atomic_read(&ip_vs_dropentry))
231 ip_vs_random_dropentry();
233 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
237 ip_vs_use_count_inc(void)
239 return try_module_get(THIS_MODULE);
243 ip_vs_use_count_dec(void)
245 module_put(THIS_MODULE);
250 * Hash table: for virtual service lookups
252 #define IP_VS_SVC_TAB_BITS 8
253 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
256 /* the service table hashed by <protocol, addr, port> */
257 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258 /* the service table hashed by fwmark */
259 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
262 * Hash table: for real service lookups
264 #define IP_VS_RTAB_BITS 4
265 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
268 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
271 * Trash for destinations
273 static LIST_HEAD(ip_vs_dest_trash);
276 * FTP & NULL virtual service counters
278 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
283 * Returns hash value for virtual service
285 static __inline__ unsigned
286 ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
288 register unsigned porth = ntohs(port);
290 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291 & IP_VS_SVC_TAB_MASK;
295 * Returns hash value of fwmark for virtual service lookup
297 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
299 return fwmark & IP_VS_SVC_TAB_MASK;
303 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 * or in the ip_vs_svc_fwm_table by fwmark.
305 * Should be called with locked tables.
307 static int ip_vs_svc_hash(struct ip_vs_service *svc)
311 if (svc->flags & IP_VS_SVC_F_HASHED) {
312 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313 "called from %p\n", __builtin_return_address(0));
317 if (svc->fwmark == 0) {
319 * Hash it by <protocol,addr,port> in ip_vs_svc_table
321 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
325 * Hash it by fwmark in ip_vs_svc_fwm_table
327 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
331 svc->flags |= IP_VS_SVC_F_HASHED;
332 /* increase its refcnt because it is referenced by the svc table */
333 atomic_inc(&svc->refcnt);
339 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 * Should be called with locked tables.
342 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
344 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346 "called from %p\n", __builtin_return_address(0));
350 if (svc->fwmark == 0) {
351 /* Remove it from the ip_vs_svc_table table */
352 list_del(&svc->s_list);
354 /* Remove it from the ip_vs_svc_fwm_table table */
355 list_del(&svc->f_list);
358 svc->flags &= ~IP_VS_SVC_F_HASHED;
359 atomic_dec(&svc->refcnt);
365 * Get service by {proto,addr,port} in the service table.
367 static __inline__ struct ip_vs_service *
368 __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
371 struct ip_vs_service *svc;
373 /* Check for "full" addressed entries */
374 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
376 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377 if ((svc->addr == vaddr)
378 && (svc->port == vport)
379 && (svc->protocol == protocol)) {
381 atomic_inc(&svc->usecnt);
391 * Get service by {fwmark} in the service table.
393 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
396 struct ip_vs_service *svc;
398 /* Check for fwmark addressed entries */
399 hash = ip_vs_svc_fwm_hashkey(fwmark);
401 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402 if (svc->fwmark == fwmark) {
404 atomic_inc(&svc->usecnt);
412 struct ip_vs_service *
413 ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
415 struct ip_vs_service *svc;
417 read_lock(&__ip_vs_svc_lock);
420 * Check the table hashed by fwmark first
422 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
426 * Check the table hashed by <protocol,addr,port>
427 * for "full" addressed entries
429 svc = __ip_vs_service_get(protocol, vaddr, vport);
432 && protocol == IPPROTO_TCP
433 && atomic_read(&ip_vs_ftpsvc_counter)
434 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
436 * Check if ftp service entry exists, the packet
437 * might belong to FTP data connections.
439 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
443 && atomic_read(&ip_vs_nullsvc_counter)) {
445 * Check if the catch-all port (port zero) exists
447 svc = __ip_vs_service_get(protocol, vaddr, 0);
451 read_unlock(&__ip_vs_svc_lock);
453 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454 fwmark, ip_vs_proto_name(protocol),
455 NIPQUAD(vaddr), ntohs(vport),
456 svc?"hit":"not hit");
463 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
465 atomic_inc(&svc->refcnt);
470 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
472 struct ip_vs_service *svc = dest->svc;
475 if (atomic_dec_and_test(&svc->refcnt))
481 * Returns hash value for real service
483 static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
485 register unsigned porth = ntohs(port);
487 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
492 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 * should be called with locked tables.
495 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
499 if (!list_empty(&dest->d_list)) {
504 * Hash by proto,addr,port,
505 * which are the parameters of the real service.
507 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508 list_add(&dest->d_list, &ip_vs_rtable[hash]);
514 * UNhashes ip_vs_dest from ip_vs_rtable.
515 * should be called with locked tables.
517 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
520 * Remove it from the ip_vs_rtable table.
522 if (!list_empty(&dest->d_list)) {
523 list_del(&dest->d_list);
524 INIT_LIST_HEAD(&dest->d_list);
531 * Lookup real service by <proto,addr,port> in the real service table.
534 ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
537 struct ip_vs_dest *dest;
540 * Check for "full" addressed entries
541 * Return the first found entry
543 hash = ip_vs_rs_hashkey(daddr, dport);
545 read_lock(&__ip_vs_rs_lock);
546 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547 if ((dest->addr == daddr)
548 && (dest->port == dport)
549 && ((dest->protocol == protocol) ||
552 read_unlock(&__ip_vs_rs_lock);
556 read_unlock(&__ip_vs_rs_lock);
562 * Lookup destination by {addr,port} in the given service
564 static struct ip_vs_dest *
565 ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
567 struct ip_vs_dest *dest;
570 * Find the destination for the given service
572 list_for_each_entry(dest, &svc->destinations, n_list) {
573 if ((dest->addr == daddr) && (dest->port == dport)) {
583 * Find destination by {daddr,dport,vaddr,protocol}
584 * Cretaed to be used in ip_vs_process_message() in
585 * the backup synchronization daemon. It finds the
586 * destination to be bound to the received connection
589 * ip_vs_lookup_real_service() looked promissing, but
590 * seems not working as expected.
592 struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
593 __be32 vaddr, __be16 vport, __u16 protocol)
595 struct ip_vs_dest *dest;
596 struct ip_vs_service *svc;
598 svc = ip_vs_service_get(0, protocol, vaddr, vport);
601 dest = ip_vs_lookup_dest(svc, daddr, dport);
603 atomic_inc(&dest->refcnt);
604 ip_vs_service_put(svc);
609 * Lookup dest by {svc,addr,port} in the destination trash.
610 * The destination trash is used to hold the destinations that are removed
611 * from the service table but are still referenced by some conn entries.
612 * The reason to add the destination trash is when the dest is temporary
613 * down (either by administrator or by monitor program), the dest can be
614 * picked back from the trash, the remaining connections to the dest can
615 * continue, and the counting information of the dest is also useful for
618 static struct ip_vs_dest *
619 ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
621 struct ip_vs_dest *dest, *nxt;
624 * Find the destination in trash
626 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
627 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
630 NIPQUAD(dest->addr), ntohs(dest->port),
631 atomic_read(&dest->refcnt));
632 if (dest->addr == daddr &&
633 dest->port == dport &&
634 dest->vfwmark == svc->fwmark &&
635 dest->protocol == svc->protocol &&
637 (dest->vaddr == svc->addr &&
638 dest->vport == svc->port))) {
644 * Try to purge the destination from trash if not referenced
646 if (atomic_read(&dest->refcnt) == 1) {
647 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
650 NIPQUAD(dest->addr), ntohs(dest->port));
651 list_del(&dest->n_list);
652 ip_vs_dst_reset(dest);
653 __ip_vs_unbind_svc(dest);
663 * Clean up all the destinations in the trash
664 * Called by the ip_vs_control_cleanup()
666 * When the ip_vs_control_clearup is activated by ipvs module exit,
667 * the service tables must have been flushed and all the connections
668 * are expired, and the refcnt of each destination in the trash must
669 * be 1, so we simply release them here.
671 static void ip_vs_trash_cleanup(void)
673 struct ip_vs_dest *dest, *nxt;
675 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
676 list_del(&dest->n_list);
677 ip_vs_dst_reset(dest);
678 __ip_vs_unbind_svc(dest);
685 ip_vs_zero_stats(struct ip_vs_stats *stats)
687 spin_lock_bh(&stats->lock);
688 memset(stats, 0, (char *)&stats->lock - (char *)stats);
689 spin_unlock_bh(&stats->lock);
690 ip_vs_zero_estimator(stats);
694 * Update a destination in the given service
697 __ip_vs_update_dest(struct ip_vs_service *svc,
698 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
702 /* set the weight and the flags */
703 atomic_set(&dest->weight, udest->weight);
704 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
706 /* check if local node and update the flags */
707 if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
708 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
709 | IP_VS_CONN_F_LOCALNODE;
712 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
713 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
714 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
717 * Put the real service in ip_vs_rtable if not present.
718 * For now only for NAT!
720 write_lock_bh(&__ip_vs_rs_lock);
722 write_unlock_bh(&__ip_vs_rs_lock);
724 atomic_set(&dest->conn_flags, conn_flags);
726 /* bind the service */
728 __ip_vs_bind_svc(dest, svc);
730 if (dest->svc != svc) {
731 __ip_vs_unbind_svc(dest);
732 ip_vs_zero_stats(&dest->stats);
733 __ip_vs_bind_svc(dest, svc);
737 /* set the dest status flags */
738 dest->flags |= IP_VS_DEST_F_AVAILABLE;
740 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
741 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
742 dest->u_threshold = udest->u_threshold;
743 dest->l_threshold = udest->l_threshold;
748 * Create a destination for the given service
751 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
752 struct ip_vs_dest **dest_p)
754 struct ip_vs_dest *dest;
759 atype = inet_addr_type(&init_net, udest->addr);
760 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
763 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
765 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
769 dest->protocol = svc->protocol;
770 dest->vaddr = svc->addr;
771 dest->vport = svc->port;
772 dest->vfwmark = svc->fwmark;
773 dest->addr = udest->addr;
774 dest->port = udest->port;
776 atomic_set(&dest->activeconns, 0);
777 atomic_set(&dest->inactconns, 0);
778 atomic_set(&dest->persistconns, 0);
779 atomic_set(&dest->refcnt, 0);
781 INIT_LIST_HEAD(&dest->d_list);
782 spin_lock_init(&dest->dst_lock);
783 spin_lock_init(&dest->stats.lock);
784 __ip_vs_update_dest(svc, dest, udest);
785 ip_vs_new_estimator(&dest->stats);
795 * Add a destination into an existing service
798 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
800 struct ip_vs_dest *dest;
801 __be32 daddr = udest->addr;
802 __be16 dport = udest->port;
807 if (udest->weight < 0) {
808 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
812 if (udest->l_threshold > udest->u_threshold) {
813 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
814 "upper threshold\n");
819 * Check if the dest already exists in the list
821 dest = ip_vs_lookup_dest(svc, daddr, dport);
823 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
828 * Check if the dest already exists in the trash and
829 * is from the same service
831 dest = ip_vs_trash_get_dest(svc, daddr, dport);
833 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
834 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
835 NIPQUAD(daddr), ntohs(dport),
836 atomic_read(&dest->refcnt),
838 NIPQUAD(dest->vaddr),
840 __ip_vs_update_dest(svc, dest, udest);
843 * Get the destination from the trash
845 list_del(&dest->n_list);
847 ip_vs_new_estimator(&dest->stats);
849 write_lock_bh(&__ip_vs_svc_lock);
852 * Wait until all other svc users go away.
854 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
856 list_add(&dest->n_list, &svc->destinations);
859 /* call the update_service function of its scheduler */
860 svc->scheduler->update_service(svc);
862 write_unlock_bh(&__ip_vs_svc_lock);
867 * Allocate and initialize the dest structure
869 ret = ip_vs_new_dest(svc, udest, &dest);
875 * Add the dest entry into the list
877 atomic_inc(&dest->refcnt);
879 write_lock_bh(&__ip_vs_svc_lock);
882 * Wait until all other svc users go away.
884 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
886 list_add(&dest->n_list, &svc->destinations);
889 /* call the update_service function of its scheduler */
890 svc->scheduler->update_service(svc);
892 write_unlock_bh(&__ip_vs_svc_lock);
901 * Edit a destination in the given service
904 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
906 struct ip_vs_dest *dest;
907 __be32 daddr = udest->addr;
908 __be16 dport = udest->port;
912 if (udest->weight < 0) {
913 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
917 if (udest->l_threshold > udest->u_threshold) {
918 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
919 "upper threshold\n");
924 * Lookup the destination list
926 dest = ip_vs_lookup_dest(svc, daddr, dport);
928 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
932 __ip_vs_update_dest(svc, dest, udest);
934 write_lock_bh(&__ip_vs_svc_lock);
936 /* Wait until all other svc users go away */
937 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
939 /* call the update_service, because server weight may be changed */
940 svc->scheduler->update_service(svc);
942 write_unlock_bh(&__ip_vs_svc_lock);
951 * Delete a destination (must be already unlinked from the service)
953 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
955 ip_vs_kill_estimator(&dest->stats);
958 * Remove it from the d-linked list with the real services.
960 write_lock_bh(&__ip_vs_rs_lock);
961 ip_vs_rs_unhash(dest);
962 write_unlock_bh(&__ip_vs_rs_lock);
965 * Decrease the refcnt of the dest, and free the dest
966 * if nobody refers to it (refcnt=0). Otherwise, throw
967 * the destination into the trash.
969 if (atomic_dec_and_test(&dest->refcnt)) {
970 ip_vs_dst_reset(dest);
971 /* simply decrease svc->refcnt here, let the caller check
972 and release the service if nobody refers to it.
973 Only user context can release destination and service,
974 and only one user context can update virtual service at a
975 time, so the operation here is OK */
976 atomic_dec(&dest->svc->refcnt);
979 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
981 NIPQUAD(dest->addr), ntohs(dest->port),
982 atomic_read(&dest->refcnt));
983 list_add(&dest->n_list, &ip_vs_dest_trash);
984 atomic_inc(&dest->refcnt);
990 * Unlink a destination from the given service
992 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
993 struct ip_vs_dest *dest,
996 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
999 * Remove it from the d-linked destination list.
1001 list_del(&dest->n_list);
1005 * Call the update_service function of its scheduler
1007 svc->scheduler->update_service(svc);
1013 * Delete a destination server in the given service
1016 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1018 struct ip_vs_dest *dest;
1019 __be32 daddr = udest->addr;
1020 __be16 dport = udest->port;
1024 dest = ip_vs_lookup_dest(svc, daddr, dport);
1026 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1030 write_lock_bh(&__ip_vs_svc_lock);
1033 * Wait until all other svc users go away.
1035 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1038 * Unlink dest from the service
1040 __ip_vs_unlink_dest(svc, dest, 1);
1042 write_unlock_bh(&__ip_vs_svc_lock);
1045 * Delete the destination
1047 __ip_vs_del_dest(dest);
1056 * Add a service into the service hash table
1059 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1062 struct ip_vs_scheduler *sched = NULL;
1063 struct ip_vs_service *svc = NULL;
1065 /* increase the module use count */
1066 ip_vs_use_count_inc();
1068 /* Lookup the scheduler by 'u->sched_name' */
1069 sched = ip_vs_scheduler_get(u->sched_name);
1070 if (sched == NULL) {
1071 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1077 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1079 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1084 /* I'm the first user of the service */
1085 atomic_set(&svc->usecnt, 1);
1086 atomic_set(&svc->refcnt, 0);
1088 svc->protocol = u->protocol;
1089 svc->addr = u->addr;
1090 svc->port = u->port;
1091 svc->fwmark = u->fwmark;
1092 svc->flags = u->flags;
1093 svc->timeout = u->timeout * HZ;
1094 svc->netmask = u->netmask;
1096 INIT_LIST_HEAD(&svc->destinations);
1097 rwlock_init(&svc->sched_lock);
1098 spin_lock_init(&svc->stats.lock);
1100 /* Bind the scheduler */
1101 ret = ip_vs_bind_scheduler(svc, sched);
1106 /* Update the virtual service counters */
1107 if (svc->port == FTPPORT)
1108 atomic_inc(&ip_vs_ftpsvc_counter);
1109 else if (svc->port == 0)
1110 atomic_inc(&ip_vs_nullsvc_counter);
1112 ip_vs_new_estimator(&svc->stats);
1113 ip_vs_num_services++;
1115 /* Hash the service into the service table */
1116 write_lock_bh(&__ip_vs_svc_lock);
1117 ip_vs_svc_hash(svc);
1118 write_unlock_bh(&__ip_vs_svc_lock);
1126 ip_vs_unbind_scheduler(svc);
1129 ip_vs_app_inc_put(svc->inc);
1134 ip_vs_scheduler_put(sched);
1137 /* decrease the module use count */
1138 ip_vs_use_count_dec();
1145 * Edit a service and bind it with a new scheduler
1148 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1150 struct ip_vs_scheduler *sched, *old_sched;
1154 * Lookup the scheduler, by 'u->sched_name'
1156 sched = ip_vs_scheduler_get(u->sched_name);
1157 if (sched == NULL) {
1158 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1164 write_lock_bh(&__ip_vs_svc_lock);
1167 * Wait until all other svc users go away.
1169 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1172 * Set the flags and timeout value
1174 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1175 svc->timeout = u->timeout * HZ;
1176 svc->netmask = u->netmask;
1178 old_sched = svc->scheduler;
1179 if (sched != old_sched) {
1181 * Unbind the old scheduler
1183 if ((ret = ip_vs_unbind_scheduler(svc))) {
1189 * Bind the new scheduler
1191 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1193 * If ip_vs_bind_scheduler fails, restore the old
1195 * The main reason of failure is out of memory.
1197 * The question is if the old scheduler can be
1198 * restored all the time. TODO: if it cannot be
1199 * restored some time, we must delete the service,
1200 * otherwise the system may crash.
1202 ip_vs_bind_scheduler(svc, old_sched);
1209 write_unlock_bh(&__ip_vs_svc_lock);
1212 ip_vs_scheduler_put(old_sched);
1219 * Delete a service from the service list
1220 * - The service must be unlinked, unlocked and not referenced!
1221 * - We are called under _bh lock
1223 static void __ip_vs_del_service(struct ip_vs_service *svc)
1225 struct ip_vs_dest *dest, *nxt;
1226 struct ip_vs_scheduler *old_sched;
1228 ip_vs_num_services--;
1229 ip_vs_kill_estimator(&svc->stats);
1231 /* Unbind scheduler */
1232 old_sched = svc->scheduler;
1233 ip_vs_unbind_scheduler(svc);
1235 ip_vs_scheduler_put(old_sched);
1237 /* Unbind app inc */
1239 ip_vs_app_inc_put(svc->inc);
1244 * Unlink the whole destination list
1246 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1247 __ip_vs_unlink_dest(svc, dest, 0);
1248 __ip_vs_del_dest(dest);
1252 * Update the virtual service counters
1254 if (svc->port == FTPPORT)
1255 atomic_dec(&ip_vs_ftpsvc_counter);
1256 else if (svc->port == 0)
1257 atomic_dec(&ip_vs_nullsvc_counter);
1260 * Free the service if nobody refers to it
1262 if (atomic_read(&svc->refcnt) == 0)
1265 /* decrease the module use count */
1266 ip_vs_use_count_dec();
1270 * Delete a service from the service list
1272 static int ip_vs_del_service(struct ip_vs_service *svc)
1278 * Unhash it from the service table
1280 write_lock_bh(&__ip_vs_svc_lock);
1282 ip_vs_svc_unhash(svc);
1285 * Wait until all the svc users go away.
1287 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1289 __ip_vs_del_service(svc);
1291 write_unlock_bh(&__ip_vs_svc_lock);
1298 * Flush all the virtual services
1300 static int ip_vs_flush(void)
1303 struct ip_vs_service *svc, *nxt;
1306 * Flush the service table hashed by <protocol,addr,port>
1308 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1309 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1310 write_lock_bh(&__ip_vs_svc_lock);
1311 ip_vs_svc_unhash(svc);
1313 * Wait until all the svc users go away.
1315 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1316 __ip_vs_del_service(svc);
1317 write_unlock_bh(&__ip_vs_svc_lock);
1322 * Flush the service table hashed by fwmark
1324 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1325 list_for_each_entry_safe(svc, nxt,
1326 &ip_vs_svc_fwm_table[idx], f_list) {
1327 write_lock_bh(&__ip_vs_svc_lock);
1328 ip_vs_svc_unhash(svc);
1330 * Wait until all the svc users go away.
1332 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1333 __ip_vs_del_service(svc);
1334 write_unlock_bh(&__ip_vs_svc_lock);
1343 * Zero counters in a service or all services
1345 static int ip_vs_zero_service(struct ip_vs_service *svc)
1347 struct ip_vs_dest *dest;
1349 write_lock_bh(&__ip_vs_svc_lock);
1350 list_for_each_entry(dest, &svc->destinations, n_list) {
1351 ip_vs_zero_stats(&dest->stats);
1353 ip_vs_zero_stats(&svc->stats);
1354 write_unlock_bh(&__ip_vs_svc_lock);
1358 static int ip_vs_zero_all(void)
1361 struct ip_vs_service *svc;
1363 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1364 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1365 ip_vs_zero_service(svc);
1369 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1370 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1371 ip_vs_zero_service(svc);
1375 ip_vs_zero_stats(&ip_vs_stats);
1381 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1382 void __user *buffer, size_t *lenp, loff_t *ppos)
1384 int *valp = table->data;
1388 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1389 if (write && (*valp != val)) {
1390 if ((*valp < 0) || (*valp > 3)) {
1391 /* Restore the correct value */
1394 update_defense_level();
1402 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1403 void __user *buffer, size_t *lenp, loff_t *ppos)
1405 int *valp = table->data;
1409 /* backup the value first */
1410 memcpy(val, valp, sizeof(val));
1412 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1413 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1414 /* Restore the correct value */
1415 memcpy(valp, val, sizeof(val));
1422 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1425 static struct ctl_table vs_vars[] = {
1427 .procname = "amemthresh",
1428 .data = &sysctl_ip_vs_amemthresh,
1429 .maxlen = sizeof(int),
1431 .proc_handler = &proc_dointvec,
1433 #ifdef CONFIG_IP_VS_DEBUG
1435 .procname = "debug_level",
1436 .data = &sysctl_ip_vs_debug_level,
1437 .maxlen = sizeof(int),
1439 .proc_handler = &proc_dointvec,
1443 .procname = "am_droprate",
1444 .data = &sysctl_ip_vs_am_droprate,
1445 .maxlen = sizeof(int),
1447 .proc_handler = &proc_dointvec,
1450 .procname = "drop_entry",
1451 .data = &sysctl_ip_vs_drop_entry,
1452 .maxlen = sizeof(int),
1454 .proc_handler = &proc_do_defense_mode,
1457 .procname = "drop_packet",
1458 .data = &sysctl_ip_vs_drop_packet,
1459 .maxlen = sizeof(int),
1461 .proc_handler = &proc_do_defense_mode,
1464 .procname = "secure_tcp",
1465 .data = &sysctl_ip_vs_secure_tcp,
1466 .maxlen = sizeof(int),
1468 .proc_handler = &proc_do_defense_mode,
1472 .procname = "timeout_established",
1473 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1474 .maxlen = sizeof(int),
1476 .proc_handler = &proc_dointvec_jiffies,
1479 .procname = "timeout_synsent",
1480 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1481 .maxlen = sizeof(int),
1483 .proc_handler = &proc_dointvec_jiffies,
1486 .procname = "timeout_synrecv",
1487 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1488 .maxlen = sizeof(int),
1490 .proc_handler = &proc_dointvec_jiffies,
1493 .procname = "timeout_finwait",
1494 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1495 .maxlen = sizeof(int),
1497 .proc_handler = &proc_dointvec_jiffies,
1500 .procname = "timeout_timewait",
1501 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1502 .maxlen = sizeof(int),
1504 .proc_handler = &proc_dointvec_jiffies,
1507 .procname = "timeout_close",
1508 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1509 .maxlen = sizeof(int),
1511 .proc_handler = &proc_dointvec_jiffies,
1514 .procname = "timeout_closewait",
1515 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1516 .maxlen = sizeof(int),
1518 .proc_handler = &proc_dointvec_jiffies,
1521 .procname = "timeout_lastack",
1522 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1523 .maxlen = sizeof(int),
1525 .proc_handler = &proc_dointvec_jiffies,
1528 .procname = "timeout_listen",
1529 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1530 .maxlen = sizeof(int),
1532 .proc_handler = &proc_dointvec_jiffies,
1535 .procname = "timeout_synack",
1536 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1537 .maxlen = sizeof(int),
1539 .proc_handler = &proc_dointvec_jiffies,
1542 .procname = "timeout_udp",
1543 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1544 .maxlen = sizeof(int),
1546 .proc_handler = &proc_dointvec_jiffies,
1549 .procname = "timeout_icmp",
1550 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1551 .maxlen = sizeof(int),
1553 .proc_handler = &proc_dointvec_jiffies,
1557 .procname = "cache_bypass",
1558 .data = &sysctl_ip_vs_cache_bypass,
1559 .maxlen = sizeof(int),
1561 .proc_handler = &proc_dointvec,
1564 .procname = "expire_nodest_conn",
1565 .data = &sysctl_ip_vs_expire_nodest_conn,
1566 .maxlen = sizeof(int),
1568 .proc_handler = &proc_dointvec,
1571 .procname = "expire_quiescent_template",
1572 .data = &sysctl_ip_vs_expire_quiescent_template,
1573 .maxlen = sizeof(int),
1575 .proc_handler = &proc_dointvec,
1578 .procname = "sync_threshold",
1579 .data = &sysctl_ip_vs_sync_threshold,
1580 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1582 .proc_handler = &proc_do_sync_threshold,
1585 .procname = "nat_icmp_send",
1586 .data = &sysctl_ip_vs_nat_icmp_send,
1587 .maxlen = sizeof(int),
1589 .proc_handler = &proc_dointvec,
1594 struct ctl_path net_vs_ctl_path[] = {
1595 { .procname = "net", .ctl_name = CTL_NET, },
1596 { .procname = "ipv4", .ctl_name = NET_IPV4, },
1597 { .procname = "vs", },
1600 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1602 static struct ctl_table_header * sysctl_header;
1604 #ifdef CONFIG_PROC_FS
1607 struct list_head *table;
1612 * Write the contents of the VS rule table to a PROCfs file.
1613 * (It is kept just for backward compatibility)
1615 static inline const char *ip_vs_fwd_name(unsigned flags)
1617 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1618 case IP_VS_CONN_F_LOCALNODE:
1620 case IP_VS_CONN_F_TUNNEL:
1622 case IP_VS_CONN_F_DROUTE:
1630 /* Get the Nth entry in the two lists */
1631 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1633 struct ip_vs_iter *iter = seq->private;
1635 struct ip_vs_service *svc;
1637 /* look in hash by protocol */
1638 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1639 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1641 iter->table = ip_vs_svc_table;
1648 /* keep looking in fwmark */
1649 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1650 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1652 iter->table = ip_vs_svc_fwm_table;
1662 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1665 read_lock_bh(&__ip_vs_svc_lock);
1666 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1670 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1672 struct list_head *e;
1673 struct ip_vs_iter *iter;
1674 struct ip_vs_service *svc;
1677 if (v == SEQ_START_TOKEN)
1678 return ip_vs_info_array(seq,0);
1681 iter = seq->private;
1683 if (iter->table == ip_vs_svc_table) {
1684 /* next service in table hashed by protocol */
1685 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1686 return list_entry(e, struct ip_vs_service, s_list);
1689 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1690 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1696 iter->table = ip_vs_svc_fwm_table;
1701 /* next service in hashed by fwmark */
1702 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1703 return list_entry(e, struct ip_vs_service, f_list);
1706 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1707 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1715 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1717 read_unlock_bh(&__ip_vs_svc_lock);
1721 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1723 if (v == SEQ_START_TOKEN) {
1725 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1726 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1728 "Prot LocalAddress:Port Scheduler Flags\n");
1730 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1732 const struct ip_vs_service *svc = v;
1733 const struct ip_vs_iter *iter = seq->private;
1734 const struct ip_vs_dest *dest;
1736 if (iter->table == ip_vs_svc_table)
1737 seq_printf(seq, "%s %08X:%04X %s ",
1738 ip_vs_proto_name(svc->protocol),
1741 svc->scheduler->name);
1743 seq_printf(seq, "FWM %08X %s ",
1744 svc->fwmark, svc->scheduler->name);
1746 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1747 seq_printf(seq, "persistent %d %08X\n",
1749 ntohl(svc->netmask));
1751 seq_putc(seq, '\n');
1753 list_for_each_entry(dest, &svc->destinations, n_list) {
1755 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1756 ntohl(dest->addr), ntohs(dest->port),
1757 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1758 atomic_read(&dest->weight),
1759 atomic_read(&dest->activeconns),
1760 atomic_read(&dest->inactconns));
1766 static const struct seq_operations ip_vs_info_seq_ops = {
1767 .start = ip_vs_info_seq_start,
1768 .next = ip_vs_info_seq_next,
1769 .stop = ip_vs_info_seq_stop,
1770 .show = ip_vs_info_seq_show,
1773 static int ip_vs_info_open(struct inode *inode, struct file *file)
1775 return seq_open_private(file, &ip_vs_info_seq_ops,
1776 sizeof(struct ip_vs_iter));
1779 static const struct file_operations ip_vs_info_fops = {
1780 .owner = THIS_MODULE,
1781 .open = ip_vs_info_open,
1783 .llseek = seq_lseek,
1784 .release = seq_release_private,
1789 struct ip_vs_stats ip_vs_stats;
1791 #ifdef CONFIG_PROC_FS
1792 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1795 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1797 " Total Incoming Outgoing Incoming Outgoing\n");
1799 " Conns Packets Packets Bytes Bytes\n");
1801 spin_lock_bh(&ip_vs_stats.lock);
1802 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1803 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1804 (unsigned long long) ip_vs_stats.inbytes,
1805 (unsigned long long) ip_vs_stats.outbytes);
1807 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1809 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1810 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1815 ip_vs_stats.outbps);
1816 spin_unlock_bh(&ip_vs_stats.lock);
1821 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1823 return single_open(file, ip_vs_stats_show, NULL);
1826 static const struct file_operations ip_vs_stats_fops = {
1827 .owner = THIS_MODULE,
1828 .open = ip_vs_stats_seq_open,
1830 .llseek = seq_lseek,
1831 .release = single_release,
1837 * Set timeout values for tcp tcpfin udp in the timeout_table.
1839 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1841 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1846 #ifdef CONFIG_IP_VS_PROTO_TCP
1847 if (u->tcp_timeout) {
1848 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1849 = u->tcp_timeout * HZ;
1852 if (u->tcp_fin_timeout) {
1853 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1854 = u->tcp_fin_timeout * HZ;
1858 #ifdef CONFIG_IP_VS_PROTO_UDP
1859 if (u->udp_timeout) {
1860 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1861 = u->udp_timeout * HZ;
1868 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1869 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1870 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1871 sizeof(struct ip_vs_dest_user))
1872 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1873 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1874 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1876 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1877 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1878 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1879 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1880 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1881 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1882 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1883 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1884 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1885 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1886 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1887 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1891 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1894 unsigned char arg[MAX_ARG_LEN];
1895 struct ip_vs_service_user *usvc;
1896 struct ip_vs_service *svc;
1897 struct ip_vs_dest_user *udest;
1899 if (!capable(CAP_NET_ADMIN))
1902 if (len != set_arglen[SET_CMDID(cmd)]) {
1903 IP_VS_ERR("set_ctl: len %u != %u\n",
1904 len, set_arglen[SET_CMDID(cmd)]);
1908 if (copy_from_user(arg, user, len) != 0)
1911 /* increase the module use count */
1912 ip_vs_use_count_inc();
1914 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1919 if (cmd == IP_VS_SO_SET_FLUSH) {
1920 /* Flush the virtual service */
1921 ret = ip_vs_flush();
1923 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1924 /* Set timeout values for (tcp tcpfin udp) */
1925 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1927 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1928 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1929 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1931 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1932 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1933 ret = stop_sync_thread(dm->state);
1937 usvc = (struct ip_vs_service_user *)arg;
1938 udest = (struct ip_vs_dest_user *)(usvc + 1);
1940 if (cmd == IP_VS_SO_SET_ZERO) {
1941 /* if no service address is set, zero counters in all */
1942 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1943 ret = ip_vs_zero_all();
1948 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1949 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1950 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1951 usvc->protocol, NIPQUAD(usvc->addr),
1952 ntohs(usvc->port), usvc->sched_name);
1957 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1958 if (usvc->fwmark == 0)
1959 svc = __ip_vs_service_get(usvc->protocol,
1960 usvc->addr, usvc->port);
1962 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1964 if (cmd != IP_VS_SO_SET_ADD
1965 && (svc == NULL || svc->protocol != usvc->protocol)) {
1971 case IP_VS_SO_SET_ADD:
1975 ret = ip_vs_add_service(usvc, &svc);
1977 case IP_VS_SO_SET_EDIT:
1978 ret = ip_vs_edit_service(svc, usvc);
1980 case IP_VS_SO_SET_DEL:
1981 ret = ip_vs_del_service(svc);
1985 case IP_VS_SO_SET_ZERO:
1986 ret = ip_vs_zero_service(svc);
1988 case IP_VS_SO_SET_ADDDEST:
1989 ret = ip_vs_add_dest(svc, udest);
1991 case IP_VS_SO_SET_EDITDEST:
1992 ret = ip_vs_edit_dest(svc, udest);
1994 case IP_VS_SO_SET_DELDEST:
1995 ret = ip_vs_del_dest(svc, udest);
2002 ip_vs_service_put(svc);
2005 mutex_unlock(&__ip_vs_mutex);
2007 /* decrease the module use count */
2008 ip_vs_use_count_dec();
2015 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2017 spin_lock_bh(&src->lock);
2018 memcpy(dst, src, (char*)&src->lock - (char*)src);
2019 spin_unlock_bh(&src->lock);
2023 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2025 dst->protocol = src->protocol;
2026 dst->addr = src->addr;
2027 dst->port = src->port;
2028 dst->fwmark = src->fwmark;
2029 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2030 dst->flags = src->flags;
2031 dst->timeout = src->timeout / HZ;
2032 dst->netmask = src->netmask;
2033 dst->num_dests = src->num_dests;
2034 ip_vs_copy_stats(&dst->stats, &src->stats);
2038 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2039 struct ip_vs_get_services __user *uptr)
2042 struct ip_vs_service *svc;
2043 struct ip_vs_service_entry entry;
2046 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2047 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2048 if (count >= get->num_services)
2050 memset(&entry, 0, sizeof(entry));
2051 ip_vs_copy_service(&entry, svc);
2052 if (copy_to_user(&uptr->entrytable[count],
2053 &entry, sizeof(entry))) {
2061 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2062 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2063 if (count >= get->num_services)
2065 memset(&entry, 0, sizeof(entry));
2066 ip_vs_copy_service(&entry, svc);
2067 if (copy_to_user(&uptr->entrytable[count],
2068 &entry, sizeof(entry))) {
2080 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2081 struct ip_vs_get_dests __user *uptr)
2083 struct ip_vs_service *svc;
2087 svc = __ip_vs_svc_fwm_get(get->fwmark);
2089 svc = __ip_vs_service_get(get->protocol,
2090 get->addr, get->port);
2093 struct ip_vs_dest *dest;
2094 struct ip_vs_dest_entry entry;
2096 list_for_each_entry(dest, &svc->destinations, n_list) {
2097 if (count >= get->num_dests)
2100 entry.addr = dest->addr;
2101 entry.port = dest->port;
2102 entry.conn_flags = atomic_read(&dest->conn_flags);
2103 entry.weight = atomic_read(&dest->weight);
2104 entry.u_threshold = dest->u_threshold;
2105 entry.l_threshold = dest->l_threshold;
2106 entry.activeconns = atomic_read(&dest->activeconns);
2107 entry.inactconns = atomic_read(&dest->inactconns);
2108 entry.persistconns = atomic_read(&dest->persistconns);
2109 ip_vs_copy_stats(&entry.stats, &dest->stats);
2110 if (copy_to_user(&uptr->entrytable[count],
2111 &entry, sizeof(entry))) {
2117 ip_vs_service_put(svc);
2124 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2126 #ifdef CONFIG_IP_VS_PROTO_TCP
2128 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2129 u->tcp_fin_timeout =
2130 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2132 #ifdef CONFIG_IP_VS_PROTO_UDP
2134 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2139 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2140 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2141 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2142 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2143 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2144 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2145 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2147 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2148 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2149 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2150 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2151 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2152 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2153 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2154 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2158 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2160 unsigned char arg[128];
2163 if (!capable(CAP_NET_ADMIN))
2166 if (*len < get_arglen[GET_CMDID(cmd)]) {
2167 IP_VS_ERR("get_ctl: len %u < %u\n",
2168 *len, get_arglen[GET_CMDID(cmd)]);
2172 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2175 if (mutex_lock_interruptible(&__ip_vs_mutex))
2176 return -ERESTARTSYS;
2179 case IP_VS_SO_GET_VERSION:
2183 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2184 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2185 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2189 *len = strlen(buf)+1;
2193 case IP_VS_SO_GET_INFO:
2195 struct ip_vs_getinfo info;
2196 info.version = IP_VS_VERSION_CODE;
2197 info.size = IP_VS_CONN_TAB_SIZE;
2198 info.num_services = ip_vs_num_services;
2199 if (copy_to_user(user, &info, sizeof(info)) != 0)
2204 case IP_VS_SO_GET_SERVICES:
2206 struct ip_vs_get_services *get;
2209 get = (struct ip_vs_get_services *)arg;
2210 size = sizeof(*get) +
2211 sizeof(struct ip_vs_service_entry) * get->num_services;
2213 IP_VS_ERR("length: %u != %u\n", *len, size);
2217 ret = __ip_vs_get_service_entries(get, user);
2221 case IP_VS_SO_GET_SERVICE:
2223 struct ip_vs_service_entry *entry;
2224 struct ip_vs_service *svc;
2226 entry = (struct ip_vs_service_entry *)arg;
2228 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2230 svc = __ip_vs_service_get(entry->protocol,
2231 entry->addr, entry->port);
2233 ip_vs_copy_service(entry, svc);
2234 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2236 ip_vs_service_put(svc);
2242 case IP_VS_SO_GET_DESTS:
2244 struct ip_vs_get_dests *get;
2247 get = (struct ip_vs_get_dests *)arg;
2248 size = sizeof(*get) +
2249 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2251 IP_VS_ERR("length: %u != %u\n", *len, size);
2255 ret = __ip_vs_get_dest_entries(get, user);
2259 case IP_VS_SO_GET_TIMEOUT:
2261 struct ip_vs_timeout_user t;
2263 __ip_vs_get_timeouts(&t);
2264 if (copy_to_user(user, &t, sizeof(t)) != 0)
2269 case IP_VS_SO_GET_DAEMON:
2271 struct ip_vs_daemon_user d[2];
2273 memset(&d, 0, sizeof(d));
2274 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2275 d[0].state = IP_VS_STATE_MASTER;
2276 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2277 d[0].syncid = ip_vs_master_syncid;
2279 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2280 d[1].state = IP_VS_STATE_BACKUP;
2281 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2282 d[1].syncid = ip_vs_backup_syncid;
2284 if (copy_to_user(user, &d, sizeof(d)) != 0)
2294 mutex_unlock(&__ip_vs_mutex);
2299 static struct nf_sockopt_ops ip_vs_sockopts = {
2301 .set_optmin = IP_VS_BASE_CTL,
2302 .set_optmax = IP_VS_SO_SET_MAX+1,
2303 .set = do_ip_vs_set_ctl,
2304 .get_optmin = IP_VS_BASE_CTL,
2305 .get_optmax = IP_VS_SO_GET_MAX+1,
2306 .get = do_ip_vs_get_ctl,
2307 .owner = THIS_MODULE,
2311 int ip_vs_control_init(void)
2318 ret = nf_register_sockopt(&ip_vs_sockopts);
2320 IP_VS_ERR("cannot register sockopt.\n");
2324 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2325 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2327 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
2329 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2330 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2331 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2332 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2334 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2335 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2338 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2339 spin_lock_init(&ip_vs_stats.lock);
2340 ip_vs_new_estimator(&ip_vs_stats);
2342 /* Hook the defense timer */
2343 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2350 void ip_vs_control_cleanup(void)
2353 ip_vs_trash_cleanup();
2354 cancel_rearming_delayed_work(&defense_work);
2355 cancel_work_sync(&defense_work.work);
2356 ip_vs_kill_estimator(&ip_vs_stats);
2357 unregister_sysctl_table(sysctl_header);
2358 proc_net_remove(&init_net, "ip_vs_stats");
2359 proc_net_remove(&init_net, "ip_vs");
2360 nf_unregister_sockopt(&ip_vs_sockopts);