2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
38 #include <net/route.h>
41 #include <asm/uaccess.h>
43 #include <net/ip_vs.h>
45 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
46 static DECLARE_MUTEX(__ip_vs_mutex);
48 /* lock for service table */
49 static DEFINE_RWLOCK(__ip_vs_svc_lock);
51 /* lock for table with the real services */
52 static DEFINE_RWLOCK(__ip_vs_rs_lock);
54 /* lock for state and timeout tables */
55 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
57 /* lock for drop entry handling */
58 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
60 /* lock for drop packet handling */
61 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
63 /* 1/rate drop and drop-entry variables */
64 int ip_vs_drop_rate = 0;
65 int ip_vs_drop_counter = 0;
66 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
68 /* number of virtual services */
69 static int ip_vs_num_services = 0;
71 /* sysctl variables */
72 static int sysctl_ip_vs_drop_entry = 0;
73 static int sysctl_ip_vs_drop_packet = 0;
74 static int sysctl_ip_vs_secure_tcp = 0;
75 static int sysctl_ip_vs_amemthresh = 1024;
76 static int sysctl_ip_vs_am_droprate = 10;
77 int sysctl_ip_vs_cache_bypass = 0;
78 int sysctl_ip_vs_expire_nodest_conn = 0;
79 int sysctl_ip_vs_expire_quiescent_template = 0;
80 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81 int sysctl_ip_vs_nat_icmp_send = 0;
84 #ifdef CONFIG_IP_VS_DEBUG
85 static int sysctl_ip_vs_debug_level = 0;
87 int ip_vs_get_debug_level(void)
89 return sysctl_ip_vs_debug_level;
94 * update_defense_level is called from keventd and from sysctl,
95 * so it needs to protect itself from softirqs
97 static void update_defense_level(void)
100 static int old_secure_tcp = 0;
105 /* we only count free and buffered memory (in pages) */
107 availmem = i.freeram + i.bufferram;
108 /* however in linux 2.5 the i.bufferram is total page cache size,
110 /* si_swapinfo(&i); */
111 /* availmem = availmem - (i.totalswap - i.freeswap); */
113 nomem = (availmem < sysctl_ip_vs_amemthresh);
118 spin_lock(&__ip_vs_dropentry_lock);
119 switch (sysctl_ip_vs_drop_entry) {
121 atomic_set(&ip_vs_dropentry, 0);
125 atomic_set(&ip_vs_dropentry, 1);
126 sysctl_ip_vs_drop_entry = 2;
128 atomic_set(&ip_vs_dropentry, 0);
133 atomic_set(&ip_vs_dropentry, 1);
135 atomic_set(&ip_vs_dropentry, 0);
136 sysctl_ip_vs_drop_entry = 1;
140 atomic_set(&ip_vs_dropentry, 1);
143 spin_unlock(&__ip_vs_dropentry_lock);
146 spin_lock(&__ip_vs_droppacket_lock);
147 switch (sysctl_ip_vs_drop_packet) {
153 ip_vs_drop_rate = ip_vs_drop_counter
154 = sysctl_ip_vs_amemthresh /
155 (sysctl_ip_vs_amemthresh-availmem);
156 sysctl_ip_vs_drop_packet = 2;
163 ip_vs_drop_rate = ip_vs_drop_counter
164 = sysctl_ip_vs_amemthresh /
165 (sysctl_ip_vs_amemthresh-availmem);
168 sysctl_ip_vs_drop_packet = 1;
172 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175 spin_unlock(&__ip_vs_droppacket_lock);
178 write_lock(&__ip_vs_securetcp_lock);
179 switch (sysctl_ip_vs_secure_tcp) {
181 if (old_secure_tcp >= 2)
186 if (old_secure_tcp < 2)
188 sysctl_ip_vs_secure_tcp = 2;
190 if (old_secure_tcp >= 2)
196 if (old_secure_tcp < 2)
199 if (old_secure_tcp >= 2)
201 sysctl_ip_vs_secure_tcp = 1;
205 if (old_secure_tcp < 2)
209 old_secure_tcp = sysctl_ip_vs_secure_tcp;
211 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212 write_unlock(&__ip_vs_securetcp_lock);
219 * Timer for checking the defense
221 #define DEFENSE_TIMER_PERIOD 1*HZ
222 static void defense_work_handler(void *data);
223 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
225 static void defense_work_handler(void *data)
227 update_defense_level();
228 if (atomic_read(&ip_vs_dropentry))
229 ip_vs_random_dropentry();
231 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
235 ip_vs_use_count_inc(void)
237 return try_module_get(THIS_MODULE);
241 ip_vs_use_count_dec(void)
243 module_put(THIS_MODULE);
248 * Hash table: for virtual service lookups
250 #define IP_VS_SVC_TAB_BITS 8
251 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
254 /* the service table hashed by <protocol, addr, port> */
255 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256 /* the service table hashed by fwmark */
257 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260 * Hash table: for real service lookups
262 #define IP_VS_RTAB_BITS 4
263 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
266 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269 * Trash for destinations
271 static LIST_HEAD(ip_vs_dest_trash);
274 * FTP & NULL virtual service counters
276 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
281 * Returns hash value for virtual service
283 static __inline__ unsigned
284 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
286 register unsigned porth = ntohs(port);
288 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289 & IP_VS_SVC_TAB_MASK;
293 * Returns hash value of fwmark for virtual service lookup
295 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
297 return fwmark & IP_VS_SVC_TAB_MASK;
301 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302 * or in the ip_vs_svc_fwm_table by fwmark.
303 * Should be called with locked tables.
305 static int ip_vs_svc_hash(struct ip_vs_service *svc)
309 if (svc->flags & IP_VS_SVC_F_HASHED) {
310 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311 "called from %p\n", __builtin_return_address(0));
315 if (svc->fwmark == 0) {
317 * Hash it by <protocol,addr,port> in ip_vs_svc_table
319 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323 * Hash it by fwmark in ip_vs_svc_fwm_table
325 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329 svc->flags |= IP_VS_SVC_F_HASHED;
330 /* increase its refcnt because it is referenced by the svc table */
331 atomic_inc(&svc->refcnt);
337 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338 * Should be called with locked tables.
340 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
342 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344 "called from %p\n", __builtin_return_address(0));
348 if (svc->fwmark == 0) {
349 /* Remove it from the ip_vs_svc_table table */
350 list_del(&svc->s_list);
352 /* Remove it from the ip_vs_svc_fwm_table table */
353 list_del(&svc->f_list);
356 svc->flags &= ~IP_VS_SVC_F_HASHED;
357 atomic_dec(&svc->refcnt);
363 * Get service by {proto,addr,port} in the service table.
365 static __inline__ struct ip_vs_service *
366 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
369 struct ip_vs_service *svc;
371 /* Check for "full" addressed entries */
372 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
374 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375 if ((svc->addr == vaddr)
376 && (svc->port == vport)
377 && (svc->protocol == protocol)) {
379 atomic_inc(&svc->usecnt);
389 * Get service by {fwmark} in the service table.
391 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394 struct ip_vs_service *svc;
396 /* Check for fwmark addressed entries */
397 hash = ip_vs_svc_fwm_hashkey(fwmark);
399 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400 if (svc->fwmark == fwmark) {
402 atomic_inc(&svc->usecnt);
410 struct ip_vs_service *
411 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
413 struct ip_vs_service *svc;
415 read_lock(&__ip_vs_svc_lock);
418 * Check the table hashed by fwmark first
420 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
424 * Check the table hashed by <protocol,addr,port>
425 * for "full" addressed entries
427 svc = __ip_vs_service_get(protocol, vaddr, vport);
430 && protocol == IPPROTO_TCP
431 && atomic_read(&ip_vs_ftpsvc_counter)
432 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
434 * Check if ftp service entry exists, the packet
435 * might belong to FTP data connections.
437 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
441 && atomic_read(&ip_vs_nullsvc_counter)) {
443 * Check if the catch-all port (port zero) exists
445 svc = __ip_vs_service_get(protocol, vaddr, 0);
449 read_unlock(&__ip_vs_svc_lock);
451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
452 fwmark, ip_vs_proto_name(protocol),
453 NIPQUAD(vaddr), ntohs(vport),
454 svc?"hit":"not hit");
461 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
463 atomic_inc(&svc->refcnt);
468 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
470 struct ip_vs_service *svc = dest->svc;
473 if (atomic_dec_and_test(&svc->refcnt))
479 * Returns hash value for real service
481 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
483 register unsigned porth = ntohs(port);
485 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
490 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491 * should be called with locked tables.
493 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
497 if (!list_empty(&dest->d_list)) {
502 * Hash by proto,addr,port,
503 * which are the parameters of the real service.
505 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506 list_add(&dest->d_list, &ip_vs_rtable[hash]);
512 * UNhashes ip_vs_dest from ip_vs_rtable.
513 * should be called with locked tables.
515 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518 * Remove it from the ip_vs_rtable table.
520 if (!list_empty(&dest->d_list)) {
521 list_del(&dest->d_list);
522 INIT_LIST_HEAD(&dest->d_list);
529 * Lookup real service by <proto,addr,port> in the real service table.
532 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
535 struct ip_vs_dest *dest;
538 * Check for "full" addressed entries
539 * Return the first found entry
541 hash = ip_vs_rs_hashkey(daddr, dport);
543 read_lock(&__ip_vs_rs_lock);
544 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545 if ((dest->addr == daddr)
546 && (dest->port == dport)
547 && ((dest->protocol == protocol) ||
550 read_unlock(&__ip_vs_rs_lock);
554 read_unlock(&__ip_vs_rs_lock);
560 * Lookup destination by {addr,port} in the given service
562 static struct ip_vs_dest *
563 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
565 struct ip_vs_dest *dest;
568 * Find the destination for the given service
570 list_for_each_entry(dest, &svc->destinations, n_list) {
571 if ((dest->addr == daddr) && (dest->port == dport)) {
582 * Lookup dest by {svc,addr,port} in the destination trash.
583 * The destination trash is used to hold the destinations that are removed
584 * from the service table but are still referenced by some conn entries.
585 * The reason to add the destination trash is when the dest is temporary
586 * down (either by administrator or by monitor program), the dest can be
587 * picked back from the trash, the remaining connections to the dest can
588 * continue, and the counting information of the dest is also useful for
591 static struct ip_vs_dest *
592 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
594 struct ip_vs_dest *dest, *nxt;
597 * Find the destination in trash
599 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
600 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
603 NIPQUAD(dest->addr), ntohs(dest->port),
604 atomic_read(&dest->refcnt));
605 if (dest->addr == daddr &&
606 dest->port == dport &&
607 dest->vfwmark == svc->fwmark &&
608 dest->protocol == svc->protocol &&
610 (dest->vaddr == svc->addr &&
611 dest->vport == svc->port))) {
617 * Try to purge the destination from trash if not referenced
619 if (atomic_read(&dest->refcnt) == 1) {
620 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
623 NIPQUAD(dest->addr), ntohs(dest->port));
624 list_del(&dest->n_list);
625 ip_vs_dst_reset(dest);
626 __ip_vs_unbind_svc(dest);
636 * Clean up all the destinations in the trash
637 * Called by the ip_vs_control_cleanup()
639 * When the ip_vs_control_clearup is activated by ipvs module exit,
640 * the service tables must have been flushed and all the connections
641 * are expired, and the refcnt of each destination in the trash must
642 * be 1, so we simply release them here.
644 static void ip_vs_trash_cleanup(void)
646 struct ip_vs_dest *dest, *nxt;
648 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
649 list_del(&dest->n_list);
650 ip_vs_dst_reset(dest);
651 __ip_vs_unbind_svc(dest);
658 ip_vs_zero_stats(struct ip_vs_stats *stats)
660 spin_lock_bh(&stats->lock);
661 memset(stats, 0, (char *)&stats->lock - (char *)stats);
662 spin_unlock_bh(&stats->lock);
663 ip_vs_zero_estimator(stats);
667 * Update a destination in the given service
670 __ip_vs_update_dest(struct ip_vs_service *svc,
671 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
675 /* set the weight and the flags */
676 atomic_set(&dest->weight, udest->weight);
677 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
679 /* check if local node and update the flags */
680 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
681 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
682 | IP_VS_CONN_F_LOCALNODE;
685 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
686 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
687 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
690 * Put the real service in ip_vs_rtable if not present.
691 * For now only for NAT!
693 write_lock_bh(&__ip_vs_rs_lock);
695 write_unlock_bh(&__ip_vs_rs_lock);
697 atomic_set(&dest->conn_flags, conn_flags);
699 /* bind the service */
701 __ip_vs_bind_svc(dest, svc);
703 if (dest->svc != svc) {
704 __ip_vs_unbind_svc(dest);
705 ip_vs_zero_stats(&dest->stats);
706 __ip_vs_bind_svc(dest, svc);
710 /* set the dest status flags */
711 dest->flags |= IP_VS_DEST_F_AVAILABLE;
713 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
714 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
715 dest->u_threshold = udest->u_threshold;
716 dest->l_threshold = udest->l_threshold;
721 * Create a destination for the given service
724 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
725 struct ip_vs_dest **dest_p)
727 struct ip_vs_dest *dest;
732 atype = inet_addr_type(udest->addr);
733 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
736 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
738 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
741 memset(dest, 0, sizeof(struct ip_vs_dest));
743 dest->protocol = svc->protocol;
744 dest->vaddr = svc->addr;
745 dest->vport = svc->port;
746 dest->vfwmark = svc->fwmark;
747 dest->addr = udest->addr;
748 dest->port = udest->port;
750 atomic_set(&dest->activeconns, 0);
751 atomic_set(&dest->inactconns, 0);
752 atomic_set(&dest->persistconns, 0);
753 atomic_set(&dest->refcnt, 0);
755 INIT_LIST_HEAD(&dest->d_list);
756 spin_lock_init(&dest->dst_lock);
757 spin_lock_init(&dest->stats.lock);
758 __ip_vs_update_dest(svc, dest, udest);
759 ip_vs_new_estimator(&dest->stats);
769 * Add a destination into an existing service
772 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
774 struct ip_vs_dest *dest;
775 __u32 daddr = udest->addr;
776 __u16 dport = udest->port;
781 if (udest->weight < 0) {
782 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
786 if (udest->l_threshold > udest->u_threshold) {
787 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
788 "upper threshold\n");
793 * Check if the dest already exists in the list
795 dest = ip_vs_lookup_dest(svc, daddr, dport);
797 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
802 * Check if the dest already exists in the trash and
803 * is from the same service
805 dest = ip_vs_trash_get_dest(svc, daddr, dport);
807 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
808 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
809 NIPQUAD(daddr), ntohs(dport),
810 atomic_read(&dest->refcnt),
812 NIPQUAD(dest->vaddr),
814 __ip_vs_update_dest(svc, dest, udest);
817 * Get the destination from the trash
819 list_del(&dest->n_list);
821 ip_vs_new_estimator(&dest->stats);
823 write_lock_bh(&__ip_vs_svc_lock);
826 * Wait until all other svc users go away.
828 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
830 list_add(&dest->n_list, &svc->destinations);
833 /* call the update_service function of its scheduler */
834 svc->scheduler->update_service(svc);
836 write_unlock_bh(&__ip_vs_svc_lock);
841 * Allocate and initialize the dest structure
843 ret = ip_vs_new_dest(svc, udest, &dest);
849 * Add the dest entry into the list
851 atomic_inc(&dest->refcnt);
853 write_lock_bh(&__ip_vs_svc_lock);
856 * Wait until all other svc users go away.
858 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
860 list_add(&dest->n_list, &svc->destinations);
863 /* call the update_service function of its scheduler */
864 svc->scheduler->update_service(svc);
866 write_unlock_bh(&__ip_vs_svc_lock);
875 * Edit a destination in the given service
878 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
880 struct ip_vs_dest *dest;
881 __u32 daddr = udest->addr;
882 __u16 dport = udest->port;
886 if (udest->weight < 0) {
887 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
891 if (udest->l_threshold > udest->u_threshold) {
892 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
893 "upper threshold\n");
898 * Lookup the destination list
900 dest = ip_vs_lookup_dest(svc, daddr, dport);
902 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
906 __ip_vs_update_dest(svc, dest, udest);
908 write_lock_bh(&__ip_vs_svc_lock);
910 /* Wait until all other svc users go away */
911 while (atomic_read(&svc->usecnt) > 1) {};
913 /* call the update_service, because server weight may be changed */
914 svc->scheduler->update_service(svc);
916 write_unlock_bh(&__ip_vs_svc_lock);
925 * Delete a destination (must be already unlinked from the service)
927 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
929 ip_vs_kill_estimator(&dest->stats);
932 * Remove it from the d-linked list with the real services.
934 write_lock_bh(&__ip_vs_rs_lock);
935 ip_vs_rs_unhash(dest);
936 write_unlock_bh(&__ip_vs_rs_lock);
939 * Decrease the refcnt of the dest, and free the dest
940 * if nobody refers to it (refcnt=0). Otherwise, throw
941 * the destination into the trash.
943 if (atomic_dec_and_test(&dest->refcnt)) {
944 ip_vs_dst_reset(dest);
945 /* simply decrease svc->refcnt here, let the caller check
946 and release the service if nobody refers to it.
947 Only user context can release destination and service,
948 and only one user context can update virtual service at a
949 time, so the operation here is OK */
950 atomic_dec(&dest->svc->refcnt);
953 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
955 NIPQUAD(dest->addr), ntohs(dest->port),
956 atomic_read(&dest->refcnt));
957 list_add(&dest->n_list, &ip_vs_dest_trash);
958 atomic_inc(&dest->refcnt);
964 * Unlink a destination from the given service
966 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
967 struct ip_vs_dest *dest,
970 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
973 * Remove it from the d-linked destination list.
975 list_del(&dest->n_list);
979 * Call the update_service function of its scheduler
981 svc->scheduler->update_service(svc);
987 * Delete a destination server in the given service
990 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
992 struct ip_vs_dest *dest;
993 __u32 daddr = udest->addr;
994 __u16 dport = udest->port;
998 dest = ip_vs_lookup_dest(svc, daddr, dport);
1000 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1004 write_lock_bh(&__ip_vs_svc_lock);
1007 * Wait until all other svc users go away.
1009 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1012 * Unlink dest from the service
1014 __ip_vs_unlink_dest(svc, dest, 1);
1016 write_unlock_bh(&__ip_vs_svc_lock);
1019 * Delete the destination
1021 __ip_vs_del_dest(dest);
1030 * Add a service into the service hash table
1033 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1036 struct ip_vs_scheduler *sched = NULL;
1037 struct ip_vs_service *svc = NULL;
1039 /* increase the module use count */
1040 ip_vs_use_count_inc();
1042 /* Lookup the scheduler by 'u->sched_name' */
1043 sched = ip_vs_scheduler_get(u->sched_name);
1044 if (sched == NULL) {
1045 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1051 svc = (struct ip_vs_service *)
1052 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1054 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1058 memset(svc, 0, sizeof(struct ip_vs_service));
1060 /* I'm the first user of the service */
1061 atomic_set(&svc->usecnt, 1);
1062 atomic_set(&svc->refcnt, 0);
1064 svc->protocol = u->protocol;
1065 svc->addr = u->addr;
1066 svc->port = u->port;
1067 svc->fwmark = u->fwmark;
1068 svc->flags = u->flags;
1069 svc->timeout = u->timeout * HZ;
1070 svc->netmask = u->netmask;
1072 INIT_LIST_HEAD(&svc->destinations);
1073 rwlock_init(&svc->sched_lock);
1074 spin_lock_init(&svc->stats.lock);
1076 /* Bind the scheduler */
1077 ret = ip_vs_bind_scheduler(svc, sched);
1082 /* Update the virtual service counters */
1083 if (svc->port == FTPPORT)
1084 atomic_inc(&ip_vs_ftpsvc_counter);
1085 else if (svc->port == 0)
1086 atomic_inc(&ip_vs_nullsvc_counter);
1088 ip_vs_new_estimator(&svc->stats);
1089 ip_vs_num_services++;
1091 /* Hash the service into the service table */
1092 write_lock_bh(&__ip_vs_svc_lock);
1093 ip_vs_svc_hash(svc);
1094 write_unlock_bh(&__ip_vs_svc_lock);
1102 ip_vs_unbind_scheduler(svc);
1105 ip_vs_app_inc_put(svc->inc);
1110 ip_vs_scheduler_put(sched);
1113 /* decrease the module use count */
1114 ip_vs_use_count_dec();
1121 * Edit a service and bind it with a new scheduler
1124 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1126 struct ip_vs_scheduler *sched, *old_sched;
1130 * Lookup the scheduler, by 'u->sched_name'
1132 sched = ip_vs_scheduler_get(u->sched_name);
1133 if (sched == NULL) {
1134 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1140 write_lock_bh(&__ip_vs_svc_lock);
1143 * Wait until all other svc users go away.
1145 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1148 * Set the flags and timeout value
1150 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1151 svc->timeout = u->timeout * HZ;
1152 svc->netmask = u->netmask;
1154 old_sched = svc->scheduler;
1155 if (sched != old_sched) {
1157 * Unbind the old scheduler
1159 if ((ret = ip_vs_unbind_scheduler(svc))) {
1165 * Bind the new scheduler
1167 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1169 * If ip_vs_bind_scheduler fails, restore the old
1171 * The main reason of failure is out of memory.
1173 * The question is if the old scheduler can be
1174 * restored all the time. TODO: if it cannot be
1175 * restored some time, we must delete the service,
1176 * otherwise the system may crash.
1178 ip_vs_bind_scheduler(svc, old_sched);
1185 write_unlock_bh(&__ip_vs_svc_lock);
1188 ip_vs_scheduler_put(old_sched);
1195 * Delete a service from the service list
1196 * - The service must be unlinked, unlocked and not referenced!
1197 * - We are called under _bh lock
1199 static void __ip_vs_del_service(struct ip_vs_service *svc)
1201 struct ip_vs_dest *dest, *nxt;
1202 struct ip_vs_scheduler *old_sched;
1204 ip_vs_num_services--;
1205 ip_vs_kill_estimator(&svc->stats);
1207 /* Unbind scheduler */
1208 old_sched = svc->scheduler;
1209 ip_vs_unbind_scheduler(svc);
1211 ip_vs_scheduler_put(old_sched);
1213 /* Unbind app inc */
1215 ip_vs_app_inc_put(svc->inc);
1220 * Unlink the whole destination list
1222 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1223 __ip_vs_unlink_dest(svc, dest, 0);
1224 __ip_vs_del_dest(dest);
1228 * Update the virtual service counters
1230 if (svc->port == FTPPORT)
1231 atomic_dec(&ip_vs_ftpsvc_counter);
1232 else if (svc->port == 0)
1233 atomic_dec(&ip_vs_nullsvc_counter);
1236 * Free the service if nobody refers to it
1238 if (atomic_read(&svc->refcnt) == 0)
1241 /* decrease the module use count */
1242 ip_vs_use_count_dec();
1246 * Delete a service from the service list
1248 static int ip_vs_del_service(struct ip_vs_service *svc)
1254 * Unhash it from the service table
1256 write_lock_bh(&__ip_vs_svc_lock);
1258 ip_vs_svc_unhash(svc);
1261 * Wait until all the svc users go away.
1263 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1265 __ip_vs_del_service(svc);
1267 write_unlock_bh(&__ip_vs_svc_lock);
1274 * Flush all the virtual services
1276 static int ip_vs_flush(void)
1279 struct ip_vs_service *svc, *nxt;
1282 * Flush the service table hashed by <protocol,addr,port>
1284 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1285 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1286 write_lock_bh(&__ip_vs_svc_lock);
1287 ip_vs_svc_unhash(svc);
1289 * Wait until all the svc users go away.
1291 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1292 __ip_vs_del_service(svc);
1293 write_unlock_bh(&__ip_vs_svc_lock);
1298 * Flush the service table hashed by fwmark
1300 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1301 list_for_each_entry_safe(svc, nxt,
1302 &ip_vs_svc_fwm_table[idx], f_list) {
1303 write_lock_bh(&__ip_vs_svc_lock);
1304 ip_vs_svc_unhash(svc);
1306 * Wait until all the svc users go away.
1308 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1309 __ip_vs_del_service(svc);
1310 write_unlock_bh(&__ip_vs_svc_lock);
1319 * Zero counters in a service or all services
1321 static int ip_vs_zero_service(struct ip_vs_service *svc)
1323 struct ip_vs_dest *dest;
1325 write_lock_bh(&__ip_vs_svc_lock);
1326 list_for_each_entry(dest, &svc->destinations, n_list) {
1327 ip_vs_zero_stats(&dest->stats);
1329 ip_vs_zero_stats(&svc->stats);
1330 write_unlock_bh(&__ip_vs_svc_lock);
1334 static int ip_vs_zero_all(void)
1337 struct ip_vs_service *svc;
1339 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1340 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1341 ip_vs_zero_service(svc);
1345 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1346 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1347 ip_vs_zero_service(svc);
1351 ip_vs_zero_stats(&ip_vs_stats);
1357 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1358 void __user *buffer, size_t *lenp, loff_t *ppos)
1360 int *valp = table->data;
1364 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1365 if (write && (*valp != val)) {
1366 if ((*valp < 0) || (*valp > 3)) {
1367 /* Restore the correct value */
1370 update_defense_level();
1378 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1379 void __user *buffer, size_t *lenp, loff_t *ppos)
1381 int *valp = table->data;
1385 /* backup the value first */
1386 memcpy(val, valp, sizeof(val));
1388 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1389 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1390 /* Restore the correct value */
1391 memcpy(valp, val, sizeof(val));
1398 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1401 static struct ctl_table vs_vars[] = {
1403 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1404 .procname = "amemthresh",
1405 .data = &sysctl_ip_vs_amemthresh,
1406 .maxlen = sizeof(int),
1408 .proc_handler = &proc_dointvec,
1410 #ifdef CONFIG_IP_VS_DEBUG
1412 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1413 .procname = "debug_level",
1414 .data = &sysctl_ip_vs_debug_level,
1415 .maxlen = sizeof(int),
1417 .proc_handler = &proc_dointvec,
1421 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1422 .procname = "am_droprate",
1423 .data = &sysctl_ip_vs_am_droprate,
1424 .maxlen = sizeof(int),
1426 .proc_handler = &proc_dointvec,
1429 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1430 .procname = "drop_entry",
1431 .data = &sysctl_ip_vs_drop_entry,
1432 .maxlen = sizeof(int),
1434 .proc_handler = &proc_do_defense_mode,
1437 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1438 .procname = "drop_packet",
1439 .data = &sysctl_ip_vs_drop_packet,
1440 .maxlen = sizeof(int),
1442 .proc_handler = &proc_do_defense_mode,
1445 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1446 .procname = "secure_tcp",
1447 .data = &sysctl_ip_vs_secure_tcp,
1448 .maxlen = sizeof(int),
1450 .proc_handler = &proc_do_defense_mode,
1454 .ctl_name = NET_IPV4_VS_TO_ES,
1455 .procname = "timeout_established",
1456 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1457 .maxlen = sizeof(int),
1459 .proc_handler = &proc_dointvec_jiffies,
1462 .ctl_name = NET_IPV4_VS_TO_SS,
1463 .procname = "timeout_synsent",
1464 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1465 .maxlen = sizeof(int),
1467 .proc_handler = &proc_dointvec_jiffies,
1470 .ctl_name = NET_IPV4_VS_TO_SR,
1471 .procname = "timeout_synrecv",
1472 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1473 .maxlen = sizeof(int),
1475 .proc_handler = &proc_dointvec_jiffies,
1478 .ctl_name = NET_IPV4_VS_TO_FW,
1479 .procname = "timeout_finwait",
1480 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1481 .maxlen = sizeof(int),
1483 .proc_handler = &proc_dointvec_jiffies,
1486 .ctl_name = NET_IPV4_VS_TO_TW,
1487 .procname = "timeout_timewait",
1488 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1489 .maxlen = sizeof(int),
1491 .proc_handler = &proc_dointvec_jiffies,
1494 .ctl_name = NET_IPV4_VS_TO_CL,
1495 .procname = "timeout_close",
1496 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1497 .maxlen = sizeof(int),
1499 .proc_handler = &proc_dointvec_jiffies,
1502 .ctl_name = NET_IPV4_VS_TO_CW,
1503 .procname = "timeout_closewait",
1504 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1505 .maxlen = sizeof(int),
1507 .proc_handler = &proc_dointvec_jiffies,
1510 .ctl_name = NET_IPV4_VS_TO_LA,
1511 .procname = "timeout_lastack",
1512 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1513 .maxlen = sizeof(int),
1515 .proc_handler = &proc_dointvec_jiffies,
1518 .ctl_name = NET_IPV4_VS_TO_LI,
1519 .procname = "timeout_listen",
1520 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1521 .maxlen = sizeof(int),
1523 .proc_handler = &proc_dointvec_jiffies,
1526 .ctl_name = NET_IPV4_VS_TO_SA,
1527 .procname = "timeout_synack",
1528 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1529 .maxlen = sizeof(int),
1531 .proc_handler = &proc_dointvec_jiffies,
1534 .ctl_name = NET_IPV4_VS_TO_UDP,
1535 .procname = "timeout_udp",
1536 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1537 .maxlen = sizeof(int),
1539 .proc_handler = &proc_dointvec_jiffies,
1542 .ctl_name = NET_IPV4_VS_TO_ICMP,
1543 .procname = "timeout_icmp",
1544 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1545 .maxlen = sizeof(int),
1547 .proc_handler = &proc_dointvec_jiffies,
1551 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1552 .procname = "cache_bypass",
1553 .data = &sysctl_ip_vs_cache_bypass,
1554 .maxlen = sizeof(int),
1556 .proc_handler = &proc_dointvec,
1559 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1560 .procname = "expire_nodest_conn",
1561 .data = &sysctl_ip_vs_expire_nodest_conn,
1562 .maxlen = sizeof(int),
1564 .proc_handler = &proc_dointvec,
1567 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1568 .procname = "expire_quiescent_template",
1569 .data = &sysctl_ip_vs_expire_quiescent_template,
1570 .maxlen = sizeof(int),
1572 .proc_handler = &proc_dointvec,
1575 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1576 .procname = "sync_threshold",
1577 .data = &sysctl_ip_vs_sync_threshold,
1578 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1580 .proc_handler = &proc_do_sync_threshold,
1583 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1584 .procname = "nat_icmp_send",
1585 .data = &sysctl_ip_vs_nat_icmp_send,
1586 .maxlen = sizeof(int),
1588 .proc_handler = &proc_dointvec,
1593 static ctl_table vs_table[] = {
1595 .ctl_name = NET_IPV4_VS,
1603 static ctl_table ipvs_ipv4_table[] = {
1605 .ctl_name = NET_IPV4,
1613 static ctl_table vs_root_table[] = {
1615 .ctl_name = CTL_NET,
1618 .child = ipvs_ipv4_table,
1623 static struct ctl_table_header * sysctl_header;
1625 #ifdef CONFIG_PROC_FS
1628 struct list_head *table;
1633 * Write the contents of the VS rule table to a PROCfs file.
1634 * (It is kept just for backward compatibility)
1636 static inline const char *ip_vs_fwd_name(unsigned flags)
1638 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1639 case IP_VS_CONN_F_LOCALNODE:
1641 case IP_VS_CONN_F_TUNNEL:
1643 case IP_VS_CONN_F_DROUTE:
1651 /* Get the Nth entry in the two lists */
1652 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1654 struct ip_vs_iter *iter = seq->private;
1656 struct ip_vs_service *svc;
1658 /* look in hash by protocol */
1659 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1660 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1662 iter->table = ip_vs_svc_table;
1669 /* keep looking in fwmark */
1670 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1671 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1673 iter->table = ip_vs_svc_fwm_table;
1683 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1686 read_lock_bh(&__ip_vs_svc_lock);
1687 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1691 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1693 struct list_head *e;
1694 struct ip_vs_iter *iter;
1695 struct ip_vs_service *svc;
1698 if (v == SEQ_START_TOKEN)
1699 return ip_vs_info_array(seq,0);
1702 iter = seq->private;
1704 if (iter->table == ip_vs_svc_table) {
1705 /* next service in table hashed by protocol */
1706 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1707 return list_entry(e, struct ip_vs_service, s_list);
1710 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1711 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1717 iter->table = ip_vs_svc_fwm_table;
1722 /* next service in hashed by fwmark */
1723 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1724 return list_entry(e, struct ip_vs_service, f_list);
1727 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1728 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1736 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1738 read_unlock_bh(&__ip_vs_svc_lock);
1742 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1744 if (v == SEQ_START_TOKEN) {
1746 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1747 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1749 "Prot LocalAddress:Port Scheduler Flags\n");
1751 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1753 const struct ip_vs_service *svc = v;
1754 const struct ip_vs_iter *iter = seq->private;
1755 const struct ip_vs_dest *dest;
1757 if (iter->table == ip_vs_svc_table)
1758 seq_printf(seq, "%s %08X:%04X %s ",
1759 ip_vs_proto_name(svc->protocol),
1762 svc->scheduler->name);
1764 seq_printf(seq, "FWM %08X %s ",
1765 svc->fwmark, svc->scheduler->name);
1767 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1768 seq_printf(seq, "persistent %d %08X\n",
1770 ntohl(svc->netmask));
1772 seq_putc(seq, '\n');
1774 list_for_each_entry(dest, &svc->destinations, n_list) {
1776 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1777 ntohl(dest->addr), ntohs(dest->port),
1778 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1779 atomic_read(&dest->weight),
1780 atomic_read(&dest->activeconns),
1781 atomic_read(&dest->inactconns));
1787 static struct seq_operations ip_vs_info_seq_ops = {
1788 .start = ip_vs_info_seq_start,
1789 .next = ip_vs_info_seq_next,
1790 .stop = ip_vs_info_seq_stop,
1791 .show = ip_vs_info_seq_show,
1794 static int ip_vs_info_open(struct inode *inode, struct file *file)
1796 struct seq_file *seq;
1798 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1803 rc = seq_open(file, &ip_vs_info_seq_ops);
1807 seq = file->private_data;
1809 memset(s, 0, sizeof(*s));
1817 static struct file_operations ip_vs_info_fops = {
1818 .owner = THIS_MODULE,
1819 .open = ip_vs_info_open,
1821 .llseek = seq_lseek,
1822 .release = seq_release_private,
1827 struct ip_vs_stats ip_vs_stats;
1829 #ifdef CONFIG_PROC_FS
1830 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1833 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1835 " Total Incoming Outgoing Incoming Outgoing\n");
1837 " Conns Packets Packets Bytes Bytes\n");
1839 spin_lock_bh(&ip_vs_stats.lock);
1840 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1841 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1842 (unsigned long long) ip_vs_stats.inbytes,
1843 (unsigned long long) ip_vs_stats.outbytes);
1845 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1847 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1848 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1853 ip_vs_stats.outbps);
1854 spin_unlock_bh(&ip_vs_stats.lock);
1859 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1861 return single_open(file, ip_vs_stats_show, NULL);
1864 static struct file_operations ip_vs_stats_fops = {
1865 .owner = THIS_MODULE,
1866 .open = ip_vs_stats_seq_open,
1868 .llseek = seq_lseek,
1869 .release = single_release,
1875 * Set timeout values for tcp tcpfin udp in the timeout_table.
1877 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1879 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1884 #ifdef CONFIG_IP_VS_PROTO_TCP
1885 if (u->tcp_timeout) {
1886 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1887 = u->tcp_timeout * HZ;
1890 if (u->tcp_fin_timeout) {
1891 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1892 = u->tcp_fin_timeout * HZ;
1896 #ifdef CONFIG_IP_VS_PROTO_UDP
1897 if (u->udp_timeout) {
1898 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1899 = u->udp_timeout * HZ;
1906 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1907 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1908 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1909 sizeof(struct ip_vs_dest_user))
1910 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1911 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1912 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1914 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1915 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1917 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1919 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1921 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1922 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1924 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1925 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1929 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1932 unsigned char arg[MAX_ARG_LEN];
1933 struct ip_vs_service_user *usvc;
1934 struct ip_vs_service *svc;
1935 struct ip_vs_dest_user *udest;
1937 if (!capable(CAP_NET_ADMIN))
1940 if (len != set_arglen[SET_CMDID(cmd)]) {
1941 IP_VS_ERR("set_ctl: len %u != %u\n",
1942 len, set_arglen[SET_CMDID(cmd)]);
1946 if (copy_from_user(arg, user, len) != 0)
1949 /* increase the module use count */
1950 ip_vs_use_count_inc();
1952 if (down_interruptible(&__ip_vs_mutex)) {
1957 if (cmd == IP_VS_SO_SET_FLUSH) {
1958 /* Flush the virtual service */
1959 ret = ip_vs_flush();
1961 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1962 /* Set timeout values for (tcp tcpfin udp) */
1963 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1965 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1966 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1967 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1969 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1970 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1971 ret = stop_sync_thread(dm->state);
1975 usvc = (struct ip_vs_service_user *)arg;
1976 udest = (struct ip_vs_dest_user *)(usvc + 1);
1978 if (cmd == IP_VS_SO_SET_ZERO) {
1979 /* if no service address is set, zero counters in all */
1980 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1981 ret = ip_vs_zero_all();
1986 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1987 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1988 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1989 usvc->protocol, NIPQUAD(usvc->addr),
1990 ntohs(usvc->port), usvc->sched_name);
1995 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1996 if (usvc->fwmark == 0)
1997 svc = __ip_vs_service_get(usvc->protocol,
1998 usvc->addr, usvc->port);
2000 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2002 if (cmd != IP_VS_SO_SET_ADD
2003 && (svc == NULL || svc->protocol != usvc->protocol)) {
2009 case IP_VS_SO_SET_ADD:
2013 ret = ip_vs_add_service(usvc, &svc);
2015 case IP_VS_SO_SET_EDIT:
2016 ret = ip_vs_edit_service(svc, usvc);
2018 case IP_VS_SO_SET_DEL:
2019 ret = ip_vs_del_service(svc);
2023 case IP_VS_SO_SET_ZERO:
2024 ret = ip_vs_zero_service(svc);
2026 case IP_VS_SO_SET_ADDDEST:
2027 ret = ip_vs_add_dest(svc, udest);
2029 case IP_VS_SO_SET_EDITDEST:
2030 ret = ip_vs_edit_dest(svc, udest);
2032 case IP_VS_SO_SET_DELDEST:
2033 ret = ip_vs_del_dest(svc, udest);
2040 ip_vs_service_put(svc);
2045 /* decrease the module use count */
2046 ip_vs_use_count_dec();
2053 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2055 spin_lock_bh(&src->lock);
2056 memcpy(dst, src, (char*)&src->lock - (char*)src);
2057 spin_unlock_bh(&src->lock);
2061 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2063 dst->protocol = src->protocol;
2064 dst->addr = src->addr;
2065 dst->port = src->port;
2066 dst->fwmark = src->fwmark;
2067 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2068 dst->flags = src->flags;
2069 dst->timeout = src->timeout / HZ;
2070 dst->netmask = src->netmask;
2071 dst->num_dests = src->num_dests;
2072 ip_vs_copy_stats(&dst->stats, &src->stats);
2076 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2077 struct ip_vs_get_services __user *uptr)
2080 struct ip_vs_service *svc;
2081 struct ip_vs_service_entry entry;
2084 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2085 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2086 if (count >= get->num_services)
2088 memset(&entry, 0, sizeof(entry));
2089 ip_vs_copy_service(&entry, svc);
2090 if (copy_to_user(&uptr->entrytable[count],
2091 &entry, sizeof(entry))) {
2099 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2100 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2101 if (count >= get->num_services)
2103 memset(&entry, 0, sizeof(entry));
2104 ip_vs_copy_service(&entry, svc);
2105 if (copy_to_user(&uptr->entrytable[count],
2106 &entry, sizeof(entry))) {
2118 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2119 struct ip_vs_get_dests __user *uptr)
2121 struct ip_vs_service *svc;
2125 svc = __ip_vs_svc_fwm_get(get->fwmark);
2127 svc = __ip_vs_service_get(get->protocol,
2128 get->addr, get->port);
2131 struct ip_vs_dest *dest;
2132 struct ip_vs_dest_entry entry;
2134 list_for_each_entry(dest, &svc->destinations, n_list) {
2135 if (count >= get->num_dests)
2138 entry.addr = dest->addr;
2139 entry.port = dest->port;
2140 entry.conn_flags = atomic_read(&dest->conn_flags);
2141 entry.weight = atomic_read(&dest->weight);
2142 entry.u_threshold = dest->u_threshold;
2143 entry.l_threshold = dest->l_threshold;
2144 entry.activeconns = atomic_read(&dest->activeconns);
2145 entry.inactconns = atomic_read(&dest->inactconns);
2146 entry.persistconns = atomic_read(&dest->persistconns);
2147 ip_vs_copy_stats(&entry.stats, &dest->stats);
2148 if (copy_to_user(&uptr->entrytable[count],
2149 &entry, sizeof(entry))) {
2155 ip_vs_service_put(svc);
2162 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2164 #ifdef CONFIG_IP_VS_PROTO_TCP
2166 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2167 u->tcp_fin_timeout =
2168 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2170 #ifdef CONFIG_IP_VS_PROTO_UDP
2172 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2177 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2178 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2179 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2180 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2181 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2182 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2183 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2185 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2186 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2187 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2188 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2189 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2190 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2191 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2192 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2196 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2198 unsigned char arg[128];
2201 if (!capable(CAP_NET_ADMIN))
2204 if (*len < get_arglen[GET_CMDID(cmd)]) {
2205 IP_VS_ERR("get_ctl: len %u < %u\n",
2206 *len, get_arglen[GET_CMDID(cmd)]);
2210 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2213 if (down_interruptible(&__ip_vs_mutex))
2214 return -ERESTARTSYS;
2217 case IP_VS_SO_GET_VERSION:
2221 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2222 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2223 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2227 *len = strlen(buf)+1;
2231 case IP_VS_SO_GET_INFO:
2233 struct ip_vs_getinfo info;
2234 info.version = IP_VS_VERSION_CODE;
2235 info.size = IP_VS_CONN_TAB_SIZE;
2236 info.num_services = ip_vs_num_services;
2237 if (copy_to_user(user, &info, sizeof(info)) != 0)
2242 case IP_VS_SO_GET_SERVICES:
2244 struct ip_vs_get_services *get;
2247 get = (struct ip_vs_get_services *)arg;
2248 size = sizeof(*get) +
2249 sizeof(struct ip_vs_service_entry) * get->num_services;
2251 IP_VS_ERR("length: %u != %u\n", *len, size);
2255 ret = __ip_vs_get_service_entries(get, user);
2259 case IP_VS_SO_GET_SERVICE:
2261 struct ip_vs_service_entry *entry;
2262 struct ip_vs_service *svc;
2264 entry = (struct ip_vs_service_entry *)arg;
2266 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2268 svc = __ip_vs_service_get(entry->protocol,
2269 entry->addr, entry->port);
2271 ip_vs_copy_service(entry, svc);
2272 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2274 ip_vs_service_put(svc);
2280 case IP_VS_SO_GET_DESTS:
2282 struct ip_vs_get_dests *get;
2285 get = (struct ip_vs_get_dests *)arg;
2286 size = sizeof(*get) +
2287 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2289 IP_VS_ERR("length: %u != %u\n", *len, size);
2293 ret = __ip_vs_get_dest_entries(get, user);
2297 case IP_VS_SO_GET_TIMEOUT:
2299 struct ip_vs_timeout_user t;
2301 __ip_vs_get_timeouts(&t);
2302 if (copy_to_user(user, &t, sizeof(t)) != 0)
2307 case IP_VS_SO_GET_DAEMON:
2309 struct ip_vs_daemon_user d[2];
2311 memset(&d, 0, sizeof(d));
2312 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2313 d[0].state = IP_VS_STATE_MASTER;
2314 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2315 d[0].syncid = ip_vs_master_syncid;
2317 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2318 d[1].state = IP_VS_STATE_BACKUP;
2319 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2320 d[1].syncid = ip_vs_backup_syncid;
2322 if (copy_to_user(user, &d, sizeof(d)) != 0)
2337 static struct nf_sockopt_ops ip_vs_sockopts = {
2339 .set_optmin = IP_VS_BASE_CTL,
2340 .set_optmax = IP_VS_SO_SET_MAX+1,
2341 .set = do_ip_vs_set_ctl,
2342 .get_optmin = IP_VS_BASE_CTL,
2343 .get_optmax = IP_VS_SO_GET_MAX+1,
2344 .get = do_ip_vs_get_ctl,
2348 int ip_vs_control_init(void)
2355 ret = nf_register_sockopt(&ip_vs_sockopts);
2357 IP_VS_ERR("cannot register sockopt.\n");
2361 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2362 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2364 sysctl_header = register_sysctl_table(vs_root_table, 0);
2366 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2367 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2368 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2369 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2371 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2372 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2375 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2376 spin_lock_init(&ip_vs_stats.lock);
2377 ip_vs_new_estimator(&ip_vs_stats);
2379 /* Hook the defense timer */
2380 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2387 void ip_vs_control_cleanup(void)
2390 ip_vs_trash_cleanup();
2391 cancel_rearming_delayed_work(&defense_work);
2392 ip_vs_kill_estimator(&ip_vs_stats);
2393 unregister_sysctl_table(sysctl_header);
2394 proc_net_remove("ip_vs_stats");
2395 proc_net_remove("ip_vs");
2396 nf_unregister_sockopt(&ip_vs_sockopts);