2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
40 #include <asm/uaccess.h>
42 #include <net/ip_vs.h>
44 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45 static DECLARE_MUTEX(__ip_vs_mutex);
47 /* lock for service table */
48 static DEFINE_RWLOCK(__ip_vs_svc_lock);
50 /* lock for table with the real services */
51 static DEFINE_RWLOCK(__ip_vs_rs_lock);
53 /* lock for state and timeout tables */
54 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56 /* lock for drop entry handling */
57 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59 /* lock for drop packet handling */
60 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62 /* 1/rate drop and drop-entry variables */
63 int ip_vs_drop_rate = 0;
64 int ip_vs_drop_counter = 0;
65 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67 /* number of virtual services */
68 static int ip_vs_num_services = 0;
70 /* sysctl variables */
71 static int sysctl_ip_vs_drop_entry = 0;
72 static int sysctl_ip_vs_drop_packet = 0;
73 static int sysctl_ip_vs_secure_tcp = 0;
74 static int sysctl_ip_vs_amemthresh = 1024;
75 static int sysctl_ip_vs_am_droprate = 10;
76 int sysctl_ip_vs_cache_bypass = 0;
77 int sysctl_ip_vs_expire_nodest_conn = 0;
78 int sysctl_ip_vs_expire_quiescent_template = 0;
79 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80 int sysctl_ip_vs_nat_icmp_send = 0;
83 #ifdef CONFIG_IP_VS_DEBUG
84 static int sysctl_ip_vs_debug_level = 0;
86 int ip_vs_get_debug_level(void)
88 return sysctl_ip_vs_debug_level;
93 * update_defense_level is called from keventd and from sysctl.
95 static void update_defense_level(void)
98 static int old_secure_tcp = 0;
103 /* we only count free and buffered memory (in pages) */
105 availmem = i.freeram + i.bufferram;
106 /* however in linux 2.5 the i.bufferram is total page cache size,
108 /* si_swapinfo(&i); */
109 /* availmem = availmem - (i.totalswap - i.freeswap); */
111 nomem = (availmem < sysctl_ip_vs_amemthresh);
114 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) {
117 atomic_set(&ip_vs_dropentry, 0);
121 atomic_set(&ip_vs_dropentry, 1);
122 sysctl_ip_vs_drop_entry = 2;
124 atomic_set(&ip_vs_dropentry, 0);
129 atomic_set(&ip_vs_dropentry, 1);
131 atomic_set(&ip_vs_dropentry, 0);
132 sysctl_ip_vs_drop_entry = 1;
136 atomic_set(&ip_vs_dropentry, 1);
139 spin_unlock(&__ip_vs_dropentry_lock);
142 spin_lock(&__ip_vs_droppacket_lock);
143 switch (sysctl_ip_vs_drop_packet) {
149 ip_vs_drop_rate = ip_vs_drop_counter
150 = sysctl_ip_vs_amemthresh /
151 (sysctl_ip_vs_amemthresh-availmem);
152 sysctl_ip_vs_drop_packet = 2;
159 ip_vs_drop_rate = ip_vs_drop_counter
160 = sysctl_ip_vs_amemthresh /
161 (sysctl_ip_vs_amemthresh-availmem);
164 sysctl_ip_vs_drop_packet = 1;
168 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
171 spin_unlock(&__ip_vs_droppacket_lock);
174 write_lock(&__ip_vs_securetcp_lock);
175 switch (sysctl_ip_vs_secure_tcp) {
177 if (old_secure_tcp >= 2)
182 if (old_secure_tcp < 2)
184 sysctl_ip_vs_secure_tcp = 2;
186 if (old_secure_tcp >= 2)
192 if (old_secure_tcp < 2)
195 if (old_secure_tcp >= 2)
197 sysctl_ip_vs_secure_tcp = 1;
201 if (old_secure_tcp < 2)
205 old_secure_tcp = sysctl_ip_vs_secure_tcp;
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock);
213 * Timer for checking the defense
215 #define DEFENSE_TIMER_PERIOD 1*HZ
216 static void defense_work_handler(void *data);
217 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
219 static void defense_work_handler(void *data)
221 update_defense_level();
222 if (atomic_read(&ip_vs_dropentry))
223 ip_vs_random_dropentry();
225 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
229 ip_vs_use_count_inc(void)
231 return try_module_get(THIS_MODULE);
235 ip_vs_use_count_dec(void)
237 module_put(THIS_MODULE);
242 * Hash table: for virtual service lookups
244 #define IP_VS_SVC_TAB_BITS 8
245 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
246 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
248 /* the service table hashed by <protocol, addr, port> */
249 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
250 /* the service table hashed by fwmark */
251 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
254 * Hash table: for real service lookups
256 #define IP_VS_RTAB_BITS 4
257 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
258 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
260 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
263 * Trash for destinations
265 static LIST_HEAD(ip_vs_dest_trash);
268 * FTP & NULL virtual service counters
270 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
271 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
275 * Returns hash value for virtual service
277 static __inline__ unsigned
278 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
280 register unsigned porth = ntohs(port);
282 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
283 & IP_VS_SVC_TAB_MASK;
287 * Returns hash value of fwmark for virtual service lookup
289 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
291 return fwmark & IP_VS_SVC_TAB_MASK;
295 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
296 * or in the ip_vs_svc_fwm_table by fwmark.
297 * Should be called with locked tables.
299 static int ip_vs_svc_hash(struct ip_vs_service *svc)
303 if (svc->flags & IP_VS_SVC_F_HASHED) {
304 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
305 "called from %p\n", __builtin_return_address(0));
309 if (svc->fwmark == 0) {
311 * Hash it by <protocol,addr,port> in ip_vs_svc_table
313 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
314 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
317 * Hash it by fwmark in ip_vs_svc_fwm_table
319 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
320 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
323 svc->flags |= IP_VS_SVC_F_HASHED;
324 /* increase its refcnt because it is referenced by the svc table */
325 atomic_inc(&svc->refcnt);
331 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
332 * Should be called with locked tables.
334 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
336 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
337 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
338 "called from %p\n", __builtin_return_address(0));
342 if (svc->fwmark == 0) {
343 /* Remove it from the ip_vs_svc_table table */
344 list_del(&svc->s_list);
346 /* Remove it from the ip_vs_svc_fwm_table table */
347 list_del(&svc->f_list);
350 svc->flags &= ~IP_VS_SVC_F_HASHED;
351 atomic_dec(&svc->refcnt);
357 * Get service by {proto,addr,port} in the service table.
359 static __inline__ struct ip_vs_service *
360 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
363 struct ip_vs_service *svc;
365 /* Check for "full" addressed entries */
366 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
368 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
369 if ((svc->addr == vaddr)
370 && (svc->port == vport)
371 && (svc->protocol == protocol)) {
373 atomic_inc(&svc->usecnt);
383 * Get service by {fwmark} in the service table.
385 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
388 struct ip_vs_service *svc;
390 /* Check for fwmark addressed entries */
391 hash = ip_vs_svc_fwm_hashkey(fwmark);
393 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
394 if (svc->fwmark == fwmark) {
396 atomic_inc(&svc->usecnt);
404 struct ip_vs_service *
405 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
407 struct ip_vs_service *svc;
409 read_lock(&__ip_vs_svc_lock);
412 * Check the table hashed by fwmark first
414 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
418 * Check the table hashed by <protocol,addr,port>
419 * for "full" addressed entries
421 svc = __ip_vs_service_get(protocol, vaddr, vport);
424 && protocol == IPPROTO_TCP
425 && atomic_read(&ip_vs_ftpsvc_counter)
426 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
428 * Check if ftp service entry exists, the packet
429 * might belong to FTP data connections.
431 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
435 && atomic_read(&ip_vs_nullsvc_counter)) {
437 * Check if the catch-all port (port zero) exists
439 svc = __ip_vs_service_get(protocol, vaddr, 0);
443 read_unlock(&__ip_vs_svc_lock);
445 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
446 fwmark, ip_vs_proto_name(protocol),
447 NIPQUAD(vaddr), ntohs(vport),
448 svc?"hit":"not hit");
455 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
457 atomic_inc(&svc->refcnt);
462 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
464 struct ip_vs_service *svc = dest->svc;
467 if (atomic_dec_and_test(&svc->refcnt))
473 * Returns hash value for real service
475 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
477 register unsigned porth = ntohs(port);
479 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
484 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
485 * should be called with locked tables.
487 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
491 if (!list_empty(&dest->d_list)) {
496 * Hash by proto,addr,port,
497 * which are the parameters of the real service.
499 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
500 list_add(&dest->d_list, &ip_vs_rtable[hash]);
506 * UNhashes ip_vs_dest from ip_vs_rtable.
507 * should be called with locked tables.
509 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
512 * Remove it from the ip_vs_rtable table.
514 if (!list_empty(&dest->d_list)) {
515 list_del(&dest->d_list);
516 INIT_LIST_HEAD(&dest->d_list);
523 * Lookup real service by <proto,addr,port> in the real service table.
526 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
529 struct ip_vs_dest *dest;
532 * Check for "full" addressed entries
533 * Return the first found entry
535 hash = ip_vs_rs_hashkey(daddr, dport);
537 read_lock(&__ip_vs_rs_lock);
538 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
539 if ((dest->addr == daddr)
540 && (dest->port == dport)
541 && ((dest->protocol == protocol) ||
544 read_unlock(&__ip_vs_rs_lock);
548 read_unlock(&__ip_vs_rs_lock);
554 * Lookup destination by {addr,port} in the given service
556 static struct ip_vs_dest *
557 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
559 struct ip_vs_dest *dest;
562 * Find the destination for the given service
564 list_for_each_entry(dest, &svc->destinations, n_list) {
565 if ((dest->addr == daddr) && (dest->port == dport)) {
576 * Lookup dest by {svc,addr,port} in the destination trash.
577 * The destination trash is used to hold the destinations that are removed
578 * from the service table but are still referenced by some conn entries.
579 * The reason to add the destination trash is when the dest is temporary
580 * down (either by administrator or by monitor program), the dest can be
581 * picked back from the trash, the remaining connections to the dest can
582 * continue, and the counting information of the dest is also useful for
585 static struct ip_vs_dest *
586 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
588 struct ip_vs_dest *dest, *nxt;
591 * Find the destination in trash
593 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
594 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
597 NIPQUAD(dest->addr), ntohs(dest->port),
598 atomic_read(&dest->refcnt));
599 if (dest->addr == daddr &&
600 dest->port == dport &&
601 dest->vfwmark == svc->fwmark &&
602 dest->protocol == svc->protocol &&
604 (dest->vaddr == svc->addr &&
605 dest->vport == svc->port))) {
611 * Try to purge the destination from trash if not referenced
613 if (atomic_read(&dest->refcnt) == 1) {
614 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
617 NIPQUAD(dest->addr), ntohs(dest->port));
618 list_del(&dest->n_list);
619 ip_vs_dst_reset(dest);
620 __ip_vs_unbind_svc(dest);
630 * Clean up all the destinations in the trash
631 * Called by the ip_vs_control_cleanup()
633 * When the ip_vs_control_clearup is activated by ipvs module exit,
634 * the service tables must have been flushed and all the connections
635 * are expired, and the refcnt of each destination in the trash must
636 * be 1, so we simply release them here.
638 static void ip_vs_trash_cleanup(void)
640 struct ip_vs_dest *dest, *nxt;
642 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
643 list_del(&dest->n_list);
644 ip_vs_dst_reset(dest);
645 __ip_vs_unbind_svc(dest);
652 ip_vs_zero_stats(struct ip_vs_stats *stats)
654 spin_lock_bh(&stats->lock);
655 memset(stats, 0, (char *)&stats->lock - (char *)stats);
656 spin_unlock_bh(&stats->lock);
657 ip_vs_zero_estimator(stats);
661 * Update a destination in the given service
664 __ip_vs_update_dest(struct ip_vs_service *svc,
665 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
669 /* set the weight and the flags */
670 atomic_set(&dest->weight, udest->weight);
671 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
673 /* check if local node and update the flags */
674 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
675 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
676 | IP_VS_CONN_F_LOCALNODE;
679 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
680 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
681 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
684 * Put the real service in ip_vs_rtable if not present.
685 * For now only for NAT!
687 write_lock_bh(&__ip_vs_rs_lock);
689 write_unlock_bh(&__ip_vs_rs_lock);
691 atomic_set(&dest->conn_flags, conn_flags);
693 /* bind the service */
695 __ip_vs_bind_svc(dest, svc);
697 if (dest->svc != svc) {
698 __ip_vs_unbind_svc(dest);
699 ip_vs_zero_stats(&dest->stats);
700 __ip_vs_bind_svc(dest, svc);
704 /* set the dest status flags */
705 dest->flags |= IP_VS_DEST_F_AVAILABLE;
707 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
708 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
709 dest->u_threshold = udest->u_threshold;
710 dest->l_threshold = udest->l_threshold;
715 * Create a destination for the given service
718 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
719 struct ip_vs_dest **dest_p)
721 struct ip_vs_dest *dest;
726 atype = inet_addr_type(udest->addr);
727 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
730 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
732 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
735 memset(dest, 0, sizeof(struct ip_vs_dest));
737 dest->protocol = svc->protocol;
738 dest->vaddr = svc->addr;
739 dest->vport = svc->port;
740 dest->vfwmark = svc->fwmark;
741 dest->addr = udest->addr;
742 dest->port = udest->port;
744 atomic_set(&dest->activeconns, 0);
745 atomic_set(&dest->inactconns, 0);
746 atomic_set(&dest->persistconns, 0);
747 atomic_set(&dest->refcnt, 0);
749 INIT_LIST_HEAD(&dest->d_list);
750 spin_lock_init(&dest->dst_lock);
751 spin_lock_init(&dest->stats.lock);
752 __ip_vs_update_dest(svc, dest, udest);
753 ip_vs_new_estimator(&dest->stats);
763 * Add a destination into an existing service
766 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
768 struct ip_vs_dest *dest;
769 __u32 daddr = udest->addr;
770 __u16 dport = udest->port;
775 if (udest->weight < 0) {
776 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
780 if (udest->l_threshold > udest->u_threshold) {
781 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
782 "upper threshold\n");
787 * Check if the dest already exists in the list
789 dest = ip_vs_lookup_dest(svc, daddr, dport);
791 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
796 * Check if the dest already exists in the trash and
797 * is from the same service
799 dest = ip_vs_trash_get_dest(svc, daddr, dport);
801 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
802 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
803 NIPQUAD(daddr), ntohs(dport),
804 atomic_read(&dest->refcnt),
806 NIPQUAD(dest->vaddr),
808 __ip_vs_update_dest(svc, dest, udest);
811 * Get the destination from the trash
813 list_del(&dest->n_list);
815 ip_vs_new_estimator(&dest->stats);
817 write_lock_bh(&__ip_vs_svc_lock);
820 * Wait until all other svc users go away.
822 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
824 list_add(&dest->n_list, &svc->destinations);
827 /* call the update_service function of its scheduler */
828 svc->scheduler->update_service(svc);
830 write_unlock_bh(&__ip_vs_svc_lock);
835 * Allocate and initialize the dest structure
837 ret = ip_vs_new_dest(svc, udest, &dest);
843 * Add the dest entry into the list
845 atomic_inc(&dest->refcnt);
847 write_lock_bh(&__ip_vs_svc_lock);
850 * Wait until all other svc users go away.
852 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
854 list_add(&dest->n_list, &svc->destinations);
857 /* call the update_service function of its scheduler */
858 svc->scheduler->update_service(svc);
860 write_unlock_bh(&__ip_vs_svc_lock);
869 * Edit a destination in the given service
872 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
874 struct ip_vs_dest *dest;
875 __u32 daddr = udest->addr;
876 __u16 dport = udest->port;
880 if (udest->weight < 0) {
881 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
885 if (udest->l_threshold > udest->u_threshold) {
886 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
887 "upper threshold\n");
892 * Lookup the destination list
894 dest = ip_vs_lookup_dest(svc, daddr, dport);
896 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
900 __ip_vs_update_dest(svc, dest, udest);
902 write_lock_bh(&__ip_vs_svc_lock);
904 /* Wait until all other svc users go away */
905 while (atomic_read(&svc->usecnt) > 1) {};
907 /* call the update_service, because server weight may be changed */
908 svc->scheduler->update_service(svc);
910 write_unlock_bh(&__ip_vs_svc_lock);
919 * Delete a destination (must be already unlinked from the service)
921 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
923 ip_vs_kill_estimator(&dest->stats);
926 * Remove it from the d-linked list with the real services.
928 write_lock_bh(&__ip_vs_rs_lock);
929 ip_vs_rs_unhash(dest);
930 write_unlock_bh(&__ip_vs_rs_lock);
933 * Decrease the refcnt of the dest, and free the dest
934 * if nobody refers to it (refcnt=0). Otherwise, throw
935 * the destination into the trash.
937 if (atomic_dec_and_test(&dest->refcnt)) {
938 ip_vs_dst_reset(dest);
939 /* simply decrease svc->refcnt here, let the caller check
940 and release the service if nobody refers to it.
941 Only user context can release destination and service,
942 and only one user context can update virtual service at a
943 time, so the operation here is OK */
944 atomic_dec(&dest->svc->refcnt);
947 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
948 NIPQUAD(dest->addr), ntohs(dest->port),
949 atomic_read(&dest->refcnt));
950 list_add(&dest->n_list, &ip_vs_dest_trash);
951 atomic_inc(&dest->refcnt);
957 * Unlink a destination from the given service
959 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
960 struct ip_vs_dest *dest,
963 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
966 * Remove it from the d-linked destination list.
968 list_del(&dest->n_list);
972 * Call the update_service function of its scheduler
974 svc->scheduler->update_service(svc);
980 * Delete a destination server in the given service
983 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
985 struct ip_vs_dest *dest;
986 __u32 daddr = udest->addr;
987 __u16 dport = udest->port;
991 dest = ip_vs_lookup_dest(svc, daddr, dport);
993 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
997 write_lock_bh(&__ip_vs_svc_lock);
1000 * Wait until all other svc users go away.
1002 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1005 * Unlink dest from the service
1007 __ip_vs_unlink_dest(svc, dest, 1);
1009 write_unlock_bh(&__ip_vs_svc_lock);
1012 * Delete the destination
1014 __ip_vs_del_dest(dest);
1023 * Add a service into the service hash table
1026 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1029 struct ip_vs_scheduler *sched = NULL;
1030 struct ip_vs_service *svc = NULL;
1032 /* increase the module use count */
1033 ip_vs_use_count_inc();
1035 /* Lookup the scheduler by 'u->sched_name' */
1036 sched = ip_vs_scheduler_get(u->sched_name);
1037 if (sched == NULL) {
1038 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1044 svc = (struct ip_vs_service *)
1045 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1047 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1051 memset(svc, 0, sizeof(struct ip_vs_service));
1053 /* I'm the first user of the service */
1054 atomic_set(&svc->usecnt, 1);
1055 atomic_set(&svc->refcnt, 0);
1057 svc->protocol = u->protocol;
1058 svc->addr = u->addr;
1059 svc->port = u->port;
1060 svc->fwmark = u->fwmark;
1061 svc->flags = u->flags;
1062 svc->timeout = u->timeout * HZ;
1063 svc->netmask = u->netmask;
1065 INIT_LIST_HEAD(&svc->destinations);
1066 rwlock_init(&svc->sched_lock);
1067 spin_lock_init(&svc->stats.lock);
1069 /* Bind the scheduler */
1070 ret = ip_vs_bind_scheduler(svc, sched);
1075 /* Update the virtual service counters */
1076 if (svc->port == FTPPORT)
1077 atomic_inc(&ip_vs_ftpsvc_counter);
1078 else if (svc->port == 0)
1079 atomic_inc(&ip_vs_nullsvc_counter);
1081 ip_vs_new_estimator(&svc->stats);
1082 ip_vs_num_services++;
1084 /* Hash the service into the service table */
1085 write_lock_bh(&__ip_vs_svc_lock);
1086 ip_vs_svc_hash(svc);
1087 write_unlock_bh(&__ip_vs_svc_lock);
1095 ip_vs_unbind_scheduler(svc);
1098 ip_vs_app_inc_put(svc->inc);
1103 ip_vs_scheduler_put(sched);
1106 /* decrease the module use count */
1107 ip_vs_use_count_dec();
1114 * Edit a service and bind it with a new scheduler
1117 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1119 struct ip_vs_scheduler *sched, *old_sched;
1123 * Lookup the scheduler, by 'u->sched_name'
1125 sched = ip_vs_scheduler_get(u->sched_name);
1126 if (sched == NULL) {
1127 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1133 write_lock_bh(&__ip_vs_svc_lock);
1136 * Wait until all other svc users go away.
1138 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1141 * Set the flags and timeout value
1143 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1144 svc->timeout = u->timeout * HZ;
1145 svc->netmask = u->netmask;
1147 old_sched = svc->scheduler;
1148 if (sched != old_sched) {
1150 * Unbind the old scheduler
1152 if ((ret = ip_vs_unbind_scheduler(svc))) {
1158 * Bind the new scheduler
1160 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1162 * If ip_vs_bind_scheduler fails, restore the old
1164 * The main reason of failure is out of memory.
1166 * The question is if the old scheduler can be
1167 * restored all the time. TODO: if it cannot be
1168 * restored some time, we must delete the service,
1169 * otherwise the system may crash.
1171 ip_vs_bind_scheduler(svc, old_sched);
1178 write_unlock_bh(&__ip_vs_svc_lock);
1181 ip_vs_scheduler_put(old_sched);
1188 * Delete a service from the service list
1189 * - The service must be unlinked, unlocked and not referenced!
1190 * - We are called under _bh lock
1192 static void __ip_vs_del_service(struct ip_vs_service *svc)
1194 struct ip_vs_dest *dest, *nxt;
1195 struct ip_vs_scheduler *old_sched;
1197 ip_vs_num_services--;
1198 ip_vs_kill_estimator(&svc->stats);
1200 /* Unbind scheduler */
1201 old_sched = svc->scheduler;
1202 ip_vs_unbind_scheduler(svc);
1204 ip_vs_scheduler_put(old_sched);
1206 /* Unbind app inc */
1208 ip_vs_app_inc_put(svc->inc);
1213 * Unlink the whole destination list
1215 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1216 __ip_vs_unlink_dest(svc, dest, 0);
1217 __ip_vs_del_dest(dest);
1221 * Update the virtual service counters
1223 if (svc->port == FTPPORT)
1224 atomic_dec(&ip_vs_ftpsvc_counter);
1225 else if (svc->port == 0)
1226 atomic_dec(&ip_vs_nullsvc_counter);
1229 * Free the service if nobody refers to it
1231 if (atomic_read(&svc->refcnt) == 0)
1234 /* decrease the module use count */
1235 ip_vs_use_count_dec();
1239 * Delete a service from the service list
1241 static int ip_vs_del_service(struct ip_vs_service *svc)
1247 * Unhash it from the service table
1249 write_lock_bh(&__ip_vs_svc_lock);
1251 ip_vs_svc_unhash(svc);
1254 * Wait until all the svc users go away.
1256 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1258 __ip_vs_del_service(svc);
1260 write_unlock_bh(&__ip_vs_svc_lock);
1267 * Flush all the virtual services
1269 static int ip_vs_flush(void)
1272 struct ip_vs_service *svc, *nxt;
1275 * Flush the service table hashed by <protocol,addr,port>
1277 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1278 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1279 write_lock_bh(&__ip_vs_svc_lock);
1280 ip_vs_svc_unhash(svc);
1282 * Wait until all the svc users go away.
1284 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1285 __ip_vs_del_service(svc);
1286 write_unlock_bh(&__ip_vs_svc_lock);
1291 * Flush the service table hashed by fwmark
1293 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1294 list_for_each_entry_safe(svc, nxt,
1295 &ip_vs_svc_fwm_table[idx], f_list) {
1296 write_lock_bh(&__ip_vs_svc_lock);
1297 ip_vs_svc_unhash(svc);
1299 * Wait until all the svc users go away.
1301 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1302 __ip_vs_del_service(svc);
1303 write_unlock_bh(&__ip_vs_svc_lock);
1312 * Zero counters in a service or all services
1314 static int ip_vs_zero_service(struct ip_vs_service *svc)
1316 struct ip_vs_dest *dest;
1318 write_lock_bh(&__ip_vs_svc_lock);
1319 list_for_each_entry(dest, &svc->destinations, n_list) {
1320 ip_vs_zero_stats(&dest->stats);
1322 ip_vs_zero_stats(&svc->stats);
1323 write_unlock_bh(&__ip_vs_svc_lock);
1327 static int ip_vs_zero_all(void)
1330 struct ip_vs_service *svc;
1332 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1333 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1334 ip_vs_zero_service(svc);
1338 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1340 ip_vs_zero_service(svc);
1344 ip_vs_zero_stats(&ip_vs_stats);
1350 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1351 void __user *buffer, size_t *lenp, loff_t *ppos)
1353 int *valp = table->data;
1357 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1358 if (write && (*valp != val)) {
1359 if ((*valp < 0) || (*valp > 3)) {
1360 /* Restore the correct value */
1364 update_defense_level();
1373 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1374 void __user *buffer, size_t *lenp, loff_t *ppos)
1376 int *valp = table->data;
1380 /* backup the value first */
1381 memcpy(val, valp, sizeof(val));
1383 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1384 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1385 /* Restore the correct value */
1386 memcpy(valp, val, sizeof(val));
1393 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1396 static struct ctl_table vs_vars[] = {
1398 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1399 .procname = "amemthresh",
1400 .data = &sysctl_ip_vs_amemthresh,
1401 .maxlen = sizeof(int),
1403 .proc_handler = &proc_dointvec,
1405 #ifdef CONFIG_IP_VS_DEBUG
1407 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1408 .procname = "debug_level",
1409 .data = &sysctl_ip_vs_debug_level,
1410 .maxlen = sizeof(int),
1412 .proc_handler = &proc_dointvec,
1416 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1417 .procname = "am_droprate",
1418 .data = &sysctl_ip_vs_am_droprate,
1419 .maxlen = sizeof(int),
1421 .proc_handler = &proc_dointvec,
1424 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1425 .procname = "drop_entry",
1426 .data = &sysctl_ip_vs_drop_entry,
1427 .maxlen = sizeof(int),
1429 .proc_handler = &proc_do_defense_mode,
1432 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1433 .procname = "drop_packet",
1434 .data = &sysctl_ip_vs_drop_packet,
1435 .maxlen = sizeof(int),
1437 .proc_handler = &proc_do_defense_mode,
1440 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1441 .procname = "secure_tcp",
1442 .data = &sysctl_ip_vs_secure_tcp,
1443 .maxlen = sizeof(int),
1445 .proc_handler = &proc_do_defense_mode,
1449 .ctl_name = NET_IPV4_VS_TO_ES,
1450 .procname = "timeout_established",
1451 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1452 .maxlen = sizeof(int),
1454 .proc_handler = &proc_dointvec_jiffies,
1457 .ctl_name = NET_IPV4_VS_TO_SS,
1458 .procname = "timeout_synsent",
1459 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1460 .maxlen = sizeof(int),
1462 .proc_handler = &proc_dointvec_jiffies,
1465 .ctl_name = NET_IPV4_VS_TO_SR,
1466 .procname = "timeout_synrecv",
1467 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1468 .maxlen = sizeof(int),
1470 .proc_handler = &proc_dointvec_jiffies,
1473 .ctl_name = NET_IPV4_VS_TO_FW,
1474 .procname = "timeout_finwait",
1475 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1476 .maxlen = sizeof(int),
1478 .proc_handler = &proc_dointvec_jiffies,
1481 .ctl_name = NET_IPV4_VS_TO_TW,
1482 .procname = "timeout_timewait",
1483 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1484 .maxlen = sizeof(int),
1486 .proc_handler = &proc_dointvec_jiffies,
1489 .ctl_name = NET_IPV4_VS_TO_CL,
1490 .procname = "timeout_close",
1491 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1492 .maxlen = sizeof(int),
1494 .proc_handler = &proc_dointvec_jiffies,
1497 .ctl_name = NET_IPV4_VS_TO_CW,
1498 .procname = "timeout_closewait",
1499 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1500 .maxlen = sizeof(int),
1502 .proc_handler = &proc_dointvec_jiffies,
1505 .ctl_name = NET_IPV4_VS_TO_LA,
1506 .procname = "timeout_lastack",
1507 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1508 .maxlen = sizeof(int),
1510 .proc_handler = &proc_dointvec_jiffies,
1513 .ctl_name = NET_IPV4_VS_TO_LI,
1514 .procname = "timeout_listen",
1515 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1516 .maxlen = sizeof(int),
1518 .proc_handler = &proc_dointvec_jiffies,
1521 .ctl_name = NET_IPV4_VS_TO_SA,
1522 .procname = "timeout_synack",
1523 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1524 .maxlen = sizeof(int),
1526 .proc_handler = &proc_dointvec_jiffies,
1529 .ctl_name = NET_IPV4_VS_TO_UDP,
1530 .procname = "timeout_udp",
1531 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1532 .maxlen = sizeof(int),
1534 .proc_handler = &proc_dointvec_jiffies,
1537 .ctl_name = NET_IPV4_VS_TO_ICMP,
1538 .procname = "timeout_icmp",
1539 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1540 .maxlen = sizeof(int),
1542 .proc_handler = &proc_dointvec_jiffies,
1546 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1547 .procname = "cache_bypass",
1548 .data = &sysctl_ip_vs_cache_bypass,
1549 .maxlen = sizeof(int),
1551 .proc_handler = &proc_dointvec,
1554 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1555 .procname = "expire_nodest_conn",
1556 .data = &sysctl_ip_vs_expire_nodest_conn,
1557 .maxlen = sizeof(int),
1559 .proc_handler = &proc_dointvec,
1562 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1563 .procname = "expire_quiescent_template",
1564 .data = &sysctl_ip_vs_expire_quiescent_template,
1565 .maxlen = sizeof(int),
1567 .proc_handler = &proc_dointvec,
1570 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1571 .procname = "sync_threshold",
1572 .data = &sysctl_ip_vs_sync_threshold,
1573 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1575 .proc_handler = &proc_do_sync_threshold,
1578 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1579 .procname = "nat_icmp_send",
1580 .data = &sysctl_ip_vs_nat_icmp_send,
1581 .maxlen = sizeof(int),
1583 .proc_handler = &proc_dointvec,
1588 static ctl_table vs_table[] = {
1590 .ctl_name = NET_IPV4_VS,
1598 static ctl_table ipv4_table[] = {
1600 .ctl_name = NET_IPV4,
1608 static ctl_table vs_root_table[] = {
1610 .ctl_name = CTL_NET,
1613 .child = ipv4_table,
1618 static struct ctl_table_header * sysctl_header;
1620 #ifdef CONFIG_PROC_FS
1623 struct list_head *table;
1628 * Write the contents of the VS rule table to a PROCfs file.
1629 * (It is kept just for backward compatibility)
1631 static inline const char *ip_vs_fwd_name(unsigned flags)
1633 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1634 case IP_VS_CONN_F_LOCALNODE:
1636 case IP_VS_CONN_F_TUNNEL:
1638 case IP_VS_CONN_F_DROUTE:
1646 /* Get the Nth entry in the two lists */
1647 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1649 struct ip_vs_iter *iter = seq->private;
1651 struct ip_vs_service *svc;
1653 /* look in hash by protocol */
1654 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1655 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1657 iter->table = ip_vs_svc_table;
1664 /* keep looking in fwmark */
1665 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1666 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1668 iter->table = ip_vs_svc_fwm_table;
1678 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1681 read_lock_bh(&__ip_vs_svc_lock);
1682 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1686 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1688 struct list_head *e;
1689 struct ip_vs_iter *iter;
1690 struct ip_vs_service *svc;
1693 if (v == SEQ_START_TOKEN)
1694 return ip_vs_info_array(seq,0);
1697 iter = seq->private;
1699 if (iter->table == ip_vs_svc_table) {
1700 /* next service in table hashed by protocol */
1701 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1702 return list_entry(e, struct ip_vs_service, s_list);
1705 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1706 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1712 iter->table = ip_vs_svc_fwm_table;
1717 /* next service in hashed by fwmark */
1718 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1719 return list_entry(e, struct ip_vs_service, f_list);
1722 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1723 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1731 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1733 read_unlock_bh(&__ip_vs_svc_lock);
1737 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1739 if (v == SEQ_START_TOKEN) {
1741 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1742 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1744 "Prot LocalAddress:Port Scheduler Flags\n");
1746 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1748 const struct ip_vs_service *svc = v;
1749 const struct ip_vs_iter *iter = seq->private;
1750 const struct ip_vs_dest *dest;
1752 if (iter->table == ip_vs_svc_table)
1753 seq_printf(seq, "%s %08X:%04X %s ",
1754 ip_vs_proto_name(svc->protocol),
1757 svc->scheduler->name);
1759 seq_printf(seq, "FWM %08X %s ",
1760 svc->fwmark, svc->scheduler->name);
1762 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1763 seq_printf(seq, "persistent %d %08X\n",
1765 ntohl(svc->netmask));
1767 seq_putc(seq, '\n');
1769 list_for_each_entry(dest, &svc->destinations, n_list) {
1771 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1772 ntohl(dest->addr), ntohs(dest->port),
1773 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1774 atomic_read(&dest->weight),
1775 atomic_read(&dest->activeconns),
1776 atomic_read(&dest->inactconns));
1782 static struct seq_operations ip_vs_info_seq_ops = {
1783 .start = ip_vs_info_seq_start,
1784 .next = ip_vs_info_seq_next,
1785 .stop = ip_vs_info_seq_stop,
1786 .show = ip_vs_info_seq_show,
1789 static int ip_vs_info_open(struct inode *inode, struct file *file)
1791 struct seq_file *seq;
1793 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1798 rc = seq_open(file, &ip_vs_info_seq_ops);
1802 seq = file->private_data;
1804 memset(s, 0, sizeof(*s));
1812 static struct file_operations ip_vs_info_fops = {
1813 .owner = THIS_MODULE,
1814 .open = ip_vs_info_open,
1816 .llseek = seq_lseek,
1817 .release = seq_release_private,
1822 struct ip_vs_stats ip_vs_stats;
1824 #ifdef CONFIG_PROC_FS
1825 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1828 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1830 " Total Incoming Outgoing Incoming Outgoing\n");
1832 " Conns Packets Packets Bytes Bytes\n");
1834 spin_lock_bh(&ip_vs_stats.lock);
1835 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1836 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1837 (unsigned long long) ip_vs_stats.inbytes,
1838 (unsigned long long) ip_vs_stats.outbytes);
1840 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1842 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1843 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1848 ip_vs_stats.outbps);
1849 spin_unlock_bh(&ip_vs_stats.lock);
1854 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1856 return single_open(file, ip_vs_stats_show, NULL);
1859 static struct file_operations ip_vs_stats_fops = {
1860 .owner = THIS_MODULE,
1861 .open = ip_vs_stats_seq_open,
1863 .llseek = seq_lseek,
1864 .release = single_release,
1870 * Set timeout values for tcp tcpfin udp in the timeout_table.
1872 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1874 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1879 #ifdef CONFIG_IP_VS_PROTO_TCP
1880 if (u->tcp_timeout) {
1881 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1882 = u->tcp_timeout * HZ;
1885 if (u->tcp_fin_timeout) {
1886 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1887 = u->tcp_fin_timeout * HZ;
1891 #ifdef CONFIG_IP_VS_PROTO_UDP
1892 if (u->udp_timeout) {
1893 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1894 = u->udp_timeout * HZ;
1901 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1902 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1903 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1904 sizeof(struct ip_vs_dest_user))
1905 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1906 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1907 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1909 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1910 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1911 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1912 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1913 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1914 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1915 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1917 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1924 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1927 unsigned char arg[MAX_ARG_LEN];
1928 struct ip_vs_service_user *usvc;
1929 struct ip_vs_service *svc;
1930 struct ip_vs_dest_user *udest;
1932 if (!capable(CAP_NET_ADMIN))
1935 if (len != set_arglen[SET_CMDID(cmd)]) {
1936 IP_VS_ERR("set_ctl: len %u != %u\n",
1937 len, set_arglen[SET_CMDID(cmd)]);
1941 if (copy_from_user(arg, user, len) != 0)
1944 /* increase the module use count */
1945 ip_vs_use_count_inc();
1947 if (down_interruptible(&__ip_vs_mutex)) {
1952 if (cmd == IP_VS_SO_SET_FLUSH) {
1953 /* Flush the virtual service */
1954 ret = ip_vs_flush();
1956 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1957 /* Set timeout values for (tcp tcpfin udp) */
1958 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1960 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1961 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1962 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1964 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1965 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1966 ret = stop_sync_thread(dm->state);
1970 usvc = (struct ip_vs_service_user *)arg;
1971 udest = (struct ip_vs_dest_user *)(usvc + 1);
1973 if (cmd == IP_VS_SO_SET_ZERO) {
1974 /* if no service address is set, zero counters in all */
1975 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1976 ret = ip_vs_zero_all();
1981 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1982 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1983 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1984 usvc->protocol, NIPQUAD(usvc->addr),
1985 ntohs(usvc->port), usvc->sched_name);
1990 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1991 if (usvc->fwmark == 0)
1992 svc = __ip_vs_service_get(usvc->protocol,
1993 usvc->addr, usvc->port);
1995 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1997 if (cmd != IP_VS_SO_SET_ADD
1998 && (svc == NULL || svc->protocol != usvc->protocol)) {
2004 case IP_VS_SO_SET_ADD:
2008 ret = ip_vs_add_service(usvc, &svc);
2010 case IP_VS_SO_SET_EDIT:
2011 ret = ip_vs_edit_service(svc, usvc);
2013 case IP_VS_SO_SET_DEL:
2014 ret = ip_vs_del_service(svc);
2018 case IP_VS_SO_SET_ZERO:
2019 ret = ip_vs_zero_service(svc);
2021 case IP_VS_SO_SET_ADDDEST:
2022 ret = ip_vs_add_dest(svc, udest);
2024 case IP_VS_SO_SET_EDITDEST:
2025 ret = ip_vs_edit_dest(svc, udest);
2027 case IP_VS_SO_SET_DELDEST:
2028 ret = ip_vs_del_dest(svc, udest);
2035 ip_vs_service_put(svc);
2040 /* decrease the module use count */
2041 ip_vs_use_count_dec();
2048 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2050 spin_lock_bh(&src->lock);
2051 memcpy(dst, src, (char*)&src->lock - (char*)src);
2052 spin_unlock_bh(&src->lock);
2056 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2058 dst->protocol = src->protocol;
2059 dst->addr = src->addr;
2060 dst->port = src->port;
2061 dst->fwmark = src->fwmark;
2062 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2063 dst->flags = src->flags;
2064 dst->timeout = src->timeout / HZ;
2065 dst->netmask = src->netmask;
2066 dst->num_dests = src->num_dests;
2067 ip_vs_copy_stats(&dst->stats, &src->stats);
2071 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2072 struct ip_vs_get_services __user *uptr)
2075 struct ip_vs_service *svc;
2076 struct ip_vs_service_entry entry;
2079 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2080 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2081 if (count >= get->num_services)
2083 memset(&entry, 0, sizeof(entry));
2084 ip_vs_copy_service(&entry, svc);
2085 if (copy_to_user(&uptr->entrytable[count],
2086 &entry, sizeof(entry))) {
2094 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2095 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2096 if (count >= get->num_services)
2098 memset(&entry, 0, sizeof(entry));
2099 ip_vs_copy_service(&entry, svc);
2100 if (copy_to_user(&uptr->entrytable[count],
2101 &entry, sizeof(entry))) {
2113 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2114 struct ip_vs_get_dests __user *uptr)
2116 struct ip_vs_service *svc;
2120 svc = __ip_vs_svc_fwm_get(get->fwmark);
2122 svc = __ip_vs_service_get(get->protocol,
2123 get->addr, get->port);
2126 struct ip_vs_dest *dest;
2127 struct ip_vs_dest_entry entry;
2129 list_for_each_entry(dest, &svc->destinations, n_list) {
2130 if (count >= get->num_dests)
2133 entry.addr = dest->addr;
2134 entry.port = dest->port;
2135 entry.conn_flags = atomic_read(&dest->conn_flags);
2136 entry.weight = atomic_read(&dest->weight);
2137 entry.u_threshold = dest->u_threshold;
2138 entry.l_threshold = dest->l_threshold;
2139 entry.activeconns = atomic_read(&dest->activeconns);
2140 entry.inactconns = atomic_read(&dest->inactconns);
2141 entry.persistconns = atomic_read(&dest->persistconns);
2142 ip_vs_copy_stats(&entry.stats, &dest->stats);
2143 if (copy_to_user(&uptr->entrytable[count],
2144 &entry, sizeof(entry))) {
2150 ip_vs_service_put(svc);
2157 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2159 #ifdef CONFIG_IP_VS_PROTO_TCP
2161 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2162 u->tcp_fin_timeout =
2163 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2165 #ifdef CONFIG_IP_VS_PROTO_UDP
2167 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2172 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2173 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2174 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2175 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2176 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2177 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2178 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2180 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2181 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2182 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2183 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2184 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2185 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2186 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2187 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2191 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2193 unsigned char arg[128];
2196 if (!capable(CAP_NET_ADMIN))
2199 if (*len < get_arglen[GET_CMDID(cmd)]) {
2200 IP_VS_ERR("get_ctl: len %u < %u\n",
2201 *len, get_arglen[GET_CMDID(cmd)]);
2205 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2208 if (down_interruptible(&__ip_vs_mutex))
2209 return -ERESTARTSYS;
2212 case IP_VS_SO_GET_VERSION:
2216 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2217 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2218 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2222 *len = strlen(buf)+1;
2226 case IP_VS_SO_GET_INFO:
2228 struct ip_vs_getinfo info;
2229 info.version = IP_VS_VERSION_CODE;
2230 info.size = IP_VS_CONN_TAB_SIZE;
2231 info.num_services = ip_vs_num_services;
2232 if (copy_to_user(user, &info, sizeof(info)) != 0)
2237 case IP_VS_SO_GET_SERVICES:
2239 struct ip_vs_get_services *get;
2242 get = (struct ip_vs_get_services *)arg;
2243 size = sizeof(*get) +
2244 sizeof(struct ip_vs_service_entry) * get->num_services;
2246 IP_VS_ERR("length: %u != %u\n", *len, size);
2250 ret = __ip_vs_get_service_entries(get, user);
2254 case IP_VS_SO_GET_SERVICE:
2256 struct ip_vs_service_entry *entry;
2257 struct ip_vs_service *svc;
2259 entry = (struct ip_vs_service_entry *)arg;
2261 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2263 svc = __ip_vs_service_get(entry->protocol,
2264 entry->addr, entry->port);
2266 ip_vs_copy_service(entry, svc);
2267 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2269 ip_vs_service_put(svc);
2275 case IP_VS_SO_GET_DESTS:
2277 struct ip_vs_get_dests *get;
2280 get = (struct ip_vs_get_dests *)arg;
2281 size = sizeof(*get) +
2282 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2284 IP_VS_ERR("length: %u != %u\n", *len, size);
2288 ret = __ip_vs_get_dest_entries(get, user);
2292 case IP_VS_SO_GET_TIMEOUT:
2294 struct ip_vs_timeout_user t;
2296 __ip_vs_get_timeouts(&t);
2297 if (copy_to_user(user, &t, sizeof(t)) != 0)
2302 case IP_VS_SO_GET_DAEMON:
2304 struct ip_vs_daemon_user d[2];
2306 memset(&d, 0, sizeof(d));
2307 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2308 d[0].state = IP_VS_STATE_MASTER;
2309 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2310 d[0].syncid = ip_vs_master_syncid;
2312 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2313 d[1].state = IP_VS_STATE_BACKUP;
2314 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2315 d[1].syncid = ip_vs_backup_syncid;
2317 if (copy_to_user(user, &d, sizeof(d)) != 0)
2332 static struct nf_sockopt_ops ip_vs_sockopts = {
2334 .set_optmin = IP_VS_BASE_CTL,
2335 .set_optmax = IP_VS_SO_SET_MAX+1,
2336 .set = do_ip_vs_set_ctl,
2337 .get_optmin = IP_VS_BASE_CTL,
2338 .get_optmax = IP_VS_SO_GET_MAX+1,
2339 .get = do_ip_vs_get_ctl,
2343 int ip_vs_control_init(void)
2350 ret = nf_register_sockopt(&ip_vs_sockopts);
2352 IP_VS_ERR("cannot register sockopt.\n");
2356 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2357 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2359 sysctl_header = register_sysctl_table(vs_root_table, 0);
2361 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2362 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2363 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2364 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2366 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2367 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2370 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2371 spin_lock_init(&ip_vs_stats.lock);
2372 ip_vs_new_estimator(&ip_vs_stats);
2374 /* Hook the defense timer */
2375 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2382 void ip_vs_control_cleanup(void)
2385 ip_vs_trash_cleanup();
2386 cancel_rearming_delayed_work(&defense_work);
2387 ip_vs_kill_estimator(&ip_vs_stats);
2388 unregister_sysctl_table(sysctl_header);
2389 proc_net_remove("ip_vs_stats");
2390 proc_net_remove("ip_vs");
2391 nf_unregister_sockopt(&ip_vs_sockopts);