2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/types.h>
27 #include <linux/sysctl.h>
28 #include <linux/proc_fs.h>
29 #include <linux/workqueue.h>
30 #include <linux/swap.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
34 #include <linux/netfilter.h>
35 #include <linux/netfilter_ipv4.h>
40 #include <asm/uaccess.h>
42 #include <net/ip_vs.h>
44 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45 static DECLARE_MUTEX(__ip_vs_mutex);
47 /* lock for service table */
48 static DEFINE_RWLOCK(__ip_vs_svc_lock);
50 /* lock for table with the real services */
51 static DEFINE_RWLOCK(__ip_vs_rs_lock);
53 /* lock for state and timeout tables */
54 static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56 /* lock for drop entry handling */
57 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59 /* lock for drop packet handling */
60 static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62 /* 1/rate drop and drop-entry variables */
63 int ip_vs_drop_rate = 0;
64 int ip_vs_drop_counter = 0;
65 static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67 /* number of virtual services */
68 static int ip_vs_num_services = 0;
70 /* sysctl variables */
71 static int sysctl_ip_vs_drop_entry = 0;
72 static int sysctl_ip_vs_drop_packet = 0;
73 static int sysctl_ip_vs_secure_tcp = 0;
74 static int sysctl_ip_vs_amemthresh = 1024;
75 static int sysctl_ip_vs_am_droprate = 10;
76 int sysctl_ip_vs_cache_bypass = 0;
77 int sysctl_ip_vs_expire_nodest_conn = 0;
78 int sysctl_ip_vs_expire_quiescent_template = 0;
79 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80 int sysctl_ip_vs_nat_icmp_send = 0;
83 #ifdef CONFIG_IP_VS_DEBUG
84 static int sysctl_ip_vs_debug_level = 0;
86 int ip_vs_get_debug_level(void)
88 return sysctl_ip_vs_debug_level;
93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
96 static void update_defense_level(void)
99 static int old_secure_tcp = 0;
104 /* we only count free and buffered memory (in pages) */
106 availmem = i.freeram + i.bufferram;
107 /* however in linux 2.5 the i.bufferram is total page cache size,
109 /* si_swapinfo(&i); */
110 /* availmem = availmem - (i.totalswap - i.freeswap); */
112 nomem = (availmem < sysctl_ip_vs_amemthresh);
117 spin_lock(&__ip_vs_dropentry_lock);
118 switch (sysctl_ip_vs_drop_entry) {
120 atomic_set(&ip_vs_dropentry, 0);
124 atomic_set(&ip_vs_dropentry, 1);
125 sysctl_ip_vs_drop_entry = 2;
127 atomic_set(&ip_vs_dropentry, 0);
132 atomic_set(&ip_vs_dropentry, 1);
134 atomic_set(&ip_vs_dropentry, 0);
135 sysctl_ip_vs_drop_entry = 1;
139 atomic_set(&ip_vs_dropentry, 1);
142 spin_unlock(&__ip_vs_dropentry_lock);
145 spin_lock(&__ip_vs_droppacket_lock);
146 switch (sysctl_ip_vs_drop_packet) {
152 ip_vs_drop_rate = ip_vs_drop_counter
153 = sysctl_ip_vs_amemthresh /
154 (sysctl_ip_vs_amemthresh-availmem);
155 sysctl_ip_vs_drop_packet = 2;
162 ip_vs_drop_rate = ip_vs_drop_counter
163 = sysctl_ip_vs_amemthresh /
164 (sysctl_ip_vs_amemthresh-availmem);
167 sysctl_ip_vs_drop_packet = 1;
171 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
174 spin_unlock(&__ip_vs_droppacket_lock);
177 write_lock(&__ip_vs_securetcp_lock);
178 switch (sysctl_ip_vs_secure_tcp) {
180 if (old_secure_tcp >= 2)
185 if (old_secure_tcp < 2)
187 sysctl_ip_vs_secure_tcp = 2;
189 if (old_secure_tcp >= 2)
195 if (old_secure_tcp < 2)
198 if (old_secure_tcp >= 2)
200 sysctl_ip_vs_secure_tcp = 1;
204 if (old_secure_tcp < 2)
208 old_secure_tcp = sysctl_ip_vs_secure_tcp;
210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
211 write_unlock(&__ip_vs_securetcp_lock);
218 * Timer for checking the defense
220 #define DEFENSE_TIMER_PERIOD 1*HZ
221 static void defense_work_handler(void *data);
222 static DECLARE_WORK(defense_work, defense_work_handler, NULL);
224 static void defense_work_handler(void *data)
226 update_defense_level();
227 if (atomic_read(&ip_vs_dropentry))
228 ip_vs_random_dropentry();
230 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234 ip_vs_use_count_inc(void)
236 return try_module_get(THIS_MODULE);
240 ip_vs_use_count_dec(void)
242 module_put(THIS_MODULE);
247 * Hash table: for virtual service lookups
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
259 * Hash table: for real service lookups
261 #define IP_VS_RTAB_BITS 4
262 #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
263 #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265 static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
268 * Trash for destinations
270 static LIST_HEAD(ip_vs_dest_trash);
273 * FTP & NULL virtual service counters
275 static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
276 static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280 * Returns hash value for virtual service
282 static __inline__ unsigned
283 ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
285 register unsigned porth = ntohs(port);
287 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
288 & IP_VS_SVC_TAB_MASK;
292 * Returns hash value of fwmark for virtual service lookup
294 static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296 return fwmark & IP_VS_SVC_TAB_MASK;
300 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
301 * or in the ip_vs_svc_fwm_table by fwmark.
302 * Should be called with locked tables.
304 static int ip_vs_svc_hash(struct ip_vs_service *svc)
308 if (svc->flags & IP_VS_SVC_F_HASHED) {
309 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
310 "called from %p\n", __builtin_return_address(0));
314 if (svc->fwmark == 0) {
316 * Hash it by <protocol,addr,port> in ip_vs_svc_table
318 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
319 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
322 * Hash it by fwmark in ip_vs_svc_fwm_table
324 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
325 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
328 svc->flags |= IP_VS_SVC_F_HASHED;
329 /* increase its refcnt because it is referenced by the svc table */
330 atomic_inc(&svc->refcnt);
336 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
337 * Should be called with locked tables.
339 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
342 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
343 "called from %p\n", __builtin_return_address(0));
347 if (svc->fwmark == 0) {
348 /* Remove it from the ip_vs_svc_table table */
349 list_del(&svc->s_list);
351 /* Remove it from the ip_vs_svc_fwm_table table */
352 list_del(&svc->f_list);
355 svc->flags &= ~IP_VS_SVC_F_HASHED;
356 atomic_dec(&svc->refcnt);
362 * Get service by {proto,addr,port} in the service table.
364 static __inline__ struct ip_vs_service *
365 __ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
368 struct ip_vs_service *svc;
370 /* Check for "full" addressed entries */
371 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
374 if ((svc->addr == vaddr)
375 && (svc->port == vport)
376 && (svc->protocol == protocol)) {
378 atomic_inc(&svc->usecnt);
388 * Get service by {fwmark} in the service table.
390 static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
393 struct ip_vs_service *svc;
395 /* Check for fwmark addressed entries */
396 hash = ip_vs_svc_fwm_hashkey(fwmark);
398 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
399 if (svc->fwmark == fwmark) {
401 atomic_inc(&svc->usecnt);
409 struct ip_vs_service *
410 ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
412 struct ip_vs_service *svc;
414 read_lock(&__ip_vs_svc_lock);
417 * Check the table hashed by fwmark first
419 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423 * Check the table hashed by <protocol,addr,port>
424 * for "full" addressed entries
426 svc = __ip_vs_service_get(protocol, vaddr, vport);
429 && protocol == IPPROTO_TCP
430 && atomic_read(&ip_vs_ftpsvc_counter)
431 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433 * Check if ftp service entry exists, the packet
434 * might belong to FTP data connections.
436 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440 && atomic_read(&ip_vs_nullsvc_counter)) {
442 * Check if the catch-all port (port zero) exists
444 svc = __ip_vs_service_get(protocol, vaddr, 0);
448 read_unlock(&__ip_vs_svc_lock);
450 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
451 fwmark, ip_vs_proto_name(protocol),
452 NIPQUAD(vaddr), ntohs(vport),
453 svc?"hit":"not hit");
460 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462 atomic_inc(&svc->refcnt);
467 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
469 struct ip_vs_service *svc = dest->svc;
472 if (atomic_dec_and_test(&svc->refcnt))
478 * Returns hash value for real service
480 static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
482 register unsigned porth = ntohs(port);
484 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
489 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
490 * should be called with locked tables.
492 static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496 if (!list_empty(&dest->d_list)) {
501 * Hash by proto,addr,port,
502 * which are the parameters of the real service.
504 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
505 list_add(&dest->d_list, &ip_vs_rtable[hash]);
511 * UNhashes ip_vs_dest from ip_vs_rtable.
512 * should be called with locked tables.
514 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
517 * Remove it from the ip_vs_rtable table.
519 if (!list_empty(&dest->d_list)) {
520 list_del(&dest->d_list);
521 INIT_LIST_HEAD(&dest->d_list);
528 * Lookup real service by <proto,addr,port> in the real service table.
531 ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
534 struct ip_vs_dest *dest;
537 * Check for "full" addressed entries
538 * Return the first found entry
540 hash = ip_vs_rs_hashkey(daddr, dport);
542 read_lock(&__ip_vs_rs_lock);
543 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
544 if ((dest->addr == daddr)
545 && (dest->port == dport)
546 && ((dest->protocol == protocol) ||
549 read_unlock(&__ip_vs_rs_lock);
553 read_unlock(&__ip_vs_rs_lock);
559 * Lookup destination by {addr,port} in the given service
561 static struct ip_vs_dest *
562 ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
564 struct ip_vs_dest *dest;
567 * Find the destination for the given service
569 list_for_each_entry(dest, &svc->destinations, n_list) {
570 if ((dest->addr == daddr) && (dest->port == dport)) {
581 * Lookup dest by {svc,addr,port} in the destination trash.
582 * The destination trash is used to hold the destinations that are removed
583 * from the service table but are still referenced by some conn entries.
584 * The reason to add the destination trash is when the dest is temporary
585 * down (either by administrator or by monitor program), the dest can be
586 * picked back from the trash, the remaining connections to the dest can
587 * continue, and the counting information of the dest is also useful for
590 static struct ip_vs_dest *
591 ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
593 struct ip_vs_dest *dest, *nxt;
596 * Find the destination in trash
598 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
599 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
602 NIPQUAD(dest->addr), ntohs(dest->port),
603 atomic_read(&dest->refcnt));
604 if (dest->addr == daddr &&
605 dest->port == dport &&
606 dest->vfwmark == svc->fwmark &&
607 dest->protocol == svc->protocol &&
609 (dest->vaddr == svc->addr &&
610 dest->vport == svc->port))) {
616 * Try to purge the destination from trash if not referenced
618 if (atomic_read(&dest->refcnt) == 1) {
619 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
622 NIPQUAD(dest->addr), ntohs(dest->port));
623 list_del(&dest->n_list);
624 ip_vs_dst_reset(dest);
625 __ip_vs_unbind_svc(dest);
635 * Clean up all the destinations in the trash
636 * Called by the ip_vs_control_cleanup()
638 * When the ip_vs_control_clearup is activated by ipvs module exit,
639 * the service tables must have been flushed and all the connections
640 * are expired, and the refcnt of each destination in the trash must
641 * be 1, so we simply release them here.
643 static void ip_vs_trash_cleanup(void)
645 struct ip_vs_dest *dest, *nxt;
647 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
648 list_del(&dest->n_list);
649 ip_vs_dst_reset(dest);
650 __ip_vs_unbind_svc(dest);
657 ip_vs_zero_stats(struct ip_vs_stats *stats)
659 spin_lock_bh(&stats->lock);
660 memset(stats, 0, (char *)&stats->lock - (char *)stats);
661 spin_unlock_bh(&stats->lock);
662 ip_vs_zero_estimator(stats);
666 * Update a destination in the given service
669 __ip_vs_update_dest(struct ip_vs_service *svc,
670 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
674 /* set the weight and the flags */
675 atomic_set(&dest->weight, udest->weight);
676 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
678 /* check if local node and update the flags */
679 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
680 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
681 | IP_VS_CONN_F_LOCALNODE;
684 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
685 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
686 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
689 * Put the real service in ip_vs_rtable if not present.
690 * For now only for NAT!
692 write_lock_bh(&__ip_vs_rs_lock);
694 write_unlock_bh(&__ip_vs_rs_lock);
696 atomic_set(&dest->conn_flags, conn_flags);
698 /* bind the service */
700 __ip_vs_bind_svc(dest, svc);
702 if (dest->svc != svc) {
703 __ip_vs_unbind_svc(dest);
704 ip_vs_zero_stats(&dest->stats);
705 __ip_vs_bind_svc(dest, svc);
709 /* set the dest status flags */
710 dest->flags |= IP_VS_DEST_F_AVAILABLE;
712 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
713 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
714 dest->u_threshold = udest->u_threshold;
715 dest->l_threshold = udest->l_threshold;
720 * Create a destination for the given service
723 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
724 struct ip_vs_dest **dest_p)
726 struct ip_vs_dest *dest;
731 atype = inet_addr_type(udest->addr);
732 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
735 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
737 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
740 memset(dest, 0, sizeof(struct ip_vs_dest));
742 dest->protocol = svc->protocol;
743 dest->vaddr = svc->addr;
744 dest->vport = svc->port;
745 dest->vfwmark = svc->fwmark;
746 dest->addr = udest->addr;
747 dest->port = udest->port;
749 atomic_set(&dest->activeconns, 0);
750 atomic_set(&dest->inactconns, 0);
751 atomic_set(&dest->persistconns, 0);
752 atomic_set(&dest->refcnt, 0);
754 INIT_LIST_HEAD(&dest->d_list);
755 spin_lock_init(&dest->dst_lock);
756 spin_lock_init(&dest->stats.lock);
757 __ip_vs_update_dest(svc, dest, udest);
758 ip_vs_new_estimator(&dest->stats);
768 * Add a destination into an existing service
771 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
773 struct ip_vs_dest *dest;
774 __u32 daddr = udest->addr;
775 __u16 dport = udest->port;
780 if (udest->weight < 0) {
781 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
785 if (udest->l_threshold > udest->u_threshold) {
786 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
787 "upper threshold\n");
792 * Check if the dest already exists in the list
794 dest = ip_vs_lookup_dest(svc, daddr, dport);
796 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
801 * Check if the dest already exists in the trash and
802 * is from the same service
804 dest = ip_vs_trash_get_dest(svc, daddr, dport);
806 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
807 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
808 NIPQUAD(daddr), ntohs(dport),
809 atomic_read(&dest->refcnt),
811 NIPQUAD(dest->vaddr),
813 __ip_vs_update_dest(svc, dest, udest);
816 * Get the destination from the trash
818 list_del(&dest->n_list);
820 ip_vs_new_estimator(&dest->stats);
822 write_lock_bh(&__ip_vs_svc_lock);
825 * Wait until all other svc users go away.
827 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
829 list_add(&dest->n_list, &svc->destinations);
832 /* call the update_service function of its scheduler */
833 svc->scheduler->update_service(svc);
835 write_unlock_bh(&__ip_vs_svc_lock);
840 * Allocate and initialize the dest structure
842 ret = ip_vs_new_dest(svc, udest, &dest);
848 * Add the dest entry into the list
850 atomic_inc(&dest->refcnt);
852 write_lock_bh(&__ip_vs_svc_lock);
855 * Wait until all other svc users go away.
857 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
859 list_add(&dest->n_list, &svc->destinations);
862 /* call the update_service function of its scheduler */
863 svc->scheduler->update_service(svc);
865 write_unlock_bh(&__ip_vs_svc_lock);
874 * Edit a destination in the given service
877 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
879 struct ip_vs_dest *dest;
880 __u32 daddr = udest->addr;
881 __u16 dport = udest->port;
885 if (udest->weight < 0) {
886 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
890 if (udest->l_threshold > udest->u_threshold) {
891 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
892 "upper threshold\n");
897 * Lookup the destination list
899 dest = ip_vs_lookup_dest(svc, daddr, dport);
901 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
905 __ip_vs_update_dest(svc, dest, udest);
907 write_lock_bh(&__ip_vs_svc_lock);
909 /* Wait until all other svc users go away */
910 while (atomic_read(&svc->usecnt) > 1) {};
912 /* call the update_service, because server weight may be changed */
913 svc->scheduler->update_service(svc);
915 write_unlock_bh(&__ip_vs_svc_lock);
924 * Delete a destination (must be already unlinked from the service)
926 static void __ip_vs_del_dest(struct ip_vs_dest *dest)
928 ip_vs_kill_estimator(&dest->stats);
931 * Remove it from the d-linked list with the real services.
933 write_lock_bh(&__ip_vs_rs_lock);
934 ip_vs_rs_unhash(dest);
935 write_unlock_bh(&__ip_vs_rs_lock);
938 * Decrease the refcnt of the dest, and free the dest
939 * if nobody refers to it (refcnt=0). Otherwise, throw
940 * the destination into the trash.
942 if (atomic_dec_and_test(&dest->refcnt)) {
943 ip_vs_dst_reset(dest);
944 /* simply decrease svc->refcnt here, let the caller check
945 and release the service if nobody refers to it.
946 Only user context can release destination and service,
947 and only one user context can update virtual service at a
948 time, so the operation here is OK */
949 atomic_dec(&dest->svc->refcnt);
952 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
953 NIPQUAD(dest->addr), ntohs(dest->port),
954 atomic_read(&dest->refcnt));
955 list_add(&dest->n_list, &ip_vs_dest_trash);
956 atomic_inc(&dest->refcnt);
962 * Unlink a destination from the given service
964 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
965 struct ip_vs_dest *dest,
968 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
971 * Remove it from the d-linked destination list.
973 list_del(&dest->n_list);
977 * Call the update_service function of its scheduler
979 svc->scheduler->update_service(svc);
985 * Delete a destination server in the given service
988 ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
990 struct ip_vs_dest *dest;
991 __u32 daddr = udest->addr;
992 __u16 dport = udest->port;
996 dest = ip_vs_lookup_dest(svc, daddr, dport);
998 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1002 write_lock_bh(&__ip_vs_svc_lock);
1005 * Wait until all other svc users go away.
1007 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1010 * Unlink dest from the service
1012 __ip_vs_unlink_dest(svc, dest, 1);
1014 write_unlock_bh(&__ip_vs_svc_lock);
1017 * Delete the destination
1019 __ip_vs_del_dest(dest);
1028 * Add a service into the service hash table
1031 ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1034 struct ip_vs_scheduler *sched = NULL;
1035 struct ip_vs_service *svc = NULL;
1037 /* increase the module use count */
1038 ip_vs_use_count_inc();
1040 /* Lookup the scheduler by 'u->sched_name' */
1041 sched = ip_vs_scheduler_get(u->sched_name);
1042 if (sched == NULL) {
1043 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1049 svc = (struct ip_vs_service *)
1050 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1052 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1056 memset(svc, 0, sizeof(struct ip_vs_service));
1058 /* I'm the first user of the service */
1059 atomic_set(&svc->usecnt, 1);
1060 atomic_set(&svc->refcnt, 0);
1062 svc->protocol = u->protocol;
1063 svc->addr = u->addr;
1064 svc->port = u->port;
1065 svc->fwmark = u->fwmark;
1066 svc->flags = u->flags;
1067 svc->timeout = u->timeout * HZ;
1068 svc->netmask = u->netmask;
1070 INIT_LIST_HEAD(&svc->destinations);
1071 rwlock_init(&svc->sched_lock);
1072 spin_lock_init(&svc->stats.lock);
1074 /* Bind the scheduler */
1075 ret = ip_vs_bind_scheduler(svc, sched);
1080 /* Update the virtual service counters */
1081 if (svc->port == FTPPORT)
1082 atomic_inc(&ip_vs_ftpsvc_counter);
1083 else if (svc->port == 0)
1084 atomic_inc(&ip_vs_nullsvc_counter);
1086 ip_vs_new_estimator(&svc->stats);
1087 ip_vs_num_services++;
1089 /* Hash the service into the service table */
1090 write_lock_bh(&__ip_vs_svc_lock);
1091 ip_vs_svc_hash(svc);
1092 write_unlock_bh(&__ip_vs_svc_lock);
1100 ip_vs_unbind_scheduler(svc);
1103 ip_vs_app_inc_put(svc->inc);
1108 ip_vs_scheduler_put(sched);
1111 /* decrease the module use count */
1112 ip_vs_use_count_dec();
1119 * Edit a service and bind it with a new scheduler
1122 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1124 struct ip_vs_scheduler *sched, *old_sched;
1128 * Lookup the scheduler, by 'u->sched_name'
1130 sched = ip_vs_scheduler_get(u->sched_name);
1131 if (sched == NULL) {
1132 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1138 write_lock_bh(&__ip_vs_svc_lock);
1141 * Wait until all other svc users go away.
1143 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1146 * Set the flags and timeout value
1148 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1149 svc->timeout = u->timeout * HZ;
1150 svc->netmask = u->netmask;
1152 old_sched = svc->scheduler;
1153 if (sched != old_sched) {
1155 * Unbind the old scheduler
1157 if ((ret = ip_vs_unbind_scheduler(svc))) {
1163 * Bind the new scheduler
1165 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1167 * If ip_vs_bind_scheduler fails, restore the old
1169 * The main reason of failure is out of memory.
1171 * The question is if the old scheduler can be
1172 * restored all the time. TODO: if it cannot be
1173 * restored some time, we must delete the service,
1174 * otherwise the system may crash.
1176 ip_vs_bind_scheduler(svc, old_sched);
1183 write_unlock_bh(&__ip_vs_svc_lock);
1186 ip_vs_scheduler_put(old_sched);
1193 * Delete a service from the service list
1194 * - The service must be unlinked, unlocked and not referenced!
1195 * - We are called under _bh lock
1197 static void __ip_vs_del_service(struct ip_vs_service *svc)
1199 struct ip_vs_dest *dest, *nxt;
1200 struct ip_vs_scheduler *old_sched;
1202 ip_vs_num_services--;
1203 ip_vs_kill_estimator(&svc->stats);
1205 /* Unbind scheduler */
1206 old_sched = svc->scheduler;
1207 ip_vs_unbind_scheduler(svc);
1209 ip_vs_scheduler_put(old_sched);
1211 /* Unbind app inc */
1213 ip_vs_app_inc_put(svc->inc);
1218 * Unlink the whole destination list
1220 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1221 __ip_vs_unlink_dest(svc, dest, 0);
1222 __ip_vs_del_dest(dest);
1226 * Update the virtual service counters
1228 if (svc->port == FTPPORT)
1229 atomic_dec(&ip_vs_ftpsvc_counter);
1230 else if (svc->port == 0)
1231 atomic_dec(&ip_vs_nullsvc_counter);
1234 * Free the service if nobody refers to it
1236 if (atomic_read(&svc->refcnt) == 0)
1239 /* decrease the module use count */
1240 ip_vs_use_count_dec();
1244 * Delete a service from the service list
1246 static int ip_vs_del_service(struct ip_vs_service *svc)
1252 * Unhash it from the service table
1254 write_lock_bh(&__ip_vs_svc_lock);
1256 ip_vs_svc_unhash(svc);
1259 * Wait until all the svc users go away.
1261 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1263 __ip_vs_del_service(svc);
1265 write_unlock_bh(&__ip_vs_svc_lock);
1272 * Flush all the virtual services
1274 static int ip_vs_flush(void)
1277 struct ip_vs_service *svc, *nxt;
1280 * Flush the service table hashed by <protocol,addr,port>
1282 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1283 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1284 write_lock_bh(&__ip_vs_svc_lock);
1285 ip_vs_svc_unhash(svc);
1287 * Wait until all the svc users go away.
1289 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1290 __ip_vs_del_service(svc);
1291 write_unlock_bh(&__ip_vs_svc_lock);
1296 * Flush the service table hashed by fwmark
1298 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1299 list_for_each_entry_safe(svc, nxt,
1300 &ip_vs_svc_fwm_table[idx], f_list) {
1301 write_lock_bh(&__ip_vs_svc_lock);
1302 ip_vs_svc_unhash(svc);
1304 * Wait until all the svc users go away.
1306 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1307 __ip_vs_del_service(svc);
1308 write_unlock_bh(&__ip_vs_svc_lock);
1317 * Zero counters in a service or all services
1319 static int ip_vs_zero_service(struct ip_vs_service *svc)
1321 struct ip_vs_dest *dest;
1323 write_lock_bh(&__ip_vs_svc_lock);
1324 list_for_each_entry(dest, &svc->destinations, n_list) {
1325 ip_vs_zero_stats(&dest->stats);
1327 ip_vs_zero_stats(&svc->stats);
1328 write_unlock_bh(&__ip_vs_svc_lock);
1332 static int ip_vs_zero_all(void)
1335 struct ip_vs_service *svc;
1337 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1338 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1339 ip_vs_zero_service(svc);
1343 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1344 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1345 ip_vs_zero_service(svc);
1349 ip_vs_zero_stats(&ip_vs_stats);
1355 proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1356 void __user *buffer, size_t *lenp, loff_t *ppos)
1358 int *valp = table->data;
1362 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1363 if (write && (*valp != val)) {
1364 if ((*valp < 0) || (*valp > 3)) {
1365 /* Restore the correct value */
1368 update_defense_level();
1376 proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1377 void __user *buffer, size_t *lenp, loff_t *ppos)
1379 int *valp = table->data;
1383 /* backup the value first */
1384 memcpy(val, valp, sizeof(val));
1386 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1387 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1388 /* Restore the correct value */
1389 memcpy(valp, val, sizeof(val));
1396 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1399 static struct ctl_table vs_vars[] = {
1401 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1402 .procname = "amemthresh",
1403 .data = &sysctl_ip_vs_amemthresh,
1404 .maxlen = sizeof(int),
1406 .proc_handler = &proc_dointvec,
1408 #ifdef CONFIG_IP_VS_DEBUG
1410 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1411 .procname = "debug_level",
1412 .data = &sysctl_ip_vs_debug_level,
1413 .maxlen = sizeof(int),
1415 .proc_handler = &proc_dointvec,
1419 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1420 .procname = "am_droprate",
1421 .data = &sysctl_ip_vs_am_droprate,
1422 .maxlen = sizeof(int),
1424 .proc_handler = &proc_dointvec,
1427 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1428 .procname = "drop_entry",
1429 .data = &sysctl_ip_vs_drop_entry,
1430 .maxlen = sizeof(int),
1432 .proc_handler = &proc_do_defense_mode,
1435 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1436 .procname = "drop_packet",
1437 .data = &sysctl_ip_vs_drop_packet,
1438 .maxlen = sizeof(int),
1440 .proc_handler = &proc_do_defense_mode,
1443 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1444 .procname = "secure_tcp",
1445 .data = &sysctl_ip_vs_secure_tcp,
1446 .maxlen = sizeof(int),
1448 .proc_handler = &proc_do_defense_mode,
1452 .ctl_name = NET_IPV4_VS_TO_ES,
1453 .procname = "timeout_established",
1454 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1455 .maxlen = sizeof(int),
1457 .proc_handler = &proc_dointvec_jiffies,
1460 .ctl_name = NET_IPV4_VS_TO_SS,
1461 .procname = "timeout_synsent",
1462 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1463 .maxlen = sizeof(int),
1465 .proc_handler = &proc_dointvec_jiffies,
1468 .ctl_name = NET_IPV4_VS_TO_SR,
1469 .procname = "timeout_synrecv",
1470 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1471 .maxlen = sizeof(int),
1473 .proc_handler = &proc_dointvec_jiffies,
1476 .ctl_name = NET_IPV4_VS_TO_FW,
1477 .procname = "timeout_finwait",
1478 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1479 .maxlen = sizeof(int),
1481 .proc_handler = &proc_dointvec_jiffies,
1484 .ctl_name = NET_IPV4_VS_TO_TW,
1485 .procname = "timeout_timewait",
1486 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1487 .maxlen = sizeof(int),
1489 .proc_handler = &proc_dointvec_jiffies,
1492 .ctl_name = NET_IPV4_VS_TO_CL,
1493 .procname = "timeout_close",
1494 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1495 .maxlen = sizeof(int),
1497 .proc_handler = &proc_dointvec_jiffies,
1500 .ctl_name = NET_IPV4_VS_TO_CW,
1501 .procname = "timeout_closewait",
1502 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1503 .maxlen = sizeof(int),
1505 .proc_handler = &proc_dointvec_jiffies,
1508 .ctl_name = NET_IPV4_VS_TO_LA,
1509 .procname = "timeout_lastack",
1510 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1511 .maxlen = sizeof(int),
1513 .proc_handler = &proc_dointvec_jiffies,
1516 .ctl_name = NET_IPV4_VS_TO_LI,
1517 .procname = "timeout_listen",
1518 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1519 .maxlen = sizeof(int),
1521 .proc_handler = &proc_dointvec_jiffies,
1524 .ctl_name = NET_IPV4_VS_TO_SA,
1525 .procname = "timeout_synack",
1526 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1527 .maxlen = sizeof(int),
1529 .proc_handler = &proc_dointvec_jiffies,
1532 .ctl_name = NET_IPV4_VS_TO_UDP,
1533 .procname = "timeout_udp",
1534 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1535 .maxlen = sizeof(int),
1537 .proc_handler = &proc_dointvec_jiffies,
1540 .ctl_name = NET_IPV4_VS_TO_ICMP,
1541 .procname = "timeout_icmp",
1542 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1543 .maxlen = sizeof(int),
1545 .proc_handler = &proc_dointvec_jiffies,
1549 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1550 .procname = "cache_bypass",
1551 .data = &sysctl_ip_vs_cache_bypass,
1552 .maxlen = sizeof(int),
1554 .proc_handler = &proc_dointvec,
1557 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1558 .procname = "expire_nodest_conn",
1559 .data = &sysctl_ip_vs_expire_nodest_conn,
1560 .maxlen = sizeof(int),
1562 .proc_handler = &proc_dointvec,
1565 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1566 .procname = "expire_quiescent_template",
1567 .data = &sysctl_ip_vs_expire_quiescent_template,
1568 .maxlen = sizeof(int),
1570 .proc_handler = &proc_dointvec,
1573 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1574 .procname = "sync_threshold",
1575 .data = &sysctl_ip_vs_sync_threshold,
1576 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1578 .proc_handler = &proc_do_sync_threshold,
1581 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1582 .procname = "nat_icmp_send",
1583 .data = &sysctl_ip_vs_nat_icmp_send,
1584 .maxlen = sizeof(int),
1586 .proc_handler = &proc_dointvec,
1591 static ctl_table vs_table[] = {
1593 .ctl_name = NET_IPV4_VS,
1601 static ctl_table ipvs_ipv4_table[] = {
1603 .ctl_name = NET_IPV4,
1611 static ctl_table vs_root_table[] = {
1613 .ctl_name = CTL_NET,
1616 .child = ipvs_ipv4_table,
1621 static struct ctl_table_header * sysctl_header;
1623 #ifdef CONFIG_PROC_FS
1626 struct list_head *table;
1631 * Write the contents of the VS rule table to a PROCfs file.
1632 * (It is kept just for backward compatibility)
1634 static inline const char *ip_vs_fwd_name(unsigned flags)
1636 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1637 case IP_VS_CONN_F_LOCALNODE:
1639 case IP_VS_CONN_F_TUNNEL:
1641 case IP_VS_CONN_F_DROUTE:
1649 /* Get the Nth entry in the two lists */
1650 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1652 struct ip_vs_iter *iter = seq->private;
1654 struct ip_vs_service *svc;
1656 /* look in hash by protocol */
1657 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1658 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1660 iter->table = ip_vs_svc_table;
1667 /* keep looking in fwmark */
1668 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1669 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1671 iter->table = ip_vs_svc_fwm_table;
1681 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1684 read_lock_bh(&__ip_vs_svc_lock);
1685 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1689 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1691 struct list_head *e;
1692 struct ip_vs_iter *iter;
1693 struct ip_vs_service *svc;
1696 if (v == SEQ_START_TOKEN)
1697 return ip_vs_info_array(seq,0);
1700 iter = seq->private;
1702 if (iter->table == ip_vs_svc_table) {
1703 /* next service in table hashed by protocol */
1704 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1705 return list_entry(e, struct ip_vs_service, s_list);
1708 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1709 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1715 iter->table = ip_vs_svc_fwm_table;
1720 /* next service in hashed by fwmark */
1721 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1722 return list_entry(e, struct ip_vs_service, f_list);
1725 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1726 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1734 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1736 read_unlock_bh(&__ip_vs_svc_lock);
1740 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1742 if (v == SEQ_START_TOKEN) {
1744 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1745 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1747 "Prot LocalAddress:Port Scheduler Flags\n");
1749 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1751 const struct ip_vs_service *svc = v;
1752 const struct ip_vs_iter *iter = seq->private;
1753 const struct ip_vs_dest *dest;
1755 if (iter->table == ip_vs_svc_table)
1756 seq_printf(seq, "%s %08X:%04X %s ",
1757 ip_vs_proto_name(svc->protocol),
1760 svc->scheduler->name);
1762 seq_printf(seq, "FWM %08X %s ",
1763 svc->fwmark, svc->scheduler->name);
1765 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1766 seq_printf(seq, "persistent %d %08X\n",
1768 ntohl(svc->netmask));
1770 seq_putc(seq, '\n');
1772 list_for_each_entry(dest, &svc->destinations, n_list) {
1774 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1775 ntohl(dest->addr), ntohs(dest->port),
1776 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1777 atomic_read(&dest->weight),
1778 atomic_read(&dest->activeconns),
1779 atomic_read(&dest->inactconns));
1785 static struct seq_operations ip_vs_info_seq_ops = {
1786 .start = ip_vs_info_seq_start,
1787 .next = ip_vs_info_seq_next,
1788 .stop = ip_vs_info_seq_stop,
1789 .show = ip_vs_info_seq_show,
1792 static int ip_vs_info_open(struct inode *inode, struct file *file)
1794 struct seq_file *seq;
1796 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1801 rc = seq_open(file, &ip_vs_info_seq_ops);
1805 seq = file->private_data;
1807 memset(s, 0, sizeof(*s));
1815 static struct file_operations ip_vs_info_fops = {
1816 .owner = THIS_MODULE,
1817 .open = ip_vs_info_open,
1819 .llseek = seq_lseek,
1820 .release = seq_release_private,
1825 struct ip_vs_stats ip_vs_stats;
1827 #ifdef CONFIG_PROC_FS
1828 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1831 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1833 " Total Incoming Outgoing Incoming Outgoing\n");
1835 " Conns Packets Packets Bytes Bytes\n");
1837 spin_lock_bh(&ip_vs_stats.lock);
1838 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1839 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1840 (unsigned long long) ip_vs_stats.inbytes,
1841 (unsigned long long) ip_vs_stats.outbytes);
1843 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1845 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1846 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1851 ip_vs_stats.outbps);
1852 spin_unlock_bh(&ip_vs_stats.lock);
1857 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1859 return single_open(file, ip_vs_stats_show, NULL);
1862 static struct file_operations ip_vs_stats_fops = {
1863 .owner = THIS_MODULE,
1864 .open = ip_vs_stats_seq_open,
1866 .llseek = seq_lseek,
1867 .release = single_release,
1873 * Set timeout values for tcp tcpfin udp in the timeout_table.
1875 static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1877 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1882 #ifdef CONFIG_IP_VS_PROTO_TCP
1883 if (u->tcp_timeout) {
1884 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1885 = u->tcp_timeout * HZ;
1888 if (u->tcp_fin_timeout) {
1889 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1890 = u->tcp_fin_timeout * HZ;
1894 #ifdef CONFIG_IP_VS_PROTO_UDP
1895 if (u->udp_timeout) {
1896 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1897 = u->udp_timeout * HZ;
1904 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1905 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1906 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1907 sizeof(struct ip_vs_dest_user))
1908 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1909 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1910 #define MAX_ARG_LEN SVCDEST_ARG_LEN
1912 static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1913 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1914 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1915 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1917 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1921 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1922 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1927 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1930 unsigned char arg[MAX_ARG_LEN];
1931 struct ip_vs_service_user *usvc;
1932 struct ip_vs_service *svc;
1933 struct ip_vs_dest_user *udest;
1935 if (!capable(CAP_NET_ADMIN))
1938 if (len != set_arglen[SET_CMDID(cmd)]) {
1939 IP_VS_ERR("set_ctl: len %u != %u\n",
1940 len, set_arglen[SET_CMDID(cmd)]);
1944 if (copy_from_user(arg, user, len) != 0)
1947 /* increase the module use count */
1948 ip_vs_use_count_inc();
1950 if (down_interruptible(&__ip_vs_mutex)) {
1955 if (cmd == IP_VS_SO_SET_FLUSH) {
1956 /* Flush the virtual service */
1957 ret = ip_vs_flush();
1959 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1960 /* Set timeout values for (tcp tcpfin udp) */
1961 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1963 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1964 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1965 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1967 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1968 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969 ret = stop_sync_thread(dm->state);
1973 usvc = (struct ip_vs_service_user *)arg;
1974 udest = (struct ip_vs_dest_user *)(usvc + 1);
1976 if (cmd == IP_VS_SO_SET_ZERO) {
1977 /* if no service address is set, zero counters in all */
1978 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1979 ret = ip_vs_zero_all();
1984 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1985 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1986 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1987 usvc->protocol, NIPQUAD(usvc->addr),
1988 ntohs(usvc->port), usvc->sched_name);
1993 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1994 if (usvc->fwmark == 0)
1995 svc = __ip_vs_service_get(usvc->protocol,
1996 usvc->addr, usvc->port);
1998 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2000 if (cmd != IP_VS_SO_SET_ADD
2001 && (svc == NULL || svc->protocol != usvc->protocol)) {
2007 case IP_VS_SO_SET_ADD:
2011 ret = ip_vs_add_service(usvc, &svc);
2013 case IP_VS_SO_SET_EDIT:
2014 ret = ip_vs_edit_service(svc, usvc);
2016 case IP_VS_SO_SET_DEL:
2017 ret = ip_vs_del_service(svc);
2021 case IP_VS_SO_SET_ZERO:
2022 ret = ip_vs_zero_service(svc);
2024 case IP_VS_SO_SET_ADDDEST:
2025 ret = ip_vs_add_dest(svc, udest);
2027 case IP_VS_SO_SET_EDITDEST:
2028 ret = ip_vs_edit_dest(svc, udest);
2030 case IP_VS_SO_SET_DELDEST:
2031 ret = ip_vs_del_dest(svc, udest);
2038 ip_vs_service_put(svc);
2043 /* decrease the module use count */
2044 ip_vs_use_count_dec();
2051 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2053 spin_lock_bh(&src->lock);
2054 memcpy(dst, src, (char*)&src->lock - (char*)src);
2055 spin_unlock_bh(&src->lock);
2059 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2061 dst->protocol = src->protocol;
2062 dst->addr = src->addr;
2063 dst->port = src->port;
2064 dst->fwmark = src->fwmark;
2065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2066 dst->flags = src->flags;
2067 dst->timeout = src->timeout / HZ;
2068 dst->netmask = src->netmask;
2069 dst->num_dests = src->num_dests;
2070 ip_vs_copy_stats(&dst->stats, &src->stats);
2074 __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2075 struct ip_vs_get_services __user *uptr)
2078 struct ip_vs_service *svc;
2079 struct ip_vs_service_entry entry;
2082 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2084 if (count >= get->num_services)
2086 memset(&entry, 0, sizeof(entry));
2087 ip_vs_copy_service(&entry, svc);
2088 if (copy_to_user(&uptr->entrytable[count],
2089 &entry, sizeof(entry))) {
2097 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2099 if (count >= get->num_services)
2101 memset(&entry, 0, sizeof(entry));
2102 ip_vs_copy_service(&entry, svc);
2103 if (copy_to_user(&uptr->entrytable[count],
2104 &entry, sizeof(entry))) {
2116 __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2117 struct ip_vs_get_dests __user *uptr)
2119 struct ip_vs_service *svc;
2123 svc = __ip_vs_svc_fwm_get(get->fwmark);
2125 svc = __ip_vs_service_get(get->protocol,
2126 get->addr, get->port);
2129 struct ip_vs_dest *dest;
2130 struct ip_vs_dest_entry entry;
2132 list_for_each_entry(dest, &svc->destinations, n_list) {
2133 if (count >= get->num_dests)
2136 entry.addr = dest->addr;
2137 entry.port = dest->port;
2138 entry.conn_flags = atomic_read(&dest->conn_flags);
2139 entry.weight = atomic_read(&dest->weight);
2140 entry.u_threshold = dest->u_threshold;
2141 entry.l_threshold = dest->l_threshold;
2142 entry.activeconns = atomic_read(&dest->activeconns);
2143 entry.inactconns = atomic_read(&dest->inactconns);
2144 entry.persistconns = atomic_read(&dest->persistconns);
2145 ip_vs_copy_stats(&entry.stats, &dest->stats);
2146 if (copy_to_user(&uptr->entrytable[count],
2147 &entry, sizeof(entry))) {
2153 ip_vs_service_put(svc);
2160 __ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2162 #ifdef CONFIG_IP_VS_PROTO_TCP
2164 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2165 u->tcp_fin_timeout =
2166 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2168 #ifdef CONFIG_IP_VS_PROTO_UDP
2170 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2175 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2176 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2177 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2178 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2179 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2180 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2181 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2183 static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2184 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2185 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2186 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2187 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2188 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2189 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2190 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2194 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2196 unsigned char arg[128];
2199 if (!capable(CAP_NET_ADMIN))
2202 if (*len < get_arglen[GET_CMDID(cmd)]) {
2203 IP_VS_ERR("get_ctl: len %u < %u\n",
2204 *len, get_arglen[GET_CMDID(cmd)]);
2208 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2211 if (down_interruptible(&__ip_vs_mutex))
2212 return -ERESTARTSYS;
2215 case IP_VS_SO_GET_VERSION:
2219 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2220 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2221 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2225 *len = strlen(buf)+1;
2229 case IP_VS_SO_GET_INFO:
2231 struct ip_vs_getinfo info;
2232 info.version = IP_VS_VERSION_CODE;
2233 info.size = IP_VS_CONN_TAB_SIZE;
2234 info.num_services = ip_vs_num_services;
2235 if (copy_to_user(user, &info, sizeof(info)) != 0)
2240 case IP_VS_SO_GET_SERVICES:
2242 struct ip_vs_get_services *get;
2245 get = (struct ip_vs_get_services *)arg;
2246 size = sizeof(*get) +
2247 sizeof(struct ip_vs_service_entry) * get->num_services;
2249 IP_VS_ERR("length: %u != %u\n", *len, size);
2253 ret = __ip_vs_get_service_entries(get, user);
2257 case IP_VS_SO_GET_SERVICE:
2259 struct ip_vs_service_entry *entry;
2260 struct ip_vs_service *svc;
2262 entry = (struct ip_vs_service_entry *)arg;
2264 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2266 svc = __ip_vs_service_get(entry->protocol,
2267 entry->addr, entry->port);
2269 ip_vs_copy_service(entry, svc);
2270 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2272 ip_vs_service_put(svc);
2278 case IP_VS_SO_GET_DESTS:
2280 struct ip_vs_get_dests *get;
2283 get = (struct ip_vs_get_dests *)arg;
2284 size = sizeof(*get) +
2285 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2287 IP_VS_ERR("length: %u != %u\n", *len, size);
2291 ret = __ip_vs_get_dest_entries(get, user);
2295 case IP_VS_SO_GET_TIMEOUT:
2297 struct ip_vs_timeout_user t;
2299 __ip_vs_get_timeouts(&t);
2300 if (copy_to_user(user, &t, sizeof(t)) != 0)
2305 case IP_VS_SO_GET_DAEMON:
2307 struct ip_vs_daemon_user d[2];
2309 memset(&d, 0, sizeof(d));
2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2311 d[0].state = IP_VS_STATE_MASTER;
2312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2313 d[0].syncid = ip_vs_master_syncid;
2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2316 d[1].state = IP_VS_STATE_BACKUP;
2317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2318 d[1].syncid = ip_vs_backup_syncid;
2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
2335 static struct nf_sockopt_ops ip_vs_sockopts = {
2337 .set_optmin = IP_VS_BASE_CTL,
2338 .set_optmax = IP_VS_SO_SET_MAX+1,
2339 .set = do_ip_vs_set_ctl,
2340 .get_optmin = IP_VS_BASE_CTL,
2341 .get_optmax = IP_VS_SO_GET_MAX+1,
2342 .get = do_ip_vs_get_ctl,
2346 int ip_vs_control_init(void)
2353 ret = nf_register_sockopt(&ip_vs_sockopts);
2355 IP_VS_ERR("cannot register sockopt.\n");
2359 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2360 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2362 sysctl_header = register_sysctl_table(vs_root_table, 0);
2364 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2365 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2366 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2367 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2369 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2370 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2373 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2374 spin_lock_init(&ip_vs_stats.lock);
2375 ip_vs_new_estimator(&ip_vs_stats);
2377 /* Hook the defense timer */
2378 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2385 void ip_vs_control_cleanup(void)
2388 ip_vs_trash_cleanup();
2389 cancel_rearming_delayed_work(&defense_work);
2390 ip_vs_kill_estimator(&ip_vs_stats);
2391 unregister_sysctl_table(sysctl_header);
2392 proc_net_remove("ip_vs_stats");
2393 proc_net_remove("ip_vs");
2394 nf_unregister_sockopt(&ip_vs_sockopts);