Merge branch 'for-2.6.27' of git://linux-nfs.org/~bfields/linux
[linux-2.6] / net / ipv4 / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:
13  *
14  */
15
16 #include <linux/kernel.h>
17 #include <linux/tcp.h>                  /* for tcphdr */
18 #include <net/ip.h>
19 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
20 #include <net/udp.h>
21 #include <net/icmp.h>                   /* for icmp_send */
22 #include <net/route.h>                  /* for ip_route_output */
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25
26 #include <net/ip_vs.h>
27
28
29 /*
30  *      Destination cache to speed up outgoing route lookup
31  */
32 static inline void
33 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
34 {
35         struct dst_entry *old_dst;
36
37         old_dst = dest->dst_cache;
38         dest->dst_cache = dst;
39         dest->dst_rtos = rtos;
40         dst_release(old_dst);
41 }
42
43 static inline struct dst_entry *
44 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
45 {
46         struct dst_entry *dst = dest->dst_cache;
47
48         if (!dst)
49                 return NULL;
50         if ((dst->obsolete || rtos != dest->dst_rtos) &&
51             dst->ops->check(dst, cookie) == NULL) {
52                 dest->dst_cache = NULL;
53                 dst_release(dst);
54                 return NULL;
55         }
56         dst_hold(dst);
57         return dst;
58 }
59
60 static struct rtable *
61 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
62 {
63         struct rtable *rt;                      /* Route to the other host */
64         struct ip_vs_dest *dest = cp->dest;
65
66         if (dest) {
67                 spin_lock(&dest->dst_lock);
68                 if (!(rt = (struct rtable *)
69                       __ip_vs_dst_check(dest, rtos, 0))) {
70                         struct flowi fl = {
71                                 .oif = 0,
72                                 .nl_u = {
73                                         .ip4_u = {
74                                                 .daddr = dest->addr,
75                                                 .saddr = 0,
76                                                 .tos = rtos, } },
77                         };
78
79                         if (ip_route_output_key(&init_net, &rt, &fl)) {
80                                 spin_unlock(&dest->dst_lock);
81                                 IP_VS_DBG_RL("ip_route_output error, "
82                                              "dest: %u.%u.%u.%u\n",
83                                              NIPQUAD(dest->addr));
84                                 return NULL;
85                         }
86                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
87                         IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
88                                   NIPQUAD(dest->addr),
89                                   atomic_read(&rt->u.dst.__refcnt), rtos);
90                 }
91                 spin_unlock(&dest->dst_lock);
92         } else {
93                 struct flowi fl = {
94                         .oif = 0,
95                         .nl_u = {
96                                 .ip4_u = {
97                                         .daddr = cp->daddr,
98                                         .saddr = 0,
99                                         .tos = rtos, } },
100                 };
101
102                 if (ip_route_output_key(&init_net, &rt, &fl)) {
103                         IP_VS_DBG_RL("ip_route_output error, dest: "
104                                      "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
105                         return NULL;
106                 }
107         }
108
109         return rt;
110 }
111
112
113 /*
114  *      Release dest->dst_cache before a dest is removed
115  */
116 void
117 ip_vs_dst_reset(struct ip_vs_dest *dest)
118 {
119         struct dst_entry *old_dst;
120
121         old_dst = dest->dst_cache;
122         dest->dst_cache = NULL;
123         dst_release(old_dst);
124 }
125
126 #define IP_VS_XMIT(skb, rt)                             \
127 do {                                                    \
128         (skb)->ipvs_property = 1;                       \
129         skb_forward_csum(skb);                          \
130         NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL,        \
131                 (rt)->u.dst.dev, dst_output);           \
132 } while (0)
133
134
135 /*
136  *      NULL transmitter (do nothing except return NF_ACCEPT)
137  */
138 int
139 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
140                 struct ip_vs_protocol *pp)
141 {
142         /* we do not touch skb and do not need pskb ptr */
143         return NF_ACCEPT;
144 }
145
146
147 /*
148  *      Bypass transmitter
149  *      Let packets bypass the destination when the destination is not
150  *      available, it may be only used in transparent cache cluster.
151  */
152 int
153 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
154                   struct ip_vs_protocol *pp)
155 {
156         struct rtable *rt;                      /* Route to the other host */
157         struct iphdr  *iph = ip_hdr(skb);
158         u8     tos = iph->tos;
159         int    mtu;
160         struct flowi fl = {
161                 .oif = 0,
162                 .nl_u = {
163                         .ip4_u = {
164                                 .daddr = iph->daddr,
165                                 .saddr = 0,
166                                 .tos = RT_TOS(tos), } },
167         };
168
169         EnterFunction(10);
170
171         if (ip_route_output_key(&init_net, &rt, &fl)) {
172                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
173                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
174                 goto tx_error_icmp;
175         }
176
177         /* MTU checking */
178         mtu = dst_mtu(&rt->u.dst);
179         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
180                 ip_rt_put(rt);
181                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
182                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
183                 goto tx_error;
184         }
185
186         /*
187          * Call ip_send_check because we are not sure it is called
188          * after ip_defrag. Is copy-on-write needed?
189          */
190         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
191                 ip_rt_put(rt);
192                 return NF_STOLEN;
193         }
194         ip_send_check(ip_hdr(skb));
195
196         /* drop old route */
197         dst_release(skb->dst);
198         skb->dst = &rt->u.dst;
199
200         /* Another hack: avoid icmp_send in ip_fragment */
201         skb->local_df = 1;
202
203         IP_VS_XMIT(skb, rt);
204
205         LeaveFunction(10);
206         return NF_STOLEN;
207
208  tx_error_icmp:
209         dst_link_failure(skb);
210  tx_error:
211         kfree_skb(skb);
212         LeaveFunction(10);
213         return NF_STOLEN;
214 }
215
216
217 /*
218  *      NAT transmitter (only for outside-to-inside nat forwarding)
219  *      Not used for related ICMP
220  */
221 int
222 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
223                struct ip_vs_protocol *pp)
224 {
225         struct rtable *rt;              /* Route to the other host */
226         int mtu;
227         struct iphdr *iph = ip_hdr(skb);
228
229         EnterFunction(10);
230
231         /* check if it is a connection of no-client-port */
232         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
233                 __be16 _pt, *p;
234                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
235                 if (p == NULL)
236                         goto tx_error;
237                 ip_vs_conn_fill_cport(cp, *p);
238                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
239         }
240
241         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
242                 goto tx_error_icmp;
243
244         /* MTU checking */
245         mtu = dst_mtu(&rt->u.dst);
246         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
247                 ip_rt_put(rt);
248                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
249                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
250                 goto tx_error;
251         }
252
253         /* copy-on-write the packet before mangling it */
254         if (!skb_make_writable(skb, sizeof(struct iphdr)))
255                 goto tx_error_put;
256
257         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
258                 goto tx_error_put;
259
260         /* drop old route */
261         dst_release(skb->dst);
262         skb->dst = &rt->u.dst;
263
264         /* mangle the packet */
265         if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
266                 goto tx_error;
267         ip_hdr(skb)->daddr = cp->daddr;
268         ip_send_check(ip_hdr(skb));
269
270         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
271
272         /* FIXME: when application helper enlarges the packet and the length
273            is larger than the MTU of outgoing device, there will be still
274            MTU problem. */
275
276         /* Another hack: avoid icmp_send in ip_fragment */
277         skb->local_df = 1;
278
279         IP_VS_XMIT(skb, rt);
280
281         LeaveFunction(10);
282         return NF_STOLEN;
283
284   tx_error_icmp:
285         dst_link_failure(skb);
286   tx_error:
287         LeaveFunction(10);
288         kfree_skb(skb);
289         return NF_STOLEN;
290   tx_error_put:
291         ip_rt_put(rt);
292         goto tx_error;
293 }
294
295
296 /*
297  *   IP Tunneling transmitter
298  *
299  *   This function encapsulates the packet in a new IP packet, its
300  *   destination will be set to cp->daddr. Most code of this function
301  *   is taken from ipip.c.
302  *
303  *   It is used in VS/TUN cluster. The load balancer selects a real
304  *   server from a cluster based on a scheduling algorithm,
305  *   encapsulates the request packet and forwards it to the selected
306  *   server. For example, all real servers are configured with
307  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
308  *   the encapsulated packet, it will decapsulate the packet, processe
309  *   the request and return the response packets directly to the client
310  *   without passing the load balancer. This can greatly increase the
311  *   scalability of virtual server.
312  *
313  *   Used for ANY protocol
314  */
315 int
316 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
317                   struct ip_vs_protocol *pp)
318 {
319         struct rtable *rt;                      /* Route to the other host */
320         struct net_device *tdev;                /* Device to other host */
321         struct iphdr  *old_iph = ip_hdr(skb);
322         u8     tos = old_iph->tos;
323         __be16 df = old_iph->frag_off;
324         sk_buff_data_t old_transport_header = skb->transport_header;
325         struct iphdr  *iph;                     /* Our new IP header */
326         unsigned int max_headroom;              /* The extra header space needed */
327         int    mtu;
328
329         EnterFunction(10);
330
331         if (skb->protocol != htons(ETH_P_IP)) {
332                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
333                              "ETH_P_IP: %d, skb protocol: %d\n",
334                              htons(ETH_P_IP), skb->protocol);
335                 goto tx_error;
336         }
337
338         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
339                 goto tx_error_icmp;
340
341         tdev = rt->u.dst.dev;
342
343         mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
344         if (mtu < 68) {
345                 ip_rt_put(rt);
346                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
347                 goto tx_error;
348         }
349         if (skb->dst)
350                 skb->dst->ops->update_pmtu(skb->dst, mtu);
351
352         df |= (old_iph->frag_off & htons(IP_DF));
353
354         if ((old_iph->frag_off & htons(IP_DF))
355             && mtu < ntohs(old_iph->tot_len)) {
356                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
357                 ip_rt_put(rt);
358                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
359                 goto tx_error;
360         }
361
362         /*
363          * Okay, now see if we can stuff it in the buffer as-is.
364          */
365         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
366
367         if (skb_headroom(skb) < max_headroom
368             || skb_cloned(skb) || skb_shared(skb)) {
369                 struct sk_buff *new_skb =
370                         skb_realloc_headroom(skb, max_headroom);
371                 if (!new_skb) {
372                         ip_rt_put(rt);
373                         kfree_skb(skb);
374                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
375                         return NF_STOLEN;
376                 }
377                 kfree_skb(skb);
378                 skb = new_skb;
379                 old_iph = ip_hdr(skb);
380         }
381
382         skb->transport_header = old_transport_header;
383
384         /* fix old IP header checksum */
385         ip_send_check(old_iph);
386
387         skb_push(skb, sizeof(struct iphdr));
388         skb_reset_network_header(skb);
389         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
390
391         /* drop old route */
392         dst_release(skb->dst);
393         skb->dst = &rt->u.dst;
394
395         /*
396          *      Push down and install the IPIP header.
397          */
398         iph                     =       ip_hdr(skb);
399         iph->version            =       4;
400         iph->ihl                =       sizeof(struct iphdr)>>2;
401         iph->frag_off           =       df;
402         iph->protocol           =       IPPROTO_IPIP;
403         iph->tos                =       tos;
404         iph->daddr              =       rt->rt_dst;
405         iph->saddr              =       rt->rt_src;
406         iph->ttl                =       old_iph->ttl;
407         ip_select_ident(iph, &rt->u.dst, NULL);
408
409         /* Another hack: avoid icmp_send in ip_fragment */
410         skb->local_df = 1;
411
412         ip_local_out(skb);
413
414         LeaveFunction(10);
415
416         return NF_STOLEN;
417
418   tx_error_icmp:
419         dst_link_failure(skb);
420   tx_error:
421         kfree_skb(skb);
422         LeaveFunction(10);
423         return NF_STOLEN;
424 }
425
426
427 /*
428  *      Direct Routing transmitter
429  *      Used for ANY protocol
430  */
431 int
432 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
433               struct ip_vs_protocol *pp)
434 {
435         struct rtable *rt;                      /* Route to the other host */
436         struct iphdr  *iph = ip_hdr(skb);
437         int    mtu;
438
439         EnterFunction(10);
440
441         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
442                 goto tx_error_icmp;
443
444         /* MTU checking */
445         mtu = dst_mtu(&rt->u.dst);
446         if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
447                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
448                 ip_rt_put(rt);
449                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
450                 goto tx_error;
451         }
452
453         /*
454          * Call ip_send_check because we are not sure it is called
455          * after ip_defrag. Is copy-on-write needed?
456          */
457         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
458                 ip_rt_put(rt);
459                 return NF_STOLEN;
460         }
461         ip_send_check(ip_hdr(skb));
462
463         /* drop old route */
464         dst_release(skb->dst);
465         skb->dst = &rt->u.dst;
466
467         /* Another hack: avoid icmp_send in ip_fragment */
468         skb->local_df = 1;
469
470         IP_VS_XMIT(skb, rt);
471
472         LeaveFunction(10);
473         return NF_STOLEN;
474
475   tx_error_icmp:
476         dst_link_failure(skb);
477   tx_error:
478         kfree_skb(skb);
479         LeaveFunction(10);
480         return NF_STOLEN;
481 }
482
483
484 /*
485  *      ICMP packet transmitter
486  *      called by the ip_vs_in_icmp
487  */
488 int
489 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
490                 struct ip_vs_protocol *pp, int offset)
491 {
492         struct rtable   *rt;    /* Route to the other host */
493         int mtu;
494         int rc;
495
496         EnterFunction(10);
497
498         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
499            forwarded directly here, because there is no need to
500            translate address/port back */
501         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
502                 if (cp->packet_xmit)
503                         rc = cp->packet_xmit(skb, cp, pp);
504                 else
505                         rc = NF_ACCEPT;
506                 /* do not touch skb anymore */
507                 atomic_inc(&cp->in_pkts);
508                 goto out;
509         }
510
511         /*
512          * mangle and send the packet here (only for VS/NAT)
513          */
514
515         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
516                 goto tx_error_icmp;
517
518         /* MTU checking */
519         mtu = dst_mtu(&rt->u.dst);
520         if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
521                 ip_rt_put(rt);
522                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
523                 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
524                 goto tx_error;
525         }
526
527         /* copy-on-write the packet before mangling it */
528         if (!skb_make_writable(skb, offset))
529                 goto tx_error_put;
530
531         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
532                 goto tx_error_put;
533
534         /* drop the old route when skb is not shared */
535         dst_release(skb->dst);
536         skb->dst = &rt->u.dst;
537
538         ip_vs_nat_icmp(skb, pp, cp, 0);
539
540         /* Another hack: avoid icmp_send in ip_fragment */
541         skb->local_df = 1;
542
543         IP_VS_XMIT(skb, rt);
544
545         rc = NF_STOLEN;
546         goto out;
547
548   tx_error_icmp:
549         dst_link_failure(skb);
550   tx_error:
551         dev_kfree_skb(skb);
552         rc = NF_STOLEN;
553   out:
554         LeaveFunction(10);
555         return rc;
556   tx_error_put:
557         ip_rt_put(rt);
558         goto tx_error;
559 }