Merge nommu branch
[linux-2.6] / net / ipv4 / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
7  *              Julian Anastasov <ja@ssi.bg>
8  *
9  *              This program is free software; you can redistribute it and/or
10  *              modify it under the terms of the GNU General Public License
11  *              as published by the Free Software Foundation; either version
12  *              2 of the License, or (at your option) any later version.
13  *
14  * Changes:
15  *
16  */
17
18 #include <linux/kernel.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>                  /* for tcphdr */
21 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
22 #include <net/udp.h>
23 #include <net/icmp.h>                   /* for icmp_send */
24 #include <net/route.h>                  /* for ip_route_output */
25 #include <linux/netfilter.h>
26 #include <linux/netfilter_ipv4.h>
27
28 #include <net/ip_vs.h>
29
30
31 /*
32  *      Destination cache to speed up outgoing route lookup
33  */
34 static inline void
35 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36 {
37         struct dst_entry *old_dst;
38
39         old_dst = dest->dst_cache;
40         dest->dst_cache = dst;
41         dest->dst_rtos = rtos;
42         dst_release(old_dst);
43 }
44
45 static inline struct dst_entry *
46 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47 {
48         struct dst_entry *dst = dest->dst_cache;
49
50         if (!dst)
51                 return NULL;
52         if ((dst->obsolete || rtos != dest->dst_rtos) &&
53             dst->ops->check(dst, cookie) == NULL) {
54                 dest->dst_cache = NULL;
55                 dst_release(dst);
56                 return NULL;
57         }
58         dst_hold(dst);
59         return dst;
60 }
61
62 static inline struct rtable *
63 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
64 {
65         struct rtable *rt;                      /* Route to the other host */
66         struct ip_vs_dest *dest = cp->dest;
67
68         if (dest) {
69                 spin_lock(&dest->dst_lock);
70                 if (!(rt = (struct rtable *)
71                       __ip_vs_dst_check(dest, rtos, 0))) {
72                         struct flowi fl = {
73                                 .oif = 0,
74                                 .nl_u = {
75                                         .ip4_u = {
76                                                 .daddr = dest->addr,
77                                                 .saddr = 0,
78                                                 .tos = rtos, } },
79                         };
80
81                         if (ip_route_output_key(&rt, &fl)) {
82                                 spin_unlock(&dest->dst_lock);
83                                 IP_VS_DBG_RL("ip_route_output error, "
84                                              "dest: %u.%u.%u.%u\n",
85                                              NIPQUAD(dest->addr));
86                                 return NULL;
87                         }
88                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
89                         IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
90                                   NIPQUAD(dest->addr),
91                                   atomic_read(&rt->u.dst.__refcnt), rtos);
92                 }
93                 spin_unlock(&dest->dst_lock);
94         } else {
95                 struct flowi fl = {
96                         .oif = 0,
97                         .nl_u = {
98                                 .ip4_u = {
99                                         .daddr = cp->daddr,
100                                         .saddr = 0,
101                                         .tos = rtos, } },
102                 };
103
104                 if (ip_route_output_key(&rt, &fl)) {
105                         IP_VS_DBG_RL("ip_route_output error, dest: "
106                                      "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
107                         return NULL;
108                 }
109         }
110
111         return rt;
112 }
113
114
115 /*
116  *      Release dest->dst_cache before a dest is removed
117  */
118 void
119 ip_vs_dst_reset(struct ip_vs_dest *dest)
120 {
121         struct dst_entry *old_dst;
122
123         old_dst = dest->dst_cache;
124         dest->dst_cache = NULL;
125         dst_release(old_dst);
126 }
127
128 #define IP_VS_XMIT(skb, rt)                             \
129 do {                                                    \
130         (skb)->ipvs_property = 1;                       \
131         (skb)->ip_summed = CHECKSUM_NONE;               \
132         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
133                 (rt)->u.dst.dev, dst_output);           \
134 } while (0)
135
136
137 /*
138  *      NULL transmitter (do nothing except return NF_ACCEPT)
139  */
140 int
141 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
142                 struct ip_vs_protocol *pp)
143 {
144         /* we do not touch skb and do not need pskb ptr */
145         return NF_ACCEPT;
146 }
147
148
149 /*
150  *      Bypass transmitter
151  *      Let packets bypass the destination when the destination is not
152  *      available, it may be only used in transparent cache cluster.
153  */
154 int
155 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
156                   struct ip_vs_protocol *pp)
157 {
158         struct rtable *rt;                      /* Route to the other host */
159         struct iphdr  *iph = skb->nh.iph;
160         u8     tos = iph->tos;
161         int    mtu;
162         struct flowi fl = {
163                 .oif = 0,
164                 .nl_u = {
165                         .ip4_u = {
166                                 .daddr = iph->daddr,
167                                 .saddr = 0,
168                                 .tos = RT_TOS(tos), } },
169         };
170
171         EnterFunction(10);
172
173         if (ip_route_output_key(&rt, &fl)) {
174                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
175                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
176                 goto tx_error_icmp;
177         }
178
179         /* MTU checking */
180         mtu = dst_mtu(&rt->u.dst);
181         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
182                 ip_rt_put(rt);
183                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
184                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
185                 goto tx_error;
186         }
187
188         /*
189          * Call ip_send_check because we are not sure it is called
190          * after ip_defrag. Is copy-on-write needed?
191          */
192         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
193                 ip_rt_put(rt);
194                 return NF_STOLEN;
195         }
196         ip_send_check(skb->nh.iph);
197
198         /* drop old route */
199         dst_release(skb->dst);
200         skb->dst = &rt->u.dst;
201
202         /* Another hack: avoid icmp_send in ip_fragment */
203         skb->local_df = 1;
204
205         IP_VS_XMIT(skb, rt);
206
207         LeaveFunction(10);
208         return NF_STOLEN;
209
210  tx_error_icmp:
211         dst_link_failure(skb);
212  tx_error:
213         kfree_skb(skb);
214         LeaveFunction(10);
215         return NF_STOLEN;
216 }
217
218
219 /*
220  *      NAT transmitter (only for outside-to-inside nat forwarding)
221  *      Not used for related ICMP
222  */
223 int
224 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
225                struct ip_vs_protocol *pp)
226 {
227         struct rtable *rt;              /* Route to the other host */
228         int mtu;
229         struct iphdr *iph = skb->nh.iph;
230
231         EnterFunction(10);
232
233         /* check if it is a connection of no-client-port */
234         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
235                 __u16 _pt, *p;
236                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
237                 if (p == NULL)
238                         goto tx_error;
239                 ip_vs_conn_fill_cport(cp, *p);
240                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
241         }
242
243         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
244                 goto tx_error_icmp;
245
246         /* MTU checking */
247         mtu = dst_mtu(&rt->u.dst);
248         if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
249                 ip_rt_put(rt);
250                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
251                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
252                 goto tx_error;
253         }
254
255         /* copy-on-write the packet before mangling it */
256         if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
257                 goto tx_error_put;
258
259         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
260                 goto tx_error_put;
261
262         /* drop old route */
263         dst_release(skb->dst);
264         skb->dst = &rt->u.dst;
265
266         /* mangle the packet */
267         if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
268                 goto tx_error;
269         skb->nh.iph->daddr = cp->daddr;
270         ip_send_check(skb->nh.iph);
271
272         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
273
274         /* FIXME: when application helper enlarges the packet and the length
275            is larger than the MTU of outgoing device, there will be still
276            MTU problem. */
277
278         /* Another hack: avoid icmp_send in ip_fragment */
279         skb->local_df = 1;
280
281         IP_VS_XMIT(skb, rt);
282
283         LeaveFunction(10);
284         return NF_STOLEN;
285
286   tx_error_icmp:
287         dst_link_failure(skb);
288   tx_error:
289         LeaveFunction(10);
290         kfree_skb(skb);
291         return NF_STOLEN;
292   tx_error_put:
293         ip_rt_put(rt);
294         goto tx_error;
295 }
296
297
298 /*
299  *   IP Tunneling transmitter
300  *
301  *   This function encapsulates the packet in a new IP packet, its
302  *   destination will be set to cp->daddr. Most code of this function
303  *   is taken from ipip.c.
304  *
305  *   It is used in VS/TUN cluster. The load balancer selects a real
306  *   server from a cluster based on a scheduling algorithm,
307  *   encapsulates the request packet and forwards it to the selected
308  *   server. For example, all real servers are configured with
309  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
310  *   the encapsulated packet, it will decapsulate the packet, processe
311  *   the request and return the response packets directly to the client
312  *   without passing the load balancer. This can greatly increase the
313  *   scalability of virtual server.
314  *
315  *   Used for ANY protocol
316  */
317 int
318 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
319                   struct ip_vs_protocol *pp)
320 {
321         struct rtable *rt;                      /* Route to the other host */
322         struct net_device *tdev;                /* Device to other host */
323         struct iphdr  *old_iph = skb->nh.iph;
324         u8     tos = old_iph->tos;
325         __be16 df = old_iph->frag_off;
326         struct iphdr  *iph;                     /* Our new IP header */
327         int    max_headroom;                    /* The extra header space needed */
328         int    mtu;
329
330         EnterFunction(10);
331
332         if (skb->protocol != __constant_htons(ETH_P_IP)) {
333                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
334                              "ETH_P_IP: %d, skb protocol: %d\n",
335                              __constant_htons(ETH_P_IP), skb->protocol);
336                 goto tx_error;
337         }
338
339         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
340                 goto tx_error_icmp;
341
342         tdev = rt->u.dst.dev;
343
344         mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
345         if (mtu < 68) {
346                 ip_rt_put(rt);
347                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
348                 goto tx_error;
349         }
350         if (skb->dst)
351                 skb->dst->ops->update_pmtu(skb->dst, mtu);
352
353         df |= (old_iph->frag_off&__constant_htons(IP_DF));
354
355         if ((old_iph->frag_off&__constant_htons(IP_DF))
356             && mtu < ntohs(old_iph->tot_len)) {
357                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
358                 ip_rt_put(rt);
359                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
360                 goto tx_error;
361         }
362
363         /*
364          * Okay, now see if we can stuff it in the buffer as-is.
365          */
366         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
367
368         if (skb_headroom(skb) < max_headroom
369             || skb_cloned(skb) || skb_shared(skb)) {
370                 struct sk_buff *new_skb =
371                         skb_realloc_headroom(skb, max_headroom);
372                 if (!new_skb) {
373                         ip_rt_put(rt);
374                         kfree_skb(skb);
375                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
376                         return NF_STOLEN;
377                 }
378                 kfree_skb(skb);
379                 skb = new_skb;
380                 old_iph = skb->nh.iph;
381         }
382
383         skb->h.raw = (void *) old_iph;
384
385         /* fix old IP header checksum */
386         ip_send_check(old_iph);
387
388         skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
389         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
390
391         /* drop old route */
392         dst_release(skb->dst);
393         skb->dst = &rt->u.dst;
394
395         /*
396          *      Push down and install the IPIP header.
397          */
398         iph                     =       skb->nh.iph;
399         iph->version            =       4;
400         iph->ihl                =       sizeof(struct iphdr)>>2;
401         iph->frag_off           =       df;
402         iph->protocol           =       IPPROTO_IPIP;
403         iph->tos                =       tos;
404         iph->daddr              =       rt->rt_dst;
405         iph->saddr              =       rt->rt_src;
406         iph->ttl                =       old_iph->ttl;
407         iph->tot_len            =       htons(skb->len);
408         ip_select_ident(iph, &rt->u.dst, NULL);
409         ip_send_check(iph);
410
411         /* Another hack: avoid icmp_send in ip_fragment */
412         skb->local_df = 1;
413
414         IP_VS_XMIT(skb, rt);
415
416         LeaveFunction(10);
417
418         return NF_STOLEN;
419
420   tx_error_icmp:
421         dst_link_failure(skb);
422   tx_error:
423         kfree_skb(skb);
424         LeaveFunction(10);
425         return NF_STOLEN;
426 }
427
428
429 /*
430  *      Direct Routing transmitter
431  *      Used for ANY protocol
432  */
433 int
434 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
435               struct ip_vs_protocol *pp)
436 {
437         struct rtable *rt;                      /* Route to the other host */
438         struct iphdr  *iph = skb->nh.iph;
439         int    mtu;
440
441         EnterFunction(10);
442
443         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
444                 goto tx_error_icmp;
445
446         /* MTU checking */
447         mtu = dst_mtu(&rt->u.dst);
448         if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
449                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
450                 ip_rt_put(rt);
451                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
452                 goto tx_error;
453         }
454
455         /*
456          * Call ip_send_check because we are not sure it is called
457          * after ip_defrag. Is copy-on-write needed?
458          */
459         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
460                 ip_rt_put(rt);
461                 return NF_STOLEN;
462         }
463         ip_send_check(skb->nh.iph);
464
465         /* drop old route */
466         dst_release(skb->dst);
467         skb->dst = &rt->u.dst;
468
469         /* Another hack: avoid icmp_send in ip_fragment */
470         skb->local_df = 1;
471
472         IP_VS_XMIT(skb, rt);
473
474         LeaveFunction(10);
475         return NF_STOLEN;
476
477   tx_error_icmp:
478         dst_link_failure(skb);
479   tx_error:
480         kfree_skb(skb);
481         LeaveFunction(10);
482         return NF_STOLEN;
483 }
484
485
486 /*
487  *      ICMP packet transmitter
488  *      called by the ip_vs_in_icmp
489  */
490 int
491 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
492                 struct ip_vs_protocol *pp, int offset)
493 {
494         struct rtable   *rt;    /* Route to the other host */
495         int mtu;
496         int rc;
497
498         EnterFunction(10);
499
500         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
501            forwarded directly here, because there is no need to
502            translate address/port back */
503         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
504                 if (cp->packet_xmit)
505                         rc = cp->packet_xmit(skb, cp, pp);
506                 else
507                         rc = NF_ACCEPT;
508                 /* do not touch skb anymore */
509                 atomic_inc(&cp->in_pkts);
510                 goto out;
511         }
512
513         /*
514          * mangle and send the packet here (only for VS/NAT)
515          */
516
517         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
518                 goto tx_error_icmp;
519
520         /* MTU checking */
521         mtu = dst_mtu(&rt->u.dst);
522         if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
523                 ip_rt_put(rt);
524                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
525                 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
526                 goto tx_error;
527         }
528
529         /* copy-on-write the packet before mangling it */
530         if (!ip_vs_make_skb_writable(&skb, offset))
531                 goto tx_error_put;
532
533         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
534                 goto tx_error_put;
535
536         /* drop the old route when skb is not shared */
537         dst_release(skb->dst);
538         skb->dst = &rt->u.dst;
539
540         ip_vs_nat_icmp(skb, pp, cp, 0);
541
542         /* Another hack: avoid icmp_send in ip_fragment */
543         skb->local_df = 1;
544
545         IP_VS_XMIT(skb, rt);
546
547         rc = NF_STOLEN;
548         goto out;
549
550   tx_error_icmp:
551         dst_link_failure(skb);
552   tx_error:
553         dev_kfree_skb(skb);
554         rc = NF_STOLEN;
555   out:
556         LeaveFunction(10);
557         return rc;
558   tx_error_put:
559         ip_rt_put(rt);
560         goto tx_error;
561 }