Merge branch 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6
[linux-2.6] / net / ipv4 / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
7  *              Julian Anastasov <ja@ssi.bg>
8  *
9  *              This program is free software; you can redistribute it and/or
10  *              modify it under the terms of the GNU General Public License
11  *              as published by the Free Software Foundation; either version
12  *              2 of the License, or (at your option) any later version.
13  *
14  * Changes:
15  *
16  */
17
18 #include <linux/kernel.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>                  /* for tcphdr */
21 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
22 #include <net/udp.h>
23 #include <net/icmp.h>                   /* for icmp_send */
24 #include <net/route.h>                  /* for ip_route_output */
25 #include <linux/netfilter.h>
26 #include <linux/netfilter_ipv4.h>
27
28 #include <net/ip_vs.h>
29
30
31 /*
32  *      Destination cache to speed up outgoing route lookup
33  */
34 static inline void
35 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36 {
37         struct dst_entry *old_dst;
38
39         old_dst = dest->dst_cache;
40         dest->dst_cache = dst;
41         dest->dst_rtos = rtos;
42         dst_release(old_dst);
43 }
44
45 static inline struct dst_entry *
46 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47 {
48         struct dst_entry *dst = dest->dst_cache;
49
50         if (!dst)
51                 return NULL;
52         if ((dst->obsolete || rtos != dest->dst_rtos) &&
53             dst->ops->check(dst, cookie) == NULL) {
54                 dest->dst_cache = NULL;
55                 dst_release(dst);
56                 return NULL;
57         }
58         dst_hold(dst);
59         return dst;
60 }
61
62 static inline struct rtable *
63 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
64 {
65         struct rtable *rt;                      /* Route to the other host */
66         struct ip_vs_dest *dest = cp->dest;
67
68         if (dest) {
69                 spin_lock(&dest->dst_lock);
70                 if (!(rt = (struct rtable *)
71                       __ip_vs_dst_check(dest, rtos, 0))) {
72                         struct flowi fl = {
73                                 .oif = 0,
74                                 .nl_u = {
75                                         .ip4_u = {
76                                                 .daddr = dest->addr,
77                                                 .saddr = 0,
78                                                 .tos = rtos, } },
79                         };
80
81                         if (ip_route_output_key(&rt, &fl)) {
82                                 spin_unlock(&dest->dst_lock);
83                                 IP_VS_DBG_RL("ip_route_output error, "
84                                              "dest: %u.%u.%u.%u\n",
85                                              NIPQUAD(dest->addr));
86                                 return NULL;
87                         }
88                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
89                         IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
90                                   NIPQUAD(dest->addr),
91                                   atomic_read(&rt->u.dst.__refcnt), rtos);
92                 }
93                 spin_unlock(&dest->dst_lock);
94         } else {
95                 struct flowi fl = {
96                         .oif = 0,
97                         .nl_u = {
98                                 .ip4_u = {
99                                         .daddr = cp->daddr,
100                                         .saddr = 0,
101                                         .tos = rtos, } },
102                 };
103
104                 if (ip_route_output_key(&rt, &fl)) {
105                         IP_VS_DBG_RL("ip_route_output error, dest: "
106                                      "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
107                         return NULL;
108                 }
109         }
110
111         return rt;
112 }
113
114
115 /*
116  *      Release dest->dst_cache before a dest is removed
117  */
118 void
119 ip_vs_dst_reset(struct ip_vs_dest *dest)
120 {
121         struct dst_entry *old_dst;
122
123         old_dst = dest->dst_cache;
124         dest->dst_cache = NULL;
125         dst_release(old_dst);
126 }
127
128 #define IP_VS_XMIT(skb, rt)                             \
129 do {                                                    \
130         (skb)->ipvs_property = 1;                       \
131         skb_forward_csum(skb);                          \
132         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
133                 (rt)->u.dst.dev, dst_output);           \
134 } while (0)
135
136
137 /*
138  *      NULL transmitter (do nothing except return NF_ACCEPT)
139  */
140 int
141 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
142                 struct ip_vs_protocol *pp)
143 {
144         /* we do not touch skb and do not need pskb ptr */
145         return NF_ACCEPT;
146 }
147
148
149 /*
150  *      Bypass transmitter
151  *      Let packets bypass the destination when the destination is not
152  *      available, it may be only used in transparent cache cluster.
153  */
154 int
155 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
156                   struct ip_vs_protocol *pp)
157 {
158         struct rtable *rt;                      /* Route to the other host */
159         struct iphdr  *iph = ip_hdr(skb);
160         u8     tos = iph->tos;
161         int    mtu;
162         struct flowi fl = {
163                 .oif = 0,
164                 .nl_u = {
165                         .ip4_u = {
166                                 .daddr = iph->daddr,
167                                 .saddr = 0,
168                                 .tos = RT_TOS(tos), } },
169         };
170
171         EnterFunction(10);
172
173         if (ip_route_output_key(&rt, &fl)) {
174                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
175                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
176                 goto tx_error_icmp;
177         }
178
179         /* MTU checking */
180         mtu = dst_mtu(&rt->u.dst);
181         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
182                 ip_rt_put(rt);
183                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
184                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
185                 goto tx_error;
186         }
187
188         /*
189          * Call ip_send_check because we are not sure it is called
190          * after ip_defrag. Is copy-on-write needed?
191          */
192         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
193                 ip_rt_put(rt);
194                 return NF_STOLEN;
195         }
196         ip_send_check(ip_hdr(skb));
197
198         /* drop old route */
199         dst_release(skb->dst);
200         skb->dst = &rt->u.dst;
201
202         /* Another hack: avoid icmp_send in ip_fragment */
203         skb->local_df = 1;
204
205         IP_VS_XMIT(skb, rt);
206
207         LeaveFunction(10);
208         return NF_STOLEN;
209
210  tx_error_icmp:
211         dst_link_failure(skb);
212  tx_error:
213         kfree_skb(skb);
214         LeaveFunction(10);
215         return NF_STOLEN;
216 }
217
218
219 /*
220  *      NAT transmitter (only for outside-to-inside nat forwarding)
221  *      Not used for related ICMP
222  */
223 int
224 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
225                struct ip_vs_protocol *pp)
226 {
227         struct rtable *rt;              /* Route to the other host */
228         int mtu;
229         struct iphdr *iph = ip_hdr(skb);
230
231         EnterFunction(10);
232
233         /* check if it is a connection of no-client-port */
234         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
235                 __be16 _pt, *p;
236                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
237                 if (p == NULL)
238                         goto tx_error;
239                 ip_vs_conn_fill_cport(cp, *p);
240                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
241         }
242
243         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
244                 goto tx_error_icmp;
245
246         /* MTU checking */
247         mtu = dst_mtu(&rt->u.dst);
248         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
249                 ip_rt_put(rt);
250                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
251                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
252                 goto tx_error;
253         }
254
255         /* copy-on-write the packet before mangling it */
256         if (!skb_make_writable(skb, sizeof(struct iphdr)))
257                 goto tx_error_put;
258
259         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
260                 goto tx_error_put;
261
262         /* drop old route */
263         dst_release(skb->dst);
264         skb->dst = &rt->u.dst;
265
266         /* mangle the packet */
267         if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
268                 goto tx_error;
269         ip_hdr(skb)->daddr = cp->daddr;
270         ip_send_check(ip_hdr(skb));
271
272         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
273
274         /* FIXME: when application helper enlarges the packet and the length
275            is larger than the MTU of outgoing device, there will be still
276            MTU problem. */
277
278         /* Another hack: avoid icmp_send in ip_fragment */
279         skb->local_df = 1;
280
281         IP_VS_XMIT(skb, rt);
282
283         LeaveFunction(10);
284         return NF_STOLEN;
285
286   tx_error_icmp:
287         dst_link_failure(skb);
288   tx_error:
289         LeaveFunction(10);
290         kfree_skb(skb);
291         return NF_STOLEN;
292   tx_error_put:
293         ip_rt_put(rt);
294         goto tx_error;
295 }
296
297
298 /*
299  *   IP Tunneling transmitter
300  *
301  *   This function encapsulates the packet in a new IP packet, its
302  *   destination will be set to cp->daddr. Most code of this function
303  *   is taken from ipip.c.
304  *
305  *   It is used in VS/TUN cluster. The load balancer selects a real
306  *   server from a cluster based on a scheduling algorithm,
307  *   encapsulates the request packet and forwards it to the selected
308  *   server. For example, all real servers are configured with
309  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
310  *   the encapsulated packet, it will decapsulate the packet, processe
311  *   the request and return the response packets directly to the client
312  *   without passing the load balancer. This can greatly increase the
313  *   scalability of virtual server.
314  *
315  *   Used for ANY protocol
316  */
317 int
318 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
319                   struct ip_vs_protocol *pp)
320 {
321         struct rtable *rt;                      /* Route to the other host */
322         struct net_device *tdev;                /* Device to other host */
323         struct iphdr  *old_iph = ip_hdr(skb);
324         u8     tos = old_iph->tos;
325         __be16 df = old_iph->frag_off;
326         sk_buff_data_t old_transport_header = skb->transport_header;
327         struct iphdr  *iph;                     /* Our new IP header */
328         unsigned int max_headroom;              /* The extra header space needed */
329         int    mtu;
330
331         EnterFunction(10);
332
333         if (skb->protocol != htons(ETH_P_IP)) {
334                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
335                              "ETH_P_IP: %d, skb protocol: %d\n",
336                              htons(ETH_P_IP), skb->protocol);
337                 goto tx_error;
338         }
339
340         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
341                 goto tx_error_icmp;
342
343         tdev = rt->u.dst.dev;
344
345         mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
346         if (mtu < 68) {
347                 ip_rt_put(rt);
348                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
349                 goto tx_error;
350         }
351         if (skb->dst)
352                 skb->dst->ops->update_pmtu(skb->dst, mtu);
353
354         df |= (old_iph->frag_off & htons(IP_DF));
355
356         if ((old_iph->frag_off & htons(IP_DF))
357             && mtu < ntohs(old_iph->tot_len)) {
358                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
359                 ip_rt_put(rt);
360                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
361                 goto tx_error;
362         }
363
364         /*
365          * Okay, now see if we can stuff it in the buffer as-is.
366          */
367         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
368
369         if (skb_headroom(skb) < max_headroom
370             || skb_cloned(skb) || skb_shared(skb)) {
371                 struct sk_buff *new_skb =
372                         skb_realloc_headroom(skb, max_headroom);
373                 if (!new_skb) {
374                         ip_rt_put(rt);
375                         kfree_skb(skb);
376                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
377                         return NF_STOLEN;
378                 }
379                 kfree_skb(skb);
380                 skb = new_skb;
381                 old_iph = ip_hdr(skb);
382         }
383
384         skb->transport_header = old_transport_header;
385
386         /* fix old IP header checksum */
387         ip_send_check(old_iph);
388
389         skb_push(skb, sizeof(struct iphdr));
390         skb_reset_network_header(skb);
391         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
392
393         /* drop old route */
394         dst_release(skb->dst);
395         skb->dst = &rt->u.dst;
396
397         /*
398          *      Push down and install the IPIP header.
399          */
400         iph                     =       ip_hdr(skb);
401         iph->version            =       4;
402         iph->ihl                =       sizeof(struct iphdr)>>2;
403         iph->frag_off           =       df;
404         iph->protocol           =       IPPROTO_IPIP;
405         iph->tos                =       tos;
406         iph->daddr              =       rt->rt_dst;
407         iph->saddr              =       rt->rt_src;
408         iph->ttl                =       old_iph->ttl;
409         iph->tot_len            =       htons(skb->len);
410         ip_select_ident(iph, &rt->u.dst, NULL);
411         ip_send_check(iph);
412
413         /* Another hack: avoid icmp_send in ip_fragment */
414         skb->local_df = 1;
415
416         IP_VS_XMIT(skb, rt);
417
418         LeaveFunction(10);
419
420         return NF_STOLEN;
421
422   tx_error_icmp:
423         dst_link_failure(skb);
424   tx_error:
425         kfree_skb(skb);
426         LeaveFunction(10);
427         return NF_STOLEN;
428 }
429
430
431 /*
432  *      Direct Routing transmitter
433  *      Used for ANY protocol
434  */
435 int
436 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
437               struct ip_vs_protocol *pp)
438 {
439         struct rtable *rt;                      /* Route to the other host */
440         struct iphdr  *iph = ip_hdr(skb);
441         int    mtu;
442
443         EnterFunction(10);
444
445         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
446                 goto tx_error_icmp;
447
448         /* MTU checking */
449         mtu = dst_mtu(&rt->u.dst);
450         if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
451                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
452                 ip_rt_put(rt);
453                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
454                 goto tx_error;
455         }
456
457         /*
458          * Call ip_send_check because we are not sure it is called
459          * after ip_defrag. Is copy-on-write needed?
460          */
461         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
462                 ip_rt_put(rt);
463                 return NF_STOLEN;
464         }
465         ip_send_check(ip_hdr(skb));
466
467         /* drop old route */
468         dst_release(skb->dst);
469         skb->dst = &rt->u.dst;
470
471         /* Another hack: avoid icmp_send in ip_fragment */
472         skb->local_df = 1;
473
474         IP_VS_XMIT(skb, rt);
475
476         LeaveFunction(10);
477         return NF_STOLEN;
478
479   tx_error_icmp:
480         dst_link_failure(skb);
481   tx_error:
482         kfree_skb(skb);
483         LeaveFunction(10);
484         return NF_STOLEN;
485 }
486
487
488 /*
489  *      ICMP packet transmitter
490  *      called by the ip_vs_in_icmp
491  */
492 int
493 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
494                 struct ip_vs_protocol *pp, int offset)
495 {
496         struct rtable   *rt;    /* Route to the other host */
497         int mtu;
498         int rc;
499
500         EnterFunction(10);
501
502         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
503            forwarded directly here, because there is no need to
504            translate address/port back */
505         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
506                 if (cp->packet_xmit)
507                         rc = cp->packet_xmit(skb, cp, pp);
508                 else
509                         rc = NF_ACCEPT;
510                 /* do not touch skb anymore */
511                 atomic_inc(&cp->in_pkts);
512                 goto out;
513         }
514
515         /*
516          * mangle and send the packet here (only for VS/NAT)
517          */
518
519         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
520                 goto tx_error_icmp;
521
522         /* MTU checking */
523         mtu = dst_mtu(&rt->u.dst);
524         if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
525                 ip_rt_put(rt);
526                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
527                 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
528                 goto tx_error;
529         }
530
531         /* copy-on-write the packet before mangling it */
532         if (!skb_make_writable(skb, offset))
533                 goto tx_error_put;
534
535         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
536                 goto tx_error_put;
537
538         /* drop the old route when skb is not shared */
539         dst_release(skb->dst);
540         skb->dst = &rt->u.dst;
541
542         ip_vs_nat_icmp(skb, pp, cp, 0);
543
544         /* Another hack: avoid icmp_send in ip_fragment */
545         skb->local_df = 1;
546
547         IP_VS_XMIT(skb, rt);
548
549         rc = NF_STOLEN;
550         goto out;
551
552   tx_error_icmp:
553         dst_link_failure(skb);
554   tx_error:
555         dev_kfree_skb(skb);
556         rc = NF_STOLEN;
557   out:
558         LeaveFunction(10);
559         return rc;
560   tx_error_put:
561         ip_rt_put(rt);
562         goto tx_error;
563 }