Merge master.kernel.org:/pub/scm/linux/kernel/git/jejb/voyager-2.6
[linux-2.6] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
37  *                                      for decreased register pressure on x86
38  *                                      and more readibility.
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/mm.h>
53 #include <linux/string.h>
54 #include <linux/errno.h>
55 #include <linux/highmem.h>
56
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/in.h>
60 #include <linux/inet.h>
61 #include <linux/netdevice.h>
62 #include <linux/etherdevice.h>
63 #include <linux/proc_fs.h>
64 #include <linux/stat.h>
65 #include <linux/init.h>
66
67 #include <net/snmp.h>
68 #include <net/ip.h>
69 #include <net/protocol.h>
70 #include <net/route.h>
71 #include <net/xfrm.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         skb_reset_mac_header(newskb);
99         __skb_pull(newskb, skb_network_offset(newskb));
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /*
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
129         skb_reset_network_header(skb);
130         iph = ip_hdr(skb);
131         iph->version  = 4;
132         iph->ihl      = 5;
133         iph->tos      = inet->tos;
134         if (ip_dont_fragment(sk, &rt->u.dst))
135                 iph->frag_off = htons(IP_DF);
136         else
137                 iph->frag_off = 0;
138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
139         iph->daddr    = rt->rt_dst;
140         iph->saddr    = rt->rt_src;
141         iph->protocol = sk->sk_protocol;
142         iph->tot_len  = htons(skb->len);
143         ip_select_ident(iph, &rt->u.dst, sk);
144
145         if (opt && opt->optlen) {
146                 iph->ihl += opt->optlen>>2;
147                 ip_options_build(skb, opt, daddr, rt, 0);
148         }
149         ip_send_check(iph);
150
151         skb->priority = sk->sk_priority;
152
153         /* Send it out. */
154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
155                        dst_output);
156 }
157
158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
159
160 static inline int ip_finish_output2(struct sk_buff *skb)
161 {
162         struct dst_entry *dst = skb->dst;
163         struct rtable *rt = (struct rtable *)dst;
164         struct net_device *dev = dst->dev;
165         int hh_len = LL_RESERVED_SPACE(dev);
166
167         if (rt->rt_type == RTN_MULTICAST)
168                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
169         else if (rt->rt_type == RTN_BROADCAST)
170                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
171
172         /* Be paranoid, rather than too clever. */
173         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
174                 struct sk_buff *skb2;
175
176                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
177                 if (skb2 == NULL) {
178                         kfree_skb(skb);
179                         return -ENOMEM;
180                 }
181                 if (skb->sk)
182                         skb_set_owner_w(skb2, skb->sk);
183                 kfree_skb(skb);
184                 skb = skb2;
185         }
186
187         if (dst->hh)
188                 return neigh_hh_output(dst->hh, skb);
189         else if (dst->neighbour)
190                 return dst->neighbour->output(skb);
191
192         if (net_ratelimit())
193                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
194         kfree_skb(skb);
195         return -EINVAL;
196 }
197
198 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
199 {
200         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
201
202         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
203                skb->dst->dev->mtu : dst_mtu(skb->dst);
204 }
205
206 static inline int ip_finish_output(struct sk_buff *skb)
207 {
208 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
209         /* Policy lookup after SNAT yielded a new policy */
210         if (skb->dst->xfrm != NULL) {
211                 IPCB(skb)->flags |= IPSKB_REROUTED;
212                 return dst_output(skb);
213         }
214 #endif
215         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
216                 return ip_fragment(skb, ip_finish_output2);
217         else
218                 return ip_finish_output2(skb);
219 }
220
221 int ip_mc_output(struct sk_buff *skb)
222 {
223         struct sock *sk = skb->sk;
224         struct rtable *rt = (struct rtable*)skb->dst;
225         struct net_device *dev = rt->u.dst.dev;
226
227         /*
228          *      If the indicated interface is up and running, send the packet.
229          */
230         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
231
232         skb->dev = dev;
233         skb->protocol = htons(ETH_P_IP);
234
235         /*
236          *      Multicasts are looped back for other local users
237          */
238
239         if (rt->rt_flags&RTCF_MULTICAST) {
240                 if ((!sk || inet_sk(sk)->mc_loop)
241 #ifdef CONFIG_IP_MROUTE
242                 /* Small optimization: do not loopback not local frames,
243                    which returned after forwarding; they will be  dropped
244                    by ip_mr_input in any case.
245                    Note, that local frames are looped back to be delivered
246                    to local recipients.
247
248                    This check is duplicated in ip_mr_input at the moment.
249                  */
250                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
251 #endif
252                 ) {
253                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
254                         if (newskb)
255                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
256                                         newskb->dev,
257                                         ip_dev_loopback_xmit);
258                 }
259
260                 /* Multicasts with ttl 0 must not go beyond the host */
261
262                 if (ip_hdr(skb)->ttl == 0) {
263                         kfree_skb(skb);
264                         return 0;
265                 }
266         }
267
268         if (rt->rt_flags&RTCF_BROADCAST) {
269                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
270                 if (newskb)
271                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
272                                 newskb->dev, ip_dev_loopback_xmit);
273         }
274
275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
276                             ip_finish_output,
277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
278 }
279
280 int ip_output(struct sk_buff *skb)
281 {
282         struct net_device *dev = skb->dst->dev;
283
284         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
285
286         skb->dev = dev;
287         skb->protocol = htons(ETH_P_IP);
288
289         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
290                             ip_finish_output,
291                             !(IPCB(skb)->flags & IPSKB_REROUTED));
292 }
293
294 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
295 {
296         struct sock *sk = skb->sk;
297         struct inet_sock *inet = inet_sk(sk);
298         struct ip_options *opt = inet->opt;
299         struct rtable *rt;
300         struct iphdr *iph;
301
302         /* Skip all of this if the packet is already routed,
303          * f.e. by something like SCTP.
304          */
305         rt = (struct rtable *) skb->dst;
306         if (rt != NULL)
307                 goto packet_routed;
308
309         /* Make sure we can route this packet. */
310         rt = (struct rtable *)__sk_dst_check(sk, 0);
311         if (rt == NULL) {
312                 __be32 daddr;
313
314                 /* Use correct destination address if we have options. */
315                 daddr = inet->daddr;
316                 if(opt && opt->srr)
317                         daddr = opt->faddr;
318
319                 {
320                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
321                                             .nl_u = { .ip4_u =
322                                                       { .daddr = daddr,
323                                                         .saddr = inet->saddr,
324                                                         .tos = RT_CONN_FLAGS(sk) } },
325                                             .proto = sk->sk_protocol,
326                                             .uli_u = { .ports =
327                                                        { .sport = inet->sport,
328                                                          .dport = inet->dport } } };
329
330                         /* If this fails, retransmit mechanism of transport layer will
331                          * keep trying until route appears or the connection times
332                          * itself out.
333                          */
334                         security_sk_classify_flow(sk, &fl);
335                         if (ip_route_output_flow(&rt, &fl, sk, 0))
336                                 goto no_route;
337                 }
338                 sk_setup_caps(sk, &rt->u.dst);
339         }
340         skb->dst = dst_clone(&rt->u.dst);
341
342 packet_routed:
343         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
344                 goto no_route;
345
346         /* OK, we know where to send it, allocate and build IP header. */
347         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
348         skb_reset_network_header(skb);
349         iph = ip_hdr(skb);
350         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
351         iph->tot_len = htons(skb->len);
352         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
353                 iph->frag_off = htons(IP_DF);
354         else
355                 iph->frag_off = 0;
356         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
357         iph->protocol = sk->sk_protocol;
358         iph->saddr    = rt->rt_src;
359         iph->daddr    = rt->rt_dst;
360         /* Transport layer set skb->h.foo itself. */
361
362         if (opt && opt->optlen) {
363                 iph->ihl += opt->optlen >> 2;
364                 ip_options_build(skb, opt, inet->daddr, rt, 0);
365         }
366
367         ip_select_ident_more(iph, &rt->u.dst, sk,
368                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
369
370         /* Add an IP checksum. */
371         ip_send_check(iph);
372
373         skb->priority = sk->sk_priority;
374
375         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
376                        dst_output);
377
378 no_route:
379         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
380         kfree_skb(skb);
381         return -EHOSTUNREACH;
382 }
383
384
385 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
386 {
387         to->pkt_type = from->pkt_type;
388         to->priority = from->priority;
389         to->protocol = from->protocol;
390         dst_release(to->dst);
391         to->dst = dst_clone(from->dst);
392         to->dev = from->dev;
393         to->mark = from->mark;
394
395         /* Copy the flags to each fragment. */
396         IPCB(to)->flags = IPCB(from)->flags;
397
398 #ifdef CONFIG_NET_SCHED
399         to->tc_index = from->tc_index;
400 #endif
401         nf_copy(to, from);
402 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
403         to->ipvs_property = from->ipvs_property;
404 #endif
405         skb_copy_secmark(to, from);
406 }
407
408 /*
409  *      This IP datagram is too large to be sent in one piece.  Break it up into
410  *      smaller pieces (each of size equal to IP header plus
411  *      a block of the data of the original IP data part) that will yet fit in a
412  *      single device frame, and queue such a frame for sending.
413  */
414
415 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
416 {
417         struct iphdr *iph;
418         int raw = 0;
419         int ptr;
420         struct net_device *dev;
421         struct sk_buff *skb2;
422         unsigned int mtu, hlen, left, len, ll_rs, pad;
423         int offset;
424         __be16 not_last_frag;
425         struct rtable *rt = (struct rtable*)skb->dst;
426         int err = 0;
427
428         dev = rt->u.dst.dev;
429
430         /*
431          *      Point into the IP datagram header.
432          */
433
434         iph = ip_hdr(skb);
435
436         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
437                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
438                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
439                           htonl(ip_skb_dst_mtu(skb)));
440                 kfree_skb(skb);
441                 return -EMSGSIZE;
442         }
443
444         /*
445          *      Setup starting values.
446          */
447
448         hlen = iph->ihl * 4;
449         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
450         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
451
452         /* When frag_list is given, use it. First, check its validity:
453          * some transformers could create wrong frag_list or break existing
454          * one, it is not prohibited. In this case fall back to copying.
455          *
456          * LATER: this step can be merged to real generation of fragments,
457          * we can switch to copy when see the first bad fragment.
458          */
459         if (skb_shinfo(skb)->frag_list) {
460                 struct sk_buff *frag;
461                 int first_len = skb_pagelen(skb);
462
463                 if (first_len - hlen > mtu ||
464                     ((first_len - hlen) & 7) ||
465                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
466                     skb_cloned(skb))
467                         goto slow_path;
468
469                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
470                         /* Correct geometry. */
471                         if (frag->len > mtu ||
472                             ((frag->len & 7) && frag->next) ||
473                             skb_headroom(frag) < hlen)
474                             goto slow_path;
475
476                         /* Partially cloned skb? */
477                         if (skb_shared(frag))
478                                 goto slow_path;
479
480                         BUG_ON(frag->sk);
481                         if (skb->sk) {
482                                 sock_hold(skb->sk);
483                                 frag->sk = skb->sk;
484                                 frag->destructor = sock_wfree;
485                                 skb->truesize -= frag->truesize;
486                         }
487                 }
488
489                 /* Everything is OK. Generate! */
490
491                 err = 0;
492                 offset = 0;
493                 frag = skb_shinfo(skb)->frag_list;
494                 skb_shinfo(skb)->frag_list = NULL;
495                 skb->data_len = first_len - skb_headlen(skb);
496                 skb->len = first_len;
497                 iph->tot_len = htons(first_len);
498                 iph->frag_off = htons(IP_MF);
499                 ip_send_check(iph);
500
501                 for (;;) {
502                         /* Prepare header of the next frame,
503                          * before previous one went down. */
504                         if (frag) {
505                                 frag->ip_summed = CHECKSUM_NONE;
506                                 skb_reset_transport_header(frag);
507                                 __skb_push(frag, hlen);
508                                 skb_reset_network_header(frag);
509                                 memcpy(skb_network_header(frag), iph, hlen);
510                                 iph = ip_hdr(frag);
511                                 iph->tot_len = htons(frag->len);
512                                 ip_copy_metadata(frag, skb);
513                                 if (offset == 0)
514                                         ip_options_fragment(frag);
515                                 offset += skb->len - hlen;
516                                 iph->frag_off = htons(offset>>3);
517                                 if (frag->next != NULL)
518                                         iph->frag_off |= htons(IP_MF);
519                                 /* Ready, complete checksum */
520                                 ip_send_check(iph);
521                         }
522
523                         err = output(skb);
524
525                         if (!err)
526                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
527                         if (err || !frag)
528                                 break;
529
530                         skb = frag;
531                         frag = skb->next;
532                         skb->next = NULL;
533                 }
534
535                 if (err == 0) {
536                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
537                         return 0;
538                 }
539
540                 while (frag) {
541                         skb = frag->next;
542                         kfree_skb(frag);
543                         frag = skb;
544                 }
545                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
546                 return err;
547         }
548
549 slow_path:
550         left = skb->len - hlen;         /* Space per frame */
551         ptr = raw + hlen;               /* Where to start from */
552
553         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
554          * we need to make room for the encapsulating header
555          */
556         pad = nf_bridge_pad(skb);
557         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
558         mtu -= pad;
559
560         /*
561          *      Fragment the datagram.
562          */
563
564         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
565         not_last_frag = iph->frag_off & htons(IP_MF);
566
567         /*
568          *      Keep copying data until we run out.
569          */
570
571         while (left > 0) {
572                 len = left;
573                 /* IF: it doesn't fit, use 'mtu' - the data space left */
574                 if (len > mtu)
575                         len = mtu;
576                 /* IF: we are not sending upto and including the packet end
577                    then align the next start on an eight byte boundary */
578                 if (len < left) {
579                         len &= ~7;
580                 }
581                 /*
582                  *      Allocate buffer.
583                  */
584
585                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
586                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
587                         err = -ENOMEM;
588                         goto fail;
589                 }
590
591                 /*
592                  *      Set up data on packet
593                  */
594
595                 ip_copy_metadata(skb2, skb);
596                 skb_reserve(skb2, ll_rs);
597                 skb_put(skb2, len + hlen);
598                 skb_reset_network_header(skb2);
599                 skb2->transport_header = skb2->network_header + hlen;
600
601                 /*
602                  *      Charge the memory for the fragment to any owner
603                  *      it might possess
604                  */
605
606                 if (skb->sk)
607                         skb_set_owner_w(skb2, skb->sk);
608
609                 /*
610                  *      Copy the packet header into the new buffer.
611                  */
612
613                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
614
615                 /*
616                  *      Copy a block of the IP datagram.
617                  */
618                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
619                         BUG();
620                 left -= len;
621
622                 /*
623                  *      Fill in the new header fields.
624                  */
625                 iph = ip_hdr(skb2);
626                 iph->frag_off = htons((offset >> 3));
627
628                 /* ANK: dirty, but effective trick. Upgrade options only if
629                  * the segment to be fragmented was THE FIRST (otherwise,
630                  * options are already fixed) and make it ONCE
631                  * on the initial skb, so that all the following fragments
632                  * will inherit fixed options.
633                  */
634                 if (offset == 0)
635                         ip_options_fragment(skb);
636
637                 /*
638                  *      Added AC : If we are fragmenting a fragment that's not the
639                  *                 last fragment then keep MF on each bit
640                  */
641                 if (left > 0 || not_last_frag)
642                         iph->frag_off |= htons(IP_MF);
643                 ptr += len;
644                 offset += len;
645
646                 /*
647                  *      Put this fragment into the sending queue.
648                  */
649                 iph->tot_len = htons(len + hlen);
650
651                 ip_send_check(iph);
652
653                 err = output(skb2);
654                 if (err)
655                         goto fail;
656
657                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
658         }
659         kfree_skb(skb);
660         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
661         return err;
662
663 fail:
664         kfree_skb(skb);
665         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
666         return err;
667 }
668
669 EXPORT_SYMBOL(ip_fragment);
670
671 int
672 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
673 {
674         struct iovec *iov = from;
675
676         if (skb->ip_summed == CHECKSUM_PARTIAL) {
677                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
678                         return -EFAULT;
679         } else {
680                 __wsum csum = 0;
681                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
682                         return -EFAULT;
683                 skb->csum = csum_block_add(skb->csum, csum, odd);
684         }
685         return 0;
686 }
687
688 static inline __wsum
689 csum_page(struct page *page, int offset, int copy)
690 {
691         char *kaddr;
692         __wsum csum;
693         kaddr = kmap(page);
694         csum = csum_partial(kaddr + offset, copy, 0);
695         kunmap(page);
696         return csum;
697 }
698
699 static inline int ip_ufo_append_data(struct sock *sk,
700                         int getfrag(void *from, char *to, int offset, int len,
701                                int odd, struct sk_buff *skb),
702                         void *from, int length, int hh_len, int fragheaderlen,
703                         int transhdrlen, int mtu,unsigned int flags)
704 {
705         struct sk_buff *skb;
706         int err;
707
708         /* There is support for UDP fragmentation offload by network
709          * device, so create one single skb packet containing complete
710          * udp datagram
711          */
712         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
713                 skb = sock_alloc_send_skb(sk,
714                         hh_len + fragheaderlen + transhdrlen + 20,
715                         (flags & MSG_DONTWAIT), &err);
716
717                 if (skb == NULL)
718                         return err;
719
720                 /* reserve space for Hardware header */
721                 skb_reserve(skb, hh_len);
722
723                 /* create space for UDP/IP header */
724                 skb_put(skb,fragheaderlen + transhdrlen);
725
726                 /* initialize network header pointer */
727                 skb_reset_network_header(skb);
728
729                 /* initialize protocol header pointer */
730                 skb->transport_header = skb->network_header + fragheaderlen;
731
732                 skb->ip_summed = CHECKSUM_PARTIAL;
733                 skb->csum = 0;
734                 sk->sk_sndmsg_off = 0;
735         }
736
737         err = skb_append_datato_frags(sk,skb, getfrag, from,
738                                (length - transhdrlen));
739         if (!err) {
740                 /* specify the length of each IP datagram fragment*/
741                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
742                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
743                 __skb_queue_tail(&sk->sk_write_queue, skb);
744
745                 return 0;
746         }
747         /* There is not enough support do UFO ,
748          * so follow normal path
749          */
750         kfree_skb(skb);
751         return err;
752 }
753
754 /*
755  *      ip_append_data() and ip_append_page() can make one large IP datagram
756  *      from many pieces of data. Each pieces will be holded on the socket
757  *      until ip_push_pending_frames() is called. Each piece can be a page
758  *      or non-page data.
759  *
760  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
761  *      this interface potentially.
762  *
763  *      LATER: length must be adjusted by pad at tail, when it is required.
764  */
765 int ip_append_data(struct sock *sk,
766                    int getfrag(void *from, char *to, int offset, int len,
767                                int odd, struct sk_buff *skb),
768                    void *from, int length, int transhdrlen,
769                    struct ipcm_cookie *ipc, struct rtable *rt,
770                    unsigned int flags)
771 {
772         struct inet_sock *inet = inet_sk(sk);
773         struct sk_buff *skb;
774
775         struct ip_options *opt = NULL;
776         int hh_len;
777         int exthdrlen;
778         int mtu;
779         int copy;
780         int err;
781         int offset = 0;
782         unsigned int maxfraglen, fragheaderlen;
783         int csummode = CHECKSUM_NONE;
784
785         if (flags&MSG_PROBE)
786                 return 0;
787
788         if (skb_queue_empty(&sk->sk_write_queue)) {
789                 /*
790                  * setup for corking.
791                  */
792                 opt = ipc->opt;
793                 if (opt) {
794                         if (inet->cork.opt == NULL) {
795                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
796                                 if (unlikely(inet->cork.opt == NULL))
797                                         return -ENOBUFS;
798                         }
799                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
800                         inet->cork.flags |= IPCORK_OPT;
801                         inet->cork.addr = ipc->addr;
802                 }
803                 dst_hold(&rt->u.dst);
804                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
805                                             rt->u.dst.dev->mtu :
806                                             dst_mtu(rt->u.dst.path);
807                 inet->cork.rt = rt;
808                 inet->cork.length = 0;
809                 sk->sk_sndmsg_page = NULL;
810                 sk->sk_sndmsg_off = 0;
811                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
812                         length += exthdrlen;
813                         transhdrlen += exthdrlen;
814                 }
815         } else {
816                 rt = inet->cork.rt;
817                 if (inet->cork.flags & IPCORK_OPT)
818                         opt = inet->cork.opt;
819
820                 transhdrlen = 0;
821                 exthdrlen = 0;
822                 mtu = inet->cork.fragsize;
823         }
824         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
825
826         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
827         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
828
829         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
830                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
831                 return -EMSGSIZE;
832         }
833
834         /*
835          * transhdrlen > 0 means that this is the first fragment and we wish
836          * it won't be fragmented in the future.
837          */
838         if (transhdrlen &&
839             length + fragheaderlen <= mtu &&
840             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
841             !exthdrlen)
842                 csummode = CHECKSUM_PARTIAL;
843
844         inet->cork.length += length;
845         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
846                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
847
848                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
849                                          fragheaderlen, transhdrlen, mtu,
850                                          flags);
851                 if (err)
852                         goto error;
853                 return 0;
854         }
855
856         /* So, what's going on in the loop below?
857          *
858          * We use calculated fragment length to generate chained skb,
859          * each of segments is IP fragment ready for sending to network after
860          * adding appropriate IP header.
861          */
862
863         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
864                 goto alloc_new_skb;
865
866         while (length > 0) {
867                 /* Check if the remaining data fits into current packet. */
868                 copy = mtu - skb->len;
869                 if (copy < length)
870                         copy = maxfraglen - skb->len;
871                 if (copy <= 0) {
872                         char *data;
873                         unsigned int datalen;
874                         unsigned int fraglen;
875                         unsigned int fraggap;
876                         unsigned int alloclen;
877                         struct sk_buff *skb_prev;
878 alloc_new_skb:
879                         skb_prev = skb;
880                         if (skb_prev)
881                                 fraggap = skb_prev->len - maxfraglen;
882                         else
883                                 fraggap = 0;
884
885                         /*
886                          * If remaining data exceeds the mtu,
887                          * we know we need more fragment(s).
888                          */
889                         datalen = length + fraggap;
890                         if (datalen > mtu - fragheaderlen)
891                                 datalen = maxfraglen - fragheaderlen;
892                         fraglen = datalen + fragheaderlen;
893
894                         if ((flags & MSG_MORE) &&
895                             !(rt->u.dst.dev->features&NETIF_F_SG))
896                                 alloclen = mtu;
897                         else
898                                 alloclen = datalen + fragheaderlen;
899
900                         /* The last fragment gets additional space at tail.
901                          * Note, with MSG_MORE we overallocate on fragments,
902                          * because we have no idea what fragment will be
903                          * the last.
904                          */
905                         if (datalen == length + fraggap)
906                                 alloclen += rt->u.dst.trailer_len;
907
908                         if (transhdrlen) {
909                                 skb = sock_alloc_send_skb(sk,
910                                                 alloclen + hh_len + 15,
911                                                 (flags & MSG_DONTWAIT), &err);
912                         } else {
913                                 skb = NULL;
914                                 if (atomic_read(&sk->sk_wmem_alloc) <=
915                                     2 * sk->sk_sndbuf)
916                                         skb = sock_wmalloc(sk,
917                                                            alloclen + hh_len + 15, 1,
918                                                            sk->sk_allocation);
919                                 if (unlikely(skb == NULL))
920                                         err = -ENOBUFS;
921                         }
922                         if (skb == NULL)
923                                 goto error;
924
925                         /*
926                          *      Fill in the control structures
927                          */
928                         skb->ip_summed = csummode;
929                         skb->csum = 0;
930                         skb_reserve(skb, hh_len);
931
932                         /*
933                          *      Find where to start putting bytes.
934                          */
935                         data = skb_put(skb, fraglen);
936                         skb_set_network_header(skb, exthdrlen);
937                         skb->transport_header = (skb->network_header +
938                                                  fragheaderlen);
939                         data += fragheaderlen;
940
941                         if (fraggap) {
942                                 skb->csum = skb_copy_and_csum_bits(
943                                         skb_prev, maxfraglen,
944                                         data + transhdrlen, fraggap, 0);
945                                 skb_prev->csum = csum_sub(skb_prev->csum,
946                                                           skb->csum);
947                                 data += fraggap;
948                                 pskb_trim_unique(skb_prev, maxfraglen);
949                         }
950
951                         copy = datalen - transhdrlen - fraggap;
952                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
953                                 err = -EFAULT;
954                                 kfree_skb(skb);
955                                 goto error;
956                         }
957
958                         offset += copy;
959                         length -= datalen - fraggap;
960                         transhdrlen = 0;
961                         exthdrlen = 0;
962                         csummode = CHECKSUM_NONE;
963
964                         /*
965                          * Put the packet on the pending queue.
966                          */
967                         __skb_queue_tail(&sk->sk_write_queue, skb);
968                         continue;
969                 }
970
971                 if (copy > length)
972                         copy = length;
973
974                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
975                         unsigned int off;
976
977                         off = skb->len;
978                         if (getfrag(from, skb_put(skb, copy),
979                                         offset, copy, off, skb) < 0) {
980                                 __skb_trim(skb, off);
981                                 err = -EFAULT;
982                                 goto error;
983                         }
984                 } else {
985                         int i = skb_shinfo(skb)->nr_frags;
986                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
987                         struct page *page = sk->sk_sndmsg_page;
988                         int off = sk->sk_sndmsg_off;
989                         unsigned int left;
990
991                         if (page && (left = PAGE_SIZE - off) > 0) {
992                                 if (copy >= left)
993                                         copy = left;
994                                 if (page != frag->page) {
995                                         if (i == MAX_SKB_FRAGS) {
996                                                 err = -EMSGSIZE;
997                                                 goto error;
998                                         }
999                                         get_page(page);
1000                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1001                                         frag = &skb_shinfo(skb)->frags[i];
1002                                 }
1003                         } else if (i < MAX_SKB_FRAGS) {
1004                                 if (copy > PAGE_SIZE)
1005                                         copy = PAGE_SIZE;
1006                                 page = alloc_pages(sk->sk_allocation, 0);
1007                                 if (page == NULL)  {
1008                                         err = -ENOMEM;
1009                                         goto error;
1010                                 }
1011                                 sk->sk_sndmsg_page = page;
1012                                 sk->sk_sndmsg_off = 0;
1013
1014                                 skb_fill_page_desc(skb, i, page, 0, 0);
1015                                 frag = &skb_shinfo(skb)->frags[i];
1016                                 skb->truesize += PAGE_SIZE;
1017                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1018                         } else {
1019                                 err = -EMSGSIZE;
1020                                 goto error;
1021                         }
1022                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1023                                 err = -EFAULT;
1024                                 goto error;
1025                         }
1026                         sk->sk_sndmsg_off += copy;
1027                         frag->size += copy;
1028                         skb->len += copy;
1029                         skb->data_len += copy;
1030                 }
1031                 offset += copy;
1032                 length -= copy;
1033         }
1034
1035         return 0;
1036
1037 error:
1038         inet->cork.length -= length;
1039         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1040         return err;
1041 }
1042
1043 ssize_t ip_append_page(struct sock *sk, struct page *page,
1044                        int offset, size_t size, int flags)
1045 {
1046         struct inet_sock *inet = inet_sk(sk);
1047         struct sk_buff *skb;
1048         struct rtable *rt;
1049         struct ip_options *opt = NULL;
1050         int hh_len;
1051         int mtu;
1052         int len;
1053         int err;
1054         unsigned int maxfraglen, fragheaderlen, fraggap;
1055
1056         if (inet->hdrincl)
1057                 return -EPERM;
1058
1059         if (flags&MSG_PROBE)
1060                 return 0;
1061
1062         if (skb_queue_empty(&sk->sk_write_queue))
1063                 return -EINVAL;
1064
1065         rt = inet->cork.rt;
1066         if (inet->cork.flags & IPCORK_OPT)
1067                 opt = inet->cork.opt;
1068
1069         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1070                 return -EOPNOTSUPP;
1071
1072         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1073         mtu = inet->cork.fragsize;
1074
1075         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1076         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1077
1078         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1079                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1080                 return -EMSGSIZE;
1081         }
1082
1083         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1084                 return -EINVAL;
1085
1086         inet->cork.length += size;
1087         if ((sk->sk_protocol == IPPROTO_UDP) &&
1088             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1089                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1090                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1091         }
1092
1093
1094         while (size > 0) {
1095                 int i;
1096
1097                 if (skb_is_gso(skb))
1098                         len = size;
1099                 else {
1100
1101                         /* Check if the remaining data fits into current packet. */
1102                         len = mtu - skb->len;
1103                         if (len < size)
1104                                 len = maxfraglen - skb->len;
1105                 }
1106                 if (len <= 0) {
1107                         struct sk_buff *skb_prev;
1108                         int alloclen;
1109
1110                         skb_prev = skb;
1111                         fraggap = skb_prev->len - maxfraglen;
1112
1113                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1114                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1115                         if (unlikely(!skb)) {
1116                                 err = -ENOBUFS;
1117                                 goto error;
1118                         }
1119
1120                         /*
1121                          *      Fill in the control structures
1122                          */
1123                         skb->ip_summed = CHECKSUM_NONE;
1124                         skb->csum = 0;
1125                         skb_reserve(skb, hh_len);
1126
1127                         /*
1128                          *      Find where to start putting bytes.
1129                          */
1130                         skb_put(skb, fragheaderlen + fraggap);
1131                         skb_reset_network_header(skb);
1132                         skb->transport_header = (skb->network_header +
1133                                                  fragheaderlen);
1134                         if (fraggap) {
1135                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1136                                                                    maxfraglen,
1137                                                     skb_transport_header(skb),
1138                                                                    fraggap, 0);
1139                                 skb_prev->csum = csum_sub(skb_prev->csum,
1140                                                           skb->csum);
1141                                 pskb_trim_unique(skb_prev, maxfraglen);
1142                         }
1143
1144                         /*
1145                          * Put the packet on the pending queue.
1146                          */
1147                         __skb_queue_tail(&sk->sk_write_queue, skb);
1148                         continue;
1149                 }
1150
1151                 i = skb_shinfo(skb)->nr_frags;
1152                 if (len > size)
1153                         len = size;
1154                 if (skb_can_coalesce(skb, i, page, offset)) {
1155                         skb_shinfo(skb)->frags[i-1].size += len;
1156                 } else if (i < MAX_SKB_FRAGS) {
1157                         get_page(page);
1158                         skb_fill_page_desc(skb, i, page, offset, len);
1159                 } else {
1160                         err = -EMSGSIZE;
1161                         goto error;
1162                 }
1163
1164                 if (skb->ip_summed == CHECKSUM_NONE) {
1165                         __wsum csum;
1166                         csum = csum_page(page, offset, len);
1167                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1168                 }
1169
1170                 skb->len += len;
1171                 skb->data_len += len;
1172                 offset += len;
1173                 size -= len;
1174         }
1175         return 0;
1176
1177 error:
1178         inet->cork.length -= size;
1179         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1180         return err;
1181 }
1182
1183 /*
1184  *      Combined all pending IP fragments on the socket as one IP datagram
1185  *      and push them out.
1186  */
1187 int ip_push_pending_frames(struct sock *sk)
1188 {
1189         struct sk_buff *skb, *tmp_skb;
1190         struct sk_buff **tail_skb;
1191         struct inet_sock *inet = inet_sk(sk);
1192         struct ip_options *opt = NULL;
1193         struct rtable *rt = inet->cork.rt;
1194         struct iphdr *iph;
1195         __be16 df = 0;
1196         __u8 ttl;
1197         int err = 0;
1198
1199         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1200                 goto out;
1201         tail_skb = &(skb_shinfo(skb)->frag_list);
1202
1203         /* move skb->data to ip header from ext header */
1204         if (skb->data < skb_network_header(skb))
1205                 __skb_pull(skb, skb_network_offset(skb));
1206         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1207                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1208                 *tail_skb = tmp_skb;
1209                 tail_skb = &(tmp_skb->next);
1210                 skb->len += tmp_skb->len;
1211                 skb->data_len += tmp_skb->len;
1212                 skb->truesize += tmp_skb->truesize;
1213                 __sock_put(tmp_skb->sk);
1214                 tmp_skb->destructor = NULL;
1215                 tmp_skb->sk = NULL;
1216         }
1217
1218         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1219          * to fragment the frame generated here. No matter, what transforms
1220          * how transforms change size of the packet, it will come out.
1221          */
1222         if (inet->pmtudisc < IP_PMTUDISC_DO)
1223                 skb->local_df = 1;
1224
1225         /* DF bit is set when we want to see DF on outgoing frames.
1226          * If local_df is set too, we still allow to fragment this frame
1227          * locally. */
1228         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1229             (skb->len <= dst_mtu(&rt->u.dst) &&
1230              ip_dont_fragment(sk, &rt->u.dst)))
1231                 df = htons(IP_DF);
1232
1233         if (inet->cork.flags & IPCORK_OPT)
1234                 opt = inet->cork.opt;
1235
1236         if (rt->rt_type == RTN_MULTICAST)
1237                 ttl = inet->mc_ttl;
1238         else
1239                 ttl = ip_select_ttl(inet, &rt->u.dst);
1240
1241         iph = (struct iphdr *)skb->data;
1242         iph->version = 4;
1243         iph->ihl = 5;
1244         if (opt) {
1245                 iph->ihl += opt->optlen>>2;
1246                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1247         }
1248         iph->tos = inet->tos;
1249         iph->tot_len = htons(skb->len);
1250         iph->frag_off = df;
1251         ip_select_ident(iph, &rt->u.dst, sk);
1252         iph->ttl = ttl;
1253         iph->protocol = sk->sk_protocol;
1254         iph->saddr = rt->rt_src;
1255         iph->daddr = rt->rt_dst;
1256         ip_send_check(iph);
1257
1258         skb->priority = sk->sk_priority;
1259         skb->dst = dst_clone(&rt->u.dst);
1260
1261         /* Netfilter gets whole the not fragmented skb. */
1262         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1263                       skb->dst->dev, dst_output);
1264         if (err) {
1265                 if (err > 0)
1266                         err = inet->recverr ? net_xmit_errno(err) : 0;
1267                 if (err)
1268                         goto error;
1269         }
1270
1271 out:
1272         inet->cork.flags &= ~IPCORK_OPT;
1273         kfree(inet->cork.opt);
1274         inet->cork.opt = NULL;
1275         if (inet->cork.rt) {
1276                 ip_rt_put(inet->cork.rt);
1277                 inet->cork.rt = NULL;
1278         }
1279         return err;
1280
1281 error:
1282         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1283         goto out;
1284 }
1285
1286 /*
1287  *      Throw away all pending data on the socket.
1288  */
1289 void ip_flush_pending_frames(struct sock *sk)
1290 {
1291         struct inet_sock *inet = inet_sk(sk);
1292         struct sk_buff *skb;
1293
1294         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1295                 kfree_skb(skb);
1296
1297         inet->cork.flags &= ~IPCORK_OPT;
1298         kfree(inet->cork.opt);
1299         inet->cork.opt = NULL;
1300         if (inet->cork.rt) {
1301                 ip_rt_put(inet->cork.rt);
1302                 inet->cork.rt = NULL;
1303         }
1304 }
1305
1306
1307 /*
1308  *      Fetch data from kernel space and fill in checksum if needed.
1309  */
1310 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1311                               int len, int odd, struct sk_buff *skb)
1312 {
1313         __wsum csum;
1314
1315         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1316         skb->csum = csum_block_add(skb->csum, csum, odd);
1317         return 0;
1318 }
1319
1320 /*
1321  *      Generic function to send a packet as reply to another packet.
1322  *      Used to send TCP resets so far. ICMP should use this function too.
1323  *
1324  *      Should run single threaded per socket because it uses the sock
1325  *      structure to pass arguments.
1326  *
1327  *      LATER: switch from ip_build_xmit to ip_append_*
1328  */
1329 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1330                    unsigned int len)
1331 {
1332         struct inet_sock *inet = inet_sk(sk);
1333         struct {
1334                 struct ip_options       opt;
1335                 char                    data[40];
1336         } replyopts;
1337         struct ipcm_cookie ipc;
1338         __be32 daddr;
1339         struct rtable *rt = (struct rtable*)skb->dst;
1340
1341         if (ip_options_echo(&replyopts.opt, skb))
1342                 return;
1343
1344         daddr = ipc.addr = rt->rt_src;
1345         ipc.opt = NULL;
1346
1347         if (replyopts.opt.optlen) {
1348                 ipc.opt = &replyopts.opt;
1349
1350                 if (ipc.opt->srr)
1351                         daddr = replyopts.opt.faddr;
1352         }
1353
1354         {
1355                 struct flowi fl = { .nl_u = { .ip4_u =
1356                                               { .daddr = daddr,
1357                                                 .saddr = rt->rt_spec_dst,
1358                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1359                                     /* Not quite clean, but right. */
1360                                     .uli_u = { .ports =
1361                                                { .sport = tcp_hdr(skb)->dest,
1362                                                  .dport = tcp_hdr(skb)->source } },
1363                                     .proto = sk->sk_protocol };
1364                 security_skb_classify_flow(skb, &fl);
1365                 if (ip_route_output_key(&rt, &fl))
1366                         return;
1367         }
1368
1369         /* And let IP do all the hard work.
1370
1371            This chunk is not reenterable, hence spinlock.
1372            Note that it uses the fact, that this function is called
1373            with locally disabled BH and that sk cannot be already spinlocked.
1374          */
1375         bh_lock_sock(sk);
1376         inet->tos = ip_hdr(skb)->tos;
1377         sk->sk_priority = skb->priority;
1378         sk->sk_protocol = ip_hdr(skb)->protocol;
1379         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1380                        &ipc, rt, MSG_DONTWAIT);
1381         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1382                 if (arg->csumoffset >= 0)
1383                         *((__sum16 *)skb_transport_header(skb) +
1384                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1385                                                                 arg->csum));
1386                 skb->ip_summed = CHECKSUM_NONE;
1387                 ip_push_pending_frames(sk);
1388         }
1389
1390         bh_unlock_sock(sk);
1391
1392         ip_rt_put(rt);
1393 }
1394
1395 void __init ip_init(void)
1396 {
1397         ip_rt_init();
1398         inet_initpeers();
1399
1400 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1401         igmp_mc_proc_init();
1402 #endif
1403 }
1404
1405 EXPORT_SYMBOL(ip_generic_getfrag);
1406 EXPORT_SYMBOL(ip_queue_xmit);
1407 EXPORT_SYMBOL(ip_send_check);