git.oblomov.eu Git - linux-2.6/blob - net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         skb_reset_mac_header(newskb);
  99         __skb_pull(newskb, skb_network_offset(newskb));
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 129         skb_reset_network_header(skb);
 130         iph = ip_hdr(skb);
 131         iph->version  = 4;
 132         iph->ihl      = 5;
 133         iph->tos      = inet->tos;
 134         if (ip_dont_fragment(sk, &rt->u.dst))
 135                 iph->frag_off = htons(IP_DF);
 136         else
 137                 iph->frag_off = 0;
 138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 139         iph->daddr    = rt->rt_dst;
 140         iph->saddr    = rt->rt_src;
 141         iph->protocol = sk->sk_protocol;
 142         iph->tot_len  = htons(skb->len);
 143         ip_select_ident(iph, &rt->u.dst, sk);
 144
 145         if (opt && opt->optlen) {
 146                 iph->ihl += opt->optlen>>2;
 147                 ip_options_build(skb, opt, daddr, rt, 0);
 148         }
 149         ip_send_check(iph);
 150
 151         skb->priority = sk->sk_priority;
 152
 153         /* Send it out. */
 154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 155                        dst_output);
 156 }
 157
 158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct net_device *dev = dst->dev;
 164         int hh_len = LL_RESERVED_SPACE(dev);
 165
 166         /* Be paranoid, rather than too clever. */
 167         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 168                 struct sk_buff *skb2;
 169
 170                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 171                 if (skb2 == NULL) {
 172                         kfree_skb(skb);
 173                         return -ENOMEM;
 174                 }
 175                 if (skb->sk)
 176                         skb_set_owner_w(skb2, skb->sk);
 177                 kfree_skb(skb);
 178                 skb = skb2;
 179         }
 180
 181         if (dst->hh)
 182                 return neigh_hh_output(dst->hh, skb);
 183         else if (dst->neighbour)
 184                 return dst->neighbour->output(skb);
 185
 186         if (net_ratelimit())
 187                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 188         kfree_skb(skb);
 189         return -EINVAL;
 190 }
 191
 192 static inline int ip_finish_output(struct sk_buff *skb)
 193 {
 194 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 195         /* Policy lookup after SNAT yielded a new policy */
 196         if (skb->dst->xfrm != NULL) {
 197                 IPCB(skb)->flags |= IPSKB_REROUTED;
 198                 return dst_output(skb);
 199         }
 200 #endif
 201         if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
 202                 return ip_fragment(skb, ip_finish_output2);
 203         else
 204                 return ip_finish_output2(skb);
 205 }
 206
 207 int ip_mc_output(struct sk_buff *skb)
 208 {
 209         struct sock *sk = skb->sk;
 210         struct rtable *rt = (struct rtable*)skb->dst;
 211         struct net_device *dev = rt->u.dst.dev;
 212
 213         /*
 214          *      If the indicated interface is up and running, send the packet.
 215          */
 216         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 217
 218         skb->dev = dev;
 219         skb->protocol = htons(ETH_P_IP);
 220
 221         /*
 222          *      Multicasts are looped back for other local users
 223          */
 224
 225         if (rt->rt_flags&RTCF_MULTICAST) {
 226                 if ((!sk || inet_sk(sk)->mc_loop)
 227 #ifdef CONFIG_IP_MROUTE
 228                 /* Small optimization: do not loopback not local frames,
 229                    which returned after forwarding; they will be  dropped
 230                    by ip_mr_input in any case.
 231                    Note, that local frames are looped back to be delivered
 232                    to local recipients.
 233
 234                    This check is duplicated in ip_mr_input at the moment.
 235                  */
 236                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 237 #endif
 238                 ) {
 239                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 240                         if (newskb)
 241                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 242                                         newskb->dev,
 243                                         ip_dev_loopback_xmit);
 244                 }
 245
 246                 /* Multicasts with ttl 0 must not go beyond the host */
 247
 248                 if (ip_hdr(skb)->ttl == 0) {
 249                         kfree_skb(skb);
 250                         return 0;
 251                 }
 252         }
 253
 254         if (rt->rt_flags&RTCF_BROADCAST) {
 255                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 256                 if (newskb)
 257                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 258                                 newskb->dev, ip_dev_loopback_xmit);
 259         }
 260
 261         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 262                             ip_finish_output,
 263                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 264 }
 265
 266 int ip_output(struct sk_buff *skb)
 267 {
 268         struct net_device *dev = skb->dst->dev;
 269
 270         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 271
 272         skb->dev = dev;
 273         skb->protocol = htons(ETH_P_IP);
 274
 275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 276                             ip_finish_output,
 277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 278 }
 279
 280 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 281 {
 282         struct sock *sk = skb->sk;
 283         struct inet_sock *inet = inet_sk(sk);
 284         struct ip_options *opt = inet->opt;
 285         struct rtable *rt;
 286         struct iphdr *iph;
 287
 288         /* Skip all of this if the packet is already routed,
 289          * f.e. by something like SCTP.
 290          */
 291         rt = (struct rtable *) skb->dst;
 292         if (rt != NULL)
 293                 goto packet_routed;
 294
 295         /* Make sure we can route this packet. */
 296         rt = (struct rtable *)__sk_dst_check(sk, 0);
 297         if (rt == NULL) {
 298                 __be32 daddr;
 299
 300                 /* Use correct destination address if we have options. */
 301                 daddr = inet->daddr;
 302                 if(opt && opt->srr)
 303                         daddr = opt->faddr;
 304
 305                 {
 306                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 307                                             .nl_u = { .ip4_u =
 308                                                       { .daddr = daddr,
 309                                                         .saddr = inet->saddr,
 310                                                         .tos = RT_CONN_FLAGS(sk) } },
 311                                             .proto = sk->sk_protocol,
 312                                             .uli_u = { .ports =
 313                                                        { .sport = inet->sport,
 314                                                          .dport = inet->dport } } };
 315
 316                         /* If this fails, retransmit mechanism of transport layer will
 317                          * keep trying until route appears or the connection times
 318                          * itself out.
 319                          */
 320                         security_sk_classify_flow(sk, &fl);
 321                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 322                                 goto no_route;
 323                 }
 324                 sk_setup_caps(sk, &rt->u.dst);
 325         }
 326         skb->dst = dst_clone(&rt->u.dst);
 327
 328 packet_routed:
 329         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 330                 goto no_route;
 331
 332         /* OK, we know where to send it, allocate and build IP header. */
 333         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 334         skb_reset_network_header(skb);
 335         iph = ip_hdr(skb);
 336         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 337         iph->tot_len = htons(skb->len);
 338         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 339                 iph->frag_off = htons(IP_DF);
 340         else
 341                 iph->frag_off = 0;
 342         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 343         iph->protocol = sk->sk_protocol;
 344         iph->saddr    = rt->rt_src;
 345         iph->daddr    = rt->rt_dst;
 346         /* Transport layer set skb->h.foo itself. */
 347
 348         if (opt && opt->optlen) {
 349                 iph->ihl += opt->optlen >> 2;
 350                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 351         }
 352
 353         ip_select_ident_more(iph, &rt->u.dst, sk,
 354                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 355
 356         /* Add an IP checksum. */
 357         ip_send_check(iph);
 358
 359         skb->priority = sk->sk_priority;
 360
 361         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 362                        dst_output);
 363
 364 no_route:
 365         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 366         kfree_skb(skb);
 367         return -EHOSTUNREACH;
 368 }
 369
 370
 371 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 372 {
 373         to->pkt_type = from->pkt_type;
 374         to->priority = from->priority;
 375         to->protocol = from->protocol;
 376         dst_release(to->dst);
 377         to->dst = dst_clone(from->dst);
 378         to->dev = from->dev;
 379         to->mark = from->mark;
 380
 381         /* Copy the flags to each fragment. */
 382         IPCB(to)->flags = IPCB(from)->flags;
 383
 384 #ifdef CONFIG_NET_SCHED
 385         to->tc_index = from->tc_index;
 386 #endif
 387         nf_copy(to, from);
 388 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 389         to->ipvs_property = from->ipvs_property;
 390 #endif
 391         skb_copy_secmark(to, from);
 392 }
 393
 394 /*
 395  *      This IP datagram is too large to be sent in one piece.  Break it up into
 396  *      smaller pieces (each of size equal to IP header plus
 397  *      a block of the data of the original IP data part) that will yet fit in a
 398  *      single device frame, and queue such a frame for sending.
 399  */
 400
 401 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 402 {
 403         struct iphdr *iph;
 404         int raw = 0;
 405         int ptr;
 406         struct net_device *dev;
 407         struct sk_buff *skb2;
 408         unsigned int mtu, hlen, left, len, ll_rs, pad;
 409         int offset;
 410         __be16 not_last_frag;
 411         struct rtable *rt = (struct rtable*)skb->dst;
 412         int err = 0;
 413
 414         dev = rt->u.dst.dev;
 415
 416         /*
 417          *      Point into the IP datagram header.
 418          */
 419
 420         iph = ip_hdr(skb);
 421
 422         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 423                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 424                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 425                           htonl(dst_mtu(&rt->u.dst)));
 426                 kfree_skb(skb);
 427                 return -EMSGSIZE;
 428         }
 429
 430         /*
 431          *      Setup starting values.
 432          */
 433
 434         hlen = iph->ihl * 4;
 435         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 436         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 437
 438         /* When frag_list is given, use it. First, check its validity:
 439          * some transformers could create wrong frag_list or break existing
 440          * one, it is not prohibited. In this case fall back to copying.
 441          *
 442          * LATER: this step can be merged to real generation of fragments,
 443          * we can switch to copy when see the first bad fragment.
 444          */
 445         if (skb_shinfo(skb)->frag_list) {
 446                 struct sk_buff *frag;
 447                 int first_len = skb_pagelen(skb);
 448
 449                 if (first_len - hlen > mtu ||
 450                     ((first_len - hlen) & 7) ||
 451                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 452                     skb_cloned(skb))
 453                         goto slow_path;
 454
 455                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 456                         /* Correct geometry. */
 457                         if (frag->len > mtu ||
 458                             ((frag->len & 7) && frag->next) ||
 459                             skb_headroom(frag) < hlen)
 460                             goto slow_path;
 461
 462                         /* Partially cloned skb? */
 463                         if (skb_shared(frag))
 464                                 goto slow_path;
 465
 466                         BUG_ON(frag->sk);
 467                         if (skb->sk) {
 468                                 sock_hold(skb->sk);
 469                                 frag->sk = skb->sk;
 470                                 frag->destructor = sock_wfree;
 471                                 skb->truesize -= frag->truesize;
 472                         }
 473                 }
 474
 475                 /* Everything is OK. Generate! */
 476
 477                 err = 0;
 478                 offset = 0;
 479                 frag = skb_shinfo(skb)->frag_list;
 480                 skb_shinfo(skb)->frag_list = NULL;
 481                 skb->data_len = first_len - skb_headlen(skb);
 482                 skb->len = first_len;
 483                 iph->tot_len = htons(first_len);
 484                 iph->frag_off = htons(IP_MF);
 485                 ip_send_check(iph);
 486
 487                 for (;;) {
 488                         /* Prepare header of the next frame,
 489                          * before previous one went down. */
 490                         if (frag) {
 491                                 frag->ip_summed = CHECKSUM_NONE;
 492                                 skb_reset_transport_header(frag);
 493                                 __skb_push(frag, hlen);
 494                                 skb_reset_network_header(frag);
 495                                 memcpy(skb_network_header(frag), iph, hlen);
 496                                 iph = ip_hdr(frag);
 497                                 iph->tot_len = htons(frag->len);
 498                                 ip_copy_metadata(frag, skb);
 499                                 if (offset == 0)
 500                                         ip_options_fragment(frag);
 501                                 offset += skb->len - hlen;
 502                                 iph->frag_off = htons(offset>>3);
 503                                 if (frag->next != NULL)
 504                                         iph->frag_off |= htons(IP_MF);
 505                                 /* Ready, complete checksum */
 506                                 ip_send_check(iph);
 507                         }
 508
 509                         err = output(skb);
 510
 511                         if (!err)
 512                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 513                         if (err || !frag)
 514                                 break;
 515
 516                         skb = frag;
 517                         frag = skb->next;
 518                         skb->next = NULL;
 519                 }
 520
 521                 if (err == 0) {
 522                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 523                         return 0;
 524                 }
 525
 526                 while (frag) {
 527                         skb = frag->next;
 528                         kfree_skb(frag);
 529                         frag = skb;
 530                 }
 531                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 532                 return err;
 533         }
 534
 535 slow_path:
 536         left = skb->len - hlen;         /* Space per frame */
 537         ptr = raw + hlen;               /* Where to start from */
 538
 539         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 540          * we need to make room for the encapsulating header
 541          */
 542         pad = nf_bridge_pad(skb);
 543         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 544         mtu -= pad;
 545
 546         /*
 547          *      Fragment the datagram.
 548          */
 549
 550         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 551         not_last_frag = iph->frag_off & htons(IP_MF);
 552
 553         /*
 554          *      Keep copying data until we run out.
 555          */
 556
 557         while (left > 0) {
 558                 len = left;
 559                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 560                 if (len > mtu)
 561                         len = mtu;
 562                 /* IF: we are not sending upto and including the packet end
 563                    then align the next start on an eight byte boundary */
 564                 if (len < left) {
 565                         len &= ~7;
 566                 }
 567                 /*
 568                  *      Allocate buffer.
 569                  */
 570
 571                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 572                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 573                         err = -ENOMEM;
 574                         goto fail;
 575                 }
 576
 577                 /*
 578                  *      Set up data on packet
 579                  */
 580
 581                 ip_copy_metadata(skb2, skb);
 582                 skb_reserve(skb2, ll_rs);
 583                 skb_put(skb2, len + hlen);
 584                 skb_reset_network_header(skb2);
 585                 skb2->h.raw = skb2->nh.raw + hlen;
 586
 587                 /*
 588                  *      Charge the memory for the fragment to any owner
 589                  *      it might possess
 590                  */
 591
 592                 if (skb->sk)
 593                         skb_set_owner_w(skb2, skb->sk);
 594
 595                 /*
 596                  *      Copy the packet header into the new buffer.
 597                  */
 598
 599                 memcpy(skb_network_header(skb2), skb->data, hlen);
 600
 601                 /*
 602                  *      Copy a block of the IP datagram.
 603                  */
 604                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 605                         BUG();
 606                 left -= len;
 607
 608                 /*
 609                  *      Fill in the new header fields.
 610                  */
 611                 iph = ip_hdr(skb2);
 612                 iph->frag_off = htons((offset >> 3));
 613
 614                 /* ANK: dirty, but effective trick. Upgrade options only if
 615                  * the segment to be fragmented was THE FIRST (otherwise,
 616                  * options are already fixed) and make it ONCE
 617                  * on the initial skb, so that all the following fragments
 618                  * will inherit fixed options.
 619                  */
 620                 if (offset == 0)
 621                         ip_options_fragment(skb);
 622
 623                 /*
 624                  *      Added AC : If we are fragmenting a fragment that's not the
 625                  *                 last fragment then keep MF on each bit
 626                  */
 627                 if (left > 0 || not_last_frag)
 628                         iph->frag_off |= htons(IP_MF);
 629                 ptr += len;
 630                 offset += len;
 631
 632                 /*
 633                  *      Put this fragment into the sending queue.
 634                  */
 635                 iph->tot_len = htons(len + hlen);
 636
 637                 ip_send_check(iph);
 638
 639                 err = output(skb2);
 640                 if (err)
 641                         goto fail;
 642
 643                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 644         }
 645         kfree_skb(skb);
 646         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 647         return err;
 648
 649 fail:
 650         kfree_skb(skb);
 651         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 652         return err;
 653 }
 654
 655 EXPORT_SYMBOL(ip_fragment);
 656
 657 int
 658 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 659 {
 660         struct iovec *iov = from;
 661
 662         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 663                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 664                         return -EFAULT;
 665         } else {
 666                 __wsum csum = 0;
 667                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 668                         return -EFAULT;
 669                 skb->csum = csum_block_add(skb->csum, csum, odd);
 670         }
 671         return 0;
 672 }
 673
 674 static inline __wsum
 675 csum_page(struct page *page, int offset, int copy)
 676 {
 677         char *kaddr;
 678         __wsum csum;
 679         kaddr = kmap(page);
 680         csum = csum_partial(kaddr + offset, copy, 0);
 681         kunmap(page);
 682         return csum;
 683 }
 684
 685 static inline int ip_ufo_append_data(struct sock *sk,
 686                         int getfrag(void *from, char *to, int offset, int len,
 687                                int odd, struct sk_buff *skb),
 688                         void *from, int length, int hh_len, int fragheaderlen,
 689                         int transhdrlen, int mtu,unsigned int flags)
 690 {
 691         struct sk_buff *skb;
 692         int err;
 693
 694         /* There is support for UDP fragmentation offload by network
 695          * device, so create one single skb packet containing complete
 696          * udp datagram
 697          */
 698         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 699                 skb = sock_alloc_send_skb(sk,
 700                         hh_len + fragheaderlen + transhdrlen + 20,
 701                         (flags & MSG_DONTWAIT), &err);
 702
 703                 if (skb == NULL)
 704                         return err;
 705
 706                 /* reserve space for Hardware header */
 707                 skb_reserve(skb, hh_len);
 708
 709                 /* create space for UDP/IP header */
 710                 skb_put(skb,fragheaderlen + transhdrlen);
 711
 712                 /* initialize network header pointer */
 713                 skb_reset_network_header(skb);
 714
 715                 /* initialize protocol header pointer */
 716                 skb->h.raw = skb->nh.raw + fragheaderlen;
 717
 718                 skb->ip_summed = CHECKSUM_PARTIAL;
 719                 skb->csum = 0;
 720                 sk->sk_sndmsg_off = 0;
 721         }
 722
 723         err = skb_append_datato_frags(sk,skb, getfrag, from,
 724                                (length - transhdrlen));
 725         if (!err) {
 726                 /* specify the length of each IP datagram fragment*/
 727                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 728                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 729                 __skb_queue_tail(&sk->sk_write_queue, skb);
 730
 731                 return 0;
 732         }
 733         /* There is not enough support do UFO ,
 734          * so follow normal path
 735          */
 736         kfree_skb(skb);
 737         return err;
 738 }
 739
 740 /*
 741  *      ip_append_data() and ip_append_page() can make one large IP datagram
 742  *      from many pieces of data. Each pieces will be holded on the socket
 743  *      until ip_push_pending_frames() is called. Each piece can be a page
 744  *      or non-page data.
 745  *
 746  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 747  *      this interface potentially.
 748  *
 749  *      LATER: length must be adjusted by pad at tail, when it is required.
 750  */
 751 int ip_append_data(struct sock *sk,
 752                    int getfrag(void *from, char *to, int offset, int len,
 753                                int odd, struct sk_buff *skb),
 754                    void *from, int length, int transhdrlen,
 755                    struct ipcm_cookie *ipc, struct rtable *rt,
 756                    unsigned int flags)
 757 {
 758         struct inet_sock *inet = inet_sk(sk);
 759         struct sk_buff *skb;
 760
 761         struct ip_options *opt = NULL;
 762         int hh_len;
 763         int exthdrlen;
 764         int mtu;
 765         int copy;
 766         int err;
 767         int offset = 0;
 768         unsigned int maxfraglen, fragheaderlen;
 769         int csummode = CHECKSUM_NONE;
 770
 771         if (flags&MSG_PROBE)
 772                 return 0;
 773
 774         if (skb_queue_empty(&sk->sk_write_queue)) {
 775                 /*
 776                  * setup for corking.
 777                  */
 778                 opt = ipc->opt;
 779                 if (opt) {
 780                         if (inet->cork.opt == NULL) {
 781                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 782                                 if (unlikely(inet->cork.opt == NULL))
 783                                         return -ENOBUFS;
 784                         }
 785                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 786                         inet->cork.flags |= IPCORK_OPT;
 787                         inet->cork.addr = ipc->addr;
 788                 }
 789                 dst_hold(&rt->u.dst);
 790                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 791                 inet->cork.rt = rt;
 792                 inet->cork.length = 0;
 793                 sk->sk_sndmsg_page = NULL;
 794                 sk->sk_sndmsg_off = 0;
 795                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 796                         length += exthdrlen;
 797                         transhdrlen += exthdrlen;
 798                 }
 799         } else {
 800                 rt = inet->cork.rt;
 801                 if (inet->cork.flags & IPCORK_OPT)
 802                         opt = inet->cork.opt;
 803
 804                 transhdrlen = 0;
 805                 exthdrlen = 0;
 806                 mtu = inet->cork.fragsize;
 807         }
 808         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 809
 810         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 811         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 812
 813         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 814                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 815                 return -EMSGSIZE;
 816         }
 817
 818         /*
 819          * transhdrlen > 0 means that this is the first fragment and we wish
 820          * it won't be fragmented in the future.
 821          */
 822         if (transhdrlen &&
 823             length + fragheaderlen <= mtu &&
 824             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 825             !exthdrlen)
 826                 csummode = CHECKSUM_PARTIAL;
 827
 828         inet->cork.length += length;
 829         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 830                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 831
 832                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 833                                          fragheaderlen, transhdrlen, mtu,
 834                                          flags);
 835                 if (err)
 836                         goto error;
 837                 return 0;
 838         }
 839
 840         /* So, what's going on in the loop below?
 841          *
 842          * We use calculated fragment length to generate chained skb,
 843          * each of segments is IP fragment ready for sending to network after
 844          * adding appropriate IP header.
 845          */
 846
 847         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 848                 goto alloc_new_skb;
 849
 850         while (length > 0) {
 851                 /* Check if the remaining data fits into current packet. */
 852                 copy = mtu - skb->len;
 853                 if (copy < length)
 854                         copy = maxfraglen - skb->len;
 855                 if (copy <= 0) {
 856                         char *data;
 857                         unsigned int datalen;
 858                         unsigned int fraglen;
 859                         unsigned int fraggap;
 860                         unsigned int alloclen;
 861                         struct sk_buff *skb_prev;
 862 alloc_new_skb:
 863                         skb_prev = skb;
 864                         if (skb_prev)
 865                                 fraggap = skb_prev->len - maxfraglen;
 866                         else
 867                                 fraggap = 0;
 868
 869                         /*
 870                          * If remaining data exceeds the mtu,
 871                          * we know we need more fragment(s).
 872                          */
 873                         datalen = length + fraggap;
 874                         if (datalen > mtu - fragheaderlen)
 875                                 datalen = maxfraglen - fragheaderlen;
 876                         fraglen = datalen + fragheaderlen;
 877
 878                         if ((flags & MSG_MORE) &&
 879                             !(rt->u.dst.dev->features&NETIF_F_SG))
 880                                 alloclen = mtu;
 881                         else
 882                                 alloclen = datalen + fragheaderlen;
 883
 884                         /* The last fragment gets additional space at tail.
 885                          * Note, with MSG_MORE we overallocate on fragments,
 886                          * because we have no idea what fragment will be
 887                          * the last.
 888                          */
 889                         if (datalen == length + fraggap)
 890                                 alloclen += rt->u.dst.trailer_len;
 891
 892                         if (transhdrlen) {
 893                                 skb = sock_alloc_send_skb(sk,
 894                                                 alloclen + hh_len + 15,
 895                                                 (flags & MSG_DONTWAIT), &err);
 896                         } else {
 897                                 skb = NULL;
 898                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 899                                     2 * sk->sk_sndbuf)
 900                                         skb = sock_wmalloc(sk,
 901                                                            alloclen + hh_len + 15, 1,
 902                                                            sk->sk_allocation);
 903                                 if (unlikely(skb == NULL))
 904                                         err = -ENOBUFS;
 905                         }
 906                         if (skb == NULL)
 907                                 goto error;
 908
 909                         /*
 910                          *      Fill in the control structures
 911                          */
 912                         skb->ip_summed = csummode;
 913                         skb->csum = 0;
 914                         skb_reserve(skb, hh_len);
 915
 916                         /*
 917                          *      Find where to start putting bytes.
 918                          */
 919                         data = skb_put(skb, fraglen);
 920                         skb_set_network_header(skb, exthdrlen);
 921                         skb->h.raw = skb->nh.raw + fragheaderlen;
 922                         data += fragheaderlen;
 923
 924                         if (fraggap) {
 925                                 skb->csum = skb_copy_and_csum_bits(
 926                                         skb_prev, maxfraglen,
 927                                         data + transhdrlen, fraggap, 0);
 928                                 skb_prev->csum = csum_sub(skb_prev->csum,
 929                                                           skb->csum);
 930                                 data += fraggap;
 931                                 pskb_trim_unique(skb_prev, maxfraglen);
 932                         }
 933
 934                         copy = datalen - transhdrlen - fraggap;
 935                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 936                                 err = -EFAULT;
 937                                 kfree_skb(skb);
 938                                 goto error;
 939                         }
 940
 941                         offset += copy;
 942                         length -= datalen - fraggap;
 943                         transhdrlen = 0;
 944                         exthdrlen = 0;
 945                         csummode = CHECKSUM_NONE;
 946
 947                         /*
 948                          * Put the packet on the pending queue.
 949                          */
 950                         __skb_queue_tail(&sk->sk_write_queue, skb);
 951                         continue;
 952                 }
 953
 954                 if (copy > length)
 955                         copy = length;
 956
 957                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 958                         unsigned int off;
 959
 960                         off = skb->len;
 961                         if (getfrag(from, skb_put(skb, copy),
 962                                         offset, copy, off, skb) < 0) {
 963                                 __skb_trim(skb, off);
 964                                 err = -EFAULT;
 965                                 goto error;
 966                         }
 967                 } else {
 968                         int i = skb_shinfo(skb)->nr_frags;
 969                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 970                         struct page *page = sk->sk_sndmsg_page;
 971                         int off = sk->sk_sndmsg_off;
 972                         unsigned int left;
 973
 974                         if (page && (left = PAGE_SIZE - off) > 0) {
 975                                 if (copy >= left)
 976                                         copy = left;
 977                                 if (page != frag->page) {
 978                                         if (i == MAX_SKB_FRAGS) {
 979                                                 err = -EMSGSIZE;
 980                                                 goto error;
 981                                         }
 982                                         get_page(page);
 983                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 984                                         frag = &skb_shinfo(skb)->frags[i];
 985                                 }
 986                         } else if (i < MAX_SKB_FRAGS) {
 987                                 if (copy > PAGE_SIZE)
 988                                         copy = PAGE_SIZE;
 989                                 page = alloc_pages(sk->sk_allocation, 0);
 990                                 if (page == NULL)  {
 991                                         err = -ENOMEM;
 992                                         goto error;
 993                                 }
 994                                 sk->sk_sndmsg_page = page;
 995                                 sk->sk_sndmsg_off = 0;
 996
 997                                 skb_fill_page_desc(skb, i, page, 0, 0);
 998                                 frag = &skb_shinfo(skb)->frags[i];
 999                                 skb->truesize += PAGE_SIZE;
1000                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1001                         } else {
1002                                 err = -EMSGSIZE;
1003                                 goto error;
1004                         }
1005                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1006                                 err = -EFAULT;
1007                                 goto error;
1008                         }
1009                         sk->sk_sndmsg_off += copy;
1010                         frag->size += copy;
1011                         skb->len += copy;
1012                         skb->data_len += copy;
1013                 }
1014                 offset += copy;
1015                 length -= copy;
1016         }
1017
1018         return 0;
1019
1020 error:
1021         inet->cork.length -= length;
1022         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1023         return err;
1024 }
1025
1026 ssize_t ip_append_page(struct sock *sk, struct page *page,
1027                        int offset, size_t size, int flags)
1028 {
1029         struct inet_sock *inet = inet_sk(sk);
1030         struct sk_buff *skb;
1031         struct rtable *rt;
1032         struct ip_options *opt = NULL;
1033         int hh_len;
1034         int mtu;
1035         int len;
1036         int err;
1037         unsigned int maxfraglen, fragheaderlen, fraggap;
1038
1039         if (inet->hdrincl)
1040                 return -EPERM;
1041
1042         if (flags&MSG_PROBE)
1043                 return 0;
1044
1045         if (skb_queue_empty(&sk->sk_write_queue))
1046                 return -EINVAL;
1047
1048         rt = inet->cork.rt;
1049         if (inet->cork.flags & IPCORK_OPT)
1050                 opt = inet->cork.opt;
1051
1052         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1053                 return -EOPNOTSUPP;
1054
1055         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1056         mtu = inet->cork.fragsize;
1057
1058         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1059         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1060
1061         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1062                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1063                 return -EMSGSIZE;
1064         }
1065
1066         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1067                 return -EINVAL;
1068
1069         inet->cork.length += size;
1070         if ((sk->sk_protocol == IPPROTO_UDP) &&
1071             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1072                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1073                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1074         }
1075
1076
1077         while (size > 0) {
1078                 int i;
1079
1080                 if (skb_is_gso(skb))
1081                         len = size;
1082                 else {
1083
1084                         /* Check if the remaining data fits into current packet. */
1085                         len = mtu - skb->len;
1086                         if (len < size)
1087                                 len = maxfraglen - skb->len;
1088                 }
1089                 if (len <= 0) {
1090                         struct sk_buff *skb_prev;
1091                         int alloclen;
1092
1093                         skb_prev = skb;
1094                         fraggap = skb_prev->len - maxfraglen;
1095
1096                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1097                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1098                         if (unlikely(!skb)) {
1099                                 err = -ENOBUFS;
1100                                 goto error;
1101                         }
1102
1103                         /*
1104                          *      Fill in the control structures
1105                          */
1106                         skb->ip_summed = CHECKSUM_NONE;
1107                         skb->csum = 0;
1108                         skb_reserve(skb, hh_len);
1109
1110                         /*
1111                          *      Find where to start putting bytes.
1112                          */
1113                         skb_put(skb, fragheaderlen + fraggap);
1114                         skb_reset_network_header(skb);
1115                         skb->h.raw = skb->nh.raw + fragheaderlen;
1116
1117                         if (fraggap) {
1118                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1119                                                                    maxfraglen,
1120                                                     skb_transport_header(skb),
1121                                                                    fraggap, 0);
1122                                 skb_prev->csum = csum_sub(skb_prev->csum,
1123                                                           skb->csum);
1124                                 pskb_trim_unique(skb_prev, maxfraglen);
1125                         }
1126
1127                         /*
1128                          * Put the packet on the pending queue.
1129                          */
1130                         __skb_queue_tail(&sk->sk_write_queue, skb);
1131                         continue;
1132                 }
1133
1134                 i = skb_shinfo(skb)->nr_frags;
1135                 if (len > size)
1136                         len = size;
1137                 if (skb_can_coalesce(skb, i, page, offset)) {
1138                         skb_shinfo(skb)->frags[i-1].size += len;
1139                 } else if (i < MAX_SKB_FRAGS) {
1140                         get_page(page);
1141                         skb_fill_page_desc(skb, i, page, offset, len);
1142                 } else {
1143                         err = -EMSGSIZE;
1144                         goto error;
1145                 }
1146
1147                 if (skb->ip_summed == CHECKSUM_NONE) {
1148                         __wsum csum;
1149                         csum = csum_page(page, offset, len);
1150                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1151                 }
1152
1153                 skb->len += len;
1154                 skb->data_len += len;
1155                 offset += len;
1156                 size -= len;
1157         }
1158         return 0;
1159
1160 error:
1161         inet->cork.length -= size;
1162         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1163         return err;
1164 }
1165
1166 /*
1167  *      Combined all pending IP fragments on the socket as one IP datagram
1168  *      and push them out.
1169  */
1170 int ip_push_pending_frames(struct sock *sk)
1171 {
1172         struct sk_buff *skb, *tmp_skb;
1173         struct sk_buff **tail_skb;
1174         struct inet_sock *inet = inet_sk(sk);
1175         struct ip_options *opt = NULL;
1176         struct rtable *rt = inet->cork.rt;
1177         struct iphdr *iph;
1178         __be16 df = 0;
1179         __u8 ttl;
1180         int err = 0;
1181
1182         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1183                 goto out;
1184         tail_skb = &(skb_shinfo(skb)->frag_list);
1185
1186         /* move skb->data to ip header from ext header */
1187         if (skb->data < skb_network_header(skb))
1188                 __skb_pull(skb, skb_network_offset(skb));
1189         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1190                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1191                 *tail_skb = tmp_skb;
1192                 tail_skb = &(tmp_skb->next);
1193                 skb->len += tmp_skb->len;
1194                 skb->data_len += tmp_skb->len;
1195                 skb->truesize += tmp_skb->truesize;
1196                 __sock_put(tmp_skb->sk);
1197                 tmp_skb->destructor = NULL;
1198                 tmp_skb->sk = NULL;
1199         }
1200
1201         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1202          * to fragment the frame generated here. No matter, what transforms
1203          * how transforms change size of the packet, it will come out.
1204          */
1205         if (inet->pmtudisc != IP_PMTUDISC_DO)
1206                 skb->local_df = 1;
1207
1208         /* DF bit is set when we want to see DF on outgoing frames.
1209          * If local_df is set too, we still allow to fragment this frame
1210          * locally. */
1211         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1212             (skb->len <= dst_mtu(&rt->u.dst) &&
1213              ip_dont_fragment(sk, &rt->u.dst)))
1214                 df = htons(IP_DF);
1215
1216         if (inet->cork.flags & IPCORK_OPT)
1217                 opt = inet->cork.opt;
1218
1219         if (rt->rt_type == RTN_MULTICAST)
1220                 ttl = inet->mc_ttl;
1221         else
1222                 ttl = ip_select_ttl(inet, &rt->u.dst);
1223
1224         iph = (struct iphdr *)skb->data;
1225         iph->version = 4;
1226         iph->ihl = 5;
1227         if (opt) {
1228                 iph->ihl += opt->optlen>>2;
1229                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1230         }
1231         iph->tos = inet->tos;
1232         iph->tot_len = htons(skb->len);
1233         iph->frag_off = df;
1234         ip_select_ident(iph, &rt->u.dst, sk);
1235         iph->ttl = ttl;
1236         iph->protocol = sk->sk_protocol;
1237         iph->saddr = rt->rt_src;
1238         iph->daddr = rt->rt_dst;
1239         ip_send_check(iph);
1240
1241         skb->priority = sk->sk_priority;
1242         skb->dst = dst_clone(&rt->u.dst);
1243
1244         /* Netfilter gets whole the not fragmented skb. */
1245         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1246                       skb->dst->dev, dst_output);
1247         if (err) {
1248                 if (err > 0)
1249                         err = inet->recverr ? net_xmit_errno(err) : 0;
1250                 if (err)
1251                         goto error;
1252         }
1253
1254 out:
1255         inet->cork.flags &= ~IPCORK_OPT;
1256         kfree(inet->cork.opt);
1257         inet->cork.opt = NULL;
1258         if (inet->cork.rt) {
1259                 ip_rt_put(inet->cork.rt);
1260                 inet->cork.rt = NULL;
1261         }
1262         return err;
1263
1264 error:
1265         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1266         goto out;
1267 }
1268
1269 /*
1270  *      Throw away all pending data on the socket.
1271  */
1272 void ip_flush_pending_frames(struct sock *sk)
1273 {
1274         struct inet_sock *inet = inet_sk(sk);
1275         struct sk_buff *skb;
1276
1277         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1278                 kfree_skb(skb);
1279
1280         inet->cork.flags &= ~IPCORK_OPT;
1281         kfree(inet->cork.opt);
1282         inet->cork.opt = NULL;
1283         if (inet->cork.rt) {
1284                 ip_rt_put(inet->cork.rt);
1285                 inet->cork.rt = NULL;
1286         }
1287 }
1288
1289
1290 /*
1291  *      Fetch data from kernel space and fill in checksum if needed.
1292  */
1293 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1294                               int len, int odd, struct sk_buff *skb)
1295 {
1296         __wsum csum;
1297
1298         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1299         skb->csum = csum_block_add(skb->csum, csum, odd);
1300         return 0;
1301 }
1302
1303 /*
1304  *      Generic function to send a packet as reply to another packet.
1305  *      Used to send TCP resets so far. ICMP should use this function too.
1306  *
1307  *      Should run single threaded per socket because it uses the sock
1308  *      structure to pass arguments.
1309  *
1310  *      LATER: switch from ip_build_xmit to ip_append_*
1311  */
1312 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1313                    unsigned int len)
1314 {
1315         struct inet_sock *inet = inet_sk(sk);
1316         struct {
1317                 struct ip_options       opt;
1318                 char                    data[40];
1319         } replyopts;
1320         struct ipcm_cookie ipc;
1321         __be32 daddr;
1322         struct rtable *rt = (struct rtable*)skb->dst;
1323
1324         if (ip_options_echo(&replyopts.opt, skb))
1325                 return;
1326
1327         daddr = ipc.addr = rt->rt_src;
1328         ipc.opt = NULL;
1329
1330         if (replyopts.opt.optlen) {
1331                 ipc.opt = &replyopts.opt;
1332
1333                 if (ipc.opt->srr)
1334                         daddr = replyopts.opt.faddr;
1335         }
1336
1337         {
1338                 struct flowi fl = { .nl_u = { .ip4_u =
1339                                               { .daddr = daddr,
1340                                                 .saddr = rt->rt_spec_dst,
1341                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1342                                     /* Not quite clean, but right. */
1343                                     .uli_u = { .ports =
1344                                                { .sport = tcp_hdr(skb)->dest,
1345                                                  .dport = tcp_hdr(skb)->source } },
1346                                     .proto = sk->sk_protocol };
1347                 security_skb_classify_flow(skb, &fl);
1348                 if (ip_route_output_key(&rt, &fl))
1349                         return;
1350         }
1351
1352         /* And let IP do all the hard work.
1353
1354            This chunk is not reenterable, hence spinlock.
1355            Note that it uses the fact, that this function is called
1356            with locally disabled BH and that sk cannot be already spinlocked.
1357          */
1358         bh_lock_sock(sk);
1359         inet->tos = ip_hdr(skb)->tos;
1360         sk->sk_priority = skb->priority;
1361         sk->sk_protocol = ip_hdr(skb)->protocol;
1362         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1363                        &ipc, rt, MSG_DONTWAIT);
1364         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1365                 if (arg->csumoffset >= 0)
1366                         *((__sum16 *)skb_transport_header(skb) +
1367                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1368                                                                 arg->csum));
1369                 skb->ip_summed = CHECKSUM_NONE;
1370                 ip_push_pending_frames(sk);
1371         }
1372
1373         bh_unlock_sock(sk);
1374
1375         ip_rt_put(rt);
1376 }
1377
1378 void __init ip_init(void)
1379 {
1380         ip_rt_init();
1381         inet_initpeers();
1382
1383 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1384         igmp_mc_proc_init();
1385 #endif
1386 }
1387
1388 EXPORT_SYMBOL(ip_generic_getfrag);
1389 EXPORT_SYMBOL(ip_queue_xmit);
1390 EXPORT_SYMBOL(ip_send_check);