git.oblomov.eu Git - linux-2.6/blob - net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(dev_net(dst->dev),
 107                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         WARN_ON(!newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ((mroute6_socket(dev_net(dev)) &&
 141                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 142                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 143                                          &ipv6_hdr(skb)->saddr))) {
 144                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 145
 146                         /* Do not check for IFF_ALLMULTI; multicast routing
 147                            is not supported in any case.
 148                          */
 149                         if (newskb)
 150                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 151                                         NULL, newskb->dev,
 152                                         ip6_dev_loopback_xmit);
 153
 154                         if (ipv6_hdr(skb)->hop_limit == 0) {
 155                                 IP6_INC_STATS(dev_net(dev), idev,
 156                                               IPSTATS_MIB_OUTDISCARDS);
 157                                 kfree_skb(skb);
 158                                 return 0;
 159                         }
 160                 }
 161
 162                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 163                                 skb->len);
 164         }
 165
 166         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 167                        ip6_output_finish);
 168 }
 169
 170 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 171 {
 172         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 173
 174         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 175                skb->dst->dev->mtu : dst_mtu(skb->dst);
 176 }
 177
 178 int ip6_output(struct sk_buff *skb)
 179 {
 180         struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 181         if (unlikely(idev->cnf.disable_ipv6)) {
 182                 IP6_INC_STATS(dev_net(skb->dst->dev), idev,
 183                               IPSTATS_MIB_OUTDISCARDS);
 184                 kfree_skb(skb);
 185                 return 0;
 186         }
 187
 188         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 189                                 dst_allfrag(skb->dst))
 190                 return ip6_fragment(skb, ip6_output2);
 191         else
 192                 return ip6_output2(skb);
 193 }
 194
 195 /*
 196  *      xmit an sk_buff (used by TCP)
 197  */
 198
 199 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 200              struct ipv6_txoptions *opt, int ipfragok)
 201 {
 202         struct net *net = sock_net(sk);
 203         struct ipv6_pinfo *np = inet6_sk(sk);
 204         struct in6_addr *first_hop = &fl->fl6_dst;
 205         struct dst_entry *dst = skb->dst;
 206         struct ipv6hdr *hdr;
 207         u8  proto = fl->proto;
 208         int seg_len = skb->len;
 209         int hlimit, tclass;
 210         u32 mtu;
 211
 212         if (opt) {
 213                 unsigned int head_room;
 214
 215                 /* First: exthdrs may take lots of space (~8K for now)
 216                    MAX_HEADER is not enough.
 217                  */
 218                 head_room = opt->opt_nflen + opt->opt_flen;
 219                 seg_len += head_room;
 220                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 221
 222                 if (skb_headroom(skb) < head_room) {
 223                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 224                         if (skb2 == NULL) {
 225                                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 226                                               IPSTATS_MIB_OUTDISCARDS);
 227                                 kfree_skb(skb);
 228                                 return -ENOBUFS;
 229                         }
 230                         kfree_skb(skb);
 231                         skb = skb2;
 232                         if (sk)
 233                                 skb_set_owner_w(skb, sk);
 234                 }
 235                 if (opt->opt_flen)
 236                         ipv6_push_frag_opts(skb, opt, &proto);
 237                 if (opt->opt_nflen)
 238                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 239         }
 240
 241         skb_push(skb, sizeof(struct ipv6hdr));
 242         skb_reset_network_header(skb);
 243         hdr = ipv6_hdr(skb);
 244
 245         /* Allow local fragmentation. */
 246         if (ipfragok)
 247                 skb->local_df = 1;
 248
 249         /*
 250          *      Fill in the IPv6 header
 251          */
 252
 253         hlimit = -1;
 254         if (np)
 255                 hlimit = np->hop_limit;
 256         if (hlimit < 0)
 257                 hlimit = ip6_dst_hoplimit(dst);
 258
 259         tclass = -1;
 260         if (np)
 261                 tclass = np->tclass;
 262         if (tclass < 0)
 263                 tclass = 0;
 264
 265         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 266
 267         hdr->payload_len = htons(seg_len);
 268         hdr->nexthdr = proto;
 269         hdr->hop_limit = hlimit;
 270
 271         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 272         ipv6_addr_copy(&hdr->daddr, first_hop);
 273
 274         skb->priority = sk->sk_priority;
 275         skb->mark = sk->sk_mark;
 276
 277         mtu = dst_mtu(dst);
 278         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 279                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb->dst),
 280                               IPSTATS_MIB_OUT, skb->len);
 281                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 282                                 dst_output);
 283         }
 284
 285         if (net_ratelimit())
 286                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 287         skb->dev = dst->dev;
 288         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 289         IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 290         kfree_skb(skb);
 291         return -EMSGSIZE;
 292 }
 293
 294 EXPORT_SYMBOL(ip6_xmit);
 295
 296 /*
 297  *      To avoid extra problems ND packets are send through this
 298  *      routine. It's code duplication but I really want to avoid
 299  *      extra checks since ipv6_build_header is used by TCP (which
 300  *      is for us performance critical)
 301  */
 302
 303 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 304                const struct in6_addr *saddr, const struct in6_addr *daddr,
 305                int proto, int len)
 306 {
 307         struct ipv6_pinfo *np = inet6_sk(sk);
 308         struct ipv6hdr *hdr;
 309         int totlen;
 310
 311         skb->protocol = htons(ETH_P_IPV6);
 312         skb->dev = dev;
 313
 314         totlen = len + sizeof(struct ipv6hdr);
 315
 316         skb_reset_network_header(skb);
 317         skb_put(skb, sizeof(struct ipv6hdr));
 318         hdr = ipv6_hdr(skb);
 319
 320         *(__be32*)hdr = htonl(0x60000000);
 321
 322         hdr->payload_len = htons(len);
 323         hdr->nexthdr = proto;
 324         hdr->hop_limit = np->hop_limit;
 325
 326         ipv6_addr_copy(&hdr->saddr, saddr);
 327         ipv6_addr_copy(&hdr->daddr, daddr);
 328
 329         return 0;
 330 }
 331
 332 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 333 {
 334         struct ip6_ra_chain *ra;
 335         struct sock *last = NULL;
 336
 337         read_lock(&ip6_ra_lock);
 338         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 339                 struct sock *sk = ra->sk;
 340                 if (sk && ra->sel == sel &&
 341                     (!sk->sk_bound_dev_if ||
 342                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 343                         if (last) {
 344                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 345                                 if (skb2)
 346                                         rawv6_rcv(last, skb2);
 347                         }
 348                         last = sk;
 349                 }
 350         }
 351
 352         if (last) {
 353                 rawv6_rcv(last, skb);
 354                 read_unlock(&ip6_ra_lock);
 355                 return 1;
 356         }
 357         read_unlock(&ip6_ra_lock);
 358         return 0;
 359 }
 360
 361 static int ip6_forward_proxy_check(struct sk_buff *skb)
 362 {
 363         struct ipv6hdr *hdr = ipv6_hdr(skb);
 364         u8 nexthdr = hdr->nexthdr;
 365         int offset;
 366
 367         if (ipv6_ext_hdr(nexthdr)) {
 368                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 369                 if (offset < 0)
 370                         return 0;
 371         } else
 372                 offset = sizeof(struct ipv6hdr);
 373
 374         if (nexthdr == IPPROTO_ICMPV6) {
 375                 struct icmp6hdr *icmp6;
 376
 377                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 378                                          offset + 1 - skb->data)))
 379                         return 0;
 380
 381                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 382
 383                 switch (icmp6->icmp6_type) {
 384                 case NDISC_ROUTER_SOLICITATION:
 385                 case NDISC_ROUTER_ADVERTISEMENT:
 386                 case NDISC_NEIGHBOUR_SOLICITATION:
 387                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 388                 case NDISC_REDIRECT:
 389                         /* For reaction involving unicast neighbor discovery
 390                          * message destined to the proxied address, pass it to
 391                          * input function.
 392                          */
 393                         return 1;
 394                 default:
 395                         break;
 396                 }
 397         }
 398
 399         /*
 400          * The proxying router can't forward traffic sent to a link-local
 401          * address, so signal the sender and discard the packet. This
 402          * behavior is clarified by the MIPv6 specification.
 403          */
 404         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 405                 dst_link_failure(skb);
 406                 return -1;
 407         }
 408
 409         return 0;
 410 }
 411
 412 static inline int ip6_forward_finish(struct sk_buff *skb)
 413 {
 414         return dst_output(skb);
 415 }
 416
 417 int ip6_forward(struct sk_buff *skb)
 418 {
 419         struct dst_entry *dst = skb->dst;
 420         struct ipv6hdr *hdr = ipv6_hdr(skb);
 421         struct inet6_skb_parm *opt = IP6CB(skb);
 422         struct net *net = dev_net(dst->dev);
 423
 424         if (net->ipv6.devconf_all->forwarding == 0)
 425                 goto error;
 426
 427         if (skb_warn_if_lro(skb))
 428                 goto drop;
 429
 430         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 431                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 432                 goto drop;
 433         }
 434
 435         skb_forward_csum(skb);
 436
 437         /*
 438          *      We DO NOT make any processing on
 439          *      RA packets, pushing them to user level AS IS
 440          *      without ane WARRANTY that application will be able
 441          *      to interpret them. The reason is that we
 442          *      cannot make anything clever here.
 443          *
 444          *      We are not end-node, so that if packet contains
 445          *      AH/ESP, we cannot make anything.
 446          *      Defragmentation also would be mistake, RA packets
 447          *      cannot be fragmented, because there is no warranty
 448          *      that different fragments will go along one path. --ANK
 449          */
 450         if (opt->ra) {
 451                 u8 *ptr = skb_network_header(skb) + opt->ra;
 452                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 453                         return 0;
 454         }
 455
 456         /*
 457          *      check and decrement ttl
 458          */
 459         if (hdr->hop_limit <= 1) {
 460                 /* Force OUTPUT device used as source address */
 461                 skb->dev = dst->dev;
 462                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 463                             0, skb->dev);
 464                 IP6_INC_STATS_BH(net,
 465                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 466
 467                 kfree_skb(skb);
 468                 return -ETIMEDOUT;
 469         }
 470
 471         /* XXX: idev->cnf.proxy_ndp? */
 472         if (net->ipv6.devconf_all->proxy_ndp &&
 473             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 474                 int proxied = ip6_forward_proxy_check(skb);
 475                 if (proxied > 0)
 476                         return ip6_input(skb);
 477                 else if (proxied < 0) {
 478                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                       IPSTATS_MIB_INDISCARDS);
 480                         goto drop;
 481                 }
 482         }
 483
 484         if (!xfrm6_route_forward(skb)) {
 485                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 486                 goto drop;
 487         }
 488         dst = skb->dst;
 489
 490         /* IPv6 specs say nothing about it, but it is clear that we cannot
 491            send redirects to source routed frames.
 492            We don't send redirects to frames decapsulated from IPsec.
 493          */
 494         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 495             !skb_sec_path(skb)) {
 496                 struct in6_addr *target = NULL;
 497                 struct rt6_info *rt;
 498                 struct neighbour *n = dst->neighbour;
 499
 500                 /*
 501                  *      incoming and outgoing devices are the same
 502                  *      send a redirect.
 503                  */
 504
 505                 rt = (struct rt6_info *) dst;
 506                 if ((rt->rt6i_flags & RTF_GATEWAY))
 507                         target = (struct in6_addr*)&n->primary_key;
 508                 else
 509                         target = &hdr->daddr;
 510
 511                 /* Limit redirects both by destination (here)
 512                    and by source (inside ndisc_send_redirect)
 513                  */
 514                 if (xrlim_allow(dst, 1*HZ))
 515                         ndisc_send_redirect(skb, n, target);
 516         } else {
 517                 int addrtype = ipv6_addr_type(&hdr->saddr);
 518
 519                 /* This check is security critical. */
 520                 if (addrtype == IPV6_ADDR_ANY ||
 521                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 522                         goto error;
 523                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 524                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 525                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 526                         goto error;
 527                 }
 528         }
 529
 530         if (skb->len > dst_mtu(dst)) {
 531                 /* Again, force OUTPUT device used as source address */
 532                 skb->dev = dst->dev;
 533                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 534                 IP6_INC_STATS_BH(net,
 535                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 536                 IP6_INC_STATS_BH(net,
 537                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 538                 kfree_skb(skb);
 539                 return -EMSGSIZE;
 540         }
 541
 542         if (skb_cow(skb, dst->dev->hard_header_len)) {
 543                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 544                 goto drop;
 545         }
 546
 547         hdr = ipv6_hdr(skb);
 548
 549         /* Mangling hops number delayed to point after skb COW */
 550
 551         hdr->hop_limit--;
 552
 553         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 554         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 555                        ip6_forward_finish);
 556
 557 error:
 558         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 559 drop:
 560         kfree_skb(skb);
 561         return -EINVAL;
 562 }
 563
 564 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 565 {
 566         to->pkt_type = from->pkt_type;
 567         to->priority = from->priority;
 568         to->protocol = from->protocol;
 569         dst_release(to->dst);
 570         to->dst = dst_clone(from->dst);
 571         to->dev = from->dev;
 572         to->mark = from->mark;
 573
 574 #ifdef CONFIG_NET_SCHED
 575         to->tc_index = from->tc_index;
 576 #endif
 577         nf_copy(to, from);
 578 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 579     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 580         to->nf_trace = from->nf_trace;
 581 #endif
 582         skb_copy_secmark(to, from);
 583 }
 584
 585 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 586 {
 587         u16 offset = sizeof(struct ipv6hdr);
 588         struct ipv6_opt_hdr *exthdr =
 589                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 590         unsigned int packet_len = skb->tail - skb->network_header;
 591         int found_rhdr = 0;
 592         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 593
 594         while (offset + 1 <= packet_len) {
 595
 596                 switch (**nexthdr) {
 597
 598                 case NEXTHDR_HOP:
 599                         break;
 600                 case NEXTHDR_ROUTING:
 601                         found_rhdr = 1;
 602                         break;
 603                 case NEXTHDR_DEST:
 604 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 605                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 606                                 break;
 607 #endif
 608                         if (found_rhdr)
 609                                 return offset;
 610                         break;
 611                 default :
 612                         return offset;
 613                 }
 614
 615                 offset += ipv6_optlen(exthdr);
 616                 *nexthdr = &exthdr->nexthdr;
 617                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 618                                                  offset);
 619         }
 620
 621         return offset;
 622 }
 623
 624 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 625 {
 626         struct sk_buff *frag;
 627         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 628         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 629         struct ipv6hdr *tmp_hdr;
 630         struct frag_hdr *fh;
 631         unsigned int mtu, hlen, left, len;
 632         __be32 frag_id = 0;
 633         int ptr, offset = 0, err=0;
 634         u8 *prevhdr, nexthdr = 0;
 635         struct net *net = dev_net(skb->dst->dev);
 636
 637         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 638         nexthdr = *prevhdr;
 639
 640         mtu = ip6_skb_dst_mtu(skb);
 641
 642         /* We must not fragment if the socket is set to force MTU discovery
 643          * or if the skb it not generated by a local socket.  (This last
 644          * check should be redundant, but it's free.)
 645          */
 646         if (!skb->local_df) {
 647                 skb->dev = skb->dst->dev;
 648                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 649                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 650                               IPSTATS_MIB_FRAGFAILS);
 651                 kfree_skb(skb);
 652                 return -EMSGSIZE;
 653         }
 654
 655         if (np && np->frag_size < mtu) {
 656                 if (np->frag_size)
 657                         mtu = np->frag_size;
 658         }
 659         mtu -= hlen + sizeof(struct frag_hdr);
 660
 661         if (skb_shinfo(skb)->frag_list) {
 662                 int first_len = skb_pagelen(skb);
 663                 int truesizes = 0;
 664
 665                 if (first_len - hlen > mtu ||
 666                     ((first_len - hlen) & 7) ||
 667                     skb_cloned(skb))
 668                         goto slow_path;
 669
 670                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 671                         /* Correct geometry. */
 672                         if (frag->len > mtu ||
 673                             ((frag->len & 7) && frag->next) ||
 674                             skb_headroom(frag) < hlen)
 675                             goto slow_path;
 676
 677                         /* Partially cloned skb? */
 678                         if (skb_shared(frag))
 679                                 goto slow_path;
 680
 681                         BUG_ON(frag->sk);
 682                         if (skb->sk) {
 683                                 sock_hold(skb->sk);
 684                                 frag->sk = skb->sk;
 685                                 frag->destructor = sock_wfree;
 686                                 truesizes += frag->truesize;
 687                         }
 688                 }
 689
 690                 err = 0;
 691                 offset = 0;
 692                 frag = skb_shinfo(skb)->frag_list;
 693                 skb_shinfo(skb)->frag_list = NULL;
 694                 /* BUILD HEADER */
 695
 696                 *prevhdr = NEXTHDR_FRAGMENT;
 697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 698                 if (!tmp_hdr) {
 699                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 700                                       IPSTATS_MIB_FRAGFAILS);
 701                         return -ENOMEM;
 702                 }
 703
 704                 __skb_pull(skb, hlen);
 705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 706                 __skb_push(skb, hlen);
 707                 skb_reset_network_header(skb);
 708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 709
 710                 ipv6_select_ident(skb, fh);
 711                 fh->nexthdr = nexthdr;
 712                 fh->reserved = 0;
 713                 fh->frag_off = htons(IP6_MF);
 714                 frag_id = fh->identification;
 715
 716                 first_len = skb_pagelen(skb);
 717                 skb->data_len = first_len - skb_headlen(skb);
 718                 skb->truesize -= truesizes;
 719                 skb->len = first_len;
 720                 ipv6_hdr(skb)->payload_len = htons(first_len -
 721                                                    sizeof(struct ipv6hdr));
 722
 723                 dst_hold(&rt->u.dst);
 724
 725                 for (;;) {
 726                         /* Prepare header of the next frame,
 727                          * before previous one went down. */
 728                         if (frag) {
 729                                 frag->ip_summed = CHECKSUM_NONE;
 730                                 skb_reset_transport_header(frag);
 731                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 732                                 __skb_push(frag, hlen);
 733                                 skb_reset_network_header(frag);
 734                                 memcpy(skb_network_header(frag), tmp_hdr,
 735                                        hlen);
 736                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 737                                 fh->nexthdr = nexthdr;
 738                                 fh->reserved = 0;
 739                                 fh->frag_off = htons(offset);
 740                                 if (frag->next != NULL)
 741                                         fh->frag_off |= htons(IP6_MF);
 742                                 fh->identification = frag_id;
 743                                 ipv6_hdr(frag)->payload_len =
 744                                                 htons(frag->len -
 745                                                       sizeof(struct ipv6hdr));
 746                                 ip6_copy_metadata(frag, skb);
 747                         }
 748
 749                         err = output(skb);
 750                         if(!err)
 751                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 752                                               IPSTATS_MIB_FRAGCREATES);
 753
 754                         if (err || !frag)
 755                                 break;
 756
 757                         skb = frag;
 758                         frag = skb->next;
 759                         skb->next = NULL;
 760                 }
 761
 762                 kfree(tmp_hdr);
 763
 764                 if (err == 0) {
 765                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 766                                       IPSTATS_MIB_FRAGOKS);
 767                         dst_release(&rt->u.dst);
 768                         return 0;
 769                 }
 770
 771                 while (frag) {
 772                         skb = frag->next;
 773                         kfree_skb(frag);
 774                         frag = skb;
 775                 }
 776
 777                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
 778                               IPSTATS_MIB_FRAGFAILS);
 779                 dst_release(&rt->u.dst);
 780                 return err;
 781         }
 782
 783 slow_path:
 784         left = skb->len - hlen;         /* Space per frame */
 785         ptr = hlen;                     /* Where to start from */
 786
 787         /*
 788          *      Fragment the datagram.
 789          */
 790
 791         *prevhdr = NEXTHDR_FRAGMENT;
 792
 793         /*
 794          *      Keep copying data until we run out.
 795          */
 796         while(left > 0) {
 797                 len = left;
 798                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 799                 if (len > mtu)
 800                         len = mtu;
 801                 /* IF: we are not sending upto and including the packet end
 802                    then align the next start on an eight byte boundary */
 803                 if (len < left) {
 804                         len &= ~7;
 805                 }
 806                 /*
 807                  *      Allocate buffer.
 808                  */
 809
 810                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 811                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 812                         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 813                                       IPSTATS_MIB_FRAGFAILS);
 814                         err = -ENOMEM;
 815                         goto fail;
 816                 }
 817
 818                 /*
 819                  *      Set up data on packet
 820                  */
 821
 822                 ip6_copy_metadata(frag, skb);
 823                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 824                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 825                 skb_reset_network_header(frag);
 826                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 827                 frag->transport_header = (frag->network_header + hlen +
 828                                           sizeof(struct frag_hdr));
 829
 830                 /*
 831                  *      Charge the memory for the fragment to any owner
 832                  *      it might possess
 833                  */
 834                 if (skb->sk)
 835                         skb_set_owner_w(frag, skb->sk);
 836
 837                 /*
 838                  *      Copy the packet header into the new buffer.
 839                  */
 840                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 841
 842                 /*
 843                  *      Build fragment header.
 844                  */
 845                 fh->nexthdr = nexthdr;
 846                 fh->reserved = 0;
 847                 if (!frag_id) {
 848                         ipv6_select_ident(skb, fh);
 849                         frag_id = fh->identification;
 850                 } else
 851                         fh->identification = frag_id;
 852
 853                 /*
 854                  *      Copy a block of the IP datagram.
 855                  */
 856                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 857                         BUG();
 858                 left -= len;
 859
 860                 fh->frag_off = htons(offset);
 861                 if (left > 0)
 862                         fh->frag_off |= htons(IP6_MF);
 863                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 864                                                     sizeof(struct ipv6hdr));
 865
 866                 ptr += len;
 867                 offset += len;
 868
 869                 /*
 870                  *      Put this fragment into the sending queue.
 871                  */
 872                 err = output(frag);
 873                 if (err)
 874                         goto fail;
 875
 876                 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 877                               IPSTATS_MIB_FRAGCREATES);
 878         }
 879         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 880                       IPSTATS_MIB_FRAGOKS);
 881         kfree_skb(skb);
 882         return err;
 883
 884 fail:
 885         IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
 886                       IPSTATS_MIB_FRAGFAILS);
 887         kfree_skb(skb);
 888         return err;
 889 }
 890
 891 static inline int ip6_rt_check(struct rt6key *rt_key,
 892                                struct in6_addr *fl_addr,
 893                                struct in6_addr *addr_cache)
 894 {
 895         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 896                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 897 }
 898
 899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 900                                           struct dst_entry *dst,
 901                                           struct flowi *fl)
 902 {
 903         struct ipv6_pinfo *np = inet6_sk(sk);
 904         struct rt6_info *rt = (struct rt6_info *)dst;
 905
 906         if (!dst)
 907                 goto out;
 908
 909         /* Yes, checking route validity in not connected
 910          * case is not very simple. Take into account,
 911          * that we do not support routing by source, TOS,
 912          * and MSG_DONTROUTE            --ANK (980726)
 913          *
 914          * 1. ip6_rt_check(): If route was host route,
 915          *    check that cached destination is current.
 916          *    If it is network route, we still may
 917          *    check its validity using saved pointer
 918          *    to the last used address: daddr_cache.
 919          *    We do not want to save whole address now,
 920          *    (because main consumer of this service
 921          *    is tcp, which has not this problem),
 922          *    so that the last trick works only on connected
 923          *    sockets.
 924          * 2. oif also should be the same.
 925          */
 926         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 927 #ifdef CONFIG_IPV6_SUBTREES
 928             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 929 #endif
 930             (fl->oif && fl->oif != dst->dev->ifindex)) {
 931                 dst_release(dst);
 932                 dst = NULL;
 933         }
 934
 935 out:
 936         return dst;
 937 }
 938
 939 static int ip6_dst_lookup_tail(struct sock *sk,
 940                                struct dst_entry **dst, struct flowi *fl)
 941 {
 942         int err;
 943         struct net *net = sock_net(sk);
 944
 945         if (*dst == NULL)
 946                 *dst = ip6_route_output(net, sk, fl);
 947
 948         if ((err = (*dst)->error))
 949                 goto out_err_release;
 950
 951         if (ipv6_addr_any(&fl->fl6_src)) {
 952                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 953                                          &fl->fl6_dst,
 954                                          sk ? inet6_sk(sk)->srcprefs : 0,
 955                                          &fl->fl6_src);
 956                 if (err)
 957                         goto out_err_release;
 958         }
 959
 960 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 961         /*
 962          * Here if the dst entry we've looked up
 963          * has a neighbour entry that is in the INCOMPLETE
 964          * state and the src address from the flow is
 965          * marked as OPTIMISTIC, we release the found
 966          * dst entry and replace it instead with the
 967          * dst entry of the nexthop router
 968          */
 969         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 970                 struct inet6_ifaddr *ifp;
 971                 struct flowi fl_gw;
 972                 int redirect;
 973
 974                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 975                                       (*dst)->dev, 1);
 976
 977                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 978                 if (ifp)
 979                         in6_ifa_put(ifp);
 980
 981                 if (redirect) {
 982                         /*
 983                          * We need to get the dst entry for the
 984                          * default router instead
 985                          */
 986                         dst_release(*dst);
 987                         memcpy(&fl_gw, fl, sizeof(struct flowi));
 988                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 989                         *dst = ip6_route_output(net, sk, &fl_gw);
 990                         if ((err = (*dst)->error))
 991                                 goto out_err_release;
 992                 }
 993         }
 994 #endif
 995
 996         return 0;
 997
 998 out_err_release:
 999         if (err == -ENETUNREACH)
1000                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1001         dst_release(*dst);
1002         *dst = NULL;
1003         return err;
1004 }
1005
1006 /**
1007  *      ip6_dst_lookup - perform route lookup on flow
1008  *      @sk: socket which provides route info
1009  *      @dst: pointer to dst_entry * for result
1010  *      @fl: flow to lookup
1011  *
1012  *      This function performs a route lookup on the given flow.
1013  *
1014  *      It returns zero on success, or a standard errno code on error.
1015  */
1016 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1017 {
1018         *dst = NULL;
1019         return ip6_dst_lookup_tail(sk, dst, fl);
1020 }
1021 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1022
1023 /**
1024  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1025  *      @sk: socket which provides the dst cache and route info
1026  *      @dst: pointer to dst_entry * for result
1027  *      @fl: flow to lookup
1028  *
1029  *      This function performs a route lookup on the given flow with the
1030  *      possibility of using the cached route in the socket if it is valid.
1031  *      It will take the socket dst lock when operating on the dst cache.
1032  *      As a result, this function can only be used in process context.
1033  *
1034  *      It returns zero on success, or a standard errno code on error.
1035  */
1036 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1037 {
1038         *dst = NULL;
1039         if (sk) {
1040                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1041                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1042         }
1043
1044         return ip6_dst_lookup_tail(sk, dst, fl);
1045 }
1046 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1047
1048 static inline int ip6_ufo_append_data(struct sock *sk,
1049                         int getfrag(void *from, char *to, int offset, int len,
1050                         int odd, struct sk_buff *skb),
1051                         void *from, int length, int hh_len, int fragheaderlen,
1052                         int transhdrlen, int mtu,unsigned int flags)
1053
1054 {
1055         struct sk_buff *skb;
1056         int err;
1057
1058         /* There is support for UDP large send offload by network
1059          * device, so create one single skb packet containing complete
1060          * udp datagram
1061          */
1062         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1063                 skb = sock_alloc_send_skb(sk,
1064                         hh_len + fragheaderlen + transhdrlen + 20,
1065                         (flags & MSG_DONTWAIT), &err);
1066                 if (skb == NULL)
1067                         return -ENOMEM;
1068
1069                 /* reserve space for Hardware header */
1070                 skb_reserve(skb, hh_len);
1071
1072                 /* create space for UDP/IP header */
1073                 skb_put(skb,fragheaderlen + transhdrlen);
1074
1075                 /* initialize network header pointer */
1076                 skb_reset_network_header(skb);
1077
1078                 /* initialize protocol header pointer */
1079                 skb->transport_header = skb->network_header + fragheaderlen;
1080
1081                 skb->ip_summed = CHECKSUM_PARTIAL;
1082                 skb->csum = 0;
1083                 sk->sk_sndmsg_off = 0;
1084         }
1085
1086         err = skb_append_datato_frags(sk,skb, getfrag, from,
1087                                       (length - transhdrlen));
1088         if (!err) {
1089                 struct frag_hdr fhdr;
1090
1091                 /* specify the length of each IP datagram fragment*/
1092                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1093                                             sizeof(struct frag_hdr);
1094                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1095                 ipv6_select_ident(skb, &fhdr);
1096                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1097                 __skb_queue_tail(&sk->sk_write_queue, skb);
1098
1099                 return 0;
1100         }
1101         /* There is not enough support do UPD LSO,
1102          * so follow normal path
1103          */
1104         kfree_skb(skb);
1105
1106         return err;
1107 }
1108
1109 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1110                                                gfp_t gfp)
1111 {
1112         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1113 }
1114
1115 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1116                                                 gfp_t gfp)
1117 {
1118         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1119 }
1120
1121 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1122         int offset, int len, int odd, struct sk_buff *skb),
1123         void *from, int length, int transhdrlen,
1124         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1125         struct rt6_info *rt, unsigned int flags)
1126 {
1127         struct inet_sock *inet = inet_sk(sk);
1128         struct ipv6_pinfo *np = inet6_sk(sk);
1129         struct sk_buff *skb;
1130         unsigned int maxfraglen, fragheaderlen;
1131         int exthdrlen;
1132         int hh_len;
1133         int mtu;
1134         int copy;
1135         int err;
1136         int offset = 0;
1137         int csummode = CHECKSUM_NONE;
1138
1139         if (flags&MSG_PROBE)
1140                 return 0;
1141         if (skb_queue_empty(&sk->sk_write_queue)) {
1142                 /*
1143                  * setup for corking
1144                  */
1145                 if (opt) {
1146                         if (WARN_ON(np->cork.opt))
1147                                 return -EINVAL;
1148
1149                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1150                         if (unlikely(np->cork.opt == NULL))
1151                                 return -ENOBUFS;
1152
1153                         np->cork.opt->tot_len = opt->tot_len;
1154                         np->cork.opt->opt_flen = opt->opt_flen;
1155                         np->cork.opt->opt_nflen = opt->opt_nflen;
1156
1157                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1158                                                             sk->sk_allocation);
1159                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1160                                 return -ENOBUFS;
1161
1162                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1163                                                             sk->sk_allocation);
1164                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1165                                 return -ENOBUFS;
1166
1167                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1168                                                            sk->sk_allocation);
1169                         if (opt->hopopt && !np->cork.opt->hopopt)
1170                                 return -ENOBUFS;
1171
1172                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1173                                                             sk->sk_allocation);
1174                         if (opt->srcrt && !np->cork.opt->srcrt)
1175                                 return -ENOBUFS;
1176
1177                         /* need source address above miyazawa*/
1178                 }
1179                 dst_hold(&rt->u.dst);
1180                 inet->cork.dst = &rt->u.dst;
1181                 inet->cork.fl = *fl;
1182                 np->cork.hop_limit = hlimit;
1183                 np->cork.tclass = tclass;
1184                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1185                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1186                 if (np->frag_size < mtu) {
1187                         if (np->frag_size)
1188                                 mtu = np->frag_size;
1189                 }
1190                 inet->cork.fragsize = mtu;
1191                 if (dst_allfrag(rt->u.dst.path))
1192                         inet->cork.flags |= IPCORK_ALLFRAG;
1193                 inet->cork.length = 0;
1194                 sk->sk_sndmsg_page = NULL;
1195                 sk->sk_sndmsg_off = 0;
1196                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1197                             rt->rt6i_nfheader_len;
1198                 length += exthdrlen;
1199                 transhdrlen += exthdrlen;
1200         } else {
1201                 rt = (struct rt6_info *)inet->cork.dst;
1202                 fl = &inet->cork.fl;
1203                 opt = np->cork.opt;
1204                 transhdrlen = 0;
1205                 exthdrlen = 0;
1206                 mtu = inet->cork.fragsize;
1207         }
1208
1209         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1210
1211         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1212                         (opt ? opt->opt_nflen : 0);
1213         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1214
1215         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1216                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1217                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1218                         return -EMSGSIZE;
1219                 }
1220         }
1221
1222         /*
1223          * Let's try using as much space as possible.
1224          * Use MTU if total length of the message fits into the MTU.
1225          * Otherwise, we need to reserve fragment header and
1226          * fragment alignment (= 8-15 octects, in total).
1227          *
1228          * Note that we may need to "move" the data from the tail of
1229          * of the buffer to the new fragment when we split
1230          * the message.
1231          *
1232          * FIXME: It may be fragmented into multiple chunks
1233          *        at once if non-fragmentable extension headers
1234          *        are too large.
1235          * --yoshfuji
1236          */
1237
1238         inet->cork.length += length;
1239         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1240             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1241
1242                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1243                                           fragheaderlen, transhdrlen, mtu,
1244                                           flags);
1245                 if (err)
1246                         goto error;
1247                 return 0;
1248         }
1249
1250         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1251                 goto alloc_new_skb;
1252
1253         while (length > 0) {
1254                 /* Check if the remaining data fits into current packet. */
1255                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1256                 if (copy < length)
1257                         copy = maxfraglen - skb->len;
1258
1259                 if (copy <= 0) {
1260                         char *data;
1261                         unsigned int datalen;
1262                         unsigned int fraglen;
1263                         unsigned int fraggap;
1264                         unsigned int alloclen;
1265                         struct sk_buff *skb_prev;
1266 alloc_new_skb:
1267                         skb_prev = skb;
1268
1269                         /* There's no room in the current skb */
1270                         if (skb_prev)
1271                                 fraggap = skb_prev->len - maxfraglen;
1272                         else
1273                                 fraggap = 0;
1274
1275                         /*
1276                          * If remaining data exceeds the mtu,
1277                          * we know we need more fragment(s).
1278                          */
1279                         datalen = length + fraggap;
1280                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1281                                 datalen = maxfraglen - fragheaderlen;
1282
1283                         fraglen = datalen + fragheaderlen;
1284                         if ((flags & MSG_MORE) &&
1285                             !(rt->u.dst.dev->features&NETIF_F_SG))
1286                                 alloclen = mtu;
1287                         else
1288                                 alloclen = datalen + fragheaderlen;
1289
1290                         /*
1291                          * The last fragment gets additional space at tail.
1292                          * Note: we overallocate on fragments with MSG_MODE
1293                          * because we have no idea if we're the last one.
1294                          */
1295                         if (datalen == length + fraggap)
1296                                 alloclen += rt->u.dst.trailer_len;
1297
1298                         /*
1299                          * We just reserve space for fragment header.
1300                          * Note: this may be overallocation if the message
1301                          * (without MSG_MORE) fits into the MTU.
1302                          */
1303                         alloclen += sizeof(struct frag_hdr);
1304
1305                         if (transhdrlen) {
1306                                 skb = sock_alloc_send_skb(sk,
1307                                                 alloclen + hh_len,
1308                                                 (flags & MSG_DONTWAIT), &err);
1309                         } else {
1310                                 skb = NULL;
1311                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1312                                     2 * sk->sk_sndbuf)
1313                                         skb = sock_wmalloc(sk,
1314                                                            alloclen + hh_len, 1,
1315                                                            sk->sk_allocation);
1316                                 if (unlikely(skb == NULL))
1317                                         err = -ENOBUFS;
1318                         }
1319                         if (skb == NULL)
1320                                 goto error;
1321                         /*
1322                          *      Fill in the control structures
1323                          */
1324                         skb->ip_summed = csummode;
1325                         skb->csum = 0;
1326                         /* reserve for fragmentation */
1327                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1328
1329                         /*
1330                          *      Find where to start putting bytes
1331                          */
1332                         data = skb_put(skb, fraglen);
1333                         skb_set_network_header(skb, exthdrlen);
1334                         data += fragheaderlen;
1335                         skb->transport_header = (skb->network_header +
1336                                                  fragheaderlen);
1337                         if (fraggap) {
1338                                 skb->csum = skb_copy_and_csum_bits(
1339                                         skb_prev, maxfraglen,
1340                                         data + transhdrlen, fraggap, 0);
1341                                 skb_prev->csum = csum_sub(skb_prev->csum,
1342                                                           skb->csum);
1343                                 data += fraggap;
1344                                 pskb_trim_unique(skb_prev, maxfraglen);
1345                         }
1346                         copy = datalen - transhdrlen - fraggap;
1347                         if (copy < 0) {
1348                                 err = -EINVAL;
1349                                 kfree_skb(skb);
1350                                 goto error;
1351                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1352                                 err = -EFAULT;
1353                                 kfree_skb(skb);
1354                                 goto error;
1355                         }
1356
1357                         offset += copy;
1358                         length -= datalen - fraggap;
1359                         transhdrlen = 0;
1360                         exthdrlen = 0;
1361                         csummode = CHECKSUM_NONE;
1362
1363                         /*
1364                          * Put the packet on the pending queue
1365                          */
1366                         __skb_queue_tail(&sk->sk_write_queue, skb);
1367                         continue;
1368                 }
1369
1370                 if (copy > length)
1371                         copy = length;
1372
1373                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1374                         unsigned int off;
1375
1376                         off = skb->len;
1377                         if (getfrag(from, skb_put(skb, copy),
1378                                                 offset, copy, off, skb) < 0) {
1379                                 __skb_trim(skb, off);
1380                                 err = -EFAULT;
1381                                 goto error;
1382                         }
1383                 } else {
1384                         int i = skb_shinfo(skb)->nr_frags;
1385                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1386                         struct page *page = sk->sk_sndmsg_page;
1387                         int off = sk->sk_sndmsg_off;
1388                         unsigned int left;
1389
1390                         if (page && (left = PAGE_SIZE - off) > 0) {
1391                                 if (copy >= left)
1392                                         copy = left;
1393                                 if (page != frag->page) {
1394                                         if (i == MAX_SKB_FRAGS) {
1395                                                 err = -EMSGSIZE;
1396                                                 goto error;
1397                                         }
1398                                         get_page(page);
1399                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1400                                         frag = &skb_shinfo(skb)->frags[i];
1401                                 }
1402                         } else if(i < MAX_SKB_FRAGS) {
1403                                 if (copy > PAGE_SIZE)
1404                                         copy = PAGE_SIZE;
1405                                 page = alloc_pages(sk->sk_allocation, 0);
1406                                 if (page == NULL) {
1407                                         err = -ENOMEM;
1408                                         goto error;
1409                                 }
1410                                 sk->sk_sndmsg_page = page;
1411                                 sk->sk_sndmsg_off = 0;
1412
1413                                 skb_fill_page_desc(skb, i, page, 0, 0);
1414                                 frag = &skb_shinfo(skb)->frags[i];
1415                         } else {
1416                                 err = -EMSGSIZE;
1417                                 goto error;
1418                         }
1419                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1420                                 err = -EFAULT;
1421                                 goto error;
1422                         }
1423                         sk->sk_sndmsg_off += copy;
1424                         frag->size += copy;
1425                         skb->len += copy;
1426                         skb->data_len += copy;
1427                         skb->truesize += copy;
1428                         atomic_add(copy, &sk->sk_wmem_alloc);
1429                 }
1430                 offset += copy;
1431                 length -= copy;
1432         }
1433         return 0;
1434 error:
1435         inet->cork.length -= length;
1436         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1437         return err;
1438 }
1439
1440 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1441 {
1442         if (np->cork.opt) {
1443                 kfree(np->cork.opt->dst0opt);
1444                 kfree(np->cork.opt->dst1opt);
1445                 kfree(np->cork.opt->hopopt);
1446                 kfree(np->cork.opt->srcrt);
1447                 kfree(np->cork.opt);
1448                 np->cork.opt = NULL;
1449         }
1450
1451         if (inet->cork.dst) {
1452                 dst_release(inet->cork.dst);
1453                 inet->cork.dst = NULL;
1454                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1455         }
1456         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1457 }
1458
1459 int ip6_push_pending_frames(struct sock *sk)
1460 {
1461         struct sk_buff *skb, *tmp_skb;
1462         struct sk_buff **tail_skb;
1463         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1464         struct inet_sock *inet = inet_sk(sk);
1465         struct ipv6_pinfo *np = inet6_sk(sk);
1466         struct net *net = sock_net(sk);
1467         struct ipv6hdr *hdr;
1468         struct ipv6_txoptions *opt = np->cork.opt;
1469         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1470         struct flowi *fl = &inet->cork.fl;
1471         unsigned char proto = fl->proto;
1472         int err = 0;
1473
1474         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1475                 goto out;
1476         tail_skb = &(skb_shinfo(skb)->frag_list);
1477
1478         /* move skb->data to ip header from ext header */
1479         if (skb->data < skb_network_header(skb))
1480                 __skb_pull(skb, skb_network_offset(skb));
1481         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1482                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1483                 *tail_skb = tmp_skb;
1484                 tail_skb = &(tmp_skb->next);
1485                 skb->len += tmp_skb->len;
1486                 skb->data_len += tmp_skb->len;
1487                 skb->truesize += tmp_skb->truesize;
1488                 __sock_put(tmp_skb->sk);
1489                 tmp_skb->destructor = NULL;
1490                 tmp_skb->sk = NULL;
1491         }
1492
1493         /* Allow local fragmentation. */
1494         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1495                 skb->local_df = 1;
1496
1497         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1498         __skb_pull(skb, skb_network_header_len(skb));
1499         if (opt && opt->opt_flen)
1500                 ipv6_push_frag_opts(skb, opt, &proto);
1501         if (opt && opt->opt_nflen)
1502                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1503
1504         skb_push(skb, sizeof(struct ipv6hdr));
1505         skb_reset_network_header(skb);
1506         hdr = ipv6_hdr(skb);
1507
1508         *(__be32*)hdr = fl->fl6_flowlabel |
1509                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1510
1511         hdr->hop_limit = np->cork.hop_limit;
1512         hdr->nexthdr = proto;
1513         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1514         ipv6_addr_copy(&hdr->daddr, final_dst);
1515
1516         skb->priority = sk->sk_priority;
1517         skb->mark = sk->sk_mark;
1518
1519         skb->dst = dst_clone(&rt->u.dst);
1520         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1521         if (proto == IPPROTO_ICMPV6) {
1522                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1523
1524                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1525                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1526         }
1527
1528         err = ip6_local_out(skb);
1529         if (err) {
1530                 if (err > 0)
1531                         err = np->recverr ? net_xmit_errno(err) : 0;
1532                 if (err)
1533                         goto error;
1534         }
1535
1536 out:
1537         ip6_cork_release(inet, np);
1538         return err;
1539 error:
1540         goto out;
1541 }
1542
1543 void ip6_flush_pending_frames(struct sock *sk)
1544 {
1545         struct sk_buff *skb;
1546
1547         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1548                 if (skb->dst)
1549                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
1550                                       IPSTATS_MIB_OUTDISCARDS);
1551                 kfree_skb(skb);
1552         }
1553
1554         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1555 }