git.oblomov.eu Git - linux-2.6/blob - net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 /*
 157  *      xmit an sk_buff (used by TCP)
 158  */
 159
 160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 161              struct ipv6_txoptions *opt, int ipfragok)
 162 {
 163         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 164         struct in6_addr *first_hop = &fl->fl6_dst;
 165         struct dst_entry *dst = skb->dst;
 166         struct ipv6hdr *hdr;
 167         u8  proto = fl->proto;
 168         int seg_len = skb->len;
 169         int hlimit, tclass;
 170         u32 mtu;
 171
 172         if (opt) {
 173                 int head_room;
 174
 175                 /* First: exthdrs may take lots of space (~8K for now)
 176                    MAX_HEADER is not enough.
 177                  */
 178                 head_room = opt->opt_nflen + opt->opt_flen;
 179                 seg_len += head_room;
 180                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 181
 182                 if (skb_headroom(skb) < head_room) {
 183                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 184                         kfree_skb(skb);
 185                         skb = skb2;
 186                         if (skb == NULL) {
 187                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 188                                 return -ENOBUFS;
 189                         }
 190                         if (sk)
 191                                 skb_set_owner_w(skb, sk);
 192                 }
 193                 if (opt->opt_flen)
 194                         ipv6_push_frag_opts(skb, opt, &proto);
 195                 if (opt->opt_nflen)
 196                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 197         }
 198
 199         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 200
 201         /*
 202          *      Fill in the IPv6 header
 203          */
 204
 205         hlimit = -1;
 206         if (np)
 207                 hlimit = np->hop_limit;
 208         if (hlimit < 0)
 209                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 210         if (hlimit < 0)
 211                 hlimit = ipv6_get_hoplimit(dst->dev);
 212
 213         tclass = -1;
 214         if (np)
 215                 tclass = np->tclass;
 216         if (tclass < 0)
 217                 tclass = 0;
 218
 219         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 220
 221         hdr->payload_len = htons(seg_len);
 222         hdr->nexthdr = proto;
 223         hdr->hop_limit = hlimit;
 224
 225         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 226         ipv6_addr_copy(&hdr->daddr, first_hop);
 227
 228         mtu = dst_mtu(dst);
 229         if ((skb->len <= mtu) || ipfragok) {
 230                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 231                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
 232                                 dst_output);
 233         }
 234
 235         if (net_ratelimit())
 236                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 237         skb->dev = dst->dev;
 238         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 239         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 240         kfree_skb(skb);
 241         return -EMSGSIZE;
 242 }
 243
 244 /*
 245  *      To avoid extra problems ND packets are send through this
 246  *      routine. It's code duplication but I really want to avoid
 247  *      extra checks since ipv6_build_header is used by TCP (which
 248  *      is for us performance critical)
 249  */
 250
 251 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 252                struct in6_addr *saddr, struct in6_addr *daddr,
 253                int proto, int len)
 254 {
 255         struct ipv6_pinfo *np = inet6_sk(sk);
 256         struct ipv6hdr *hdr;
 257         int totlen;
 258
 259         skb->protocol = htons(ETH_P_IPV6);
 260         skb->dev = dev;
 261
 262         totlen = len + sizeof(struct ipv6hdr);
 263
 264         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 265         skb->nh.ipv6h = hdr;
 266
 267         *(u32*)hdr = htonl(0x60000000);
 268
 269         hdr->payload_len = htons(len);
 270         hdr->nexthdr = proto;
 271         hdr->hop_limit = np->hop_limit;
 272
 273         ipv6_addr_copy(&hdr->saddr, saddr);
 274         ipv6_addr_copy(&hdr->daddr, daddr);
 275
 276         return 0;
 277 }
 278
 279 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 280 {
 281         struct ip6_ra_chain *ra;
 282         struct sock *last = NULL;
 283
 284         read_lock(&ip6_ra_lock);
 285         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 286                 struct sock *sk = ra->sk;
 287                 if (sk && ra->sel == sel &&
 288                     (!sk->sk_bound_dev_if ||
 289                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 290                         if (last) {
 291                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 292                                 if (skb2)
 293                                         rawv6_rcv(last, skb2);
 294                         }
 295                         last = sk;
 296                 }
 297         }
 298
 299         if (last) {
 300                 rawv6_rcv(last, skb);
 301                 read_unlock(&ip6_ra_lock);
 302                 return 1;
 303         }
 304         read_unlock(&ip6_ra_lock);
 305         return 0;
 306 }
 307
 308 static inline int ip6_forward_finish(struct sk_buff *skb)
 309 {
 310         return dst_output(skb);
 311 }
 312
 313 int ip6_forward(struct sk_buff *skb)
 314 {
 315         struct dst_entry *dst = skb->dst;
 316         struct ipv6hdr *hdr = skb->nh.ipv6h;
 317         struct inet6_skb_parm *opt = IP6CB(skb);
 318
 319         if (ipv6_devconf.forwarding == 0)
 320                 goto error;
 321
 322         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 323                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 324                 goto drop;
 325         }
 326
 327         skb->ip_summed = CHECKSUM_NONE;
 328
 329         /*
 330          *      We DO NOT make any processing on
 331          *      RA packets, pushing them to user level AS IS
 332          *      without ane WARRANTY that application will be able
 333          *      to interpret them. The reason is that we
 334          *      cannot make anything clever here.
 335          *
 336          *      We are not end-node, so that if packet contains
 337          *      AH/ESP, we cannot make anything.
 338          *      Defragmentation also would be mistake, RA packets
 339          *      cannot be fragmented, because there is no warranty
 340          *      that different fragments will go along one path. --ANK
 341          */
 342         if (opt->ra) {
 343                 u8 *ptr = skb->nh.raw + opt->ra;
 344                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 345                         return 0;
 346         }
 347
 348         /*
 349          *      check and decrement ttl
 350          */
 351         if (hdr->hop_limit <= 1) {
 352                 /* Force OUTPUT device used as source address */
 353                 skb->dev = dst->dev;
 354                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 355                             0, skb->dev);
 356
 357                 kfree_skb(skb);
 358                 return -ETIMEDOUT;
 359         }
 360
 361         if (!xfrm6_route_forward(skb)) {
 362                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 363                 goto drop;
 364         }
 365         dst = skb->dst;
 366
 367         /* IPv6 specs say nothing about it, but it is clear that we cannot
 368            send redirects to source routed frames.
 369          */
 370         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 371                 struct in6_addr *target = NULL;
 372                 struct rt6_info *rt;
 373                 struct neighbour *n = dst->neighbour;
 374
 375                 /*
 376                  *      incoming and outgoing devices are the same
 377                  *      send a redirect.
 378                  */
 379
 380                 rt = (struct rt6_info *) dst;
 381                 if ((rt->rt6i_flags & RTF_GATEWAY))
 382                         target = (struct in6_addr*)&n->primary_key;
 383                 else
 384                         target = &hdr->daddr;
 385
 386                 /* Limit redirects both by destination (here)
 387                    and by source (inside ndisc_send_redirect)
 388                  */
 389                 if (xrlim_allow(dst, 1*HZ))
 390                         ndisc_send_redirect(skb, n, target);
 391         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 392                                                 |IPV6_ADDR_LINKLOCAL)) {
 393                 /* This check is security critical. */
 394                 goto error;
 395         }
 396
 397         if (skb->len > dst_mtu(dst)) {
 398                 /* Again, force OUTPUT device used as source address */
 399                 skb->dev = dst->dev;
 400                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 401                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 402                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 403                 kfree_skb(skb);
 404                 return -EMSGSIZE;
 405         }
 406
 407         if (skb_cow(skb, dst->dev->hard_header_len)) {
 408                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 409                 goto drop;
 410         }
 411
 412         hdr = skb->nh.ipv6h;
 413
 414         /* Mangling hops number delayed to point after skb COW */
 415
 416         hdr->hop_limit--;
 417
 418         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 419         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 420
 421 error:
 422         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 423 drop:
 424         kfree_skb(skb);
 425         return -EINVAL;
 426 }
 427
 428 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 429 {
 430         to->pkt_type = from->pkt_type;
 431         to->priority = from->priority;
 432         to->protocol = from->protocol;
 433         dst_release(to->dst);
 434         to->dst = dst_clone(from->dst);
 435         to->dev = from->dev;
 436
 437 #ifdef CONFIG_NET_SCHED
 438         to->tc_index = from->tc_index;
 439 #endif
 440 #ifdef CONFIG_NETFILTER
 441         to->nfmark = from->nfmark;
 442         /* Connection association is same as pre-frag packet */
 443         to->nfct = from->nfct;
 444         nf_conntrack_get(to->nfct);
 445         to->nfctinfo = from->nfctinfo;
 446 #ifdef CONFIG_BRIDGE_NETFILTER
 447         nf_bridge_put(to->nf_bridge);
 448         to->nf_bridge = from->nf_bridge;
 449         nf_bridge_get(to->nf_bridge);
 450 #endif
 451 #endif
 452 }
 453
 454 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 455 {
 456         u16 offset = sizeof(struct ipv6hdr);
 457         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 458         unsigned int packet_len = skb->tail - skb->nh.raw;
 459         int found_rhdr = 0;
 460         *nexthdr = &skb->nh.ipv6h->nexthdr;
 461
 462         while (offset + 1 <= packet_len) {
 463
 464                 switch (**nexthdr) {
 465
 466                 case NEXTHDR_HOP:
 467                 case NEXTHDR_ROUTING:
 468                 case NEXTHDR_DEST:
 469                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 470                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 471                         offset += ipv6_optlen(exthdr);
 472                         *nexthdr = &exthdr->nexthdr;
 473                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 474                         break;
 475                 default :
 476                         return offset;
 477                 }
 478         }
 479
 480         return offset;
 481 }
 482
 483 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 484 {
 485         struct net_device *dev;
 486         struct sk_buff *frag;
 487         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 488         struct ipv6hdr *tmp_hdr;
 489         struct frag_hdr *fh;
 490         unsigned int mtu, hlen, left, len;
 491         u32 frag_id = 0;
 492         int ptr, offset = 0, err=0;
 493         u8 *prevhdr, nexthdr = 0;
 494
 495         dev = rt->u.dst.dev;
 496         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 497         nexthdr = *prevhdr;
 498
 499         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 500
 501         if (skb_shinfo(skb)->frag_list) {
 502                 int first_len = skb_pagelen(skb);
 503
 504                 if (first_len - hlen > mtu ||
 505                     ((first_len - hlen) & 7) ||
 506                     skb_cloned(skb))
 507                         goto slow_path;
 508
 509                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 510                         /* Correct geometry. */
 511                         if (frag->len > mtu ||
 512                             ((frag->len & 7) && frag->next) ||
 513                             skb_headroom(frag) < hlen)
 514                             goto slow_path;
 515
 516                         /* Partially cloned skb? */
 517                         if (skb_shared(frag))
 518                                 goto slow_path;
 519
 520                         BUG_ON(frag->sk);
 521                         if (skb->sk) {
 522                                 sock_hold(skb->sk);
 523                                 frag->sk = skb->sk;
 524                                 frag->destructor = sock_wfree;
 525                                 skb->truesize -= frag->truesize;
 526                         }
 527                 }
 528
 529                 err = 0;
 530                 offset = 0;
 531                 frag = skb_shinfo(skb)->frag_list;
 532                 skb_shinfo(skb)->frag_list = NULL;
 533                 /* BUILD HEADER */
 534
 535                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 536                 if (!tmp_hdr) {
 537                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 538                         return -ENOMEM;
 539                 }
 540
 541                 *prevhdr = NEXTHDR_FRAGMENT;
 542                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 543                 __skb_pull(skb, hlen);
 544                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 545                 skb->nh.raw = __skb_push(skb, hlen);
 546                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 547
 548                 ipv6_select_ident(skb, fh);
 549                 fh->nexthdr = nexthdr;
 550                 fh->reserved = 0;
 551                 fh->frag_off = htons(IP6_MF);
 552                 frag_id = fh->identification;
 553
 554                 first_len = skb_pagelen(skb);
 555                 skb->data_len = first_len - skb_headlen(skb);
 556                 skb->len = first_len;
 557                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 558
 559
 560                 for (;;) {
 561                         /* Prepare header of the next frame,
 562                          * before previous one went down. */
 563                         if (frag) {
 564                                 frag->ip_summed = CHECKSUM_NONE;
 565                                 frag->h.raw = frag->data;
 566                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 567                                 frag->nh.raw = __skb_push(frag, hlen);
 568                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 569                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 570                                 fh->nexthdr = nexthdr;
 571                                 fh->reserved = 0;
 572                                 fh->frag_off = htons(offset);
 573                                 if (frag->next != NULL)
 574                                         fh->frag_off |= htons(IP6_MF);
 575                                 fh->identification = frag_id;
 576                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 577                                 ip6_copy_metadata(frag, skb);
 578                         }
 579
 580                         err = output(skb);
 581                         if (err || !frag)
 582                                 break;
 583
 584                         skb = frag;
 585                         frag = skb->next;
 586                         skb->next = NULL;
 587                 }
 588
 589                 if (tmp_hdr)
 590                         kfree(tmp_hdr);
 591
 592                 if (err == 0) {
 593                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 594                         return 0;
 595                 }
 596
 597                 while (frag) {
 598                         skb = frag->next;
 599                         kfree_skb(frag);
 600                         frag = skb;
 601                 }
 602
 603                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 604                 return err;
 605         }
 606
 607 slow_path:
 608         left = skb->len - hlen;         /* Space per frame */
 609         ptr = hlen;                     /* Where to start from */
 610
 611         /*
 612          *      Fragment the datagram.
 613          */
 614
 615         *prevhdr = NEXTHDR_FRAGMENT;
 616
 617         /*
 618          *      Keep copying data until we run out.
 619          */
 620         while(left > 0) {
 621                 len = left;
 622                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 623                 if (len > mtu)
 624                         len = mtu;
 625                 /* IF: we are not sending upto and including the packet end
 626                    then align the next start on an eight byte boundary */
 627                 if (len < left) {
 628                         len &= ~7;
 629                 }
 630                 /*
 631                  *      Allocate buffer.
 632                  */
 633
 634                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 635                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 636                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 637                         err = -ENOMEM;
 638                         goto fail;
 639                 }
 640
 641                 /*
 642                  *      Set up data on packet
 643                  */
 644
 645                 ip6_copy_metadata(frag, skb);
 646                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 647                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 648                 frag->nh.raw = frag->data;
 649                 fh = (struct frag_hdr*)(frag->data + hlen);
 650                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 651
 652                 /*
 653                  *      Charge the memory for the fragment to any owner
 654                  *      it might possess
 655                  */
 656                 if (skb->sk)
 657                         skb_set_owner_w(frag, skb->sk);
 658
 659                 /*
 660                  *      Copy the packet header into the new buffer.
 661                  */
 662                 memcpy(frag->nh.raw, skb->data, hlen);
 663
 664                 /*
 665                  *      Build fragment header.
 666                  */
 667                 fh->nexthdr = nexthdr;
 668                 fh->reserved = 0;
 669                 if (!frag_id) {
 670                         ipv6_select_ident(skb, fh);
 671                         frag_id = fh->identification;
 672                 } else
 673                         fh->identification = frag_id;
 674
 675                 /*
 676                  *      Copy a block of the IP datagram.
 677                  */
 678                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 679                         BUG();
 680                 left -= len;
 681
 682                 fh->frag_off = htons(offset);
 683                 if (left > 0)
 684                         fh->frag_off |= htons(IP6_MF);
 685                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 686
 687                 ptr += len;
 688                 offset += len;
 689
 690                 /*
 691                  *      Put this fragment into the sending queue.
 692                  */
 693
 694                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 695
 696                 err = output(frag);
 697                 if (err)
 698                         goto fail;
 699         }
 700         kfree_skb(skb);
 701         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 702         return err;
 703
 704 fail:
 705         kfree_skb(skb);
 706         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 707         return err;
 708 }
 709
 710 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 711 {
 712         int err = 0;
 713
 714         *dst = NULL;
 715         if (sk) {
 716                 struct ipv6_pinfo *np = inet6_sk(sk);
 717
 718                 *dst = sk_dst_check(sk, np->dst_cookie);
 719                 if (*dst) {
 720                         struct rt6_info *rt = (struct rt6_info*)*dst;
 721
 722                                 /* Yes, checking route validity in not connected
 723                                    case is not very simple. Take into account,
 724                                    that we do not support routing by source, TOS,
 725                                    and MSG_DONTROUTE            --ANK (980726)
 726
 727                                    1. If route was host route, check that
 728                                       cached destination is current.
 729                                       If it is network route, we still may
 730                                       check its validity using saved pointer
 731                                       to the last used address: daddr_cache.
 732                                       We do not want to save whole address now,
 733                                       (because main consumer of this service
 734                                        is tcp, which has not this problem),
 735                                       so that the last trick works only on connected
 736                                       sockets.
 737                                    2. oif also should be the same.
 738                                  */
 739
 740                         if (((rt->rt6i_dst.plen != 128 ||
 741                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 742                              && (np->daddr_cache == NULL ||
 743                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 744                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 745                                 dst_release(*dst);
 746                                 *dst = NULL;
 747                         }
 748                 }
 749         }
 750
 751         if (*dst == NULL)
 752                 *dst = ip6_route_output(sk, fl);
 753
 754         if ((err = (*dst)->error))
 755                 goto out_err_release;
 756
 757         if (ipv6_addr_any(&fl->fl6_src)) {
 758                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 759
 760                 if (err)
 761                         goto out_err_release;
 762         }
 763
 764         return 0;
 765
 766 out_err_release:
 767         dst_release(*dst);
 768         *dst = NULL;
 769         return err;
 770 }
 771
 772 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 773         int offset, int len, int odd, struct sk_buff *skb),
 774         void *from, int length, int transhdrlen,
 775         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
 776         struct rt6_info *rt, unsigned int flags)
 777 {
 778         struct inet_sock *inet = inet_sk(sk);
 779         struct ipv6_pinfo *np = inet6_sk(sk);
 780         struct sk_buff *skb;
 781         unsigned int maxfraglen, fragheaderlen;
 782         int exthdrlen;
 783         int hh_len;
 784         int mtu;
 785         int copy;
 786         int err;
 787         int offset = 0;
 788         int csummode = CHECKSUM_NONE;
 789
 790         if (flags&MSG_PROBE)
 791                 return 0;
 792         if (skb_queue_empty(&sk->sk_write_queue)) {
 793                 /*
 794                  * setup for corking
 795                  */
 796                 if (opt) {
 797                         if (np->cork.opt == NULL) {
 798                                 np->cork.opt = kmalloc(opt->tot_len,
 799                                                        sk->sk_allocation);
 800                                 if (unlikely(np->cork.opt == NULL))
 801                                         return -ENOBUFS;
 802                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 803                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 804                                 return -EINVAL;
 805                         }
 806                         memcpy(np->cork.opt, opt, opt->tot_len);
 807                         inet->cork.flags |= IPCORK_OPT;
 808                         /* need source address above miyazawa*/
 809                 }
 810                 dst_hold(&rt->u.dst);
 811                 np->cork.rt = rt;
 812                 inet->cork.fl = *fl;
 813                 np->cork.hop_limit = hlimit;
 814                 np->cork.tclass = tclass;
 815                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 816                 if (dst_allfrag(rt->u.dst.path))
 817                         inet->cork.flags |= IPCORK_ALLFRAG;
 818                 inet->cork.length = 0;
 819                 sk->sk_sndmsg_page = NULL;
 820                 sk->sk_sndmsg_off = 0;
 821                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 822                 length += exthdrlen;
 823                 transhdrlen += exthdrlen;
 824         } else {
 825                 rt = np->cork.rt;
 826                 fl = &inet->cork.fl;
 827                 if (inet->cork.flags & IPCORK_OPT)
 828                         opt = np->cork.opt;
 829                 transhdrlen = 0;
 830                 exthdrlen = 0;
 831                 mtu = inet->cork.fragsize;
 832         }
 833
 834         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 835
 836         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 837         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 838
 839         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 840                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 841                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 842                         return -EMSGSIZE;
 843                 }
 844         }
 845
 846         /*
 847          * Let's try using as much space as possible.
 848          * Use MTU if total length of the message fits into the MTU.
 849          * Otherwise, we need to reserve fragment header and
 850          * fragment alignment (= 8-15 octects, in total).
 851          *
 852          * Note that we may need to "move" the data from the tail of
 853          * of the buffer to the new fragment when we split
 854          * the message.
 855          *
 856          * FIXME: It may be fragmented into multiple chunks
 857          *        at once if non-fragmentable extension headers
 858          *        are too large.
 859          * --yoshfuji
 860          */
 861
 862         inet->cork.length += length;
 863
 864         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 865                 goto alloc_new_skb;
 866
 867         while (length > 0) {
 868                 /* Check if the remaining data fits into current packet. */
 869                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
 870                 if (copy < length)
 871                         copy = maxfraglen - skb->len;
 872
 873                 if (copy <= 0) {
 874                         char *data;
 875                         unsigned int datalen;
 876                         unsigned int fraglen;
 877                         unsigned int fraggap;
 878                         unsigned int alloclen;
 879                         struct sk_buff *skb_prev;
 880 alloc_new_skb:
 881                         skb_prev = skb;
 882
 883                         /* There's no room in the current skb */
 884                         if (skb_prev)
 885                                 fraggap = skb_prev->len - maxfraglen;
 886                         else
 887                                 fraggap = 0;
 888
 889                         /*
 890                          * If remaining data exceeds the mtu,
 891                          * we know we need more fragment(s).
 892                          */
 893                         datalen = length + fraggap;
 894                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 895                                 datalen = maxfraglen - fragheaderlen;
 896
 897                         fraglen = datalen + fragheaderlen;
 898                         if ((flags & MSG_MORE) &&
 899                             !(rt->u.dst.dev->features&NETIF_F_SG))
 900                                 alloclen = mtu;
 901                         else
 902                                 alloclen = datalen + fragheaderlen;
 903
 904                         /*
 905                          * The last fragment gets additional space at tail.
 906                          * Note: we overallocate on fragments with MSG_MODE
 907                          * because we have no idea if we're the last one.
 908                          */
 909                         if (datalen == length + fraggap)
 910                                 alloclen += rt->u.dst.trailer_len;
 911
 912                         /*
 913                          * We just reserve space for fragment header.
 914                          * Note: this may be overallocation if the message
 915                          * (without MSG_MORE) fits into the MTU.
 916                          */
 917                         alloclen += sizeof(struct frag_hdr);
 918
 919                         if (transhdrlen) {
 920                                 skb = sock_alloc_send_skb(sk,
 921                                                 alloclen + hh_len,
 922                                                 (flags & MSG_DONTWAIT), &err);
 923                         } else {
 924                                 skb = NULL;
 925                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 926                                     2 * sk->sk_sndbuf)
 927                                         skb = sock_wmalloc(sk,
 928                                                            alloclen + hh_len, 1,
 929                                                            sk->sk_allocation);
 930                                 if (unlikely(skb == NULL))
 931                                         err = -ENOBUFS;
 932                         }
 933                         if (skb == NULL)
 934                                 goto error;
 935                         /*
 936                          *      Fill in the control structures
 937                          */
 938                         skb->ip_summed = csummode;
 939                         skb->csum = 0;
 940                         /* reserve for fragmentation */
 941                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 942
 943                         /*
 944                          *      Find where to start putting bytes
 945                          */
 946                         data = skb_put(skb, fraglen);
 947                         skb->nh.raw = data + exthdrlen;
 948                         data += fragheaderlen;
 949                         skb->h.raw = data + exthdrlen;
 950
 951                         if (fraggap) {
 952                                 skb->csum = skb_copy_and_csum_bits(
 953                                         skb_prev, maxfraglen,
 954                                         data + transhdrlen, fraggap, 0);
 955                                 skb_prev->csum = csum_sub(skb_prev->csum,
 956                                                           skb->csum);
 957                                 data += fraggap;
 958                                 skb_trim(skb_prev, maxfraglen);
 959                         }
 960                         copy = datalen - transhdrlen - fraggap;
 961                         if (copy < 0) {
 962                                 err = -EINVAL;
 963                                 kfree_skb(skb);
 964                                 goto error;
 965                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 966                                 err = -EFAULT;
 967                                 kfree_skb(skb);
 968                                 goto error;
 969                         }
 970
 971                         offset += copy;
 972                         length -= datalen - fraggap;
 973                         transhdrlen = 0;
 974                         exthdrlen = 0;
 975                         csummode = CHECKSUM_NONE;
 976
 977                         /*
 978                          * Put the packet on the pending queue
 979                          */
 980                         __skb_queue_tail(&sk->sk_write_queue, skb);
 981                         continue;
 982                 }
 983
 984                 if (copy > length)
 985                         copy = length;
 986
 987                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 988                         unsigned int off;
 989
 990                         off = skb->len;
 991                         if (getfrag(from, skb_put(skb, copy),
 992                                                 offset, copy, off, skb) < 0) {
 993                                 __skb_trim(skb, off);
 994                                 err = -EFAULT;
 995                                 goto error;
 996                         }
 997                 } else {
 998                         int i = skb_shinfo(skb)->nr_frags;
 999                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1000                         struct page *page = sk->sk_sndmsg_page;
1001                         int off = sk->sk_sndmsg_off;
1002                         unsigned int left;
1003
1004                         if (page && (left = PAGE_SIZE - off) > 0) {
1005                                 if (copy >= left)
1006                                         copy = left;
1007                                 if (page != frag->page) {
1008                                         if (i == MAX_SKB_FRAGS) {
1009                                                 err = -EMSGSIZE;
1010                                                 goto error;
1011                                         }
1012                                         get_page(page);
1013                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1014                                         frag = &skb_shinfo(skb)->frags[i];
1015                                 }
1016                         } else if(i < MAX_SKB_FRAGS) {
1017                                 if (copy > PAGE_SIZE)
1018                                         copy = PAGE_SIZE;
1019                                 page = alloc_pages(sk->sk_allocation, 0);
1020                                 if (page == NULL) {
1021                                         err = -ENOMEM;
1022                                         goto error;
1023                                 }
1024                                 sk->sk_sndmsg_page = page;
1025                                 sk->sk_sndmsg_off = 0;
1026
1027                                 skb_fill_page_desc(skb, i, page, 0, 0);
1028                                 frag = &skb_shinfo(skb)->frags[i];
1029                                 skb->truesize += PAGE_SIZE;
1030                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1031                         } else {
1032                                 err = -EMSGSIZE;
1033                                 goto error;
1034                         }
1035                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1036                                 err = -EFAULT;
1037                                 goto error;
1038                         }
1039                         sk->sk_sndmsg_off += copy;
1040                         frag->size += copy;
1041                         skb->len += copy;
1042                         skb->data_len += copy;
1043                 }
1044                 offset += copy;
1045                 length -= copy;
1046         }
1047         return 0;
1048 error:
1049         inet->cork.length -= length;
1050         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1051         return err;
1052 }
1053
1054 int ip6_push_pending_frames(struct sock *sk)
1055 {
1056         struct sk_buff *skb, *tmp_skb;
1057         struct sk_buff **tail_skb;
1058         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1059         struct inet_sock *inet = inet_sk(sk);
1060         struct ipv6_pinfo *np = inet6_sk(sk);
1061         struct ipv6hdr *hdr;
1062         struct ipv6_txoptions *opt = np->cork.opt;
1063         struct rt6_info *rt = np->cork.rt;
1064         struct flowi *fl = &inet->cork.fl;
1065         unsigned char proto = fl->proto;
1066         int err = 0;
1067
1068         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1069                 goto out;
1070         tail_skb = &(skb_shinfo(skb)->frag_list);
1071
1072         /* move skb->data to ip header from ext header */
1073         if (skb->data < skb->nh.raw)
1074                 __skb_pull(skb, skb->nh.raw - skb->data);
1075         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1076                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1077                 *tail_skb = tmp_skb;
1078                 tail_skb = &(tmp_skb->next);
1079                 skb->len += tmp_skb->len;
1080                 skb->data_len += tmp_skb->len;
1081                 skb->truesize += tmp_skb->truesize;
1082                 __sock_put(tmp_skb->sk);
1083                 tmp_skb->destructor = NULL;
1084                 tmp_skb->sk = NULL;
1085         }
1086
1087         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1088         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1089         if (opt && opt->opt_flen)
1090                 ipv6_push_frag_opts(skb, opt, &proto);
1091         if (opt && opt->opt_nflen)
1092                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1093
1094         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1095
1096         *(u32*)hdr = fl->fl6_flowlabel |
1097                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1098
1099         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1100                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1101         else
1102                 hdr->payload_len = 0;
1103         hdr->hop_limit = np->cork.hop_limit;
1104         hdr->nexthdr = proto;
1105         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1106         ipv6_addr_copy(&hdr->daddr, final_dst);
1107
1108         skb->dst = dst_clone(&rt->u.dst);
1109         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1110         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1111         if (err) {
1112                 if (err > 0)
1113                         err = np->recverr ? net_xmit_errno(err) : 0;
1114                 if (err)
1115                         goto error;
1116         }
1117
1118 out:
1119         inet->cork.flags &= ~IPCORK_OPT;
1120         if (np->cork.opt) {
1121                 kfree(np->cork.opt);
1122                 np->cork.opt = NULL;
1123         }
1124         if (np->cork.rt) {
1125                 dst_release(&np->cork.rt->u.dst);
1126                 np->cork.rt = NULL;
1127                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1128         }
1129         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1130         return err;
1131 error:
1132         goto out;
1133 }
1134
1135 void ip6_flush_pending_frames(struct sock *sk)
1136 {
1137         struct inet_sock *inet = inet_sk(sk);
1138         struct ipv6_pinfo *np = inet6_sk(sk);
1139         struct sk_buff *skb;
1140
1141         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1142                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1143                 kfree_skb(skb);
1144         }
1145
1146         inet->cork.flags &= ~IPCORK_OPT;
1147
1148         if (np->cork.opt) {
1149                 kfree(np->cork.opt);
1150                 np->cork.opt = NULL;
1151         }
1152         if (np->cork.rt) {
1153                 dst_release(&np->cork.rt->u.dst);
1154                 np->cork.rt = NULL;
1155                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1156         }
1157         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1158 }