Merge git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         skb->priority = sk->sk_priority;
230
231         mtu = dst_mtu(dst);
232         if ((skb->len <= mtu) || ipfragok) {
233                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
234                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
235                                 dst_output);
236         }
237
238         if (net_ratelimit())
239                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
240         skb->dev = dst->dev;
241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
243         kfree_skb(skb);
244         return -EMSGSIZE;
245 }
246
247 /*
248  *      To avoid extra problems ND packets are send through this
249  *      routine. It's code duplication but I really want to avoid
250  *      extra checks since ipv6_build_header is used by TCP (which
251  *      is for us performance critical)
252  */
253
254 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255                struct in6_addr *saddr, struct in6_addr *daddr,
256                int proto, int len)
257 {
258         struct ipv6_pinfo *np = inet6_sk(sk);
259         struct ipv6hdr *hdr;
260         int totlen;
261
262         skb->protocol = htons(ETH_P_IPV6);
263         skb->dev = dev;
264
265         totlen = len + sizeof(struct ipv6hdr);
266
267         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
268         skb->nh.ipv6h = hdr;
269
270         *(u32*)hdr = htonl(0x60000000);
271
272         hdr->payload_len = htons(len);
273         hdr->nexthdr = proto;
274         hdr->hop_limit = np->hop_limit;
275
276         ipv6_addr_copy(&hdr->saddr, saddr);
277         ipv6_addr_copy(&hdr->daddr, daddr);
278
279         return 0;
280 }
281
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284         struct ip6_ra_chain *ra;
285         struct sock *last = NULL;
286
287         read_lock(&ip6_ra_lock);
288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
289                 struct sock *sk = ra->sk;
290                 if (sk && ra->sel == sel &&
291                     (!sk->sk_bound_dev_if ||
292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
293                         if (last) {
294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295                                 if (skb2)
296                                         rawv6_rcv(last, skb2);
297                         }
298                         last = sk;
299                 }
300         }
301
302         if (last) {
303                 rawv6_rcv(last, skb);
304                 read_unlock(&ip6_ra_lock);
305                 return 1;
306         }
307         read_unlock(&ip6_ra_lock);
308         return 0;
309 }
310
311 static inline int ip6_forward_finish(struct sk_buff *skb)
312 {
313         return dst_output(skb);
314 }
315
316 int ip6_forward(struct sk_buff *skb)
317 {
318         struct dst_entry *dst = skb->dst;
319         struct ipv6hdr *hdr = skb->nh.ipv6h;
320         struct inet6_skb_parm *opt = IP6CB(skb);
321         
322         if (ipv6_devconf.forwarding == 0)
323                 goto error;
324
325         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
327                 goto drop;
328         }
329
330         skb->ip_summed = CHECKSUM_NONE;
331
332         /*
333          *      We DO NOT make any processing on
334          *      RA packets, pushing them to user level AS IS
335          *      without ane WARRANTY that application will be able
336          *      to interpret them. The reason is that we
337          *      cannot make anything clever here.
338          *
339          *      We are not end-node, so that if packet contains
340          *      AH/ESP, we cannot make anything.
341          *      Defragmentation also would be mistake, RA packets
342          *      cannot be fragmented, because there is no warranty
343          *      that different fragments will go along one path. --ANK
344          */
345         if (opt->ra) {
346                 u8 *ptr = skb->nh.raw + opt->ra;
347                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
348                         return 0;
349         }
350
351         /*
352          *      check and decrement ttl
353          */
354         if (hdr->hop_limit <= 1) {
355                 /* Force OUTPUT device used as source address */
356                 skb->dev = dst->dev;
357                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
358                             0, skb->dev);
359
360                 kfree_skb(skb);
361                 return -ETIMEDOUT;
362         }
363
364         if (!xfrm6_route_forward(skb)) {
365                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
366                 goto drop;
367         }
368         dst = skb->dst;
369
370         /* IPv6 specs say nothing about it, but it is clear that we cannot
371            send redirects to source routed frames.
372          */
373         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374                 struct in6_addr *target = NULL;
375                 struct rt6_info *rt;
376                 struct neighbour *n = dst->neighbour;
377
378                 /*
379                  *      incoming and outgoing devices are the same
380                  *      send a redirect.
381                  */
382
383                 rt = (struct rt6_info *) dst;
384                 if ((rt->rt6i_flags & RTF_GATEWAY))
385                         target = (struct in6_addr*)&n->primary_key;
386                 else
387                         target = &hdr->daddr;
388
389                 /* Limit redirects both by destination (here)
390                    and by source (inside ndisc_send_redirect)
391                  */
392                 if (xrlim_allow(dst, 1*HZ))
393                         ndisc_send_redirect(skb, n, target);
394         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395                                                 |IPV6_ADDR_LINKLOCAL)) {
396                 /* This check is security critical. */
397                 goto error;
398         }
399
400         if (skb->len > dst_mtu(dst)) {
401                 /* Again, force OUTPUT device used as source address */
402                 skb->dev = dst->dev;
403                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
406                 kfree_skb(skb);
407                 return -EMSGSIZE;
408         }
409
410         if (skb_cow(skb, dst->dev->hard_header_len)) {
411                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
412                 goto drop;
413         }
414
415         hdr = skb->nh.ipv6h;
416
417         /* Mangling hops number delayed to point after skb COW */
418  
419         hdr->hop_limit--;
420
421         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
423
424 error:
425         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
426 drop:
427         kfree_skb(skb);
428         return -EINVAL;
429 }
430
431 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
432 {
433         to->pkt_type = from->pkt_type;
434         to->priority = from->priority;
435         to->protocol = from->protocol;
436         dst_release(to->dst);
437         to->dst = dst_clone(from->dst);
438         to->dev = from->dev;
439
440 #ifdef CONFIG_NET_SCHED
441         to->tc_index = from->tc_index;
442 #endif
443 #ifdef CONFIG_NETFILTER
444         to->nfmark = from->nfmark;
445         /* Connection association is same as pre-frag packet */
446         nf_conntrack_put(to->nfct);
447         to->nfct = from->nfct;
448         nf_conntrack_get(to->nfct);
449         to->nfctinfo = from->nfctinfo;
450 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451         nf_conntrack_put_reasm(to->nfct_reasm);
452         to->nfct_reasm = from->nfct_reasm;
453         nf_conntrack_get_reasm(to->nfct_reasm);
454 #endif
455 #ifdef CONFIG_BRIDGE_NETFILTER
456         nf_bridge_put(to->nf_bridge);
457         to->nf_bridge = from->nf_bridge;
458         nf_bridge_get(to->nf_bridge);
459 #endif
460 #endif
461 }
462
463 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
464 {
465         u16 offset = sizeof(struct ipv6hdr);
466         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467         unsigned int packet_len = skb->tail - skb->nh.raw;
468         int found_rhdr = 0;
469         *nexthdr = &skb->nh.ipv6h->nexthdr;
470
471         while (offset + 1 <= packet_len) {
472
473                 switch (**nexthdr) {
474
475                 case NEXTHDR_HOP:
476                 case NEXTHDR_ROUTING:
477                 case NEXTHDR_DEST:
478                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480                         offset += ipv6_optlen(exthdr);
481                         *nexthdr = &exthdr->nexthdr;
482                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
483                         break;
484                 default :
485                         return offset;
486                 }
487         }
488
489         return offset;
490 }
491
492 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493 {
494         struct net_device *dev;
495         struct sk_buff *frag;
496         struct rt6_info *rt = (struct rt6_info*)skb->dst;
497         struct ipv6hdr *tmp_hdr;
498         struct frag_hdr *fh;
499         unsigned int mtu, hlen, left, len;
500         u32 frag_id = 0;
501         int ptr, offset = 0, err=0;
502         u8 *prevhdr, nexthdr = 0;
503
504         dev = rt->u.dst.dev;
505         hlen = ip6_find_1stfragopt(skb, &prevhdr);
506         nexthdr = *prevhdr;
507
508         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
509
510         if (skb_shinfo(skb)->frag_list) {
511                 int first_len = skb_pagelen(skb);
512
513                 if (first_len - hlen > mtu ||
514                     ((first_len - hlen) & 7) ||
515                     skb_cloned(skb))
516                         goto slow_path;
517
518                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
519                         /* Correct geometry. */
520                         if (frag->len > mtu ||
521                             ((frag->len & 7) && frag->next) ||
522                             skb_headroom(frag) < hlen)
523                             goto slow_path;
524
525                         /* Partially cloned skb? */
526                         if (skb_shared(frag))
527                                 goto slow_path;
528
529                         BUG_ON(frag->sk);
530                         if (skb->sk) {
531                                 sock_hold(skb->sk);
532                                 frag->sk = skb->sk;
533                                 frag->destructor = sock_wfree;
534                                 skb->truesize -= frag->truesize;
535                         }
536                 }
537
538                 err = 0;
539                 offset = 0;
540                 frag = skb_shinfo(skb)->frag_list;
541                 skb_shinfo(skb)->frag_list = NULL;
542                 /* BUILD HEADER */
543
544                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
545                 if (!tmp_hdr) {
546                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
547                         return -ENOMEM;
548                 }
549
550                 *prevhdr = NEXTHDR_FRAGMENT;
551                 memcpy(tmp_hdr, skb->nh.raw, hlen);
552                 __skb_pull(skb, hlen);
553                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
554                 skb->nh.raw = __skb_push(skb, hlen);
555                 memcpy(skb->nh.raw, tmp_hdr, hlen);
556
557                 ipv6_select_ident(skb, fh);
558                 fh->nexthdr = nexthdr;
559                 fh->reserved = 0;
560                 fh->frag_off = htons(IP6_MF);
561                 frag_id = fh->identification;
562
563                 first_len = skb_pagelen(skb);
564                 skb->data_len = first_len - skb_headlen(skb);
565                 skb->len = first_len;
566                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
567  
568
569                 for (;;) {
570                         /* Prepare header of the next frame,
571                          * before previous one went down. */
572                         if (frag) {
573                                 frag->ip_summed = CHECKSUM_NONE;
574                                 frag->h.raw = frag->data;
575                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
576                                 frag->nh.raw = __skb_push(frag, hlen);
577                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
578                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
579                                 fh->nexthdr = nexthdr;
580                                 fh->reserved = 0;
581                                 fh->frag_off = htons(offset);
582                                 if (frag->next != NULL)
583                                         fh->frag_off |= htons(IP6_MF);
584                                 fh->identification = frag_id;
585                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
586                                 ip6_copy_metadata(frag, skb);
587                         }
588                         
589                         err = output(skb);
590                         if (err || !frag)
591                                 break;
592
593                         skb = frag;
594                         frag = skb->next;
595                         skb->next = NULL;
596                 }
597
598                 kfree(tmp_hdr);
599
600                 if (err == 0) {
601                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
602                         return 0;
603                 }
604
605                 while (frag) {
606                         skb = frag->next;
607                         kfree_skb(frag);
608                         frag = skb;
609                 }
610
611                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
612                 return err;
613         }
614
615 slow_path:
616         left = skb->len - hlen;         /* Space per frame */
617         ptr = hlen;                     /* Where to start from */
618
619         /*
620          *      Fragment the datagram.
621          */
622
623         *prevhdr = NEXTHDR_FRAGMENT;
624
625         /*
626          *      Keep copying data until we run out.
627          */
628         while(left > 0) {
629                 len = left;
630                 /* IF: it doesn't fit, use 'mtu' - the data space left */
631                 if (len > mtu)
632                         len = mtu;
633                 /* IF: we are not sending upto and including the packet end
634                    then align the next start on an eight byte boundary */
635                 if (len < left) {
636                         len &= ~7;
637                 }
638                 /*
639                  *      Allocate buffer.
640                  */
641
642                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
643                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
644                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
645                         err = -ENOMEM;
646                         goto fail;
647                 }
648
649                 /*
650                  *      Set up data on packet
651                  */
652
653                 ip6_copy_metadata(frag, skb);
654                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
655                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
656                 frag->nh.raw = frag->data;
657                 fh = (struct frag_hdr*)(frag->data + hlen);
658                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
659
660                 /*
661                  *      Charge the memory for the fragment to any owner
662                  *      it might possess
663                  */
664                 if (skb->sk)
665                         skb_set_owner_w(frag, skb->sk);
666
667                 /*
668                  *      Copy the packet header into the new buffer.
669                  */
670                 memcpy(frag->nh.raw, skb->data, hlen);
671
672                 /*
673                  *      Build fragment header.
674                  */
675                 fh->nexthdr = nexthdr;
676                 fh->reserved = 0;
677                 if (!frag_id) {
678                         ipv6_select_ident(skb, fh);
679                         frag_id = fh->identification;
680                 } else
681                         fh->identification = frag_id;
682
683                 /*
684                  *      Copy a block of the IP datagram.
685                  */
686                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
687                         BUG();
688                 left -= len;
689
690                 fh->frag_off = htons(offset);
691                 if (left > 0)
692                         fh->frag_off |= htons(IP6_MF);
693                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
694
695                 ptr += len;
696                 offset += len;
697
698                 /*
699                  *      Put this fragment into the sending queue.
700                  */
701
702                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
703
704                 err = output(frag);
705                 if (err)
706                         goto fail;
707         }
708         kfree_skb(skb);
709         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
710         return err;
711
712 fail:
713         kfree_skb(skb); 
714         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
715         return err;
716 }
717
718 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
719 {
720         int err = 0;
721
722         *dst = NULL;
723         if (sk) {
724                 struct ipv6_pinfo *np = inet6_sk(sk);
725         
726                 *dst = sk_dst_check(sk, np->dst_cookie);
727                 if (*dst) {
728                         struct rt6_info *rt = (struct rt6_info*)*dst;
729         
730                                 /* Yes, checking route validity in not connected
731                                    case is not very simple. Take into account,
732                                    that we do not support routing by source, TOS,
733                                    and MSG_DONTROUTE            --ANK (980726)
734         
735                                    1. If route was host route, check that
736                                       cached destination is current.
737                                       If it is network route, we still may
738                                       check its validity using saved pointer
739                                       to the last used address: daddr_cache.
740                                       We do not want to save whole address now,
741                                       (because main consumer of this service
742                                        is tcp, which has not this problem),
743                                       so that the last trick works only on connected
744                                       sockets.
745                                    2. oif also should be the same.
746                                  */
747         
748                         if (((rt->rt6i_dst.plen != 128 ||
749                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
750                              && (np->daddr_cache == NULL ||
751                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
752                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
753                                 dst_release(*dst);
754                                 *dst = NULL;
755                         }
756                 }
757         }
758
759         if (*dst == NULL)
760                 *dst = ip6_route_output(sk, fl);
761
762         if ((err = (*dst)->error))
763                 goto out_err_release;
764
765         if (ipv6_addr_any(&fl->fl6_src)) {
766                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
767
768                 if (err)
769                         goto out_err_release;
770         }
771
772         return 0;
773
774 out_err_release:
775         dst_release(*dst);
776         *dst = NULL;
777         return err;
778 }
779
780 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
781
782 static inline int ip6_ufo_append_data(struct sock *sk,
783                         int getfrag(void *from, char *to, int offset, int len,
784                         int odd, struct sk_buff *skb),
785                         void *from, int length, int hh_len, int fragheaderlen,
786                         int transhdrlen, int mtu,unsigned int flags)
787
788 {
789         struct sk_buff *skb;
790         int err;
791
792         /* There is support for UDP large send offload by network
793          * device, so create one single skb packet containing complete
794          * udp datagram
795          */
796         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
797                 skb = sock_alloc_send_skb(sk,
798                         hh_len + fragheaderlen + transhdrlen + 20,
799                         (flags & MSG_DONTWAIT), &err);
800                 if (skb == NULL)
801                         return -ENOMEM;
802
803                 /* reserve space for Hardware header */
804                 skb_reserve(skb, hh_len);
805
806                 /* create space for UDP/IP header */
807                 skb_put(skb,fragheaderlen + transhdrlen);
808
809                 /* initialize network header pointer */
810                 skb->nh.raw = skb->data;
811
812                 /* initialize protocol header pointer */
813                 skb->h.raw = skb->data + fragheaderlen;
814
815                 skb->ip_summed = CHECKSUM_HW;
816                 skb->csum = 0;
817                 sk->sk_sndmsg_off = 0;
818         }
819
820         err = skb_append_datato_frags(sk,skb, getfrag, from,
821                                       (length - transhdrlen));
822         if (!err) {
823                 struct frag_hdr fhdr;
824
825                 /* specify the length of each IP datagram fragment*/
826                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
827                                                 sizeof(struct frag_hdr);
828                 ipv6_select_ident(skb, &fhdr);
829                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
830                 __skb_queue_tail(&sk->sk_write_queue, skb);
831
832                 return 0;
833         }
834         /* There is not enough support do UPD LSO,
835          * so follow normal path
836          */
837         kfree_skb(skb);
838
839         return err;
840 }
841
842 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
843         int offset, int len, int odd, struct sk_buff *skb),
844         void *from, int length, int transhdrlen,
845         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
846         struct rt6_info *rt, unsigned int flags)
847 {
848         struct inet_sock *inet = inet_sk(sk);
849         struct ipv6_pinfo *np = inet6_sk(sk);
850         struct sk_buff *skb;
851         unsigned int maxfraglen, fragheaderlen;
852         int exthdrlen;
853         int hh_len;
854         int mtu;
855         int copy;
856         int err;
857         int offset = 0;
858         int csummode = CHECKSUM_NONE;
859
860         if (flags&MSG_PROBE)
861                 return 0;
862         if (skb_queue_empty(&sk->sk_write_queue)) {
863                 /*
864                  * setup for corking
865                  */
866                 if (opt) {
867                         if (np->cork.opt == NULL) {
868                                 np->cork.opt = kmalloc(opt->tot_len,
869                                                        sk->sk_allocation);
870                                 if (unlikely(np->cork.opt == NULL))
871                                         return -ENOBUFS;
872                         } else if (np->cork.opt->tot_len < opt->tot_len) {
873                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
874                                 return -EINVAL;
875                         }
876                         memcpy(np->cork.opt, opt, opt->tot_len);
877                         inet->cork.flags |= IPCORK_OPT;
878                         /* need source address above miyazawa*/
879                 }
880                 dst_hold(&rt->u.dst);
881                 np->cork.rt = rt;
882                 inet->cork.fl = *fl;
883                 np->cork.hop_limit = hlimit;
884                 np->cork.tclass = tclass;
885                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
886                 if (dst_allfrag(rt->u.dst.path))
887                         inet->cork.flags |= IPCORK_ALLFRAG;
888                 inet->cork.length = 0;
889                 sk->sk_sndmsg_page = NULL;
890                 sk->sk_sndmsg_off = 0;
891                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
892                 length += exthdrlen;
893                 transhdrlen += exthdrlen;
894         } else {
895                 rt = np->cork.rt;
896                 fl = &inet->cork.fl;
897                 if (inet->cork.flags & IPCORK_OPT)
898                         opt = np->cork.opt;
899                 transhdrlen = 0;
900                 exthdrlen = 0;
901                 mtu = inet->cork.fragsize;
902         }
903
904         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
905
906         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
907         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
908
909         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
910                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
911                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
912                         return -EMSGSIZE;
913                 }
914         }
915
916         /*
917          * Let's try using as much space as possible.
918          * Use MTU if total length of the message fits into the MTU.
919          * Otherwise, we need to reserve fragment header and
920          * fragment alignment (= 8-15 octects, in total).
921          *
922          * Note that we may need to "move" the data from the tail of
923          * of the buffer to the new fragment when we split 
924          * the message.
925          *
926          * FIXME: It may be fragmented into multiple chunks 
927          *        at once if non-fragmentable extension headers
928          *        are too large.
929          * --yoshfuji 
930          */
931
932         inet->cork.length += length;
933         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
934             (rt->u.dst.dev->features & NETIF_F_UFO)) {
935
936                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
937                                 fragheaderlen, transhdrlen, mtu, flags))
938                         goto error;
939
940                 return 0;
941         }
942
943         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
944                 goto alloc_new_skb;
945
946         while (length > 0) {
947                 /* Check if the remaining data fits into current packet. */
948                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
949                 if (copy < length)
950                         copy = maxfraglen - skb->len;
951
952                 if (copy <= 0) {
953                         char *data;
954                         unsigned int datalen;
955                         unsigned int fraglen;
956                         unsigned int fraggap;
957                         unsigned int alloclen;
958                         struct sk_buff *skb_prev;
959 alloc_new_skb:
960                         skb_prev = skb;
961
962                         /* There's no room in the current skb */
963                         if (skb_prev)
964                                 fraggap = skb_prev->len - maxfraglen;
965                         else
966                                 fraggap = 0;
967
968                         /*
969                          * If remaining data exceeds the mtu,
970                          * we know we need more fragment(s).
971                          */
972                         datalen = length + fraggap;
973                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
974                                 datalen = maxfraglen - fragheaderlen;
975
976                         fraglen = datalen + fragheaderlen;
977                         if ((flags & MSG_MORE) &&
978                             !(rt->u.dst.dev->features&NETIF_F_SG))
979                                 alloclen = mtu;
980                         else
981                                 alloclen = datalen + fragheaderlen;
982
983                         /*
984                          * The last fragment gets additional space at tail.
985                          * Note: we overallocate on fragments with MSG_MODE
986                          * because we have no idea if we're the last one.
987                          */
988                         if (datalen == length + fraggap)
989                                 alloclen += rt->u.dst.trailer_len;
990
991                         /*
992                          * We just reserve space for fragment header.
993                          * Note: this may be overallocation if the message 
994                          * (without MSG_MORE) fits into the MTU.
995                          */
996                         alloclen += sizeof(struct frag_hdr);
997
998                         if (transhdrlen) {
999                                 skb = sock_alloc_send_skb(sk,
1000                                                 alloclen + hh_len,
1001                                                 (flags & MSG_DONTWAIT), &err);
1002                         } else {
1003                                 skb = NULL;
1004                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1005                                     2 * sk->sk_sndbuf)
1006                                         skb = sock_wmalloc(sk,
1007                                                            alloclen + hh_len, 1,
1008                                                            sk->sk_allocation);
1009                                 if (unlikely(skb == NULL))
1010                                         err = -ENOBUFS;
1011                         }
1012                         if (skb == NULL)
1013                                 goto error;
1014                         /*
1015                          *      Fill in the control structures
1016                          */
1017                         skb->ip_summed = csummode;
1018                         skb->csum = 0;
1019                         /* reserve for fragmentation */
1020                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1021
1022                         /*
1023                          *      Find where to start putting bytes
1024                          */
1025                         data = skb_put(skb, fraglen);
1026                         skb->nh.raw = data + exthdrlen;
1027                         data += fragheaderlen;
1028                         skb->h.raw = data + exthdrlen;
1029
1030                         if (fraggap) {
1031                                 skb->csum = skb_copy_and_csum_bits(
1032                                         skb_prev, maxfraglen,
1033                                         data + transhdrlen, fraggap, 0);
1034                                 skb_prev->csum = csum_sub(skb_prev->csum,
1035                                                           skb->csum);
1036                                 data += fraggap;
1037                                 skb_trim(skb_prev, maxfraglen);
1038                         }
1039                         copy = datalen - transhdrlen - fraggap;
1040                         if (copy < 0) {
1041                                 err = -EINVAL;
1042                                 kfree_skb(skb);
1043                                 goto error;
1044                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1045                                 err = -EFAULT;
1046                                 kfree_skb(skb);
1047                                 goto error;
1048                         }
1049
1050                         offset += copy;
1051                         length -= datalen - fraggap;
1052                         transhdrlen = 0;
1053                         exthdrlen = 0;
1054                         csummode = CHECKSUM_NONE;
1055
1056                         /*
1057                          * Put the packet on the pending queue
1058                          */
1059                         __skb_queue_tail(&sk->sk_write_queue, skb);
1060                         continue;
1061                 }
1062
1063                 if (copy > length)
1064                         copy = length;
1065
1066                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1067                         unsigned int off;
1068
1069                         off = skb->len;
1070                         if (getfrag(from, skb_put(skb, copy),
1071                                                 offset, copy, off, skb) < 0) {
1072                                 __skb_trim(skb, off);
1073                                 err = -EFAULT;
1074                                 goto error;
1075                         }
1076                 } else {
1077                         int i = skb_shinfo(skb)->nr_frags;
1078                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1079                         struct page *page = sk->sk_sndmsg_page;
1080                         int off = sk->sk_sndmsg_off;
1081                         unsigned int left;
1082
1083                         if (page && (left = PAGE_SIZE - off) > 0) {
1084                                 if (copy >= left)
1085                                         copy = left;
1086                                 if (page != frag->page) {
1087                                         if (i == MAX_SKB_FRAGS) {
1088                                                 err = -EMSGSIZE;
1089                                                 goto error;
1090                                         }
1091                                         get_page(page);
1092                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1093                                         frag = &skb_shinfo(skb)->frags[i];
1094                                 }
1095                         } else if(i < MAX_SKB_FRAGS) {
1096                                 if (copy > PAGE_SIZE)
1097                                         copy = PAGE_SIZE;
1098                                 page = alloc_pages(sk->sk_allocation, 0);
1099                                 if (page == NULL) {
1100                                         err = -ENOMEM;
1101                                         goto error;
1102                                 }
1103                                 sk->sk_sndmsg_page = page;
1104                                 sk->sk_sndmsg_off = 0;
1105
1106                                 skb_fill_page_desc(skb, i, page, 0, 0);
1107                                 frag = &skb_shinfo(skb)->frags[i];
1108                                 skb->truesize += PAGE_SIZE;
1109                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1110                         } else {
1111                                 err = -EMSGSIZE;
1112                                 goto error;
1113                         }
1114                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1115                                 err = -EFAULT;
1116                                 goto error;
1117                         }
1118                         sk->sk_sndmsg_off += copy;
1119                         frag->size += copy;
1120                         skb->len += copy;
1121                         skb->data_len += copy;
1122                 }
1123                 offset += copy;
1124                 length -= copy;
1125         }
1126         return 0;
1127 error:
1128         inet->cork.length -= length;
1129         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1130         return err;
1131 }
1132
1133 int ip6_push_pending_frames(struct sock *sk)
1134 {
1135         struct sk_buff *skb, *tmp_skb;
1136         struct sk_buff **tail_skb;
1137         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1138         struct inet_sock *inet = inet_sk(sk);
1139         struct ipv6_pinfo *np = inet6_sk(sk);
1140         struct ipv6hdr *hdr;
1141         struct ipv6_txoptions *opt = np->cork.opt;
1142         struct rt6_info *rt = np->cork.rt;
1143         struct flowi *fl = &inet->cork.fl;
1144         unsigned char proto = fl->proto;
1145         int err = 0;
1146
1147         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1148                 goto out;
1149         tail_skb = &(skb_shinfo(skb)->frag_list);
1150
1151         /* move skb->data to ip header from ext header */
1152         if (skb->data < skb->nh.raw)
1153                 __skb_pull(skb, skb->nh.raw - skb->data);
1154         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1155                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1156                 *tail_skb = tmp_skb;
1157                 tail_skb = &(tmp_skb->next);
1158                 skb->len += tmp_skb->len;
1159                 skb->data_len += tmp_skb->len;
1160                 skb->truesize += tmp_skb->truesize;
1161                 __sock_put(tmp_skb->sk);
1162                 tmp_skb->destructor = NULL;
1163                 tmp_skb->sk = NULL;
1164         }
1165
1166         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1167         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1168         if (opt && opt->opt_flen)
1169                 ipv6_push_frag_opts(skb, opt, &proto);
1170         if (opt && opt->opt_nflen)
1171                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1172
1173         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1174         
1175         *(u32*)hdr = fl->fl6_flowlabel |
1176                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1177
1178         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1179                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1180         else
1181                 hdr->payload_len = 0;
1182         hdr->hop_limit = np->cork.hop_limit;
1183         hdr->nexthdr = proto;
1184         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1185         ipv6_addr_copy(&hdr->daddr, final_dst);
1186
1187         skb->priority = sk->sk_priority;
1188
1189         skb->dst = dst_clone(&rt->u.dst);
1190         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1191         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1192         if (err) {
1193                 if (err > 0)
1194                         err = np->recverr ? net_xmit_errno(err) : 0;
1195                 if (err)
1196                         goto error;
1197         }
1198
1199 out:
1200         inet->cork.flags &= ~IPCORK_OPT;
1201         kfree(np->cork.opt);
1202         np->cork.opt = NULL;
1203         if (np->cork.rt) {
1204                 dst_release(&np->cork.rt->u.dst);
1205                 np->cork.rt = NULL;
1206                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1207         }
1208         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1209         return err;
1210 error:
1211         goto out;
1212 }
1213
1214 void ip6_flush_pending_frames(struct sock *sk)
1215 {
1216         struct inet_sock *inet = inet_sk(sk);
1217         struct ipv6_pinfo *np = inet6_sk(sk);
1218         struct sk_buff *skb;
1219
1220         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1221                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1222                 kfree_skb(skb);
1223         }
1224
1225         inet->cork.flags &= ~IPCORK_OPT;
1226
1227         kfree(np->cork.opt);
1228         np->cork.opt = NULL;
1229         if (np->cork.rt) {
1230                 dst_release(&np->cork.rt->u.dst);
1231                 np->cork.rt = NULL;
1232                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1233         }
1234         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1235 }