Auto-update from upstream
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         mtu = dst_mtu(dst);
230         if ((skb->len <= mtu) || ipfragok) {
231                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233                                 dst_output);
234         }
235
236         if (net_ratelimit())
237                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238         skb->dev = dst->dev;
239         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241         kfree_skb(skb);
242         return -EMSGSIZE;
243 }
244
245 /*
246  *      To avoid extra problems ND packets are send through this
247  *      routine. It's code duplication but I really want to avoid
248  *      extra checks since ipv6_build_header is used by TCP (which
249  *      is for us performance critical)
250  */
251
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253                struct in6_addr *saddr, struct in6_addr *daddr,
254                int proto, int len)
255 {
256         struct ipv6_pinfo *np = inet6_sk(sk);
257         struct ipv6hdr *hdr;
258         int totlen;
259
260         skb->protocol = htons(ETH_P_IPV6);
261         skb->dev = dev;
262
263         totlen = len + sizeof(struct ipv6hdr);
264
265         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266         skb->nh.ipv6h = hdr;
267
268         *(u32*)hdr = htonl(0x60000000);
269
270         hdr->payload_len = htons(len);
271         hdr->nexthdr = proto;
272         hdr->hop_limit = np->hop_limit;
273
274         ipv6_addr_copy(&hdr->saddr, saddr);
275         ipv6_addr_copy(&hdr->daddr, daddr);
276
277         return 0;
278 }
279
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282         struct ip6_ra_chain *ra;
283         struct sock *last = NULL;
284
285         read_lock(&ip6_ra_lock);
286         for (ra = ip6_ra_chain; ra; ra = ra->next) {
287                 struct sock *sk = ra->sk;
288                 if (sk && ra->sel == sel &&
289                     (!sk->sk_bound_dev_if ||
290                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
291                         if (last) {
292                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293                                 if (skb2)
294                                         rawv6_rcv(last, skb2);
295                         }
296                         last = sk;
297                 }
298         }
299
300         if (last) {
301                 rawv6_rcv(last, skb);
302                 read_unlock(&ip6_ra_lock);
303                 return 1;
304         }
305         read_unlock(&ip6_ra_lock);
306         return 0;
307 }
308
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311         return dst_output(skb);
312 }
313
314 int ip6_forward(struct sk_buff *skb)
315 {
316         struct dst_entry *dst = skb->dst;
317         struct ipv6hdr *hdr = skb->nh.ipv6h;
318         struct inet6_skb_parm *opt = IP6CB(skb);
319         
320         if (ipv6_devconf.forwarding == 0)
321                 goto error;
322
323         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325                 goto drop;
326         }
327
328         skb->ip_summed = CHECKSUM_NONE;
329
330         /*
331          *      We DO NOT make any processing on
332          *      RA packets, pushing them to user level AS IS
333          *      without ane WARRANTY that application will be able
334          *      to interpret them. The reason is that we
335          *      cannot make anything clever here.
336          *
337          *      We are not end-node, so that if packet contains
338          *      AH/ESP, we cannot make anything.
339          *      Defragmentation also would be mistake, RA packets
340          *      cannot be fragmented, because there is no warranty
341          *      that different fragments will go along one path. --ANK
342          */
343         if (opt->ra) {
344                 u8 *ptr = skb->nh.raw + opt->ra;
345                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346                         return 0;
347         }
348
349         /*
350          *      check and decrement ttl
351          */
352         if (hdr->hop_limit <= 1) {
353                 /* Force OUTPUT device used as source address */
354                 skb->dev = dst->dev;
355                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356                             0, skb->dev);
357
358                 kfree_skb(skb);
359                 return -ETIMEDOUT;
360         }
361
362         if (!xfrm6_route_forward(skb)) {
363                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364                 goto drop;
365         }
366         dst = skb->dst;
367
368         /* IPv6 specs say nothing about it, but it is clear that we cannot
369            send redirects to source routed frames.
370          */
371         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372                 struct in6_addr *target = NULL;
373                 struct rt6_info *rt;
374                 struct neighbour *n = dst->neighbour;
375
376                 /*
377                  *      incoming and outgoing devices are the same
378                  *      send a redirect.
379                  */
380
381                 rt = (struct rt6_info *) dst;
382                 if ((rt->rt6i_flags & RTF_GATEWAY))
383                         target = (struct in6_addr*)&n->primary_key;
384                 else
385                         target = &hdr->daddr;
386
387                 /* Limit redirects both by destination (here)
388                    and by source (inside ndisc_send_redirect)
389                  */
390                 if (xrlim_allow(dst, 1*HZ))
391                         ndisc_send_redirect(skb, n, target);
392         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393                                                 |IPV6_ADDR_LINKLOCAL)) {
394                 /* This check is security critical. */
395                 goto error;
396         }
397
398         if (skb->len > dst_mtu(dst)) {
399                 /* Again, force OUTPUT device used as source address */
400                 skb->dev = dst->dev;
401                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404                 kfree_skb(skb);
405                 return -EMSGSIZE;
406         }
407
408         if (skb_cow(skb, dst->dev->hard_header_len)) {
409                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410                 goto drop;
411         }
412
413         hdr = skb->nh.ipv6h;
414
415         /* Mangling hops number delayed to point after skb COW */
416  
417         hdr->hop_limit--;
418
419         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421
422 error:
423         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425         kfree_skb(skb);
426         return -EINVAL;
427 }
428
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431         to->pkt_type = from->pkt_type;
432         to->priority = from->priority;
433         to->protocol = from->protocol;
434         dst_release(to->dst);
435         to->dst = dst_clone(from->dst);
436         to->dev = from->dev;
437
438 #ifdef CONFIG_NET_SCHED
439         to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442         to->nfmark = from->nfmark;
443         /* Connection association is same as pre-frag packet */
444         nf_conntrack_put(to->nfct);
445         to->nfct = from->nfct;
446         nf_conntrack_get(to->nfct);
447         to->nfctinfo = from->nfctinfo;
448 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
449         nf_conntrack_put_reasm(to->nfct_reasm);
450         to->nfct_reasm = from->nfct_reasm;
451         nf_conntrack_get_reasm(to->nfct_reasm);
452 #endif
453 #ifdef CONFIG_BRIDGE_NETFILTER
454         nf_bridge_put(to->nf_bridge);
455         to->nf_bridge = from->nf_bridge;
456         nf_bridge_get(to->nf_bridge);
457 #endif
458 #endif
459 }
460
461 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
462 {
463         u16 offset = sizeof(struct ipv6hdr);
464         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
465         unsigned int packet_len = skb->tail - skb->nh.raw;
466         int found_rhdr = 0;
467         *nexthdr = &skb->nh.ipv6h->nexthdr;
468
469         while (offset + 1 <= packet_len) {
470
471                 switch (**nexthdr) {
472
473                 case NEXTHDR_HOP:
474                 case NEXTHDR_ROUTING:
475                 case NEXTHDR_DEST:
476                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
477                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
478                         offset += ipv6_optlen(exthdr);
479                         *nexthdr = &exthdr->nexthdr;
480                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
481                         break;
482                 default :
483                         return offset;
484                 }
485         }
486
487         return offset;
488 }
489
490 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
491 {
492         struct net_device *dev;
493         struct sk_buff *frag;
494         struct rt6_info *rt = (struct rt6_info*)skb->dst;
495         struct ipv6hdr *tmp_hdr;
496         struct frag_hdr *fh;
497         unsigned int mtu, hlen, left, len;
498         u32 frag_id = 0;
499         int ptr, offset = 0, err=0;
500         u8 *prevhdr, nexthdr = 0;
501
502         dev = rt->u.dst.dev;
503         hlen = ip6_find_1stfragopt(skb, &prevhdr);
504         nexthdr = *prevhdr;
505
506         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
507
508         if (skb_shinfo(skb)->frag_list) {
509                 int first_len = skb_pagelen(skb);
510
511                 if (first_len - hlen > mtu ||
512                     ((first_len - hlen) & 7) ||
513                     skb_cloned(skb))
514                         goto slow_path;
515
516                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
517                         /* Correct geometry. */
518                         if (frag->len > mtu ||
519                             ((frag->len & 7) && frag->next) ||
520                             skb_headroom(frag) < hlen)
521                             goto slow_path;
522
523                         /* Partially cloned skb? */
524                         if (skb_shared(frag))
525                                 goto slow_path;
526
527                         BUG_ON(frag->sk);
528                         if (skb->sk) {
529                                 sock_hold(skb->sk);
530                                 frag->sk = skb->sk;
531                                 frag->destructor = sock_wfree;
532                                 skb->truesize -= frag->truesize;
533                         }
534                 }
535
536                 err = 0;
537                 offset = 0;
538                 frag = skb_shinfo(skb)->frag_list;
539                 skb_shinfo(skb)->frag_list = NULL;
540                 /* BUILD HEADER */
541
542                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
543                 if (!tmp_hdr) {
544                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
545                         return -ENOMEM;
546                 }
547
548                 *prevhdr = NEXTHDR_FRAGMENT;
549                 memcpy(tmp_hdr, skb->nh.raw, hlen);
550                 __skb_pull(skb, hlen);
551                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
552                 skb->nh.raw = __skb_push(skb, hlen);
553                 memcpy(skb->nh.raw, tmp_hdr, hlen);
554
555                 ipv6_select_ident(skb, fh);
556                 fh->nexthdr = nexthdr;
557                 fh->reserved = 0;
558                 fh->frag_off = htons(IP6_MF);
559                 frag_id = fh->identification;
560
561                 first_len = skb_pagelen(skb);
562                 skb->data_len = first_len - skb_headlen(skb);
563                 skb->len = first_len;
564                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
565  
566
567                 for (;;) {
568                         /* Prepare header of the next frame,
569                          * before previous one went down. */
570                         if (frag) {
571                                 frag->ip_summed = CHECKSUM_NONE;
572                                 frag->h.raw = frag->data;
573                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
574                                 frag->nh.raw = __skb_push(frag, hlen);
575                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
576                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
577                                 fh->nexthdr = nexthdr;
578                                 fh->reserved = 0;
579                                 fh->frag_off = htons(offset);
580                                 if (frag->next != NULL)
581                                         fh->frag_off |= htons(IP6_MF);
582                                 fh->identification = frag_id;
583                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
584                                 ip6_copy_metadata(frag, skb);
585                         }
586                         
587                         err = output(skb);
588                         if (err || !frag)
589                                 break;
590
591                         skb = frag;
592                         frag = skb->next;
593                         skb->next = NULL;
594                 }
595
596                 kfree(tmp_hdr);
597
598                 if (err == 0) {
599                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
600                         return 0;
601                 }
602
603                 while (frag) {
604                         skb = frag->next;
605                         kfree_skb(frag);
606                         frag = skb;
607                 }
608
609                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
610                 return err;
611         }
612
613 slow_path:
614         left = skb->len - hlen;         /* Space per frame */
615         ptr = hlen;                     /* Where to start from */
616
617         /*
618          *      Fragment the datagram.
619          */
620
621         *prevhdr = NEXTHDR_FRAGMENT;
622
623         /*
624          *      Keep copying data until we run out.
625          */
626         while(left > 0) {
627                 len = left;
628                 /* IF: it doesn't fit, use 'mtu' - the data space left */
629                 if (len > mtu)
630                         len = mtu;
631                 /* IF: we are not sending upto and including the packet end
632                    then align the next start on an eight byte boundary */
633                 if (len < left) {
634                         len &= ~7;
635                 }
636                 /*
637                  *      Allocate buffer.
638                  */
639
640                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
641                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
642                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643                         err = -ENOMEM;
644                         goto fail;
645                 }
646
647                 /*
648                  *      Set up data on packet
649                  */
650
651                 ip6_copy_metadata(frag, skb);
652                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
653                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
654                 frag->nh.raw = frag->data;
655                 fh = (struct frag_hdr*)(frag->data + hlen);
656                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
657
658                 /*
659                  *      Charge the memory for the fragment to any owner
660                  *      it might possess
661                  */
662                 if (skb->sk)
663                         skb_set_owner_w(frag, skb->sk);
664
665                 /*
666                  *      Copy the packet header into the new buffer.
667                  */
668                 memcpy(frag->nh.raw, skb->data, hlen);
669
670                 /*
671                  *      Build fragment header.
672                  */
673                 fh->nexthdr = nexthdr;
674                 fh->reserved = 0;
675                 if (!frag_id) {
676                         ipv6_select_ident(skb, fh);
677                         frag_id = fh->identification;
678                 } else
679                         fh->identification = frag_id;
680
681                 /*
682                  *      Copy a block of the IP datagram.
683                  */
684                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
685                         BUG();
686                 left -= len;
687
688                 fh->frag_off = htons(offset);
689                 if (left > 0)
690                         fh->frag_off |= htons(IP6_MF);
691                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
692
693                 ptr += len;
694                 offset += len;
695
696                 /*
697                  *      Put this fragment into the sending queue.
698                  */
699
700                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
701
702                 err = output(frag);
703                 if (err)
704                         goto fail;
705         }
706         kfree_skb(skb);
707         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
708         return err;
709
710 fail:
711         kfree_skb(skb); 
712         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
713         return err;
714 }
715
716 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
717 {
718         int err = 0;
719
720         *dst = NULL;
721         if (sk) {
722                 struct ipv6_pinfo *np = inet6_sk(sk);
723         
724                 *dst = sk_dst_check(sk, np->dst_cookie);
725                 if (*dst) {
726                         struct rt6_info *rt = (struct rt6_info*)*dst;
727         
728                                 /* Yes, checking route validity in not connected
729                                    case is not very simple. Take into account,
730                                    that we do not support routing by source, TOS,
731                                    and MSG_DONTROUTE            --ANK (980726)
732         
733                                    1. If route was host route, check that
734                                       cached destination is current.
735                                       If it is network route, we still may
736                                       check its validity using saved pointer
737                                       to the last used address: daddr_cache.
738                                       We do not want to save whole address now,
739                                       (because main consumer of this service
740                                        is tcp, which has not this problem),
741                                       so that the last trick works only on connected
742                                       sockets.
743                                    2. oif also should be the same.
744                                  */
745         
746                         if (((rt->rt6i_dst.plen != 128 ||
747                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
748                              && (np->daddr_cache == NULL ||
749                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
750                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
751                                 dst_release(*dst);
752                                 *dst = NULL;
753                         }
754                 }
755         }
756
757         if (*dst == NULL)
758                 *dst = ip6_route_output(sk, fl);
759
760         if ((err = (*dst)->error))
761                 goto out_err_release;
762
763         if (ipv6_addr_any(&fl->fl6_src)) {
764                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
765
766                 if (err)
767                         goto out_err_release;
768         }
769
770         return 0;
771
772 out_err_release:
773         dst_release(*dst);
774         *dst = NULL;
775         return err;
776 }
777
778 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
779
780 static inline int ip6_ufo_append_data(struct sock *sk,
781                         int getfrag(void *from, char *to, int offset, int len,
782                         int odd, struct sk_buff *skb),
783                         void *from, int length, int hh_len, int fragheaderlen,
784                         int transhdrlen, int mtu,unsigned int flags)
785
786 {
787         struct sk_buff *skb;
788         int err;
789
790         /* There is support for UDP large send offload by network
791          * device, so create one single skb packet containing complete
792          * udp datagram
793          */
794         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
795                 skb = sock_alloc_send_skb(sk,
796                         hh_len + fragheaderlen + transhdrlen + 20,
797                         (flags & MSG_DONTWAIT), &err);
798                 if (skb == NULL)
799                         return -ENOMEM;
800
801                 /* reserve space for Hardware header */
802                 skb_reserve(skb, hh_len);
803
804                 /* create space for UDP/IP header */
805                 skb_put(skb,fragheaderlen + transhdrlen);
806
807                 /* initialize network header pointer */
808                 skb->nh.raw = skb->data;
809
810                 /* initialize protocol header pointer */
811                 skb->h.raw = skb->data + fragheaderlen;
812
813                 skb->ip_summed = CHECKSUM_HW;
814                 skb->csum = 0;
815                 sk->sk_sndmsg_off = 0;
816         }
817
818         err = skb_append_datato_frags(sk,skb, getfrag, from,
819                                       (length - transhdrlen));
820         if (!err) {
821                 struct frag_hdr fhdr;
822
823                 /* specify the length of each IP datagram fragment*/
824                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
825                                                 sizeof(struct frag_hdr);
826                 ipv6_select_ident(skb, &fhdr);
827                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
828                 __skb_queue_tail(&sk->sk_write_queue, skb);
829
830                 return 0;
831         }
832         /* There is not enough support do UPD LSO,
833          * so follow normal path
834          */
835         kfree_skb(skb);
836
837         return err;
838 }
839
840 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
841         int offset, int len, int odd, struct sk_buff *skb),
842         void *from, int length, int transhdrlen,
843         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
844         struct rt6_info *rt, unsigned int flags)
845 {
846         struct inet_sock *inet = inet_sk(sk);
847         struct ipv6_pinfo *np = inet6_sk(sk);
848         struct sk_buff *skb;
849         unsigned int maxfraglen, fragheaderlen;
850         int exthdrlen;
851         int hh_len;
852         int mtu;
853         int copy;
854         int err;
855         int offset = 0;
856         int csummode = CHECKSUM_NONE;
857
858         if (flags&MSG_PROBE)
859                 return 0;
860         if (skb_queue_empty(&sk->sk_write_queue)) {
861                 /*
862                  * setup for corking
863                  */
864                 if (opt) {
865                         if (np->cork.opt == NULL) {
866                                 np->cork.opt = kmalloc(opt->tot_len,
867                                                        sk->sk_allocation);
868                                 if (unlikely(np->cork.opt == NULL))
869                                         return -ENOBUFS;
870                         } else if (np->cork.opt->tot_len < opt->tot_len) {
871                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
872                                 return -EINVAL;
873                         }
874                         memcpy(np->cork.opt, opt, opt->tot_len);
875                         inet->cork.flags |= IPCORK_OPT;
876                         /* need source address above miyazawa*/
877                 }
878                 dst_hold(&rt->u.dst);
879                 np->cork.rt = rt;
880                 inet->cork.fl = *fl;
881                 np->cork.hop_limit = hlimit;
882                 np->cork.tclass = tclass;
883                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
884                 if (dst_allfrag(rt->u.dst.path))
885                         inet->cork.flags |= IPCORK_ALLFRAG;
886                 inet->cork.length = 0;
887                 sk->sk_sndmsg_page = NULL;
888                 sk->sk_sndmsg_off = 0;
889                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
890                 length += exthdrlen;
891                 transhdrlen += exthdrlen;
892         } else {
893                 rt = np->cork.rt;
894                 fl = &inet->cork.fl;
895                 if (inet->cork.flags & IPCORK_OPT)
896                         opt = np->cork.opt;
897                 transhdrlen = 0;
898                 exthdrlen = 0;
899                 mtu = inet->cork.fragsize;
900         }
901
902         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
903
904         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
905         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
906
907         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
908                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
909                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
910                         return -EMSGSIZE;
911                 }
912         }
913
914         /*
915          * Let's try using as much space as possible.
916          * Use MTU if total length of the message fits into the MTU.
917          * Otherwise, we need to reserve fragment header and
918          * fragment alignment (= 8-15 octects, in total).
919          *
920          * Note that we may need to "move" the data from the tail of
921          * of the buffer to the new fragment when we split 
922          * the message.
923          *
924          * FIXME: It may be fragmented into multiple chunks 
925          *        at once if non-fragmentable extension headers
926          *        are too large.
927          * --yoshfuji 
928          */
929
930         inet->cork.length += length;
931         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
932             (rt->u.dst.dev->features & NETIF_F_UFO)) {
933
934                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
935                                 fragheaderlen, transhdrlen, mtu, flags))
936                         goto error;
937
938                 return 0;
939         }
940
941         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
942                 goto alloc_new_skb;
943
944         while (length > 0) {
945                 /* Check if the remaining data fits into current packet. */
946                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
947                 if (copy < length)
948                         copy = maxfraglen - skb->len;
949
950                 if (copy <= 0) {
951                         char *data;
952                         unsigned int datalen;
953                         unsigned int fraglen;
954                         unsigned int fraggap;
955                         unsigned int alloclen;
956                         struct sk_buff *skb_prev;
957 alloc_new_skb:
958                         skb_prev = skb;
959
960                         /* There's no room in the current skb */
961                         if (skb_prev)
962                                 fraggap = skb_prev->len - maxfraglen;
963                         else
964                                 fraggap = 0;
965
966                         /*
967                          * If remaining data exceeds the mtu,
968                          * we know we need more fragment(s).
969                          */
970                         datalen = length + fraggap;
971                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
972                                 datalen = maxfraglen - fragheaderlen;
973
974                         fraglen = datalen + fragheaderlen;
975                         if ((flags & MSG_MORE) &&
976                             !(rt->u.dst.dev->features&NETIF_F_SG))
977                                 alloclen = mtu;
978                         else
979                                 alloclen = datalen + fragheaderlen;
980
981                         /*
982                          * The last fragment gets additional space at tail.
983                          * Note: we overallocate on fragments with MSG_MODE
984                          * because we have no idea if we're the last one.
985                          */
986                         if (datalen == length + fraggap)
987                                 alloclen += rt->u.dst.trailer_len;
988
989                         /*
990                          * We just reserve space for fragment header.
991                          * Note: this may be overallocation if the message 
992                          * (without MSG_MORE) fits into the MTU.
993                          */
994                         alloclen += sizeof(struct frag_hdr);
995
996                         if (transhdrlen) {
997                                 skb = sock_alloc_send_skb(sk,
998                                                 alloclen + hh_len,
999                                                 (flags & MSG_DONTWAIT), &err);
1000                         } else {
1001                                 skb = NULL;
1002                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1003                                     2 * sk->sk_sndbuf)
1004                                         skb = sock_wmalloc(sk,
1005                                                            alloclen + hh_len, 1,
1006                                                            sk->sk_allocation);
1007                                 if (unlikely(skb == NULL))
1008                                         err = -ENOBUFS;
1009                         }
1010                         if (skb == NULL)
1011                                 goto error;
1012                         /*
1013                          *      Fill in the control structures
1014                          */
1015                         skb->ip_summed = csummode;
1016                         skb->csum = 0;
1017                         /* reserve for fragmentation */
1018                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1019
1020                         /*
1021                          *      Find where to start putting bytes
1022                          */
1023                         data = skb_put(skb, fraglen);
1024                         skb->nh.raw = data + exthdrlen;
1025                         data += fragheaderlen;
1026                         skb->h.raw = data + exthdrlen;
1027
1028                         if (fraggap) {
1029                                 skb->csum = skb_copy_and_csum_bits(
1030                                         skb_prev, maxfraglen,
1031                                         data + transhdrlen, fraggap, 0);
1032                                 skb_prev->csum = csum_sub(skb_prev->csum,
1033                                                           skb->csum);
1034                                 data += fraggap;
1035                                 skb_trim(skb_prev, maxfraglen);
1036                         }
1037                         copy = datalen - transhdrlen - fraggap;
1038                         if (copy < 0) {
1039                                 err = -EINVAL;
1040                                 kfree_skb(skb);
1041                                 goto error;
1042                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1043                                 err = -EFAULT;
1044                                 kfree_skb(skb);
1045                                 goto error;
1046                         }
1047
1048                         offset += copy;
1049                         length -= datalen - fraggap;
1050                         transhdrlen = 0;
1051                         exthdrlen = 0;
1052                         csummode = CHECKSUM_NONE;
1053
1054                         /*
1055                          * Put the packet on the pending queue
1056                          */
1057                         __skb_queue_tail(&sk->sk_write_queue, skb);
1058                         continue;
1059                 }
1060
1061                 if (copy > length)
1062                         copy = length;
1063
1064                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1065                         unsigned int off;
1066
1067                         off = skb->len;
1068                         if (getfrag(from, skb_put(skb, copy),
1069                                                 offset, copy, off, skb) < 0) {
1070                                 __skb_trim(skb, off);
1071                                 err = -EFAULT;
1072                                 goto error;
1073                         }
1074                 } else {
1075                         int i = skb_shinfo(skb)->nr_frags;
1076                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1077                         struct page *page = sk->sk_sndmsg_page;
1078                         int off = sk->sk_sndmsg_off;
1079                         unsigned int left;
1080
1081                         if (page && (left = PAGE_SIZE - off) > 0) {
1082                                 if (copy >= left)
1083                                         copy = left;
1084                                 if (page != frag->page) {
1085                                         if (i == MAX_SKB_FRAGS) {
1086                                                 err = -EMSGSIZE;
1087                                                 goto error;
1088                                         }
1089                                         get_page(page);
1090                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1091                                         frag = &skb_shinfo(skb)->frags[i];
1092                                 }
1093                         } else if(i < MAX_SKB_FRAGS) {
1094                                 if (copy > PAGE_SIZE)
1095                                         copy = PAGE_SIZE;
1096                                 page = alloc_pages(sk->sk_allocation, 0);
1097                                 if (page == NULL) {
1098                                         err = -ENOMEM;
1099                                         goto error;
1100                                 }
1101                                 sk->sk_sndmsg_page = page;
1102                                 sk->sk_sndmsg_off = 0;
1103
1104                                 skb_fill_page_desc(skb, i, page, 0, 0);
1105                                 frag = &skb_shinfo(skb)->frags[i];
1106                                 skb->truesize += PAGE_SIZE;
1107                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1108                         } else {
1109                                 err = -EMSGSIZE;
1110                                 goto error;
1111                         }
1112                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1113                                 err = -EFAULT;
1114                                 goto error;
1115                         }
1116                         sk->sk_sndmsg_off += copy;
1117                         frag->size += copy;
1118                         skb->len += copy;
1119                         skb->data_len += copy;
1120                 }
1121                 offset += copy;
1122                 length -= copy;
1123         }
1124         return 0;
1125 error:
1126         inet->cork.length -= length;
1127         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1128         return err;
1129 }
1130
1131 int ip6_push_pending_frames(struct sock *sk)
1132 {
1133         struct sk_buff *skb, *tmp_skb;
1134         struct sk_buff **tail_skb;
1135         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1136         struct inet_sock *inet = inet_sk(sk);
1137         struct ipv6_pinfo *np = inet6_sk(sk);
1138         struct ipv6hdr *hdr;
1139         struct ipv6_txoptions *opt = np->cork.opt;
1140         struct rt6_info *rt = np->cork.rt;
1141         struct flowi *fl = &inet->cork.fl;
1142         unsigned char proto = fl->proto;
1143         int err = 0;
1144
1145         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1146                 goto out;
1147         tail_skb = &(skb_shinfo(skb)->frag_list);
1148
1149         /* move skb->data to ip header from ext header */
1150         if (skb->data < skb->nh.raw)
1151                 __skb_pull(skb, skb->nh.raw - skb->data);
1152         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1153                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1154                 *tail_skb = tmp_skb;
1155                 tail_skb = &(tmp_skb->next);
1156                 skb->len += tmp_skb->len;
1157                 skb->data_len += tmp_skb->len;
1158                 skb->truesize += tmp_skb->truesize;
1159                 __sock_put(tmp_skb->sk);
1160                 tmp_skb->destructor = NULL;
1161                 tmp_skb->sk = NULL;
1162         }
1163
1164         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1165         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1166         if (opt && opt->opt_flen)
1167                 ipv6_push_frag_opts(skb, opt, &proto);
1168         if (opt && opt->opt_nflen)
1169                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1170
1171         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1172         
1173         *(u32*)hdr = fl->fl6_flowlabel |
1174                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1175
1176         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1177                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1178         else
1179                 hdr->payload_len = 0;
1180         hdr->hop_limit = np->cork.hop_limit;
1181         hdr->nexthdr = proto;
1182         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1183         ipv6_addr_copy(&hdr->daddr, final_dst);
1184
1185         skb->dst = dst_clone(&rt->u.dst);
1186         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1187         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1188         if (err) {
1189                 if (err > 0)
1190                         err = np->recverr ? net_xmit_errno(err) : 0;
1191                 if (err)
1192                         goto error;
1193         }
1194
1195 out:
1196         inet->cork.flags &= ~IPCORK_OPT;
1197         kfree(np->cork.opt);
1198         np->cork.opt = NULL;
1199         if (np->cork.rt) {
1200                 dst_release(&np->cork.rt->u.dst);
1201                 np->cork.rt = NULL;
1202                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1203         }
1204         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1205         return err;
1206 error:
1207         goto out;
1208 }
1209
1210 void ip6_flush_pending_frames(struct sock *sk)
1211 {
1212         struct inet_sock *inet = inet_sk(sk);
1213         struct ipv6_pinfo *np = inet6_sk(sk);
1214         struct sk_buff *skb;
1215
1216         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1217                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1218                 kfree_skb(skb);
1219         }
1220
1221         inet->cork.flags &= ~IPCORK_OPT;
1222
1223         kfree(np->cork.opt);
1224         np->cork.opt = NULL;
1225         if (np->cork.rt) {
1226                 dst_release(&np->cork.rt->u.dst);
1227                 np->cork.rt = NULL;
1228                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1229         }
1230         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1231 }