Merge master.kernel.org:/home/rmk/linux-2.6-arm
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         mtu = dst_mtu(dst);
230         if ((skb->len <= mtu) || ipfragok) {
231                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233                                 dst_output);
234         }
235
236         if (net_ratelimit())
237                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238         skb->dev = dst->dev;
239         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241         kfree_skb(skb);
242         return -EMSGSIZE;
243 }
244
245 /*
246  *      To avoid extra problems ND packets are send through this
247  *      routine. It's code duplication but I really want to avoid
248  *      extra checks since ipv6_build_header is used by TCP (which
249  *      is for us performance critical)
250  */
251
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253                struct in6_addr *saddr, struct in6_addr *daddr,
254                int proto, int len)
255 {
256         struct ipv6_pinfo *np = inet6_sk(sk);
257         struct ipv6hdr *hdr;
258         int totlen;
259
260         skb->protocol = htons(ETH_P_IPV6);
261         skb->dev = dev;
262
263         totlen = len + sizeof(struct ipv6hdr);
264
265         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266         skb->nh.ipv6h = hdr;
267
268         *(u32*)hdr = htonl(0x60000000);
269
270         hdr->payload_len = htons(len);
271         hdr->nexthdr = proto;
272         hdr->hop_limit = np->hop_limit;
273
274         ipv6_addr_copy(&hdr->saddr, saddr);
275         ipv6_addr_copy(&hdr->daddr, daddr);
276
277         return 0;
278 }
279
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282         struct ip6_ra_chain *ra;
283         struct sock *last = NULL;
284
285         read_lock(&ip6_ra_lock);
286         for (ra = ip6_ra_chain; ra; ra = ra->next) {
287                 struct sock *sk = ra->sk;
288                 if (sk && ra->sel == sel &&
289                     (!sk->sk_bound_dev_if ||
290                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
291                         if (last) {
292                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293                                 if (skb2)
294                                         rawv6_rcv(last, skb2);
295                         }
296                         last = sk;
297                 }
298         }
299
300         if (last) {
301                 rawv6_rcv(last, skb);
302                 read_unlock(&ip6_ra_lock);
303                 return 1;
304         }
305         read_unlock(&ip6_ra_lock);
306         return 0;
307 }
308
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311         return dst_output(skb);
312 }
313
314 int ip6_forward(struct sk_buff *skb)
315 {
316         struct dst_entry *dst = skb->dst;
317         struct ipv6hdr *hdr = skb->nh.ipv6h;
318         struct inet6_skb_parm *opt = IP6CB(skb);
319         
320         if (ipv6_devconf.forwarding == 0)
321                 goto error;
322
323         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325                 goto drop;
326         }
327
328         skb->ip_summed = CHECKSUM_NONE;
329
330         /*
331          *      We DO NOT make any processing on
332          *      RA packets, pushing them to user level AS IS
333          *      without ane WARRANTY that application will be able
334          *      to interpret them. The reason is that we
335          *      cannot make anything clever here.
336          *
337          *      We are not end-node, so that if packet contains
338          *      AH/ESP, we cannot make anything.
339          *      Defragmentation also would be mistake, RA packets
340          *      cannot be fragmented, because there is no warranty
341          *      that different fragments will go along one path. --ANK
342          */
343         if (opt->ra) {
344                 u8 *ptr = skb->nh.raw + opt->ra;
345                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346                         return 0;
347         }
348
349         /*
350          *      check and decrement ttl
351          */
352         if (hdr->hop_limit <= 1) {
353                 /* Force OUTPUT device used as source address */
354                 skb->dev = dst->dev;
355                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356                             0, skb->dev);
357
358                 kfree_skb(skb);
359                 return -ETIMEDOUT;
360         }
361
362         if (!xfrm6_route_forward(skb)) {
363                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364                 goto drop;
365         }
366         dst = skb->dst;
367
368         /* IPv6 specs say nothing about it, but it is clear that we cannot
369            send redirects to source routed frames.
370          */
371         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372                 struct in6_addr *target = NULL;
373                 struct rt6_info *rt;
374                 struct neighbour *n = dst->neighbour;
375
376                 /*
377                  *      incoming and outgoing devices are the same
378                  *      send a redirect.
379                  */
380
381                 rt = (struct rt6_info *) dst;
382                 if ((rt->rt6i_flags & RTF_GATEWAY))
383                         target = (struct in6_addr*)&n->primary_key;
384                 else
385                         target = &hdr->daddr;
386
387                 /* Limit redirects both by destination (here)
388                    and by source (inside ndisc_send_redirect)
389                  */
390                 if (xrlim_allow(dst, 1*HZ))
391                         ndisc_send_redirect(skb, n, target);
392         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393                                                 |IPV6_ADDR_LINKLOCAL)) {
394                 /* This check is security critical. */
395                 goto error;
396         }
397
398         if (skb->len > dst_mtu(dst)) {
399                 /* Again, force OUTPUT device used as source address */
400                 skb->dev = dst->dev;
401                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404                 kfree_skb(skb);
405                 return -EMSGSIZE;
406         }
407
408         if (skb_cow(skb, dst->dev->hard_header_len)) {
409                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410                 goto drop;
411         }
412
413         hdr = skb->nh.ipv6h;
414
415         /* Mangling hops number delayed to point after skb COW */
416  
417         hdr->hop_limit--;
418
419         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421
422 error:
423         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425         kfree_skb(skb);
426         return -EINVAL;
427 }
428
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431         to->pkt_type = from->pkt_type;
432         to->priority = from->priority;
433         to->protocol = from->protocol;
434         dst_release(to->dst);
435         to->dst = dst_clone(from->dst);
436         to->dev = from->dev;
437
438 #ifdef CONFIG_NET_SCHED
439         to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442         to->nfmark = from->nfmark;
443         /* Connection association is same as pre-frag packet */
444         to->nfct = from->nfct;
445         nf_conntrack_get(to->nfct);
446         to->nfctinfo = from->nfctinfo;
447 #ifdef CONFIG_BRIDGE_NETFILTER
448         nf_bridge_put(to->nf_bridge);
449         to->nf_bridge = from->nf_bridge;
450         nf_bridge_get(to->nf_bridge);
451 #endif
452 #endif
453 }
454
455 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
456 {
457         u16 offset = sizeof(struct ipv6hdr);
458         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
459         unsigned int packet_len = skb->tail - skb->nh.raw;
460         int found_rhdr = 0;
461         *nexthdr = &skb->nh.ipv6h->nexthdr;
462
463         while (offset + 1 <= packet_len) {
464
465                 switch (**nexthdr) {
466
467                 case NEXTHDR_HOP:
468                 case NEXTHDR_ROUTING:
469                 case NEXTHDR_DEST:
470                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
471                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
472                         offset += ipv6_optlen(exthdr);
473                         *nexthdr = &exthdr->nexthdr;
474                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
475                         break;
476                 default :
477                         return offset;
478                 }
479         }
480
481         return offset;
482 }
483
484 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
485 {
486         struct net_device *dev;
487         struct sk_buff *frag;
488         struct rt6_info *rt = (struct rt6_info*)skb->dst;
489         struct ipv6hdr *tmp_hdr;
490         struct frag_hdr *fh;
491         unsigned int mtu, hlen, left, len;
492         u32 frag_id = 0;
493         int ptr, offset = 0, err=0;
494         u8 *prevhdr, nexthdr = 0;
495
496         dev = rt->u.dst.dev;
497         hlen = ip6_find_1stfragopt(skb, &prevhdr);
498         nexthdr = *prevhdr;
499
500         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
501
502         if (skb_shinfo(skb)->frag_list) {
503                 int first_len = skb_pagelen(skb);
504
505                 if (first_len - hlen > mtu ||
506                     ((first_len - hlen) & 7) ||
507                     skb_cloned(skb))
508                         goto slow_path;
509
510                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
511                         /* Correct geometry. */
512                         if (frag->len > mtu ||
513                             ((frag->len & 7) && frag->next) ||
514                             skb_headroom(frag) < hlen)
515                             goto slow_path;
516
517                         /* Partially cloned skb? */
518                         if (skb_shared(frag))
519                                 goto slow_path;
520
521                         BUG_ON(frag->sk);
522                         if (skb->sk) {
523                                 sock_hold(skb->sk);
524                                 frag->sk = skb->sk;
525                                 frag->destructor = sock_wfree;
526                                 skb->truesize -= frag->truesize;
527                         }
528                 }
529
530                 err = 0;
531                 offset = 0;
532                 frag = skb_shinfo(skb)->frag_list;
533                 skb_shinfo(skb)->frag_list = NULL;
534                 /* BUILD HEADER */
535
536                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
537                 if (!tmp_hdr) {
538                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
539                         return -ENOMEM;
540                 }
541
542                 *prevhdr = NEXTHDR_FRAGMENT;
543                 memcpy(tmp_hdr, skb->nh.raw, hlen);
544                 __skb_pull(skb, hlen);
545                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
546                 skb->nh.raw = __skb_push(skb, hlen);
547                 memcpy(skb->nh.raw, tmp_hdr, hlen);
548
549                 ipv6_select_ident(skb, fh);
550                 fh->nexthdr = nexthdr;
551                 fh->reserved = 0;
552                 fh->frag_off = htons(IP6_MF);
553                 frag_id = fh->identification;
554
555                 first_len = skb_pagelen(skb);
556                 skb->data_len = first_len - skb_headlen(skb);
557                 skb->len = first_len;
558                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
559  
560
561                 for (;;) {
562                         /* Prepare header of the next frame,
563                          * before previous one went down. */
564                         if (frag) {
565                                 frag->ip_summed = CHECKSUM_NONE;
566                                 frag->h.raw = frag->data;
567                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
568                                 frag->nh.raw = __skb_push(frag, hlen);
569                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
570                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
571                                 fh->nexthdr = nexthdr;
572                                 fh->reserved = 0;
573                                 fh->frag_off = htons(offset);
574                                 if (frag->next != NULL)
575                                         fh->frag_off |= htons(IP6_MF);
576                                 fh->identification = frag_id;
577                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
578                                 ip6_copy_metadata(frag, skb);
579                         }
580                         
581                         err = output(skb);
582                         if (err || !frag)
583                                 break;
584
585                         skb = frag;
586                         frag = skb->next;
587                         skb->next = NULL;
588                 }
589
590                 kfree(tmp_hdr);
591
592                 if (err == 0) {
593                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
594                         return 0;
595                 }
596
597                 while (frag) {
598                         skb = frag->next;
599                         kfree_skb(frag);
600                         frag = skb;
601                 }
602
603                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
604                 return err;
605         }
606
607 slow_path:
608         left = skb->len - hlen;         /* Space per frame */
609         ptr = hlen;                     /* Where to start from */
610
611         /*
612          *      Fragment the datagram.
613          */
614
615         *prevhdr = NEXTHDR_FRAGMENT;
616
617         /*
618          *      Keep copying data until we run out.
619          */
620         while(left > 0) {
621                 len = left;
622                 /* IF: it doesn't fit, use 'mtu' - the data space left */
623                 if (len > mtu)
624                         len = mtu;
625                 /* IF: we are not sending upto and including the packet end
626                    then align the next start on an eight byte boundary */
627                 if (len < left) {
628                         len &= ~7;
629                 }
630                 /*
631                  *      Allocate buffer.
632                  */
633
634                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
635                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
636                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
637                         err = -ENOMEM;
638                         goto fail;
639                 }
640
641                 /*
642                  *      Set up data on packet
643                  */
644
645                 ip6_copy_metadata(frag, skb);
646                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
647                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
648                 frag->nh.raw = frag->data;
649                 fh = (struct frag_hdr*)(frag->data + hlen);
650                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
651
652                 /*
653                  *      Charge the memory for the fragment to any owner
654                  *      it might possess
655                  */
656                 if (skb->sk)
657                         skb_set_owner_w(frag, skb->sk);
658
659                 /*
660                  *      Copy the packet header into the new buffer.
661                  */
662                 memcpy(frag->nh.raw, skb->data, hlen);
663
664                 /*
665                  *      Build fragment header.
666                  */
667                 fh->nexthdr = nexthdr;
668                 fh->reserved = 0;
669                 if (!frag_id) {
670                         ipv6_select_ident(skb, fh);
671                         frag_id = fh->identification;
672                 } else
673                         fh->identification = frag_id;
674
675                 /*
676                  *      Copy a block of the IP datagram.
677                  */
678                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
679                         BUG();
680                 left -= len;
681
682                 fh->frag_off = htons(offset);
683                 if (left > 0)
684                         fh->frag_off |= htons(IP6_MF);
685                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
686
687                 ptr += len;
688                 offset += len;
689
690                 /*
691                  *      Put this fragment into the sending queue.
692                  */
693
694                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
695
696                 err = output(frag);
697                 if (err)
698                         goto fail;
699         }
700         kfree_skb(skb);
701         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
702         return err;
703
704 fail:
705         kfree_skb(skb); 
706         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
707         return err;
708 }
709
710 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
711 {
712         int err = 0;
713
714         *dst = NULL;
715         if (sk) {
716                 struct ipv6_pinfo *np = inet6_sk(sk);
717         
718                 *dst = sk_dst_check(sk, np->dst_cookie);
719                 if (*dst) {
720                         struct rt6_info *rt = (struct rt6_info*)*dst;
721         
722                                 /* Yes, checking route validity in not connected
723                                    case is not very simple. Take into account,
724                                    that we do not support routing by source, TOS,
725                                    and MSG_DONTROUTE            --ANK (980726)
726         
727                                    1. If route was host route, check that
728                                       cached destination is current.
729                                       If it is network route, we still may
730                                       check its validity using saved pointer
731                                       to the last used address: daddr_cache.
732                                       We do not want to save whole address now,
733                                       (because main consumer of this service
734                                        is tcp, which has not this problem),
735                                       so that the last trick works only on connected
736                                       sockets.
737                                    2. oif also should be the same.
738                                  */
739         
740                         if (((rt->rt6i_dst.plen != 128 ||
741                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
742                              && (np->daddr_cache == NULL ||
743                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
744                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
745                                 dst_release(*dst);
746                                 *dst = NULL;
747                         }
748                 }
749         }
750
751         if (*dst == NULL)
752                 *dst = ip6_route_output(sk, fl);
753
754         if ((err = (*dst)->error))
755                 goto out_err_release;
756
757         if (ipv6_addr_any(&fl->fl6_src)) {
758                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
759
760                 if (err)
761                         goto out_err_release;
762         }
763
764         return 0;
765
766 out_err_release:
767         dst_release(*dst);
768         *dst = NULL;
769         return err;
770 }
771 inline int ip6_ufo_append_data(struct sock *sk,
772                         int getfrag(void *from, char *to, int offset, int len,
773                         int odd, struct sk_buff *skb),
774                         void *from, int length, int hh_len, int fragheaderlen,
775                         int transhdrlen, int mtu,unsigned int flags)
776
777 {
778         struct sk_buff *skb;
779         int err;
780
781         /* There is support for UDP large send offload by network
782          * device, so create one single skb packet containing complete
783          * udp datagram
784          */
785         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
786                 skb = sock_alloc_send_skb(sk,
787                         hh_len + fragheaderlen + transhdrlen + 20,
788                         (flags & MSG_DONTWAIT), &err);
789                 if (skb == NULL)
790                         return -ENOMEM;
791
792                 /* reserve space for Hardware header */
793                 skb_reserve(skb, hh_len);
794
795                 /* create space for UDP/IP header */
796                 skb_put(skb,fragheaderlen + transhdrlen);
797
798                 /* initialize network header pointer */
799                 skb->nh.raw = skb->data;
800
801                 /* initialize protocol header pointer */
802                 skb->h.raw = skb->data + fragheaderlen;
803
804                 skb->ip_summed = CHECKSUM_HW;
805                 skb->csum = 0;
806                 sk->sk_sndmsg_off = 0;
807         }
808
809         err = skb_append_datato_frags(sk,skb, getfrag, from,
810                                       (length - transhdrlen));
811         if (!err) {
812                 struct frag_hdr fhdr;
813
814                 /* specify the length of each IP datagram fragment*/
815                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
816                                                 sizeof(struct frag_hdr);
817                 ipv6_select_ident(skb, &fhdr);
818                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
819                 __skb_queue_tail(&sk->sk_write_queue, skb);
820
821                 return 0;
822         }
823         /* There is not enough support do UPD LSO,
824          * so follow normal path
825          */
826         kfree_skb(skb);
827
828         return err;
829 }
830
831 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
832         int offset, int len, int odd, struct sk_buff *skb),
833         void *from, int length, int transhdrlen,
834         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
835         struct rt6_info *rt, unsigned int flags)
836 {
837         struct inet_sock *inet = inet_sk(sk);
838         struct ipv6_pinfo *np = inet6_sk(sk);
839         struct sk_buff *skb;
840         unsigned int maxfraglen, fragheaderlen;
841         int exthdrlen;
842         int hh_len;
843         int mtu;
844         int copy;
845         int err;
846         int offset = 0;
847         int csummode = CHECKSUM_NONE;
848
849         if (flags&MSG_PROBE)
850                 return 0;
851         if (skb_queue_empty(&sk->sk_write_queue)) {
852                 /*
853                  * setup for corking
854                  */
855                 if (opt) {
856                         if (np->cork.opt == NULL) {
857                                 np->cork.opt = kmalloc(opt->tot_len,
858                                                        sk->sk_allocation);
859                                 if (unlikely(np->cork.opt == NULL))
860                                         return -ENOBUFS;
861                         } else if (np->cork.opt->tot_len < opt->tot_len) {
862                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
863                                 return -EINVAL;
864                         }
865                         memcpy(np->cork.opt, opt, opt->tot_len);
866                         inet->cork.flags |= IPCORK_OPT;
867                         /* need source address above miyazawa*/
868                 }
869                 dst_hold(&rt->u.dst);
870                 np->cork.rt = rt;
871                 inet->cork.fl = *fl;
872                 np->cork.hop_limit = hlimit;
873                 np->cork.tclass = tclass;
874                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
875                 if (dst_allfrag(rt->u.dst.path))
876                         inet->cork.flags |= IPCORK_ALLFRAG;
877                 inet->cork.length = 0;
878                 sk->sk_sndmsg_page = NULL;
879                 sk->sk_sndmsg_off = 0;
880                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
881                 length += exthdrlen;
882                 transhdrlen += exthdrlen;
883         } else {
884                 rt = np->cork.rt;
885                 fl = &inet->cork.fl;
886                 if (inet->cork.flags & IPCORK_OPT)
887                         opt = np->cork.opt;
888                 transhdrlen = 0;
889                 exthdrlen = 0;
890                 mtu = inet->cork.fragsize;
891         }
892
893         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
894
895         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
896         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
897
898         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
899                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
900                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
901                         return -EMSGSIZE;
902                 }
903         }
904
905         /*
906          * Let's try using as much space as possible.
907          * Use MTU if total length of the message fits into the MTU.
908          * Otherwise, we need to reserve fragment header and
909          * fragment alignment (= 8-15 octects, in total).
910          *
911          * Note that we may need to "move" the data from the tail of
912          * of the buffer to the new fragment when we split 
913          * the message.
914          *
915          * FIXME: It may be fragmented into multiple chunks 
916          *        at once if non-fragmentable extension headers
917          *        are too large.
918          * --yoshfuji 
919          */
920
921         inet->cork.length += length;
922         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
923             (rt->u.dst.dev->features & NETIF_F_UFO)) {
924
925                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
926                                 fragheaderlen, transhdrlen, mtu, flags))
927                         goto error;
928
929                 return 0;
930         }
931
932         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
933                 goto alloc_new_skb;
934
935         while (length > 0) {
936                 /* Check if the remaining data fits into current packet. */
937                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
938                 if (copy < length)
939                         copy = maxfraglen - skb->len;
940
941                 if (copy <= 0) {
942                         char *data;
943                         unsigned int datalen;
944                         unsigned int fraglen;
945                         unsigned int fraggap;
946                         unsigned int alloclen;
947                         struct sk_buff *skb_prev;
948 alloc_new_skb:
949                         skb_prev = skb;
950
951                         /* There's no room in the current skb */
952                         if (skb_prev)
953                                 fraggap = skb_prev->len - maxfraglen;
954                         else
955                                 fraggap = 0;
956
957                         /*
958                          * If remaining data exceeds the mtu,
959                          * we know we need more fragment(s).
960                          */
961                         datalen = length + fraggap;
962                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
963                                 datalen = maxfraglen - fragheaderlen;
964
965                         fraglen = datalen + fragheaderlen;
966                         if ((flags & MSG_MORE) &&
967                             !(rt->u.dst.dev->features&NETIF_F_SG))
968                                 alloclen = mtu;
969                         else
970                                 alloclen = datalen + fragheaderlen;
971
972                         /*
973                          * The last fragment gets additional space at tail.
974                          * Note: we overallocate on fragments with MSG_MODE
975                          * because we have no idea if we're the last one.
976                          */
977                         if (datalen == length + fraggap)
978                                 alloclen += rt->u.dst.trailer_len;
979
980                         /*
981                          * We just reserve space for fragment header.
982                          * Note: this may be overallocation if the message 
983                          * (without MSG_MORE) fits into the MTU.
984                          */
985                         alloclen += sizeof(struct frag_hdr);
986
987                         if (transhdrlen) {
988                                 skb = sock_alloc_send_skb(sk,
989                                                 alloclen + hh_len,
990                                                 (flags & MSG_DONTWAIT), &err);
991                         } else {
992                                 skb = NULL;
993                                 if (atomic_read(&sk->sk_wmem_alloc) <=
994                                     2 * sk->sk_sndbuf)
995                                         skb = sock_wmalloc(sk,
996                                                            alloclen + hh_len, 1,
997                                                            sk->sk_allocation);
998                                 if (unlikely(skb == NULL))
999                                         err = -ENOBUFS;
1000                         }
1001                         if (skb == NULL)
1002                                 goto error;
1003                         /*
1004                          *      Fill in the control structures
1005                          */
1006                         skb->ip_summed = csummode;
1007                         skb->csum = 0;
1008                         /* reserve for fragmentation */
1009                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1010
1011                         /*
1012                          *      Find where to start putting bytes
1013                          */
1014                         data = skb_put(skb, fraglen);
1015                         skb->nh.raw = data + exthdrlen;
1016                         data += fragheaderlen;
1017                         skb->h.raw = data + exthdrlen;
1018
1019                         if (fraggap) {
1020                                 skb->csum = skb_copy_and_csum_bits(
1021                                         skb_prev, maxfraglen,
1022                                         data + transhdrlen, fraggap, 0);
1023                                 skb_prev->csum = csum_sub(skb_prev->csum,
1024                                                           skb->csum);
1025                                 data += fraggap;
1026                                 skb_trim(skb_prev, maxfraglen);
1027                         }
1028                         copy = datalen - transhdrlen - fraggap;
1029                         if (copy < 0) {
1030                                 err = -EINVAL;
1031                                 kfree_skb(skb);
1032                                 goto error;
1033                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1034                                 err = -EFAULT;
1035                                 kfree_skb(skb);
1036                                 goto error;
1037                         }
1038
1039                         offset += copy;
1040                         length -= datalen - fraggap;
1041                         transhdrlen = 0;
1042                         exthdrlen = 0;
1043                         csummode = CHECKSUM_NONE;
1044
1045                         /*
1046                          * Put the packet on the pending queue
1047                          */
1048                         __skb_queue_tail(&sk->sk_write_queue, skb);
1049                         continue;
1050                 }
1051
1052                 if (copy > length)
1053                         copy = length;
1054
1055                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1056                         unsigned int off;
1057
1058                         off = skb->len;
1059                         if (getfrag(from, skb_put(skb, copy),
1060                                                 offset, copy, off, skb) < 0) {
1061                                 __skb_trim(skb, off);
1062                                 err = -EFAULT;
1063                                 goto error;
1064                         }
1065                 } else {
1066                         int i = skb_shinfo(skb)->nr_frags;
1067                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1068                         struct page *page = sk->sk_sndmsg_page;
1069                         int off = sk->sk_sndmsg_off;
1070                         unsigned int left;
1071
1072                         if (page && (left = PAGE_SIZE - off) > 0) {
1073                                 if (copy >= left)
1074                                         copy = left;
1075                                 if (page != frag->page) {
1076                                         if (i == MAX_SKB_FRAGS) {
1077                                                 err = -EMSGSIZE;
1078                                                 goto error;
1079                                         }
1080                                         get_page(page);
1081                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1082                                         frag = &skb_shinfo(skb)->frags[i];
1083                                 }
1084                         } else if(i < MAX_SKB_FRAGS) {
1085                                 if (copy > PAGE_SIZE)
1086                                         copy = PAGE_SIZE;
1087                                 page = alloc_pages(sk->sk_allocation, 0);
1088                                 if (page == NULL) {
1089                                         err = -ENOMEM;
1090                                         goto error;
1091                                 }
1092                                 sk->sk_sndmsg_page = page;
1093                                 sk->sk_sndmsg_off = 0;
1094
1095                                 skb_fill_page_desc(skb, i, page, 0, 0);
1096                                 frag = &skb_shinfo(skb)->frags[i];
1097                                 skb->truesize += PAGE_SIZE;
1098                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1099                         } else {
1100                                 err = -EMSGSIZE;
1101                                 goto error;
1102                         }
1103                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1104                                 err = -EFAULT;
1105                                 goto error;
1106                         }
1107                         sk->sk_sndmsg_off += copy;
1108                         frag->size += copy;
1109                         skb->len += copy;
1110                         skb->data_len += copy;
1111                 }
1112                 offset += copy;
1113                 length -= copy;
1114         }
1115         return 0;
1116 error:
1117         inet->cork.length -= length;
1118         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1119         return err;
1120 }
1121
1122 int ip6_push_pending_frames(struct sock *sk)
1123 {
1124         struct sk_buff *skb, *tmp_skb;
1125         struct sk_buff **tail_skb;
1126         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1127         struct inet_sock *inet = inet_sk(sk);
1128         struct ipv6_pinfo *np = inet6_sk(sk);
1129         struct ipv6hdr *hdr;
1130         struct ipv6_txoptions *opt = np->cork.opt;
1131         struct rt6_info *rt = np->cork.rt;
1132         struct flowi *fl = &inet->cork.fl;
1133         unsigned char proto = fl->proto;
1134         int err = 0;
1135
1136         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1137                 goto out;
1138         tail_skb = &(skb_shinfo(skb)->frag_list);
1139
1140         /* move skb->data to ip header from ext header */
1141         if (skb->data < skb->nh.raw)
1142                 __skb_pull(skb, skb->nh.raw - skb->data);
1143         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1144                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1145                 *tail_skb = tmp_skb;
1146                 tail_skb = &(tmp_skb->next);
1147                 skb->len += tmp_skb->len;
1148                 skb->data_len += tmp_skb->len;
1149                 skb->truesize += tmp_skb->truesize;
1150                 __sock_put(tmp_skb->sk);
1151                 tmp_skb->destructor = NULL;
1152                 tmp_skb->sk = NULL;
1153         }
1154
1155         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1156         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1157         if (opt && opt->opt_flen)
1158                 ipv6_push_frag_opts(skb, opt, &proto);
1159         if (opt && opt->opt_nflen)
1160                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1161
1162         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1163         
1164         *(u32*)hdr = fl->fl6_flowlabel |
1165                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1166
1167         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1168                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1169         else
1170                 hdr->payload_len = 0;
1171         hdr->hop_limit = np->cork.hop_limit;
1172         hdr->nexthdr = proto;
1173         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1174         ipv6_addr_copy(&hdr->daddr, final_dst);
1175
1176         skb->dst = dst_clone(&rt->u.dst);
1177         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1178         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1179         if (err) {
1180                 if (err > 0)
1181                         err = np->recverr ? net_xmit_errno(err) : 0;
1182                 if (err)
1183                         goto error;
1184         }
1185
1186 out:
1187         inet->cork.flags &= ~IPCORK_OPT;
1188         kfree(np->cork.opt);
1189         np->cork.opt = NULL;
1190         if (np->cork.rt) {
1191                 dst_release(&np->cork.rt->u.dst);
1192                 np->cork.rt = NULL;
1193                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1194         }
1195         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1196         return err;
1197 error:
1198         goto out;
1199 }
1200
1201 void ip6_flush_pending_frames(struct sock *sk)
1202 {
1203         struct inet_sock *inet = inet_sk(sk);
1204         struct ipv6_pinfo *np = inet6_sk(sk);
1205         struct sk_buff *skb;
1206
1207         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1208                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1209                 kfree_skb(skb);
1210         }
1211
1212         inet->cork.flags &= ~IPCORK_OPT;
1213
1214         kfree(np->cork.opt);
1215         np->cork.opt = NULL;
1216         if (np->cork.rt) {
1217                 dst_release(&np->cork.rt->u.dst);
1218                 np->cork.rt = NULL;
1219                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1220         }
1221         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1222 }