Merge /spare/repo/linux-2.6/
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191         if (skb->nfcache & NFC_ALTERED){
192                 if (ip6_route_me_harder(skb) != 0){
193                         kfree_skb(skb);
194                         return -EINVAL;
195                 }
196         }
197 #endif /* CONFIG_NETFILTER */
198         return dst_output(skb);
199 }
200
201 /*
202  *      xmit an sk_buff (used by TCP)
203  */
204
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206              struct ipv6_txoptions *opt, int ipfragok)
207 {
208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209         struct in6_addr *first_hop = &fl->fl6_dst;
210         struct dst_entry *dst = skb->dst;
211         struct ipv6hdr *hdr;
212         u8  proto = fl->proto;
213         int seg_len = skb->len;
214         int hlimit;
215         u32 mtu;
216
217         if (opt) {
218                 int head_room;
219
220                 /* First: exthdrs may take lots of space (~8K for now)
221                    MAX_HEADER is not enough.
222                  */
223                 head_room = opt->opt_nflen + opt->opt_flen;
224                 seg_len += head_room;
225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227                 if (skb_headroom(skb) < head_room) {
228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229                         kfree_skb(skb);
230                         skb = skb2;
231                         if (skb == NULL) {      
232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233                                 return -ENOBUFS;
234                         }
235                         if (sk)
236                                 skb_set_owner_w(skb, sk);
237                 }
238                 if (opt->opt_flen)
239                         ipv6_push_frag_opts(skb, opt, &proto);
240                 if (opt->opt_nflen)
241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242         }
243
244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246         /*
247          *      Fill in the IPv6 header
248          */
249
250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251         hlimit = -1;
252         if (np)
253                 hlimit = np->hop_limit;
254         if (hlimit < 0)
255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256         if (hlimit < 0)
257                 hlimit = ipv6_get_hoplimit(dst->dev);
258
259         hdr->payload_len = htons(seg_len);
260         hdr->nexthdr = proto;
261         hdr->hop_limit = hlimit;
262
263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264         ipv6_addr_copy(&hdr->daddr, first_hop);
265
266         mtu = dst_mtu(dst);
267         if ((skb->len <= mtu) || ipfragok) {
268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270         }
271
272         if (net_ratelimit())
273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274         skb->dev = dst->dev;
275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277         kfree_skb(skb);
278         return -EMSGSIZE;
279 }
280
281 /*
282  *      To avoid extra problems ND packets are send through this
283  *      routine. It's code duplication but I really want to avoid
284  *      extra checks since ipv6_build_header is used by TCP (which
285  *      is for us performance critical)
286  */
287
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289                struct in6_addr *saddr, struct in6_addr *daddr,
290                int proto, int len)
291 {
292         struct ipv6_pinfo *np = inet6_sk(sk);
293         struct ipv6hdr *hdr;
294         int totlen;
295
296         skb->protocol = htons(ETH_P_IPV6);
297         skb->dev = dev;
298
299         totlen = len + sizeof(struct ipv6hdr);
300
301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302         skb->nh.ipv6h = hdr;
303
304         *(u32*)hdr = htonl(0x60000000);
305
306         hdr->payload_len = htons(len);
307         hdr->nexthdr = proto;
308         hdr->hop_limit = np->hop_limit;
309
310         ipv6_addr_copy(&hdr->saddr, saddr);
311         ipv6_addr_copy(&hdr->daddr, daddr);
312
313         return 0;
314 }
315
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318         struct ip6_ra_chain *ra;
319         struct sock *last = NULL;
320
321         read_lock(&ip6_ra_lock);
322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
323                 struct sock *sk = ra->sk;
324                 if (sk && ra->sel == sel) {
325                         if (last) {
326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327                                 if (skb2)
328                                         rawv6_rcv(last, skb2);
329                         }
330                         last = sk;
331                 }
332         }
333
334         if (last) {
335                 rawv6_rcv(last, skb);
336                 read_unlock(&ip6_ra_lock);
337                 return 1;
338         }
339         read_unlock(&ip6_ra_lock);
340         return 0;
341 }
342
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345         return dst_output(skb);
346 }
347
348 int ip6_forward(struct sk_buff *skb)
349 {
350         struct dst_entry *dst = skb->dst;
351         struct ipv6hdr *hdr = skb->nh.ipv6h;
352         struct inet6_skb_parm *opt = IP6CB(skb);
353         
354         if (ipv6_devconf.forwarding == 0)
355                 goto error;
356
357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359                 goto drop;
360         }
361
362         skb->ip_summed = CHECKSUM_NONE;
363
364         /*
365          *      We DO NOT make any processing on
366          *      RA packets, pushing them to user level AS IS
367          *      without ane WARRANTY that application will be able
368          *      to interpret them. The reason is that we
369          *      cannot make anything clever here.
370          *
371          *      We are not end-node, so that if packet contains
372          *      AH/ESP, we cannot make anything.
373          *      Defragmentation also would be mistake, RA packets
374          *      cannot be fragmented, because there is no warranty
375          *      that different fragments will go along one path. --ANK
376          */
377         if (opt->ra) {
378                 u8 *ptr = skb->nh.raw + opt->ra;
379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380                         return 0;
381         }
382
383         /*
384          *      check and decrement ttl
385          */
386         if (hdr->hop_limit <= 1) {
387                 /* Force OUTPUT device used as source address */
388                 skb->dev = dst->dev;
389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390                             0, skb->dev);
391
392                 kfree_skb(skb);
393                 return -ETIMEDOUT;
394         }
395
396         if (!xfrm6_route_forward(skb)) {
397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398                 goto drop;
399         }
400         dst = skb->dst;
401
402         /* IPv6 specs say nothing about it, but it is clear that we cannot
403            send redirects to source routed frames.
404          */
405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406                 struct in6_addr *target = NULL;
407                 struct rt6_info *rt;
408                 struct neighbour *n = dst->neighbour;
409
410                 /*
411                  *      incoming and outgoing devices are the same
412                  *      send a redirect.
413                  */
414
415                 rt = (struct rt6_info *) dst;
416                 if ((rt->rt6i_flags & RTF_GATEWAY))
417                         target = (struct in6_addr*)&n->primary_key;
418                 else
419                         target = &hdr->daddr;
420
421                 /* Limit redirects both by destination (here)
422                    and by source (inside ndisc_send_redirect)
423                  */
424                 if (xrlim_allow(dst, 1*HZ))
425                         ndisc_send_redirect(skb, n, target);
426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427                                                 |IPV6_ADDR_LINKLOCAL)) {
428                 /* This check is security critical. */
429                 goto error;
430         }
431
432         if (skb->len > dst_mtu(dst)) {
433                 /* Again, force OUTPUT device used as source address */
434                 skb->dev = dst->dev;
435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438                 kfree_skb(skb);
439                 return -EMSGSIZE;
440         }
441
442         if (skb_cow(skb, dst->dev->hard_header_len)) {
443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444                 goto drop;
445         }
446
447         hdr = skb->nh.ipv6h;
448
449         /* Mangling hops number delayed to point after skb COW */
450  
451         hdr->hop_limit--;
452
453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456 error:
457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459         kfree_skb(skb);
460         return -EINVAL;
461 }
462
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465         to->pkt_type = from->pkt_type;
466         to->priority = from->priority;
467         to->protocol = from->protocol;
468         dst_release(to->dst);
469         to->dst = dst_clone(from->dst);
470         to->dev = from->dev;
471
472 #ifdef CONFIG_NET_SCHED
473         to->tc_index = from->tc_index;
474 #endif
475 #ifdef CONFIG_NETFILTER
476         to->nfmark = from->nfmark;
477         /* Connection association is same as pre-frag packet */
478         to->nfct = from->nfct;
479         nf_conntrack_get(to->nfct);
480         to->nfctinfo = from->nfctinfo;
481 #ifdef CONFIG_BRIDGE_NETFILTER
482         nf_bridge_put(to->nf_bridge);
483         to->nf_bridge = from->nf_bridge;
484         nf_bridge_get(to->nf_bridge);
485 #endif
486 #endif
487 }
488
489 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
490 {
491         u16 offset = sizeof(struct ipv6hdr);
492         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
493         unsigned int packet_len = skb->tail - skb->nh.raw;
494         int found_rhdr = 0;
495         *nexthdr = &skb->nh.ipv6h->nexthdr;
496
497         while (offset + 1 <= packet_len) {
498
499                 switch (**nexthdr) {
500
501                 case NEXTHDR_HOP:
502                 case NEXTHDR_ROUTING:
503                 case NEXTHDR_DEST:
504                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
505                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
506                         offset += ipv6_optlen(exthdr);
507                         *nexthdr = &exthdr->nexthdr;
508                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
509                         break;
510                 default :
511                         return offset;
512                 }
513         }
514
515         return offset;
516 }
517
518 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
519 {
520         struct net_device *dev;
521         struct sk_buff *frag;
522         struct rt6_info *rt = (struct rt6_info*)skb->dst;
523         struct ipv6hdr *tmp_hdr;
524         struct frag_hdr *fh;
525         unsigned int mtu, hlen, left, len;
526         u32 frag_id = 0;
527         int ptr, offset = 0, err=0;
528         u8 *prevhdr, nexthdr = 0;
529
530         dev = rt->u.dst.dev;
531         hlen = ip6_find_1stfragopt(skb, &prevhdr);
532         nexthdr = *prevhdr;
533
534         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
535
536         if (skb_shinfo(skb)->frag_list) {
537                 int first_len = skb_pagelen(skb);
538
539                 if (first_len - hlen > mtu ||
540                     ((first_len - hlen) & 7) ||
541                     skb_cloned(skb))
542                         goto slow_path;
543
544                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
545                         /* Correct geometry. */
546                         if (frag->len > mtu ||
547                             ((frag->len & 7) && frag->next) ||
548                             skb_headroom(frag) < hlen)
549                             goto slow_path;
550
551                         /* Partially cloned skb? */
552                         if (skb_shared(frag))
553                                 goto slow_path;
554
555                         BUG_ON(frag->sk);
556                         if (skb->sk) {
557                                 sock_hold(skb->sk);
558                                 frag->sk = skb->sk;
559                                 frag->destructor = sock_wfree;
560                                 skb->truesize -= frag->truesize;
561                         }
562                 }
563
564                 err = 0;
565                 offset = 0;
566                 frag = skb_shinfo(skb)->frag_list;
567                 skb_shinfo(skb)->frag_list = NULL;
568                 /* BUILD HEADER */
569
570                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
571                 if (!tmp_hdr) {
572                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
573                         return -ENOMEM;
574                 }
575
576                 *prevhdr = NEXTHDR_FRAGMENT;
577                 memcpy(tmp_hdr, skb->nh.raw, hlen);
578                 __skb_pull(skb, hlen);
579                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
580                 skb->nh.raw = __skb_push(skb, hlen);
581                 memcpy(skb->nh.raw, tmp_hdr, hlen);
582
583                 ipv6_select_ident(skb, fh);
584                 fh->nexthdr = nexthdr;
585                 fh->reserved = 0;
586                 fh->frag_off = htons(IP6_MF);
587                 frag_id = fh->identification;
588
589                 first_len = skb_pagelen(skb);
590                 skb->data_len = first_len - skb_headlen(skb);
591                 skb->len = first_len;
592                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
593  
594
595                 for (;;) {
596                         /* Prepare header of the next frame,
597                          * before previous one went down. */
598                         if (frag) {
599                                 frag->ip_summed = CHECKSUM_NONE;
600                                 frag->h.raw = frag->data;
601                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
602                                 frag->nh.raw = __skb_push(frag, hlen);
603                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
604                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
605                                 fh->nexthdr = nexthdr;
606                                 fh->reserved = 0;
607                                 fh->frag_off = htons(offset);
608                                 if (frag->next != NULL)
609                                         fh->frag_off |= htons(IP6_MF);
610                                 fh->identification = frag_id;
611                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
612                                 ip6_copy_metadata(frag, skb);
613                         }
614                         
615                         err = output(skb);
616                         if (err || !frag)
617                                 break;
618
619                         skb = frag;
620                         frag = skb->next;
621                         skb->next = NULL;
622                 }
623
624                 if (tmp_hdr)
625                         kfree(tmp_hdr);
626
627                 if (err == 0) {
628                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
629                         return 0;
630                 }
631
632                 while (frag) {
633                         skb = frag->next;
634                         kfree_skb(frag);
635                         frag = skb;
636                 }
637
638                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
639                 return err;
640         }
641
642 slow_path:
643         left = skb->len - hlen;         /* Space per frame */
644         ptr = hlen;                     /* Where to start from */
645
646         /*
647          *      Fragment the datagram.
648          */
649
650         *prevhdr = NEXTHDR_FRAGMENT;
651
652         /*
653          *      Keep copying data until we run out.
654          */
655         while(left > 0) {
656                 len = left;
657                 /* IF: it doesn't fit, use 'mtu' - the data space left */
658                 if (len > mtu)
659                         len = mtu;
660                 /* IF: we are not sending upto and including the packet end
661                    then align the next start on an eight byte boundary */
662                 if (len < left) {
663                         len &= ~7;
664                 }
665                 /*
666                  *      Allocate buffer.
667                  */
668
669                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
670                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
671                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
672                         err = -ENOMEM;
673                         goto fail;
674                 }
675
676                 /*
677                  *      Set up data on packet
678                  */
679
680                 ip6_copy_metadata(frag, skb);
681                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
682                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
683                 frag->nh.raw = frag->data;
684                 fh = (struct frag_hdr*)(frag->data + hlen);
685                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
686
687                 /*
688                  *      Charge the memory for the fragment to any owner
689                  *      it might possess
690                  */
691                 if (skb->sk)
692                         skb_set_owner_w(frag, skb->sk);
693
694                 /*
695                  *      Copy the packet header into the new buffer.
696                  */
697                 memcpy(frag->nh.raw, skb->data, hlen);
698
699                 /*
700                  *      Build fragment header.
701                  */
702                 fh->nexthdr = nexthdr;
703                 fh->reserved = 0;
704                 if (frag_id) {
705                         ipv6_select_ident(skb, fh);
706                         frag_id = fh->identification;
707                 } else
708                         fh->identification = frag_id;
709
710                 /*
711                  *      Copy a block of the IP datagram.
712                  */
713                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
714                         BUG();
715                 left -= len;
716
717                 fh->frag_off = htons(offset);
718                 if (left > 0)
719                         fh->frag_off |= htons(IP6_MF);
720                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
721
722                 ptr += len;
723                 offset += len;
724
725                 /*
726                  *      Put this fragment into the sending queue.
727                  */
728
729                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
730
731                 err = output(frag);
732                 if (err)
733                         goto fail;
734         }
735         kfree_skb(skb);
736         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
737         return err;
738
739 fail:
740         kfree_skb(skb); 
741         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
742         return err;
743 }
744
745 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
746 {
747         int err = 0;
748
749         *dst = NULL;
750         if (sk) {
751                 struct ipv6_pinfo *np = inet6_sk(sk);
752         
753                 *dst = sk_dst_check(sk, np->dst_cookie);
754                 if (*dst) {
755                         struct rt6_info *rt = (struct rt6_info*)*dst;
756         
757                                 /* Yes, checking route validity in not connected
758                                    case is not very simple. Take into account,
759                                    that we do not support routing by source, TOS,
760                                    and MSG_DONTROUTE            --ANK (980726)
761         
762                                    1. If route was host route, check that
763                                       cached destination is current.
764                                       If it is network route, we still may
765                                       check its validity using saved pointer
766                                       to the last used address: daddr_cache.
767                                       We do not want to save whole address now,
768                                       (because main consumer of this service
769                                        is tcp, which has not this problem),
770                                       so that the last trick works only on connected
771                                       sockets.
772                                    2. oif also should be the same.
773                                  */
774         
775                         if (((rt->rt6i_dst.plen != 128 ||
776                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
777                              && (np->daddr_cache == NULL ||
778                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
779                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
780                                 dst_release(*dst);
781                                 *dst = NULL;
782                         }
783                 }
784         }
785
786         if (*dst == NULL)
787                 *dst = ip6_route_output(sk, fl);
788
789         if ((err = (*dst)->error))
790                 goto out_err_release;
791
792         if (ipv6_addr_any(&fl->fl6_src)) {
793                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
794
795                 if (err)
796                         goto out_err_release;
797         }
798
799         return 0;
800
801 out_err_release:
802         dst_release(*dst);
803         *dst = NULL;
804         return err;
805 }
806
807 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
808                     void *from, int length, int transhdrlen,
809                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
810                     unsigned int flags)
811 {
812         struct inet_sock *inet = inet_sk(sk);
813         struct ipv6_pinfo *np = inet6_sk(sk);
814         struct sk_buff *skb;
815         unsigned int maxfraglen, fragheaderlen;
816         int exthdrlen;
817         int hh_len;
818         int mtu;
819         int copy;
820         int err;
821         int offset = 0;
822         int csummode = CHECKSUM_NONE;
823
824         if (flags&MSG_PROBE)
825                 return 0;
826         if (skb_queue_empty(&sk->sk_write_queue)) {
827                 /*
828                  * setup for corking
829                  */
830                 if (opt) {
831                         if (np->cork.opt == NULL) {
832                                 np->cork.opt = kmalloc(opt->tot_len,
833                                                        sk->sk_allocation);
834                                 if (unlikely(np->cork.opt == NULL))
835                                         return -ENOBUFS;
836                         } else if (np->cork.opt->tot_len < opt->tot_len) {
837                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
838                                 return -EINVAL;
839                         }
840                         memcpy(np->cork.opt, opt, opt->tot_len);
841                         inet->cork.flags |= IPCORK_OPT;
842                         /* need source address above miyazawa*/
843                 }
844                 dst_hold(&rt->u.dst);
845                 np->cork.rt = rt;
846                 inet->cork.fl = *fl;
847                 np->cork.hop_limit = hlimit;
848                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
849                 if (dst_allfrag(rt->u.dst.path))
850                         inet->cork.flags |= IPCORK_ALLFRAG;
851                 inet->cork.length = 0;
852                 sk->sk_sndmsg_page = NULL;
853                 sk->sk_sndmsg_off = 0;
854                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
855                 length += exthdrlen;
856                 transhdrlen += exthdrlen;
857         } else {
858                 rt = np->cork.rt;
859                 fl = &inet->cork.fl;
860                 if (inet->cork.flags & IPCORK_OPT)
861                         opt = np->cork.opt;
862                 transhdrlen = 0;
863                 exthdrlen = 0;
864                 mtu = inet->cork.fragsize;
865         }
866
867         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
868
869         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
870         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
871
872         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
873                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
874                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
875                         return -EMSGSIZE;
876                 }
877         }
878
879         /*
880          * Let's try using as much space as possible.
881          * Use MTU if total length of the message fits into the MTU.
882          * Otherwise, we need to reserve fragment header and
883          * fragment alignment (= 8-15 octects, in total).
884          *
885          * Note that we may need to "move" the data from the tail of
886          * of the buffer to the new fragment when we split 
887          * the message.
888          *
889          * FIXME: It may be fragmented into multiple chunks 
890          *        at once if non-fragmentable extension headers
891          *        are too large.
892          * --yoshfuji 
893          */
894
895         inet->cork.length += length;
896
897         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
898                 goto alloc_new_skb;
899
900         while (length > 0) {
901                 /* Check if the remaining data fits into current packet. */
902                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
903                 if (copy < length)
904                         copy = maxfraglen - skb->len;
905
906                 if (copy <= 0) {
907                         char *data;
908                         unsigned int datalen;
909                         unsigned int fraglen;
910                         unsigned int fraggap;
911                         unsigned int alloclen;
912                         struct sk_buff *skb_prev;
913 alloc_new_skb:
914                         skb_prev = skb;
915
916                         /* There's no room in the current skb */
917                         if (skb_prev)
918                                 fraggap = skb_prev->len - maxfraglen;
919                         else
920                                 fraggap = 0;
921
922                         /*
923                          * If remaining data exceeds the mtu,
924                          * we know we need more fragment(s).
925                          */
926                         datalen = length + fraggap;
927                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
928                                 datalen = maxfraglen - fragheaderlen;
929
930                         fraglen = datalen + fragheaderlen;
931                         if ((flags & MSG_MORE) &&
932                             !(rt->u.dst.dev->features&NETIF_F_SG))
933                                 alloclen = mtu;
934                         else
935                                 alloclen = datalen + fragheaderlen;
936
937                         /*
938                          * The last fragment gets additional space at tail.
939                          * Note: we overallocate on fragments with MSG_MODE
940                          * because we have no idea if we're the last one.
941                          */
942                         if (datalen == length + fraggap)
943                                 alloclen += rt->u.dst.trailer_len;
944
945                         /*
946                          * We just reserve space for fragment header.
947                          * Note: this may be overallocation if the message 
948                          * (without MSG_MORE) fits into the MTU.
949                          */
950                         alloclen += sizeof(struct frag_hdr);
951
952                         if (transhdrlen) {
953                                 skb = sock_alloc_send_skb(sk,
954                                                 alloclen + hh_len,
955                                                 (flags & MSG_DONTWAIT), &err);
956                         } else {
957                                 skb = NULL;
958                                 if (atomic_read(&sk->sk_wmem_alloc) <=
959                                     2 * sk->sk_sndbuf)
960                                         skb = sock_wmalloc(sk,
961                                                            alloclen + hh_len, 1,
962                                                            sk->sk_allocation);
963                                 if (unlikely(skb == NULL))
964                                         err = -ENOBUFS;
965                         }
966                         if (skb == NULL)
967                                 goto error;
968                         /*
969                          *      Fill in the control structures
970                          */
971                         skb->ip_summed = csummode;
972                         skb->csum = 0;
973                         /* reserve for fragmentation */
974                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
975
976                         /*
977                          *      Find where to start putting bytes
978                          */
979                         data = skb_put(skb, fraglen);
980                         skb->nh.raw = data + exthdrlen;
981                         data += fragheaderlen;
982                         skb->h.raw = data + exthdrlen;
983
984                         if (fraggap) {
985                                 skb->csum = skb_copy_and_csum_bits(
986                                         skb_prev, maxfraglen,
987                                         data + transhdrlen, fraggap, 0);
988                                 skb_prev->csum = csum_sub(skb_prev->csum,
989                                                           skb->csum);
990                                 data += fraggap;
991                                 skb_trim(skb_prev, maxfraglen);
992                         }
993                         copy = datalen - transhdrlen - fraggap;
994                         if (copy < 0) {
995                                 err = -EINVAL;
996                                 kfree_skb(skb);
997                                 goto error;
998                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
999                                 err = -EFAULT;
1000                                 kfree_skb(skb);
1001                                 goto error;
1002                         }
1003
1004                         offset += copy;
1005                         length -= datalen - fraggap;
1006                         transhdrlen = 0;
1007                         exthdrlen = 0;
1008                         csummode = CHECKSUM_NONE;
1009
1010                         /*
1011                          * Put the packet on the pending queue
1012                          */
1013                         __skb_queue_tail(&sk->sk_write_queue, skb);
1014                         continue;
1015                 }
1016
1017                 if (copy > length)
1018                         copy = length;
1019
1020                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1021                         unsigned int off;
1022
1023                         off = skb->len;
1024                         if (getfrag(from, skb_put(skb, copy),
1025                                                 offset, copy, off, skb) < 0) {
1026                                 __skb_trim(skb, off);
1027                                 err = -EFAULT;
1028                                 goto error;
1029                         }
1030                 } else {
1031                         int i = skb_shinfo(skb)->nr_frags;
1032                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1033                         struct page *page = sk->sk_sndmsg_page;
1034                         int off = sk->sk_sndmsg_off;
1035                         unsigned int left;
1036
1037                         if (page && (left = PAGE_SIZE - off) > 0) {
1038                                 if (copy >= left)
1039                                         copy = left;
1040                                 if (page != frag->page) {
1041                                         if (i == MAX_SKB_FRAGS) {
1042                                                 err = -EMSGSIZE;
1043                                                 goto error;
1044                                         }
1045                                         get_page(page);
1046                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1047                                         frag = &skb_shinfo(skb)->frags[i];
1048                                 }
1049                         } else if(i < MAX_SKB_FRAGS) {
1050                                 if (copy > PAGE_SIZE)
1051                                         copy = PAGE_SIZE;
1052                                 page = alloc_pages(sk->sk_allocation, 0);
1053                                 if (page == NULL) {
1054                                         err = -ENOMEM;
1055                                         goto error;
1056                                 }
1057                                 sk->sk_sndmsg_page = page;
1058                                 sk->sk_sndmsg_off = 0;
1059
1060                                 skb_fill_page_desc(skb, i, page, 0, 0);
1061                                 frag = &skb_shinfo(skb)->frags[i];
1062                                 skb->truesize += PAGE_SIZE;
1063                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1064                         } else {
1065                                 err = -EMSGSIZE;
1066                                 goto error;
1067                         }
1068                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1069                                 err = -EFAULT;
1070                                 goto error;
1071                         }
1072                         sk->sk_sndmsg_off += copy;
1073                         frag->size += copy;
1074                         skb->len += copy;
1075                         skb->data_len += copy;
1076                 }
1077                 offset += copy;
1078                 length -= copy;
1079         }
1080         return 0;
1081 error:
1082         inet->cork.length -= length;
1083         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1084         return err;
1085 }
1086
1087 int ip6_push_pending_frames(struct sock *sk)
1088 {
1089         struct sk_buff *skb, *tmp_skb;
1090         struct sk_buff **tail_skb;
1091         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1092         struct inet_sock *inet = inet_sk(sk);
1093         struct ipv6_pinfo *np = inet6_sk(sk);
1094         struct ipv6hdr *hdr;
1095         struct ipv6_txoptions *opt = np->cork.opt;
1096         struct rt6_info *rt = np->cork.rt;
1097         struct flowi *fl = &inet->cork.fl;
1098         unsigned char proto = fl->proto;
1099         int err = 0;
1100
1101         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1102                 goto out;
1103         tail_skb = &(skb_shinfo(skb)->frag_list);
1104
1105         /* move skb->data to ip header from ext header */
1106         if (skb->data < skb->nh.raw)
1107                 __skb_pull(skb, skb->nh.raw - skb->data);
1108         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1109                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1110                 *tail_skb = tmp_skb;
1111                 tail_skb = &(tmp_skb->next);
1112                 skb->len += tmp_skb->len;
1113                 skb->data_len += tmp_skb->len;
1114                 skb->truesize += tmp_skb->truesize;
1115                 __sock_put(tmp_skb->sk);
1116                 tmp_skb->destructor = NULL;
1117                 tmp_skb->sk = NULL;
1118         }
1119
1120         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1121         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1122         if (opt && opt->opt_flen)
1123                 ipv6_push_frag_opts(skb, opt, &proto);
1124         if (opt && opt->opt_nflen)
1125                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1126
1127         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1128         
1129         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1130
1131         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1132                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1133         else
1134                 hdr->payload_len = 0;
1135         hdr->hop_limit = np->cork.hop_limit;
1136         hdr->nexthdr = proto;
1137         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1138         ipv6_addr_copy(&hdr->daddr, final_dst);
1139
1140         skb->dst = dst_clone(&rt->u.dst);
1141         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1142         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1143         if (err) {
1144                 if (err > 0)
1145                         err = np->recverr ? net_xmit_errno(err) : 0;
1146                 if (err)
1147                         goto error;
1148         }
1149
1150 out:
1151         inet->cork.flags &= ~IPCORK_OPT;
1152         if (np->cork.opt) {
1153                 kfree(np->cork.opt);
1154                 np->cork.opt = NULL;
1155         }
1156         if (np->cork.rt) {
1157                 dst_release(&np->cork.rt->u.dst);
1158                 np->cork.rt = NULL;
1159                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1160         }
1161         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1162         return err;
1163 error:
1164         goto out;
1165 }
1166
1167 void ip6_flush_pending_frames(struct sock *sk)
1168 {
1169         struct inet_sock *inet = inet_sk(sk);
1170         struct ipv6_pinfo *np = inet6_sk(sk);
1171         struct sk_buff *skb;
1172
1173         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1174                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1175                 kfree_skb(skb);
1176         }
1177
1178         inet->cork.flags &= ~IPCORK_OPT;
1179
1180         if (np->cork.opt) {
1181                 kfree(np->cork.opt);
1182                 np->cork.opt = NULL;
1183         }
1184         if (np->cork.rt) {
1185                 dst_release(&np->cork.rt->u.dst);
1186                 np->cork.rt = NULL;
1187                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1188         }
1189         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1190 }