Merge of rsync://rsync.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.git/
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 #ifdef CONFIG_NETFILTER
157 int ip6_route_me_harder(struct sk_buff *skb)
158 {
159         struct ipv6hdr *iph = skb->nh.ipv6h;
160         struct dst_entry *dst;
161         struct flowi fl = {
162                 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163                 .nl_u =
164                 { .ip6_u =
165                   { .daddr = iph->daddr,
166                     .saddr = iph->saddr, } },
167                 .proto = iph->nexthdr,
168         };
169
170         dst = ip6_route_output(skb->sk, &fl);
171
172         if (dst->error) {
173                 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174                 LIMIT_NETDEBUG(
175                         printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176                 dst_release(dst);
177                 return -EINVAL;
178         }
179
180         /* Drop old route. */
181         dst_release(skb->dst);
182
183         skb->dst = dst;
184         return 0;
185 }
186 #endif
187
188 static inline int ip6_maybe_reroute(struct sk_buff *skb)
189 {
190 #ifdef CONFIG_NETFILTER
191         if (skb->nfcache & NFC_ALTERED){
192                 if (ip6_route_me_harder(skb) != 0){
193                         kfree_skb(skb);
194                         return -EINVAL;
195                 }
196         }
197 #endif /* CONFIG_NETFILTER */
198         return dst_output(skb);
199 }
200
201 /*
202  *      xmit an sk_buff (used by TCP)
203  */
204
205 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206              struct ipv6_txoptions *opt, int ipfragok)
207 {
208         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209         struct in6_addr *first_hop = &fl->fl6_dst;
210         struct dst_entry *dst = skb->dst;
211         struct ipv6hdr *hdr;
212         u8  proto = fl->proto;
213         int seg_len = skb->len;
214         int hlimit;
215         u32 mtu;
216
217         if (opt) {
218                 int head_room;
219
220                 /* First: exthdrs may take lots of space (~8K for now)
221                    MAX_HEADER is not enough.
222                  */
223                 head_room = opt->opt_nflen + opt->opt_flen;
224                 seg_len += head_room;
225                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227                 if (skb_headroom(skb) < head_room) {
228                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229                         kfree_skb(skb);
230                         skb = skb2;
231                         if (skb == NULL) {      
232                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233                                 return -ENOBUFS;
234                         }
235                         if (sk)
236                                 skb_set_owner_w(skb, sk);
237                 }
238                 if (opt->opt_flen)
239                         ipv6_push_frag_opts(skb, opt, &proto);
240                 if (opt->opt_nflen)
241                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242         }
243
244         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246         /*
247          *      Fill in the IPv6 header
248          */
249
250         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251         hlimit = -1;
252         if (np)
253                 hlimit = np->hop_limit;
254         if (hlimit < 0)
255                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256         if (hlimit < 0)
257                 hlimit = ipv6_get_hoplimit(dst->dev);
258
259         hdr->payload_len = htons(seg_len);
260         hdr->nexthdr = proto;
261         hdr->hop_limit = hlimit;
262
263         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264         ipv6_addr_copy(&hdr->daddr, first_hop);
265
266         mtu = dst_mtu(dst);
267         if ((skb->len <= mtu) || ipfragok) {
268                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270         }
271
272         if (net_ratelimit())
273                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274         skb->dev = dst->dev;
275         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277         kfree_skb(skb);
278         return -EMSGSIZE;
279 }
280
281 /*
282  *      To avoid extra problems ND packets are send through this
283  *      routine. It's code duplication but I really want to avoid
284  *      extra checks since ipv6_build_header is used by TCP (which
285  *      is for us performance critical)
286  */
287
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289                struct in6_addr *saddr, struct in6_addr *daddr,
290                int proto, int len)
291 {
292         struct ipv6_pinfo *np = inet6_sk(sk);
293         struct ipv6hdr *hdr;
294         int totlen;
295
296         skb->protocol = htons(ETH_P_IPV6);
297         skb->dev = dev;
298
299         totlen = len + sizeof(struct ipv6hdr);
300
301         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302         skb->nh.ipv6h = hdr;
303
304         *(u32*)hdr = htonl(0x60000000);
305
306         hdr->payload_len = htons(len);
307         hdr->nexthdr = proto;
308         hdr->hop_limit = np->hop_limit;
309
310         ipv6_addr_copy(&hdr->saddr, saddr);
311         ipv6_addr_copy(&hdr->daddr, daddr);
312
313         return 0;
314 }
315
316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317 {
318         struct ip6_ra_chain *ra;
319         struct sock *last = NULL;
320
321         read_lock(&ip6_ra_lock);
322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
323                 struct sock *sk = ra->sk;
324                 if (sk && ra->sel == sel) {
325                         if (last) {
326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327                                 if (skb2)
328                                         rawv6_rcv(last, skb2);
329                         }
330                         last = sk;
331                 }
332         }
333
334         if (last) {
335                 rawv6_rcv(last, skb);
336                 read_unlock(&ip6_ra_lock);
337                 return 1;
338         }
339         read_unlock(&ip6_ra_lock);
340         return 0;
341 }
342
343 static inline int ip6_forward_finish(struct sk_buff *skb)
344 {
345         return dst_output(skb);
346 }
347
348 int ip6_forward(struct sk_buff *skb)
349 {
350         struct dst_entry *dst = skb->dst;
351         struct ipv6hdr *hdr = skb->nh.ipv6h;
352         struct inet6_skb_parm *opt = IP6CB(skb);
353         
354         if (ipv6_devconf.forwarding == 0)
355                 goto error;
356
357         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359                 goto drop;
360         }
361
362         skb->ip_summed = CHECKSUM_NONE;
363
364         /*
365          *      We DO NOT make any processing on
366          *      RA packets, pushing them to user level AS IS
367          *      without ane WARRANTY that application will be able
368          *      to interpret them. The reason is that we
369          *      cannot make anything clever here.
370          *
371          *      We are not end-node, so that if packet contains
372          *      AH/ESP, we cannot make anything.
373          *      Defragmentation also would be mistake, RA packets
374          *      cannot be fragmented, because there is no warranty
375          *      that different fragments will go along one path. --ANK
376          */
377         if (opt->ra) {
378                 u8 *ptr = skb->nh.raw + opt->ra;
379                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380                         return 0;
381         }
382
383         /*
384          *      check and decrement ttl
385          */
386         if (hdr->hop_limit <= 1) {
387                 /* Force OUTPUT device used as source address */
388                 skb->dev = dst->dev;
389                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390                             0, skb->dev);
391
392                 kfree_skb(skb);
393                 return -ETIMEDOUT;
394         }
395
396         if (!xfrm6_route_forward(skb)) {
397                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398                 goto drop;
399         }
400         dst = skb->dst;
401
402         /* IPv6 specs say nothing about it, but it is clear that we cannot
403            send redirects to source routed frames.
404          */
405         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406                 struct in6_addr *target = NULL;
407                 struct rt6_info *rt;
408                 struct neighbour *n = dst->neighbour;
409
410                 /*
411                  *      incoming and outgoing devices are the same
412                  *      send a redirect.
413                  */
414
415                 rt = (struct rt6_info *) dst;
416                 if ((rt->rt6i_flags & RTF_GATEWAY))
417                         target = (struct in6_addr*)&n->primary_key;
418                 else
419                         target = &hdr->daddr;
420
421                 /* Limit redirects both by destination (here)
422                    and by source (inside ndisc_send_redirect)
423                  */
424                 if (xrlim_allow(dst, 1*HZ))
425                         ndisc_send_redirect(skb, n, target);
426         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427                                                 |IPV6_ADDR_LINKLOCAL)) {
428                 /* This check is security critical. */
429                 goto error;
430         }
431
432         if (skb->len > dst_mtu(dst)) {
433                 /* Again, force OUTPUT device used as source address */
434                 skb->dev = dst->dev;
435                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438                 kfree_skb(skb);
439                 return -EMSGSIZE;
440         }
441
442         if (skb_cow(skb, dst->dev->hard_header_len)) {
443                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444                 goto drop;
445         }
446
447         hdr = skb->nh.ipv6h;
448
449         /* Mangling hops number delayed to point after skb COW */
450  
451         hdr->hop_limit--;
452
453         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456 error:
457         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458 drop:
459         kfree_skb(skb);
460         return -EINVAL;
461 }
462
463 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464 {
465         to->pkt_type = from->pkt_type;
466         to->priority = from->priority;
467         to->protocol = from->protocol;
468         to->security = from->security;
469         dst_release(to->dst);
470         to->dst = dst_clone(from->dst);
471         to->dev = from->dev;
472
473 #ifdef CONFIG_NET_SCHED
474         to->tc_index = from->tc_index;
475 #endif
476 #ifdef CONFIG_NETFILTER
477         to->nfmark = from->nfmark;
478         /* Connection association is same as pre-frag packet */
479         to->nfct = from->nfct;
480         nf_conntrack_get(to->nfct);
481         to->nfctinfo = from->nfctinfo;
482 #ifdef CONFIG_BRIDGE_NETFILTER
483         nf_bridge_put(to->nf_bridge);
484         to->nf_bridge = from->nf_bridge;
485         nf_bridge_get(to->nf_bridge);
486 #endif
487 #ifdef CONFIG_NETFILTER_DEBUG
488         to->nf_debug = from->nf_debug;
489 #endif
490 #endif
491 }
492
493 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
494 {
495         u16 offset = sizeof(struct ipv6hdr);
496         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
497         unsigned int packet_len = skb->tail - skb->nh.raw;
498         int found_rhdr = 0;
499         *nexthdr = &skb->nh.ipv6h->nexthdr;
500
501         while (offset + 1 <= packet_len) {
502
503                 switch (**nexthdr) {
504
505                 case NEXTHDR_HOP:
506                 case NEXTHDR_ROUTING:
507                 case NEXTHDR_DEST:
508                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
509                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
510                         offset += ipv6_optlen(exthdr);
511                         *nexthdr = &exthdr->nexthdr;
512                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
513                         break;
514                 default :
515                         return offset;
516                 }
517         }
518
519         return offset;
520 }
521
522 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
523 {
524         struct net_device *dev;
525         struct sk_buff *frag;
526         struct rt6_info *rt = (struct rt6_info*)skb->dst;
527         struct ipv6hdr *tmp_hdr;
528         struct frag_hdr *fh;
529         unsigned int mtu, hlen, left, len;
530         u32 frag_id = 0;
531         int ptr, offset = 0, err=0;
532         u8 *prevhdr, nexthdr = 0;
533
534         dev = rt->u.dst.dev;
535         hlen = ip6_find_1stfragopt(skb, &prevhdr);
536         nexthdr = *prevhdr;
537
538         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
539
540         if (skb_shinfo(skb)->frag_list) {
541                 int first_len = skb_pagelen(skb);
542
543                 if (first_len - hlen > mtu ||
544                     ((first_len - hlen) & 7) ||
545                     skb_cloned(skb))
546                         goto slow_path;
547
548                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
549                         /* Correct geometry. */
550                         if (frag->len > mtu ||
551                             ((frag->len & 7) && frag->next) ||
552                             skb_headroom(frag) < hlen)
553                             goto slow_path;
554
555                         /* Partially cloned skb? */
556                         if (skb_shared(frag))
557                                 goto slow_path;
558
559                         BUG_ON(frag->sk);
560                         if (skb->sk) {
561                                 sock_hold(skb->sk);
562                                 frag->sk = skb->sk;
563                                 frag->destructor = sock_wfree;
564                                 skb->truesize -= frag->truesize;
565                         }
566                 }
567
568                 err = 0;
569                 offset = 0;
570                 frag = skb_shinfo(skb)->frag_list;
571                 skb_shinfo(skb)->frag_list = NULL;
572                 /* BUILD HEADER */
573
574                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
575                 if (!tmp_hdr) {
576                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
577                         return -ENOMEM;
578                 }
579
580                 *prevhdr = NEXTHDR_FRAGMENT;
581                 memcpy(tmp_hdr, skb->nh.raw, hlen);
582                 __skb_pull(skb, hlen);
583                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
584                 skb->nh.raw = __skb_push(skb, hlen);
585                 memcpy(skb->nh.raw, tmp_hdr, hlen);
586
587                 ipv6_select_ident(skb, fh);
588                 fh->nexthdr = nexthdr;
589                 fh->reserved = 0;
590                 fh->frag_off = htons(IP6_MF);
591                 frag_id = fh->identification;
592
593                 first_len = skb_pagelen(skb);
594                 skb->data_len = first_len - skb_headlen(skb);
595                 skb->len = first_len;
596                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
597  
598
599                 for (;;) {
600                         /* Prepare header of the next frame,
601                          * before previous one went down. */
602                         if (frag) {
603                                 frag->ip_summed = CHECKSUM_NONE;
604                                 frag->h.raw = frag->data;
605                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
606                                 frag->nh.raw = __skb_push(frag, hlen);
607                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
608                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
609                                 fh->nexthdr = nexthdr;
610                                 fh->reserved = 0;
611                                 fh->frag_off = htons(offset);
612                                 if (frag->next != NULL)
613                                         fh->frag_off |= htons(IP6_MF);
614                                 fh->identification = frag_id;
615                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
616                                 ip6_copy_metadata(frag, skb);
617                         }
618                         
619                         err = output(skb);
620                         if (err || !frag)
621                                 break;
622
623                         skb = frag;
624                         frag = skb->next;
625                         skb->next = NULL;
626                 }
627
628                 if (tmp_hdr)
629                         kfree(tmp_hdr);
630
631                 if (err == 0) {
632                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
633                         return 0;
634                 }
635
636                 while (frag) {
637                         skb = frag->next;
638                         kfree_skb(frag);
639                         frag = skb;
640                 }
641
642                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643                 return err;
644         }
645
646 slow_path:
647         left = skb->len - hlen;         /* Space per frame */
648         ptr = hlen;                     /* Where to start from */
649
650         /*
651          *      Fragment the datagram.
652          */
653
654         *prevhdr = NEXTHDR_FRAGMENT;
655
656         /*
657          *      Keep copying data until we run out.
658          */
659         while(left > 0) {
660                 len = left;
661                 /* IF: it doesn't fit, use 'mtu' - the data space left */
662                 if (len > mtu)
663                         len = mtu;
664                 /* IF: we are not sending upto and including the packet end
665                    then align the next start on an eight byte boundary */
666                 if (len < left) {
667                         len &= ~7;
668                 }
669                 /*
670                  *      Allocate buffer.
671                  */
672
673                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
674                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
675                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
676                         err = -ENOMEM;
677                         goto fail;
678                 }
679
680                 /*
681                  *      Set up data on packet
682                  */
683
684                 ip6_copy_metadata(frag, skb);
685                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
686                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
687                 frag->nh.raw = frag->data;
688                 fh = (struct frag_hdr*)(frag->data + hlen);
689                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
690
691                 /*
692                  *      Charge the memory for the fragment to any owner
693                  *      it might possess
694                  */
695                 if (skb->sk)
696                         skb_set_owner_w(frag, skb->sk);
697
698                 /*
699                  *      Copy the packet header into the new buffer.
700                  */
701                 memcpy(frag->nh.raw, skb->data, hlen);
702
703                 /*
704                  *      Build fragment header.
705                  */
706                 fh->nexthdr = nexthdr;
707                 fh->reserved = 0;
708                 if (frag_id) {
709                         ipv6_select_ident(skb, fh);
710                         frag_id = fh->identification;
711                 } else
712                         fh->identification = frag_id;
713
714                 /*
715                  *      Copy a block of the IP datagram.
716                  */
717                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
718                         BUG();
719                 left -= len;
720
721                 fh->frag_off = htons(offset);
722                 if (left > 0)
723                         fh->frag_off |= htons(IP6_MF);
724                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
725
726                 ptr += len;
727                 offset += len;
728
729                 /*
730                  *      Put this fragment into the sending queue.
731                  */
732
733                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
734
735                 err = output(frag);
736                 if (err)
737                         goto fail;
738         }
739         kfree_skb(skb);
740         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
741         return err;
742
743 fail:
744         kfree_skb(skb); 
745         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
746         return err;
747 }
748
749 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
750 {
751         int err = 0;
752
753         *dst = NULL;
754         if (sk) {
755                 struct ipv6_pinfo *np = inet6_sk(sk);
756         
757                 *dst = sk_dst_check(sk, np->dst_cookie);
758                 if (*dst) {
759                         struct rt6_info *rt = (struct rt6_info*)*dst;
760         
761                                 /* Yes, checking route validity in not connected
762                                    case is not very simple. Take into account,
763                                    that we do not support routing by source, TOS,
764                                    and MSG_DONTROUTE            --ANK (980726)
765         
766                                    1. If route was host route, check that
767                                       cached destination is current.
768                                       If it is network route, we still may
769                                       check its validity using saved pointer
770                                       to the last used address: daddr_cache.
771                                       We do not want to save whole address now,
772                                       (because main consumer of this service
773                                        is tcp, which has not this problem),
774                                       so that the last trick works only on connected
775                                       sockets.
776                                    2. oif also should be the same.
777                                  */
778         
779                         if (((rt->rt6i_dst.plen != 128 ||
780                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
781                              && (np->daddr_cache == NULL ||
782                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
783                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
784                                 dst_release(*dst);
785                                 *dst = NULL;
786                         }
787                 }
788         }
789
790         if (*dst == NULL)
791                 *dst = ip6_route_output(sk, fl);
792
793         if ((err = (*dst)->error))
794                 goto out_err_release;
795
796         if (ipv6_addr_any(&fl->fl6_src)) {
797                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
798
799                 if (err) {
800 #if IP6_DEBUG >= 2
801                         printk(KERN_DEBUG "ip6_dst_lookup: "
802                                "no available source address\n");
803 #endif
804                         goto out_err_release;
805                 }
806         }
807
808         return 0;
809
810 out_err_release:
811         dst_release(*dst);
812         *dst = NULL;
813         return err;
814 }
815
816 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
817                     void *from, int length, int transhdrlen,
818                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
819                     unsigned int flags)
820 {
821         struct inet_sock *inet = inet_sk(sk);
822         struct ipv6_pinfo *np = inet6_sk(sk);
823         struct sk_buff *skb;
824         unsigned int maxfraglen, fragheaderlen;
825         int exthdrlen;
826         int hh_len;
827         int mtu;
828         int copy;
829         int err;
830         int offset = 0;
831         int csummode = CHECKSUM_NONE;
832
833         if (flags&MSG_PROBE)
834                 return 0;
835         if (skb_queue_empty(&sk->sk_write_queue)) {
836                 /*
837                  * setup for corking
838                  */
839                 if (opt) {
840                         if (np->cork.opt == NULL) {
841                                 np->cork.opt = kmalloc(opt->tot_len,
842                                                        sk->sk_allocation);
843                                 if (unlikely(np->cork.opt == NULL))
844                                         return -ENOBUFS;
845                         } else if (np->cork.opt->tot_len < opt->tot_len) {
846                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
847                                 return -EINVAL;
848                         }
849                         memcpy(np->cork.opt, opt, opt->tot_len);
850                         inet->cork.flags |= IPCORK_OPT;
851                         /* need source address above miyazawa*/
852                 }
853                 dst_hold(&rt->u.dst);
854                 np->cork.rt = rt;
855                 inet->cork.fl = *fl;
856                 np->cork.hop_limit = hlimit;
857                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
858                 if (dst_allfrag(rt->u.dst.path))
859                         inet->cork.flags |= IPCORK_ALLFRAG;
860                 inet->cork.length = 0;
861                 sk->sk_sndmsg_page = NULL;
862                 sk->sk_sndmsg_off = 0;
863                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
864                 length += exthdrlen;
865                 transhdrlen += exthdrlen;
866         } else {
867                 rt = np->cork.rt;
868                 fl = &inet->cork.fl;
869                 if (inet->cork.flags & IPCORK_OPT)
870                         opt = np->cork.opt;
871                 transhdrlen = 0;
872                 exthdrlen = 0;
873                 mtu = inet->cork.fragsize;
874         }
875
876         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
877
878         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
879         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
880
881         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
882                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
883                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
884                         return -EMSGSIZE;
885                 }
886         }
887
888         /*
889          * Let's try using as much space as possible.
890          * Use MTU if total length of the message fits into the MTU.
891          * Otherwise, we need to reserve fragment header and
892          * fragment alignment (= 8-15 octects, in total).
893          *
894          * Note that we may need to "move" the data from the tail of
895          * of the buffer to the new fragment when we split 
896          * the message.
897          *
898          * FIXME: It may be fragmented into multiple chunks 
899          *        at once if non-fragmentable extension headers
900          *        are too large.
901          * --yoshfuji 
902          */
903
904         inet->cork.length += length;
905
906         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
907                 goto alloc_new_skb;
908
909         while (length > 0) {
910                 /* Check if the remaining data fits into current packet. */
911                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
912                 if (copy < length)
913                         copy = maxfraglen - skb->len;
914
915                 if (copy <= 0) {
916                         char *data;
917                         unsigned int datalen;
918                         unsigned int fraglen;
919                         unsigned int fraggap;
920                         unsigned int alloclen;
921                         struct sk_buff *skb_prev;
922 alloc_new_skb:
923                         skb_prev = skb;
924
925                         /* There's no room in the current skb */
926                         if (skb_prev)
927                                 fraggap = skb_prev->len - maxfraglen;
928                         else
929                                 fraggap = 0;
930
931                         /*
932                          * If remaining data exceeds the mtu,
933                          * we know we need more fragment(s).
934                          */
935                         datalen = length + fraggap;
936                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
937                                 datalen = maxfraglen - fragheaderlen;
938
939                         fraglen = datalen + fragheaderlen;
940                         if ((flags & MSG_MORE) &&
941                             !(rt->u.dst.dev->features&NETIF_F_SG))
942                                 alloclen = mtu;
943                         else
944                                 alloclen = datalen + fragheaderlen;
945
946                         /*
947                          * The last fragment gets additional space at tail.
948                          * Note: we overallocate on fragments with MSG_MODE
949                          * because we have no idea if we're the last one.
950                          */
951                         if (datalen == length + fraggap)
952                                 alloclen += rt->u.dst.trailer_len;
953
954                         /*
955                          * We just reserve space for fragment header.
956                          * Note: this may be overallocation if the message 
957                          * (without MSG_MORE) fits into the MTU.
958                          */
959                         alloclen += sizeof(struct frag_hdr);
960
961                         if (transhdrlen) {
962                                 skb = sock_alloc_send_skb(sk,
963                                                 alloclen + hh_len,
964                                                 (flags & MSG_DONTWAIT), &err);
965                         } else {
966                                 skb = NULL;
967                                 if (atomic_read(&sk->sk_wmem_alloc) <=
968                                     2 * sk->sk_sndbuf)
969                                         skb = sock_wmalloc(sk,
970                                                            alloclen + hh_len, 1,
971                                                            sk->sk_allocation);
972                                 if (unlikely(skb == NULL))
973                                         err = -ENOBUFS;
974                         }
975                         if (skb == NULL)
976                                 goto error;
977                         /*
978                          *      Fill in the control structures
979                          */
980                         skb->ip_summed = csummode;
981                         skb->csum = 0;
982                         /* reserve for fragmentation */
983                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
984
985                         /*
986                          *      Find where to start putting bytes
987                          */
988                         data = skb_put(skb, fraglen);
989                         skb->nh.raw = data + exthdrlen;
990                         data += fragheaderlen;
991                         skb->h.raw = data + exthdrlen;
992
993                         if (fraggap) {
994                                 skb->csum = skb_copy_and_csum_bits(
995                                         skb_prev, maxfraglen,
996                                         data + transhdrlen, fraggap, 0);
997                                 skb_prev->csum = csum_sub(skb_prev->csum,
998                                                           skb->csum);
999                                 data += fraggap;
1000                                 skb_trim(skb_prev, maxfraglen);
1001                         }
1002                         copy = datalen - transhdrlen - fraggap;
1003                         if (copy < 0) {
1004                                 err = -EINVAL;
1005                                 kfree_skb(skb);
1006                                 goto error;
1007                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1008                                 err = -EFAULT;
1009                                 kfree_skb(skb);
1010                                 goto error;
1011                         }
1012
1013                         offset += copy;
1014                         length -= datalen - fraggap;
1015                         transhdrlen = 0;
1016                         exthdrlen = 0;
1017                         csummode = CHECKSUM_NONE;
1018
1019                         /*
1020                          * Put the packet on the pending queue
1021                          */
1022                         __skb_queue_tail(&sk->sk_write_queue, skb);
1023                         continue;
1024                 }
1025
1026                 if (copy > length)
1027                         copy = length;
1028
1029                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1030                         unsigned int off;
1031
1032                         off = skb->len;
1033                         if (getfrag(from, skb_put(skb, copy),
1034                                                 offset, copy, off, skb) < 0) {
1035                                 __skb_trim(skb, off);
1036                                 err = -EFAULT;
1037                                 goto error;
1038                         }
1039                 } else {
1040                         int i = skb_shinfo(skb)->nr_frags;
1041                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1042                         struct page *page = sk->sk_sndmsg_page;
1043                         int off = sk->sk_sndmsg_off;
1044                         unsigned int left;
1045
1046                         if (page && (left = PAGE_SIZE - off) > 0) {
1047                                 if (copy >= left)
1048                                         copy = left;
1049                                 if (page != frag->page) {
1050                                         if (i == MAX_SKB_FRAGS) {
1051                                                 err = -EMSGSIZE;
1052                                                 goto error;
1053                                         }
1054                                         get_page(page);
1055                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1056                                         frag = &skb_shinfo(skb)->frags[i];
1057                                 }
1058                         } else if(i < MAX_SKB_FRAGS) {
1059                                 if (copy > PAGE_SIZE)
1060                                         copy = PAGE_SIZE;
1061                                 page = alloc_pages(sk->sk_allocation, 0);
1062                                 if (page == NULL) {
1063                                         err = -ENOMEM;
1064                                         goto error;
1065                                 }
1066                                 sk->sk_sndmsg_page = page;
1067                                 sk->sk_sndmsg_off = 0;
1068
1069                                 skb_fill_page_desc(skb, i, page, 0, 0);
1070                                 frag = &skb_shinfo(skb)->frags[i];
1071                                 skb->truesize += PAGE_SIZE;
1072                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1073                         } else {
1074                                 err = -EMSGSIZE;
1075                                 goto error;
1076                         }
1077                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1078                                 err = -EFAULT;
1079                                 goto error;
1080                         }
1081                         sk->sk_sndmsg_off += copy;
1082                         frag->size += copy;
1083                         skb->len += copy;
1084                         skb->data_len += copy;
1085                 }
1086                 offset += copy;
1087                 length -= copy;
1088         }
1089         return 0;
1090 error:
1091         inet->cork.length -= length;
1092         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1093         return err;
1094 }
1095
1096 int ip6_push_pending_frames(struct sock *sk)
1097 {
1098         struct sk_buff *skb, *tmp_skb;
1099         struct sk_buff **tail_skb;
1100         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1101         struct inet_sock *inet = inet_sk(sk);
1102         struct ipv6_pinfo *np = inet6_sk(sk);
1103         struct ipv6hdr *hdr;
1104         struct ipv6_txoptions *opt = np->cork.opt;
1105         struct rt6_info *rt = np->cork.rt;
1106         struct flowi *fl = &inet->cork.fl;
1107         unsigned char proto = fl->proto;
1108         int err = 0;
1109
1110         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1111                 goto out;
1112         tail_skb = &(skb_shinfo(skb)->frag_list);
1113
1114         /* move skb->data to ip header from ext header */
1115         if (skb->data < skb->nh.raw)
1116                 __skb_pull(skb, skb->nh.raw - skb->data);
1117         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1118                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1119                 *tail_skb = tmp_skb;
1120                 tail_skb = &(tmp_skb->next);
1121                 skb->len += tmp_skb->len;
1122                 skb->data_len += tmp_skb->len;
1123                 skb->truesize += tmp_skb->truesize;
1124                 __sock_put(tmp_skb->sk);
1125                 tmp_skb->destructor = NULL;
1126                 tmp_skb->sk = NULL;
1127         }
1128
1129         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1130         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1131         if (opt && opt->opt_flen)
1132                 ipv6_push_frag_opts(skb, opt, &proto);
1133         if (opt && opt->opt_nflen)
1134                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1135
1136         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1137         
1138         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1139
1140         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1141                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1142         else
1143                 hdr->payload_len = 0;
1144         hdr->hop_limit = np->cork.hop_limit;
1145         hdr->nexthdr = proto;
1146         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1147         ipv6_addr_copy(&hdr->daddr, final_dst);
1148
1149         skb->dst = dst_clone(&rt->u.dst);
1150         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1151         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1152         if (err) {
1153                 if (err > 0)
1154                         err = np->recverr ? net_xmit_errno(err) : 0;
1155                 if (err)
1156                         goto error;
1157         }
1158
1159 out:
1160         inet->cork.flags &= ~IPCORK_OPT;
1161         if (np->cork.opt) {
1162                 kfree(np->cork.opt);
1163                 np->cork.opt = NULL;
1164         }
1165         if (np->cork.rt) {
1166                 dst_release(&np->cork.rt->u.dst);
1167                 np->cork.rt = NULL;
1168                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1169         }
1170         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1171         return err;
1172 error:
1173         goto out;
1174 }
1175
1176 void ip6_flush_pending_frames(struct sock *sk)
1177 {
1178         struct inet_sock *inet = inet_sk(sk);
1179         struct ipv6_pinfo *np = inet6_sk(sk);
1180         struct sk_buff *skb;
1181
1182         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1183                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1184                 kfree_skb(skb);
1185         }
1186
1187         inet->cork.flags &= ~IPCORK_OPT;
1188
1189         if (np->cork.opt) {
1190                 kfree(np->cork.opt);
1191                 np->cork.opt = NULL;
1192         }
1193         if (np->cork.rt) {
1194                 dst_release(&np->cork.rt->u.dst);
1195                 np->cork.rt = NULL;
1196                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1197         }
1198         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199 }