Merge branch 'for-linus' of git://brick.kernel.dk/data/git/linux-2.6-block
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151                 return ip6_fragment(skb, ip6_output2);
152         else
153                 return ip6_output2(skb);
154 }
155
156 /*
157  *      xmit an sk_buff (used by TCP)
158  */
159
160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
161              struct ipv6_txoptions *opt, int ipfragok)
162 {
163         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
164         struct in6_addr *first_hop = &fl->fl6_dst;
165         struct dst_entry *dst = skb->dst;
166         struct ipv6hdr *hdr;
167         u8  proto = fl->proto;
168         int seg_len = skb->len;
169         int hlimit, tclass;
170         u32 mtu;
171
172         if (opt) {
173                 int head_room;
174
175                 /* First: exthdrs may take lots of space (~8K for now)
176                    MAX_HEADER is not enough.
177                  */
178                 head_room = opt->opt_nflen + opt->opt_flen;
179                 seg_len += head_room;
180                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
181
182                 if (skb_headroom(skb) < head_room) {
183                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
184                         kfree_skb(skb);
185                         skb = skb2;
186                         if (skb == NULL) {      
187                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
188                                 return -ENOBUFS;
189                         }
190                         if (sk)
191                                 skb_set_owner_w(skb, sk);
192                 }
193                 if (opt->opt_flen)
194                         ipv6_push_frag_opts(skb, opt, &proto);
195                 if (opt->opt_nflen)
196                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
197         }
198
199         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
200
201         /*
202          *      Fill in the IPv6 header
203          */
204
205         hlimit = -1;
206         if (np)
207                 hlimit = np->hop_limit;
208         if (hlimit < 0)
209                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
210         if (hlimit < 0)
211                 hlimit = ipv6_get_hoplimit(dst->dev);
212
213         tclass = -1;
214         if (np)
215                 tclass = np->tclass;
216         if (tclass < 0)
217                 tclass = 0;
218
219         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
220
221         hdr->payload_len = htons(seg_len);
222         hdr->nexthdr = proto;
223         hdr->hop_limit = hlimit;
224
225         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
226         ipv6_addr_copy(&hdr->daddr, first_hop);
227
228         mtu = dst_mtu(dst);
229         if ((skb->len <= mtu) || ipfragok) {
230                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
231                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
232                                 dst_output);
233         }
234
235         if (net_ratelimit())
236                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
237         skb->dev = dst->dev;
238         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
239         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
240         kfree_skb(skb);
241         return -EMSGSIZE;
242 }
243
244 /*
245  *      To avoid extra problems ND packets are send through this
246  *      routine. It's code duplication but I really want to avoid
247  *      extra checks since ipv6_build_header is used by TCP (which
248  *      is for us performance critical)
249  */
250
251 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
252                struct in6_addr *saddr, struct in6_addr *daddr,
253                int proto, int len)
254 {
255         struct ipv6_pinfo *np = inet6_sk(sk);
256         struct ipv6hdr *hdr;
257         int totlen;
258
259         skb->protocol = htons(ETH_P_IPV6);
260         skb->dev = dev;
261
262         totlen = len + sizeof(struct ipv6hdr);
263
264         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
265         skb->nh.ipv6h = hdr;
266
267         *(u32*)hdr = htonl(0x60000000);
268
269         hdr->payload_len = htons(len);
270         hdr->nexthdr = proto;
271         hdr->hop_limit = np->hop_limit;
272
273         ipv6_addr_copy(&hdr->saddr, saddr);
274         ipv6_addr_copy(&hdr->daddr, daddr);
275
276         return 0;
277 }
278
279 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
280 {
281         struct ip6_ra_chain *ra;
282         struct sock *last = NULL;
283
284         read_lock(&ip6_ra_lock);
285         for (ra = ip6_ra_chain; ra; ra = ra->next) {
286                 struct sock *sk = ra->sk;
287                 if (sk && ra->sel == sel &&
288                     (!sk->sk_bound_dev_if ||
289                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
290                         if (last) {
291                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
292                                 if (skb2)
293                                         rawv6_rcv(last, skb2);
294                         }
295                         last = sk;
296                 }
297         }
298
299         if (last) {
300                 rawv6_rcv(last, skb);
301                 read_unlock(&ip6_ra_lock);
302                 return 1;
303         }
304         read_unlock(&ip6_ra_lock);
305         return 0;
306 }
307
308 static inline int ip6_forward_finish(struct sk_buff *skb)
309 {
310         return dst_output(skb);
311 }
312
313 int ip6_forward(struct sk_buff *skb)
314 {
315         struct dst_entry *dst = skb->dst;
316         struct ipv6hdr *hdr = skb->nh.ipv6h;
317         struct inet6_skb_parm *opt = IP6CB(skb);
318         
319         if (ipv6_devconf.forwarding == 0)
320                 goto error;
321
322         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
323                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
324                 goto drop;
325         }
326
327         skb->ip_summed = CHECKSUM_NONE;
328
329         /*
330          *      We DO NOT make any processing on
331          *      RA packets, pushing them to user level AS IS
332          *      without ane WARRANTY that application will be able
333          *      to interpret them. The reason is that we
334          *      cannot make anything clever here.
335          *
336          *      We are not end-node, so that if packet contains
337          *      AH/ESP, we cannot make anything.
338          *      Defragmentation also would be mistake, RA packets
339          *      cannot be fragmented, because there is no warranty
340          *      that different fragments will go along one path. --ANK
341          */
342         if (opt->ra) {
343                 u8 *ptr = skb->nh.raw + opt->ra;
344                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
345                         return 0;
346         }
347
348         /*
349          *      check and decrement ttl
350          */
351         if (hdr->hop_limit <= 1) {
352                 /* Force OUTPUT device used as source address */
353                 skb->dev = dst->dev;
354                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
355                             0, skb->dev);
356
357                 kfree_skb(skb);
358                 return -ETIMEDOUT;
359         }
360
361         if (!xfrm6_route_forward(skb)) {
362                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
363                 goto drop;
364         }
365         dst = skb->dst;
366
367         /* IPv6 specs say nothing about it, but it is clear that we cannot
368            send redirects to source routed frames.
369          */
370         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
371                 struct in6_addr *target = NULL;
372                 struct rt6_info *rt;
373                 struct neighbour *n = dst->neighbour;
374
375                 /*
376                  *      incoming and outgoing devices are the same
377                  *      send a redirect.
378                  */
379
380                 rt = (struct rt6_info *) dst;
381                 if ((rt->rt6i_flags & RTF_GATEWAY))
382                         target = (struct in6_addr*)&n->primary_key;
383                 else
384                         target = &hdr->daddr;
385
386                 /* Limit redirects both by destination (here)
387                    and by source (inside ndisc_send_redirect)
388                  */
389                 if (xrlim_allow(dst, 1*HZ))
390                         ndisc_send_redirect(skb, n, target);
391         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
392                                                 |IPV6_ADDR_LINKLOCAL)) {
393                 /* This check is security critical. */
394                 goto error;
395         }
396
397         if (skb->len > dst_mtu(dst)) {
398                 /* Again, force OUTPUT device used as source address */
399                 skb->dev = dst->dev;
400                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
401                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
402                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
403                 kfree_skb(skb);
404                 return -EMSGSIZE;
405         }
406
407         if (skb_cow(skb, dst->dev->hard_header_len)) {
408                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
409                 goto drop;
410         }
411
412         hdr = skb->nh.ipv6h;
413
414         /* Mangling hops number delayed to point after skb COW */
415  
416         hdr->hop_limit--;
417
418         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
419         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
420
421 error:
422         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
423 drop:
424         kfree_skb(skb);
425         return -EINVAL;
426 }
427
428 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
429 {
430         to->pkt_type = from->pkt_type;
431         to->priority = from->priority;
432         to->protocol = from->protocol;
433         dst_release(to->dst);
434         to->dst = dst_clone(from->dst);
435         to->dev = from->dev;
436
437 #ifdef CONFIG_NET_SCHED
438         to->tc_index = from->tc_index;
439 #endif
440 #ifdef CONFIG_NETFILTER
441         to->nfmark = from->nfmark;
442         /* Connection association is same as pre-frag packet */
443         to->nfct = from->nfct;
444         nf_conntrack_get(to->nfct);
445         to->nfctinfo = from->nfctinfo;
446 #ifdef CONFIG_BRIDGE_NETFILTER
447         nf_bridge_put(to->nf_bridge);
448         to->nf_bridge = from->nf_bridge;
449         nf_bridge_get(to->nf_bridge);
450 #endif
451 #endif
452 }
453
454 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
455 {
456         u16 offset = sizeof(struct ipv6hdr);
457         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
458         unsigned int packet_len = skb->tail - skb->nh.raw;
459         int found_rhdr = 0;
460         *nexthdr = &skb->nh.ipv6h->nexthdr;
461
462         while (offset + 1 <= packet_len) {
463
464                 switch (**nexthdr) {
465
466                 case NEXTHDR_HOP:
467                 case NEXTHDR_ROUTING:
468                 case NEXTHDR_DEST:
469                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
470                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
471                         offset += ipv6_optlen(exthdr);
472                         *nexthdr = &exthdr->nexthdr;
473                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
474                         break;
475                 default :
476                         return offset;
477                 }
478         }
479
480         return offset;
481 }
482
483 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
484 {
485         struct net_device *dev;
486         struct sk_buff *frag;
487         struct rt6_info *rt = (struct rt6_info*)skb->dst;
488         struct ipv6hdr *tmp_hdr;
489         struct frag_hdr *fh;
490         unsigned int mtu, hlen, left, len;
491         u32 frag_id = 0;
492         int ptr, offset = 0, err=0;
493         u8 *prevhdr, nexthdr = 0;
494
495         dev = rt->u.dst.dev;
496         hlen = ip6_find_1stfragopt(skb, &prevhdr);
497         nexthdr = *prevhdr;
498
499         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
500
501         if (skb_shinfo(skb)->frag_list) {
502                 int first_len = skb_pagelen(skb);
503
504                 if (first_len - hlen > mtu ||
505                     ((first_len - hlen) & 7) ||
506                     skb_cloned(skb))
507                         goto slow_path;
508
509                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
510                         /* Correct geometry. */
511                         if (frag->len > mtu ||
512                             ((frag->len & 7) && frag->next) ||
513                             skb_headroom(frag) < hlen)
514                             goto slow_path;
515
516                         /* Partially cloned skb? */
517                         if (skb_shared(frag))
518                                 goto slow_path;
519
520                         BUG_ON(frag->sk);
521                         if (skb->sk) {
522                                 sock_hold(skb->sk);
523                                 frag->sk = skb->sk;
524                                 frag->destructor = sock_wfree;
525                                 skb->truesize -= frag->truesize;
526                         }
527                 }
528
529                 err = 0;
530                 offset = 0;
531                 frag = skb_shinfo(skb)->frag_list;
532                 skb_shinfo(skb)->frag_list = NULL;
533                 /* BUILD HEADER */
534
535                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
536                 if (!tmp_hdr) {
537                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538                         return -ENOMEM;
539                 }
540
541                 *prevhdr = NEXTHDR_FRAGMENT;
542                 memcpy(tmp_hdr, skb->nh.raw, hlen);
543                 __skb_pull(skb, hlen);
544                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
545                 skb->nh.raw = __skb_push(skb, hlen);
546                 memcpy(skb->nh.raw, tmp_hdr, hlen);
547
548                 ipv6_select_ident(skb, fh);
549                 fh->nexthdr = nexthdr;
550                 fh->reserved = 0;
551                 fh->frag_off = htons(IP6_MF);
552                 frag_id = fh->identification;
553
554                 first_len = skb_pagelen(skb);
555                 skb->data_len = first_len - skb_headlen(skb);
556                 skb->len = first_len;
557                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
558  
559
560                 for (;;) {
561                         /* Prepare header of the next frame,
562                          * before previous one went down. */
563                         if (frag) {
564                                 frag->ip_summed = CHECKSUM_NONE;
565                                 frag->h.raw = frag->data;
566                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
567                                 frag->nh.raw = __skb_push(frag, hlen);
568                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
569                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
570                                 fh->nexthdr = nexthdr;
571                                 fh->reserved = 0;
572                                 fh->frag_off = htons(offset);
573                                 if (frag->next != NULL)
574                                         fh->frag_off |= htons(IP6_MF);
575                                 fh->identification = frag_id;
576                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
577                                 ip6_copy_metadata(frag, skb);
578                         }
579                         
580                         err = output(skb);
581                         if (err || !frag)
582                                 break;
583
584                         skb = frag;
585                         frag = skb->next;
586                         skb->next = NULL;
587                 }
588
589                 if (tmp_hdr)
590                         kfree(tmp_hdr);
591
592                 if (err == 0) {
593                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
594                         return 0;
595                 }
596
597                 while (frag) {
598                         skb = frag->next;
599                         kfree_skb(frag);
600                         frag = skb;
601                 }
602
603                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
604                 return err;
605         }
606
607 slow_path:
608         left = skb->len - hlen;         /* Space per frame */
609         ptr = hlen;                     /* Where to start from */
610
611         /*
612          *      Fragment the datagram.
613          */
614
615         *prevhdr = NEXTHDR_FRAGMENT;
616
617         /*
618          *      Keep copying data until we run out.
619          */
620         while(left > 0) {
621                 len = left;
622                 /* IF: it doesn't fit, use 'mtu' - the data space left */
623                 if (len > mtu)
624                         len = mtu;
625                 /* IF: we are not sending upto and including the packet end
626                    then align the next start on an eight byte boundary */
627                 if (len < left) {
628                         len &= ~7;
629                 }
630                 /*
631                  *      Allocate buffer.
632                  */
633
634                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
635                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
636                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
637                         err = -ENOMEM;
638                         goto fail;
639                 }
640
641                 /*
642                  *      Set up data on packet
643                  */
644
645                 ip6_copy_metadata(frag, skb);
646                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
647                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
648                 frag->nh.raw = frag->data;
649                 fh = (struct frag_hdr*)(frag->data + hlen);
650                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
651
652                 /*
653                  *      Charge the memory for the fragment to any owner
654                  *      it might possess
655                  */
656                 if (skb->sk)
657                         skb_set_owner_w(frag, skb->sk);
658
659                 /*
660                  *      Copy the packet header into the new buffer.
661                  */
662                 memcpy(frag->nh.raw, skb->data, hlen);
663
664                 /*
665                  *      Build fragment header.
666                  */
667                 fh->nexthdr = nexthdr;
668                 fh->reserved = 0;
669                 if (!frag_id) {
670                         ipv6_select_ident(skb, fh);
671                         frag_id = fh->identification;
672                 } else
673                         fh->identification = frag_id;
674
675                 /*
676                  *      Copy a block of the IP datagram.
677                  */
678                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
679                         BUG();
680                 left -= len;
681
682                 fh->frag_off = htons(offset);
683                 if (left > 0)
684                         fh->frag_off |= htons(IP6_MF);
685                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
686
687                 ptr += len;
688                 offset += len;
689
690                 /*
691                  *      Put this fragment into the sending queue.
692                  */
693
694                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
695
696                 err = output(frag);
697                 if (err)
698                         goto fail;
699         }
700         kfree_skb(skb);
701         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
702         return err;
703
704 fail:
705         kfree_skb(skb); 
706         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
707         return err;
708 }
709
710 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
711 {
712         int err = 0;
713
714         *dst = NULL;
715         if (sk) {
716                 struct ipv6_pinfo *np = inet6_sk(sk);
717         
718                 *dst = sk_dst_check(sk, np->dst_cookie);
719                 if (*dst) {
720                         struct rt6_info *rt = (struct rt6_info*)*dst;
721         
722                                 /* Yes, checking route validity in not connected
723                                    case is not very simple. Take into account,
724                                    that we do not support routing by source, TOS,
725                                    and MSG_DONTROUTE            --ANK (980726)
726         
727                                    1. If route was host route, check that
728                                       cached destination is current.
729                                       If it is network route, we still may
730                                       check its validity using saved pointer
731                                       to the last used address: daddr_cache.
732                                       We do not want to save whole address now,
733                                       (because main consumer of this service
734                                        is tcp, which has not this problem),
735                                       so that the last trick works only on connected
736                                       sockets.
737                                    2. oif also should be the same.
738                                  */
739         
740                         if (((rt->rt6i_dst.plen != 128 ||
741                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
742                              && (np->daddr_cache == NULL ||
743                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
744                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
745                                 dst_release(*dst);
746                                 *dst = NULL;
747                         }
748                 }
749         }
750
751         if (*dst == NULL)
752                 *dst = ip6_route_output(sk, fl);
753
754         if ((err = (*dst)->error))
755                 goto out_err_release;
756
757         if (ipv6_addr_any(&fl->fl6_src)) {
758                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
759
760                 if (err)
761                         goto out_err_release;
762         }
763
764         return 0;
765
766 out_err_release:
767         dst_release(*dst);
768         *dst = NULL;
769         return err;
770 }
771
772 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
773         int offset, int len, int odd, struct sk_buff *skb),
774         void *from, int length, int transhdrlen,
775         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
776         struct rt6_info *rt, unsigned int flags)
777 {
778         struct inet_sock *inet = inet_sk(sk);
779         struct ipv6_pinfo *np = inet6_sk(sk);
780         struct sk_buff *skb;
781         unsigned int maxfraglen, fragheaderlen;
782         int exthdrlen;
783         int hh_len;
784         int mtu;
785         int copy;
786         int err;
787         int offset = 0;
788         int csummode = CHECKSUM_NONE;
789
790         if (flags&MSG_PROBE)
791                 return 0;
792         if (skb_queue_empty(&sk->sk_write_queue)) {
793                 /*
794                  * setup for corking
795                  */
796                 if (opt) {
797                         if (np->cork.opt == NULL) {
798                                 np->cork.opt = kmalloc(opt->tot_len,
799                                                        sk->sk_allocation);
800                                 if (unlikely(np->cork.opt == NULL))
801                                         return -ENOBUFS;
802                         } else if (np->cork.opt->tot_len < opt->tot_len) {
803                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
804                                 return -EINVAL;
805                         }
806                         memcpy(np->cork.opt, opt, opt->tot_len);
807                         inet->cork.flags |= IPCORK_OPT;
808                         /* need source address above miyazawa*/
809                 }
810                 dst_hold(&rt->u.dst);
811                 np->cork.rt = rt;
812                 inet->cork.fl = *fl;
813                 np->cork.hop_limit = hlimit;
814                 np->cork.tclass = tclass;
815                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
816                 if (dst_allfrag(rt->u.dst.path))
817                         inet->cork.flags |= IPCORK_ALLFRAG;
818                 inet->cork.length = 0;
819                 sk->sk_sndmsg_page = NULL;
820                 sk->sk_sndmsg_off = 0;
821                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
822                 length += exthdrlen;
823                 transhdrlen += exthdrlen;
824         } else {
825                 rt = np->cork.rt;
826                 fl = &inet->cork.fl;
827                 if (inet->cork.flags & IPCORK_OPT)
828                         opt = np->cork.opt;
829                 transhdrlen = 0;
830                 exthdrlen = 0;
831                 mtu = inet->cork.fragsize;
832         }
833
834         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
835
836         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
837         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
838
839         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
840                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
841                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
842                         return -EMSGSIZE;
843                 }
844         }
845
846         /*
847          * Let's try using as much space as possible.
848          * Use MTU if total length of the message fits into the MTU.
849          * Otherwise, we need to reserve fragment header and
850          * fragment alignment (= 8-15 octects, in total).
851          *
852          * Note that we may need to "move" the data from the tail of
853          * of the buffer to the new fragment when we split 
854          * the message.
855          *
856          * FIXME: It may be fragmented into multiple chunks 
857          *        at once if non-fragmentable extension headers
858          *        are too large.
859          * --yoshfuji 
860          */
861
862         inet->cork.length += length;
863
864         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
865                 goto alloc_new_skb;
866
867         while (length > 0) {
868                 /* Check if the remaining data fits into current packet. */
869                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
870                 if (copy < length)
871                         copy = maxfraglen - skb->len;
872
873                 if (copy <= 0) {
874                         char *data;
875                         unsigned int datalen;
876                         unsigned int fraglen;
877                         unsigned int fraggap;
878                         unsigned int alloclen;
879                         struct sk_buff *skb_prev;
880 alloc_new_skb:
881                         skb_prev = skb;
882
883                         /* There's no room in the current skb */
884                         if (skb_prev)
885                                 fraggap = skb_prev->len - maxfraglen;
886                         else
887                                 fraggap = 0;
888
889                         /*
890                          * If remaining data exceeds the mtu,
891                          * we know we need more fragment(s).
892                          */
893                         datalen = length + fraggap;
894                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
895                                 datalen = maxfraglen - fragheaderlen;
896
897                         fraglen = datalen + fragheaderlen;
898                         if ((flags & MSG_MORE) &&
899                             !(rt->u.dst.dev->features&NETIF_F_SG))
900                                 alloclen = mtu;
901                         else
902                                 alloclen = datalen + fragheaderlen;
903
904                         /*
905                          * The last fragment gets additional space at tail.
906                          * Note: we overallocate on fragments with MSG_MODE
907                          * because we have no idea if we're the last one.
908                          */
909                         if (datalen == length + fraggap)
910                                 alloclen += rt->u.dst.trailer_len;
911
912                         /*
913                          * We just reserve space for fragment header.
914                          * Note: this may be overallocation if the message 
915                          * (without MSG_MORE) fits into the MTU.
916                          */
917                         alloclen += sizeof(struct frag_hdr);
918
919                         if (transhdrlen) {
920                                 skb = sock_alloc_send_skb(sk,
921                                                 alloclen + hh_len,
922                                                 (flags & MSG_DONTWAIT), &err);
923                         } else {
924                                 skb = NULL;
925                                 if (atomic_read(&sk->sk_wmem_alloc) <=
926                                     2 * sk->sk_sndbuf)
927                                         skb = sock_wmalloc(sk,
928                                                            alloclen + hh_len, 1,
929                                                            sk->sk_allocation);
930                                 if (unlikely(skb == NULL))
931                                         err = -ENOBUFS;
932                         }
933                         if (skb == NULL)
934                                 goto error;
935                         /*
936                          *      Fill in the control structures
937                          */
938                         skb->ip_summed = csummode;
939                         skb->csum = 0;
940                         /* reserve for fragmentation */
941                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
942
943                         /*
944                          *      Find where to start putting bytes
945                          */
946                         data = skb_put(skb, fraglen);
947                         skb->nh.raw = data + exthdrlen;
948                         data += fragheaderlen;
949                         skb->h.raw = data + exthdrlen;
950
951                         if (fraggap) {
952                                 skb->csum = skb_copy_and_csum_bits(
953                                         skb_prev, maxfraglen,
954                                         data + transhdrlen, fraggap, 0);
955                                 skb_prev->csum = csum_sub(skb_prev->csum,
956                                                           skb->csum);
957                                 data += fraggap;
958                                 skb_trim(skb_prev, maxfraglen);
959                         }
960                         copy = datalen - transhdrlen - fraggap;
961                         if (copy < 0) {
962                                 err = -EINVAL;
963                                 kfree_skb(skb);
964                                 goto error;
965                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
966                                 err = -EFAULT;
967                                 kfree_skb(skb);
968                                 goto error;
969                         }
970
971                         offset += copy;
972                         length -= datalen - fraggap;
973                         transhdrlen = 0;
974                         exthdrlen = 0;
975                         csummode = CHECKSUM_NONE;
976
977                         /*
978                          * Put the packet on the pending queue
979                          */
980                         __skb_queue_tail(&sk->sk_write_queue, skb);
981                         continue;
982                 }
983
984                 if (copy > length)
985                         copy = length;
986
987                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
988                         unsigned int off;
989
990                         off = skb->len;
991                         if (getfrag(from, skb_put(skb, copy),
992                                                 offset, copy, off, skb) < 0) {
993                                 __skb_trim(skb, off);
994                                 err = -EFAULT;
995                                 goto error;
996                         }
997                 } else {
998                         int i = skb_shinfo(skb)->nr_frags;
999                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1000                         struct page *page = sk->sk_sndmsg_page;
1001                         int off = sk->sk_sndmsg_off;
1002                         unsigned int left;
1003
1004                         if (page && (left = PAGE_SIZE - off) > 0) {
1005                                 if (copy >= left)
1006                                         copy = left;
1007                                 if (page != frag->page) {
1008                                         if (i == MAX_SKB_FRAGS) {
1009                                                 err = -EMSGSIZE;
1010                                                 goto error;
1011                                         }
1012                                         get_page(page);
1013                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1014                                         frag = &skb_shinfo(skb)->frags[i];
1015                                 }
1016                         } else if(i < MAX_SKB_FRAGS) {
1017                                 if (copy > PAGE_SIZE)
1018                                         copy = PAGE_SIZE;
1019                                 page = alloc_pages(sk->sk_allocation, 0);
1020                                 if (page == NULL) {
1021                                         err = -ENOMEM;
1022                                         goto error;
1023                                 }
1024                                 sk->sk_sndmsg_page = page;
1025                                 sk->sk_sndmsg_off = 0;
1026
1027                                 skb_fill_page_desc(skb, i, page, 0, 0);
1028                                 frag = &skb_shinfo(skb)->frags[i];
1029                                 skb->truesize += PAGE_SIZE;
1030                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1031                         } else {
1032                                 err = -EMSGSIZE;
1033                                 goto error;
1034                         }
1035                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1036                                 err = -EFAULT;
1037                                 goto error;
1038                         }
1039                         sk->sk_sndmsg_off += copy;
1040                         frag->size += copy;
1041                         skb->len += copy;
1042                         skb->data_len += copy;
1043                 }
1044                 offset += copy;
1045                 length -= copy;
1046         }
1047         return 0;
1048 error:
1049         inet->cork.length -= length;
1050         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1051         return err;
1052 }
1053
1054 int ip6_push_pending_frames(struct sock *sk)
1055 {
1056         struct sk_buff *skb, *tmp_skb;
1057         struct sk_buff **tail_skb;
1058         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1059         struct inet_sock *inet = inet_sk(sk);
1060         struct ipv6_pinfo *np = inet6_sk(sk);
1061         struct ipv6hdr *hdr;
1062         struct ipv6_txoptions *opt = np->cork.opt;
1063         struct rt6_info *rt = np->cork.rt;
1064         struct flowi *fl = &inet->cork.fl;
1065         unsigned char proto = fl->proto;
1066         int err = 0;
1067
1068         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1069                 goto out;
1070         tail_skb = &(skb_shinfo(skb)->frag_list);
1071
1072         /* move skb->data to ip header from ext header */
1073         if (skb->data < skb->nh.raw)
1074                 __skb_pull(skb, skb->nh.raw - skb->data);
1075         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1076                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1077                 *tail_skb = tmp_skb;
1078                 tail_skb = &(tmp_skb->next);
1079                 skb->len += tmp_skb->len;
1080                 skb->data_len += tmp_skb->len;
1081                 skb->truesize += tmp_skb->truesize;
1082                 __sock_put(tmp_skb->sk);
1083                 tmp_skb->destructor = NULL;
1084                 tmp_skb->sk = NULL;
1085         }
1086
1087         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1088         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1089         if (opt && opt->opt_flen)
1090                 ipv6_push_frag_opts(skb, opt, &proto);
1091         if (opt && opt->opt_nflen)
1092                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1093
1094         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1095         
1096         *(u32*)hdr = fl->fl6_flowlabel |
1097                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1098
1099         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1100                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1101         else
1102                 hdr->payload_len = 0;
1103         hdr->hop_limit = np->cork.hop_limit;
1104         hdr->nexthdr = proto;
1105         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1106         ipv6_addr_copy(&hdr->daddr, final_dst);
1107
1108         skb->dst = dst_clone(&rt->u.dst);
1109         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1110         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1111         if (err) {
1112                 if (err > 0)
1113                         err = np->recverr ? net_xmit_errno(err) : 0;
1114                 if (err)
1115                         goto error;
1116         }
1117
1118 out:
1119         inet->cork.flags &= ~IPCORK_OPT;
1120         if (np->cork.opt) {
1121                 kfree(np->cork.opt);
1122                 np->cork.opt = NULL;
1123         }
1124         if (np->cork.rt) {
1125                 dst_release(&np->cork.rt->u.dst);
1126                 np->cork.rt = NULL;
1127                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1128         }
1129         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1130         return err;
1131 error:
1132         goto out;
1133 }
1134
1135 void ip6_flush_pending_frames(struct sock *sk)
1136 {
1137         struct inet_sock *inet = inet_sk(sk);
1138         struct ipv6_pinfo *np = inet6_sk(sk);
1139         struct sk_buff *skb;
1140
1141         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1142                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1143                 kfree_skb(skb);
1144         }
1145
1146         inet->cork.flags &= ~IPCORK_OPT;
1147
1148         if (np->cork.opt) {
1149                 kfree(np->cork.opt);
1150                 np->cork.opt = NULL;
1151         }
1152         if (np->cork.rt) {
1153                 dst_release(&np->cork.rt->u.dst);
1154                 np->cork.rt = NULL;
1155                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1156         }
1157         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1158 }