[IPV6]: ADDRCONF: Add accept_ra_pinfo sysctl.
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         skb->priority = sk->sk_priority;
230
231         mtu = dst_mtu(dst);
232         if ((skb->len <= mtu) || ipfragok) {
233                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
234                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
235                                 dst_output);
236         }
237
238         if (net_ratelimit())
239                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
240         skb->dev = dst->dev;
241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
243         kfree_skb(skb);
244         return -EMSGSIZE;
245 }
246
247 /*
248  *      To avoid extra problems ND packets are send through this
249  *      routine. It's code duplication but I really want to avoid
250  *      extra checks since ipv6_build_header is used by TCP (which
251  *      is for us performance critical)
252  */
253
254 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255                struct in6_addr *saddr, struct in6_addr *daddr,
256                int proto, int len)
257 {
258         struct ipv6_pinfo *np = inet6_sk(sk);
259         struct ipv6hdr *hdr;
260         int totlen;
261
262         skb->protocol = htons(ETH_P_IPV6);
263         skb->dev = dev;
264
265         totlen = len + sizeof(struct ipv6hdr);
266
267         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
268         skb->nh.ipv6h = hdr;
269
270         *(u32*)hdr = htonl(0x60000000);
271
272         hdr->payload_len = htons(len);
273         hdr->nexthdr = proto;
274         hdr->hop_limit = np->hop_limit;
275
276         ipv6_addr_copy(&hdr->saddr, saddr);
277         ipv6_addr_copy(&hdr->daddr, daddr);
278
279         return 0;
280 }
281
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284         struct ip6_ra_chain *ra;
285         struct sock *last = NULL;
286
287         read_lock(&ip6_ra_lock);
288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
289                 struct sock *sk = ra->sk;
290                 if (sk && ra->sel == sel &&
291                     (!sk->sk_bound_dev_if ||
292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
293                         if (last) {
294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295                                 if (skb2)
296                                         rawv6_rcv(last, skb2);
297                         }
298                         last = sk;
299                 }
300         }
301
302         if (last) {
303                 rawv6_rcv(last, skb);
304                 read_unlock(&ip6_ra_lock);
305                 return 1;
306         }
307         read_unlock(&ip6_ra_lock);
308         return 0;
309 }
310
311 static inline int ip6_forward_finish(struct sk_buff *skb)
312 {
313         return dst_output(skb);
314 }
315
316 int ip6_forward(struct sk_buff *skb)
317 {
318         struct dst_entry *dst = skb->dst;
319         struct ipv6hdr *hdr = skb->nh.ipv6h;
320         struct inet6_skb_parm *opt = IP6CB(skb);
321         
322         if (ipv6_devconf.forwarding == 0)
323                 goto error;
324
325         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
327                 goto drop;
328         }
329
330         skb->ip_summed = CHECKSUM_NONE;
331
332         /*
333          *      We DO NOT make any processing on
334          *      RA packets, pushing them to user level AS IS
335          *      without ane WARRANTY that application will be able
336          *      to interpret them. The reason is that we
337          *      cannot make anything clever here.
338          *
339          *      We are not end-node, so that if packet contains
340          *      AH/ESP, we cannot make anything.
341          *      Defragmentation also would be mistake, RA packets
342          *      cannot be fragmented, because there is no warranty
343          *      that different fragments will go along one path. --ANK
344          */
345         if (opt->ra) {
346                 u8 *ptr = skb->nh.raw + opt->ra;
347                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
348                         return 0;
349         }
350
351         /*
352          *      check and decrement ttl
353          */
354         if (hdr->hop_limit <= 1) {
355                 /* Force OUTPUT device used as source address */
356                 skb->dev = dst->dev;
357                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
358                             0, skb->dev);
359
360                 kfree_skb(skb);
361                 return -ETIMEDOUT;
362         }
363
364         if (!xfrm6_route_forward(skb)) {
365                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
366                 goto drop;
367         }
368         dst = skb->dst;
369
370         /* IPv6 specs say nothing about it, but it is clear that we cannot
371            send redirects to source routed frames.
372          */
373         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374                 struct in6_addr *target = NULL;
375                 struct rt6_info *rt;
376                 struct neighbour *n = dst->neighbour;
377
378                 /*
379                  *      incoming and outgoing devices are the same
380                  *      send a redirect.
381                  */
382
383                 rt = (struct rt6_info *) dst;
384                 if ((rt->rt6i_flags & RTF_GATEWAY))
385                         target = (struct in6_addr*)&n->primary_key;
386                 else
387                         target = &hdr->daddr;
388
389                 /* Limit redirects both by destination (here)
390                    and by source (inside ndisc_send_redirect)
391                  */
392                 if (xrlim_allow(dst, 1*HZ))
393                         ndisc_send_redirect(skb, n, target);
394         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395                                                 |IPV6_ADDR_LINKLOCAL)) {
396                 /* This check is security critical. */
397                 goto error;
398         }
399
400         if (skb->len > dst_mtu(dst)) {
401                 /* Again, force OUTPUT device used as source address */
402                 skb->dev = dst->dev;
403                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
406                 kfree_skb(skb);
407                 return -EMSGSIZE;
408         }
409
410         if (skb_cow(skb, dst->dev->hard_header_len)) {
411                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
412                 goto drop;
413         }
414
415         hdr = skb->nh.ipv6h;
416
417         /* Mangling hops number delayed to point after skb COW */
418  
419         hdr->hop_limit--;
420
421         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
423
424 error:
425         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
426 drop:
427         kfree_skb(skb);
428         return -EINVAL;
429 }
430
431 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
432 {
433         to->pkt_type = from->pkt_type;
434         to->priority = from->priority;
435         to->protocol = from->protocol;
436         dst_release(to->dst);
437         to->dst = dst_clone(from->dst);
438         to->dev = from->dev;
439
440 #ifdef CONFIG_NET_SCHED
441         to->tc_index = from->tc_index;
442 #endif
443 #ifdef CONFIG_NETFILTER
444         to->nfmark = from->nfmark;
445         /* Connection association is same as pre-frag packet */
446         nf_conntrack_put(to->nfct);
447         to->nfct = from->nfct;
448         nf_conntrack_get(to->nfct);
449         to->nfctinfo = from->nfctinfo;
450 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451         nf_conntrack_put_reasm(to->nfct_reasm);
452         to->nfct_reasm = from->nfct_reasm;
453         nf_conntrack_get_reasm(to->nfct_reasm);
454 #endif
455 #ifdef CONFIG_BRIDGE_NETFILTER
456         nf_bridge_put(to->nf_bridge);
457         to->nf_bridge = from->nf_bridge;
458         nf_bridge_get(to->nf_bridge);
459 #endif
460 #endif
461 }
462
463 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
464 {
465         u16 offset = sizeof(struct ipv6hdr);
466         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467         unsigned int packet_len = skb->tail - skb->nh.raw;
468         int found_rhdr = 0;
469         *nexthdr = &skb->nh.ipv6h->nexthdr;
470
471         while (offset + 1 <= packet_len) {
472
473                 switch (**nexthdr) {
474
475                 case NEXTHDR_HOP:
476                 case NEXTHDR_ROUTING:
477                 case NEXTHDR_DEST:
478                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480                         offset += ipv6_optlen(exthdr);
481                         *nexthdr = &exthdr->nexthdr;
482                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
483                         break;
484                 default :
485                         return offset;
486                 }
487         }
488
489         return offset;
490 }
491
492 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493 {
494         struct net_device *dev;
495         struct sk_buff *frag;
496         struct rt6_info *rt = (struct rt6_info*)skb->dst;
497         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
498         struct ipv6hdr *tmp_hdr;
499         struct frag_hdr *fh;
500         unsigned int mtu, hlen, left, len;
501         u32 frag_id = 0;
502         int ptr, offset = 0, err=0;
503         u8 *prevhdr, nexthdr = 0;
504
505         dev = rt->u.dst.dev;
506         hlen = ip6_find_1stfragopt(skb, &prevhdr);
507         nexthdr = *prevhdr;
508
509         mtu = dst_mtu(&rt->u.dst);
510         if (np && np->frag_size < mtu) {
511                 if (np->frag_size)
512                         mtu = np->frag_size;
513         }
514         mtu -= hlen + sizeof(struct frag_hdr);
515
516         if (skb_shinfo(skb)->frag_list) {
517                 int first_len = skb_pagelen(skb);
518
519                 if (first_len - hlen > mtu ||
520                     ((first_len - hlen) & 7) ||
521                     skb_cloned(skb))
522                         goto slow_path;
523
524                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
525                         /* Correct geometry. */
526                         if (frag->len > mtu ||
527                             ((frag->len & 7) && frag->next) ||
528                             skb_headroom(frag) < hlen)
529                             goto slow_path;
530
531                         /* Partially cloned skb? */
532                         if (skb_shared(frag))
533                                 goto slow_path;
534
535                         BUG_ON(frag->sk);
536                         if (skb->sk) {
537                                 sock_hold(skb->sk);
538                                 frag->sk = skb->sk;
539                                 frag->destructor = sock_wfree;
540                                 skb->truesize -= frag->truesize;
541                         }
542                 }
543
544                 err = 0;
545                 offset = 0;
546                 frag = skb_shinfo(skb)->frag_list;
547                 skb_shinfo(skb)->frag_list = NULL;
548                 /* BUILD HEADER */
549
550                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
551                 if (!tmp_hdr) {
552                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
553                         return -ENOMEM;
554                 }
555
556                 *prevhdr = NEXTHDR_FRAGMENT;
557                 memcpy(tmp_hdr, skb->nh.raw, hlen);
558                 __skb_pull(skb, hlen);
559                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
560                 skb->nh.raw = __skb_push(skb, hlen);
561                 memcpy(skb->nh.raw, tmp_hdr, hlen);
562
563                 ipv6_select_ident(skb, fh);
564                 fh->nexthdr = nexthdr;
565                 fh->reserved = 0;
566                 fh->frag_off = htons(IP6_MF);
567                 frag_id = fh->identification;
568
569                 first_len = skb_pagelen(skb);
570                 skb->data_len = first_len - skb_headlen(skb);
571                 skb->len = first_len;
572                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
573  
574
575                 for (;;) {
576                         /* Prepare header of the next frame,
577                          * before previous one went down. */
578                         if (frag) {
579                                 frag->ip_summed = CHECKSUM_NONE;
580                                 frag->h.raw = frag->data;
581                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
582                                 frag->nh.raw = __skb_push(frag, hlen);
583                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
584                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
585                                 fh->nexthdr = nexthdr;
586                                 fh->reserved = 0;
587                                 fh->frag_off = htons(offset);
588                                 if (frag->next != NULL)
589                                         fh->frag_off |= htons(IP6_MF);
590                                 fh->identification = frag_id;
591                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
592                                 ip6_copy_metadata(frag, skb);
593                         }
594                         
595                         err = output(skb);
596                         if (err || !frag)
597                                 break;
598
599                         skb = frag;
600                         frag = skb->next;
601                         skb->next = NULL;
602                 }
603
604                 kfree(tmp_hdr);
605
606                 if (err == 0) {
607                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
608                         return 0;
609                 }
610
611                 while (frag) {
612                         skb = frag->next;
613                         kfree_skb(frag);
614                         frag = skb;
615                 }
616
617                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
618                 return err;
619         }
620
621 slow_path:
622         left = skb->len - hlen;         /* Space per frame */
623         ptr = hlen;                     /* Where to start from */
624
625         /*
626          *      Fragment the datagram.
627          */
628
629         *prevhdr = NEXTHDR_FRAGMENT;
630
631         /*
632          *      Keep copying data until we run out.
633          */
634         while(left > 0) {
635                 len = left;
636                 /* IF: it doesn't fit, use 'mtu' - the data space left */
637                 if (len > mtu)
638                         len = mtu;
639                 /* IF: we are not sending upto and including the packet end
640                    then align the next start on an eight byte boundary */
641                 if (len < left) {
642                         len &= ~7;
643                 }
644                 /*
645                  *      Allocate buffer.
646                  */
647
648                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
649                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
650                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
651                         err = -ENOMEM;
652                         goto fail;
653                 }
654
655                 /*
656                  *      Set up data on packet
657                  */
658
659                 ip6_copy_metadata(frag, skb);
660                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
661                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
662                 frag->nh.raw = frag->data;
663                 fh = (struct frag_hdr*)(frag->data + hlen);
664                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
665
666                 /*
667                  *      Charge the memory for the fragment to any owner
668                  *      it might possess
669                  */
670                 if (skb->sk)
671                         skb_set_owner_w(frag, skb->sk);
672
673                 /*
674                  *      Copy the packet header into the new buffer.
675                  */
676                 memcpy(frag->nh.raw, skb->data, hlen);
677
678                 /*
679                  *      Build fragment header.
680                  */
681                 fh->nexthdr = nexthdr;
682                 fh->reserved = 0;
683                 if (!frag_id) {
684                         ipv6_select_ident(skb, fh);
685                         frag_id = fh->identification;
686                 } else
687                         fh->identification = frag_id;
688
689                 /*
690                  *      Copy a block of the IP datagram.
691                  */
692                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
693                         BUG();
694                 left -= len;
695
696                 fh->frag_off = htons(offset);
697                 if (left > 0)
698                         fh->frag_off |= htons(IP6_MF);
699                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700
701                 ptr += len;
702                 offset += len;
703
704                 /*
705                  *      Put this fragment into the sending queue.
706                  */
707
708                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
709
710                 err = output(frag);
711                 if (err)
712                         goto fail;
713         }
714         kfree_skb(skb);
715         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
716         return err;
717
718 fail:
719         kfree_skb(skb); 
720         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
721         return err;
722 }
723
724 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
725 {
726         int err = 0;
727
728         *dst = NULL;
729         if (sk) {
730                 struct ipv6_pinfo *np = inet6_sk(sk);
731         
732                 *dst = sk_dst_check(sk, np->dst_cookie);
733                 if (*dst) {
734                         struct rt6_info *rt = (struct rt6_info*)*dst;
735         
736                                 /* Yes, checking route validity in not connected
737                                    case is not very simple. Take into account,
738                                    that we do not support routing by source, TOS,
739                                    and MSG_DONTROUTE            --ANK (980726)
740         
741                                    1. If route was host route, check that
742                                       cached destination is current.
743                                       If it is network route, we still may
744                                       check its validity using saved pointer
745                                       to the last used address: daddr_cache.
746                                       We do not want to save whole address now,
747                                       (because main consumer of this service
748                                        is tcp, which has not this problem),
749                                       so that the last trick works only on connected
750                                       sockets.
751                                    2. oif also should be the same.
752                                  */
753         
754                         if (((rt->rt6i_dst.plen != 128 ||
755                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
756                              && (np->daddr_cache == NULL ||
757                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
758                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
759                                 dst_release(*dst);
760                                 *dst = NULL;
761                         }
762                 }
763         }
764
765         if (*dst == NULL)
766                 *dst = ip6_route_output(sk, fl);
767
768         if ((err = (*dst)->error))
769                 goto out_err_release;
770
771         if (ipv6_addr_any(&fl->fl6_src)) {
772                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
773
774                 if (err)
775                         goto out_err_release;
776         }
777
778         return 0;
779
780 out_err_release:
781         dst_release(*dst);
782         *dst = NULL;
783         return err;
784 }
785
786 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
787
788 static inline int ip6_ufo_append_data(struct sock *sk,
789                         int getfrag(void *from, char *to, int offset, int len,
790                         int odd, struct sk_buff *skb),
791                         void *from, int length, int hh_len, int fragheaderlen,
792                         int transhdrlen, int mtu,unsigned int flags)
793
794 {
795         struct sk_buff *skb;
796         int err;
797
798         /* There is support for UDP large send offload by network
799          * device, so create one single skb packet containing complete
800          * udp datagram
801          */
802         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
803                 skb = sock_alloc_send_skb(sk,
804                         hh_len + fragheaderlen + transhdrlen + 20,
805                         (flags & MSG_DONTWAIT), &err);
806                 if (skb == NULL)
807                         return -ENOMEM;
808
809                 /* reserve space for Hardware header */
810                 skb_reserve(skb, hh_len);
811
812                 /* create space for UDP/IP header */
813                 skb_put(skb,fragheaderlen + transhdrlen);
814
815                 /* initialize network header pointer */
816                 skb->nh.raw = skb->data;
817
818                 /* initialize protocol header pointer */
819                 skb->h.raw = skb->data + fragheaderlen;
820
821                 skb->ip_summed = CHECKSUM_HW;
822                 skb->csum = 0;
823                 sk->sk_sndmsg_off = 0;
824         }
825
826         err = skb_append_datato_frags(sk,skb, getfrag, from,
827                                       (length - transhdrlen));
828         if (!err) {
829                 struct frag_hdr fhdr;
830
831                 /* specify the length of each IP datagram fragment*/
832                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
833                                                 sizeof(struct frag_hdr);
834                 ipv6_select_ident(skb, &fhdr);
835                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
836                 __skb_queue_tail(&sk->sk_write_queue, skb);
837
838                 return 0;
839         }
840         /* There is not enough support do UPD LSO,
841          * so follow normal path
842          */
843         kfree_skb(skb);
844
845         return err;
846 }
847
848 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
849         int offset, int len, int odd, struct sk_buff *skb),
850         void *from, int length, int transhdrlen,
851         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
852         struct rt6_info *rt, unsigned int flags)
853 {
854         struct inet_sock *inet = inet_sk(sk);
855         struct ipv6_pinfo *np = inet6_sk(sk);
856         struct sk_buff *skb;
857         unsigned int maxfraglen, fragheaderlen;
858         int exthdrlen;
859         int hh_len;
860         int mtu;
861         int copy;
862         int err;
863         int offset = 0;
864         int csummode = CHECKSUM_NONE;
865
866         if (flags&MSG_PROBE)
867                 return 0;
868         if (skb_queue_empty(&sk->sk_write_queue)) {
869                 /*
870                  * setup for corking
871                  */
872                 if (opt) {
873                         if (np->cork.opt == NULL) {
874                                 np->cork.opt = kmalloc(opt->tot_len,
875                                                        sk->sk_allocation);
876                                 if (unlikely(np->cork.opt == NULL))
877                                         return -ENOBUFS;
878                         } else if (np->cork.opt->tot_len < opt->tot_len) {
879                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
880                                 return -EINVAL;
881                         }
882                         memcpy(np->cork.opt, opt, opt->tot_len);
883                         inet->cork.flags |= IPCORK_OPT;
884                         /* need source address above miyazawa*/
885                 }
886                 dst_hold(&rt->u.dst);
887                 np->cork.rt = rt;
888                 inet->cork.fl = *fl;
889                 np->cork.hop_limit = hlimit;
890                 np->cork.tclass = tclass;
891                 mtu = dst_mtu(rt->u.dst.path);
892                 if (np && np->frag_size < mtu) {
893                         if (np->frag_size)
894                                 mtu = np->frag_size;
895                 }
896                 inet->cork.fragsize = mtu;
897                 if (dst_allfrag(rt->u.dst.path))
898                         inet->cork.flags |= IPCORK_ALLFRAG;
899                 inet->cork.length = 0;
900                 sk->sk_sndmsg_page = NULL;
901                 sk->sk_sndmsg_off = 0;
902                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
903                 length += exthdrlen;
904                 transhdrlen += exthdrlen;
905         } else {
906                 rt = np->cork.rt;
907                 fl = &inet->cork.fl;
908                 if (inet->cork.flags & IPCORK_OPT)
909                         opt = np->cork.opt;
910                 transhdrlen = 0;
911                 exthdrlen = 0;
912                 mtu = inet->cork.fragsize;
913         }
914
915         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
916
917         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
918         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
919
920         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
921                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
922                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
923                         return -EMSGSIZE;
924                 }
925         }
926
927         /*
928          * Let's try using as much space as possible.
929          * Use MTU if total length of the message fits into the MTU.
930          * Otherwise, we need to reserve fragment header and
931          * fragment alignment (= 8-15 octects, in total).
932          *
933          * Note that we may need to "move" the data from the tail of
934          * of the buffer to the new fragment when we split 
935          * the message.
936          *
937          * FIXME: It may be fragmented into multiple chunks 
938          *        at once if non-fragmentable extension headers
939          *        are too large.
940          * --yoshfuji 
941          */
942
943         inet->cork.length += length;
944         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
945             (rt->u.dst.dev->features & NETIF_F_UFO)) {
946
947                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
948                                           fragheaderlen, transhdrlen, mtu,
949                                           flags);
950                 if (err)
951                         goto error;
952                 return 0;
953         }
954
955         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
956                 goto alloc_new_skb;
957
958         while (length > 0) {
959                 /* Check if the remaining data fits into current packet. */
960                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
961                 if (copy < length)
962                         copy = maxfraglen - skb->len;
963
964                 if (copy <= 0) {
965                         char *data;
966                         unsigned int datalen;
967                         unsigned int fraglen;
968                         unsigned int fraggap;
969                         unsigned int alloclen;
970                         struct sk_buff *skb_prev;
971 alloc_new_skb:
972                         skb_prev = skb;
973
974                         /* There's no room in the current skb */
975                         if (skb_prev)
976                                 fraggap = skb_prev->len - maxfraglen;
977                         else
978                                 fraggap = 0;
979
980                         /*
981                          * If remaining data exceeds the mtu,
982                          * we know we need more fragment(s).
983                          */
984                         datalen = length + fraggap;
985                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
986                                 datalen = maxfraglen - fragheaderlen;
987
988                         fraglen = datalen + fragheaderlen;
989                         if ((flags & MSG_MORE) &&
990                             !(rt->u.dst.dev->features&NETIF_F_SG))
991                                 alloclen = mtu;
992                         else
993                                 alloclen = datalen + fragheaderlen;
994
995                         /*
996                          * The last fragment gets additional space at tail.
997                          * Note: we overallocate on fragments with MSG_MODE
998                          * because we have no idea if we're the last one.
999                          */
1000                         if (datalen == length + fraggap)
1001                                 alloclen += rt->u.dst.trailer_len;
1002
1003                         /*
1004                          * We just reserve space for fragment header.
1005                          * Note: this may be overallocation if the message 
1006                          * (without MSG_MORE) fits into the MTU.
1007                          */
1008                         alloclen += sizeof(struct frag_hdr);
1009
1010                         if (transhdrlen) {
1011                                 skb = sock_alloc_send_skb(sk,
1012                                                 alloclen + hh_len,
1013                                                 (flags & MSG_DONTWAIT), &err);
1014                         } else {
1015                                 skb = NULL;
1016                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1017                                     2 * sk->sk_sndbuf)
1018                                         skb = sock_wmalloc(sk,
1019                                                            alloclen + hh_len, 1,
1020                                                            sk->sk_allocation);
1021                                 if (unlikely(skb == NULL))
1022                                         err = -ENOBUFS;
1023                         }
1024                         if (skb == NULL)
1025                                 goto error;
1026                         /*
1027                          *      Fill in the control structures
1028                          */
1029                         skb->ip_summed = csummode;
1030                         skb->csum = 0;
1031                         /* reserve for fragmentation */
1032                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1033
1034                         /*
1035                          *      Find where to start putting bytes
1036                          */
1037                         data = skb_put(skb, fraglen);
1038                         skb->nh.raw = data + exthdrlen;
1039                         data += fragheaderlen;
1040                         skb->h.raw = data + exthdrlen;
1041
1042                         if (fraggap) {
1043                                 skb->csum = skb_copy_and_csum_bits(
1044                                         skb_prev, maxfraglen,
1045                                         data + transhdrlen, fraggap, 0);
1046                                 skb_prev->csum = csum_sub(skb_prev->csum,
1047                                                           skb->csum);
1048                                 data += fraggap;
1049                                 skb_trim(skb_prev, maxfraglen);
1050                         }
1051                         copy = datalen - transhdrlen - fraggap;
1052                         if (copy < 0) {
1053                                 err = -EINVAL;
1054                                 kfree_skb(skb);
1055                                 goto error;
1056                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1057                                 err = -EFAULT;
1058                                 kfree_skb(skb);
1059                                 goto error;
1060                         }
1061
1062                         offset += copy;
1063                         length -= datalen - fraggap;
1064                         transhdrlen = 0;
1065                         exthdrlen = 0;
1066                         csummode = CHECKSUM_NONE;
1067
1068                         /*
1069                          * Put the packet on the pending queue
1070                          */
1071                         __skb_queue_tail(&sk->sk_write_queue, skb);
1072                         continue;
1073                 }
1074
1075                 if (copy > length)
1076                         copy = length;
1077
1078                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1079                         unsigned int off;
1080
1081                         off = skb->len;
1082                         if (getfrag(from, skb_put(skb, copy),
1083                                                 offset, copy, off, skb) < 0) {
1084                                 __skb_trim(skb, off);
1085                                 err = -EFAULT;
1086                                 goto error;
1087                         }
1088                 } else {
1089                         int i = skb_shinfo(skb)->nr_frags;
1090                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1091                         struct page *page = sk->sk_sndmsg_page;
1092                         int off = sk->sk_sndmsg_off;
1093                         unsigned int left;
1094
1095                         if (page && (left = PAGE_SIZE - off) > 0) {
1096                                 if (copy >= left)
1097                                         copy = left;
1098                                 if (page != frag->page) {
1099                                         if (i == MAX_SKB_FRAGS) {
1100                                                 err = -EMSGSIZE;
1101                                                 goto error;
1102                                         }
1103                                         get_page(page);
1104                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1105                                         frag = &skb_shinfo(skb)->frags[i];
1106                                 }
1107                         } else if(i < MAX_SKB_FRAGS) {
1108                                 if (copy > PAGE_SIZE)
1109                                         copy = PAGE_SIZE;
1110                                 page = alloc_pages(sk->sk_allocation, 0);
1111                                 if (page == NULL) {
1112                                         err = -ENOMEM;
1113                                         goto error;
1114                                 }
1115                                 sk->sk_sndmsg_page = page;
1116                                 sk->sk_sndmsg_off = 0;
1117
1118                                 skb_fill_page_desc(skb, i, page, 0, 0);
1119                                 frag = &skb_shinfo(skb)->frags[i];
1120                                 skb->truesize += PAGE_SIZE;
1121                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1122                         } else {
1123                                 err = -EMSGSIZE;
1124                                 goto error;
1125                         }
1126                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1127                                 err = -EFAULT;
1128                                 goto error;
1129                         }
1130                         sk->sk_sndmsg_off += copy;
1131                         frag->size += copy;
1132                         skb->len += copy;
1133                         skb->data_len += copy;
1134                 }
1135                 offset += copy;
1136                 length -= copy;
1137         }
1138         return 0;
1139 error:
1140         inet->cork.length -= length;
1141         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1142         return err;
1143 }
1144
1145 int ip6_push_pending_frames(struct sock *sk)
1146 {
1147         struct sk_buff *skb, *tmp_skb;
1148         struct sk_buff **tail_skb;
1149         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1150         struct inet_sock *inet = inet_sk(sk);
1151         struct ipv6_pinfo *np = inet6_sk(sk);
1152         struct ipv6hdr *hdr;
1153         struct ipv6_txoptions *opt = np->cork.opt;
1154         struct rt6_info *rt = np->cork.rt;
1155         struct flowi *fl = &inet->cork.fl;
1156         unsigned char proto = fl->proto;
1157         int err = 0;
1158
1159         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1160                 goto out;
1161         tail_skb = &(skb_shinfo(skb)->frag_list);
1162
1163         /* move skb->data to ip header from ext header */
1164         if (skb->data < skb->nh.raw)
1165                 __skb_pull(skb, skb->nh.raw - skb->data);
1166         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1167                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1168                 *tail_skb = tmp_skb;
1169                 tail_skb = &(tmp_skb->next);
1170                 skb->len += tmp_skb->len;
1171                 skb->data_len += tmp_skb->len;
1172                 skb->truesize += tmp_skb->truesize;
1173                 __sock_put(tmp_skb->sk);
1174                 tmp_skb->destructor = NULL;
1175                 tmp_skb->sk = NULL;
1176         }
1177
1178         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1179         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1180         if (opt && opt->opt_flen)
1181                 ipv6_push_frag_opts(skb, opt, &proto);
1182         if (opt && opt->opt_nflen)
1183                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1184
1185         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1186         
1187         *(u32*)hdr = fl->fl6_flowlabel |
1188                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1189
1190         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1191                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1192         else
1193                 hdr->payload_len = 0;
1194         hdr->hop_limit = np->cork.hop_limit;
1195         hdr->nexthdr = proto;
1196         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1197         ipv6_addr_copy(&hdr->daddr, final_dst);
1198
1199         skb->priority = sk->sk_priority;
1200
1201         skb->dst = dst_clone(&rt->u.dst);
1202         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1203         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1204         if (err) {
1205                 if (err > 0)
1206                         err = np->recverr ? net_xmit_errno(err) : 0;
1207                 if (err)
1208                         goto error;
1209         }
1210
1211 out:
1212         inet->cork.flags &= ~IPCORK_OPT;
1213         kfree(np->cork.opt);
1214         np->cork.opt = NULL;
1215         if (np->cork.rt) {
1216                 dst_release(&np->cork.rt->u.dst);
1217                 np->cork.rt = NULL;
1218                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1219         }
1220         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1221         return err;
1222 error:
1223         goto out;
1224 }
1225
1226 void ip6_flush_pending_frames(struct sock *sk)
1227 {
1228         struct inet_sock *inet = inet_sk(sk);
1229         struct ipv6_pinfo *np = inet6_sk(sk);
1230         struct sk_buff *skb;
1231
1232         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1233                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1234                 kfree_skb(skb);
1235         }
1236
1237         inet->cork.flags &= ~IPCORK_OPT;
1238
1239         kfree(np->cork.opt);
1240         np->cork.opt = NULL;
1241         if (np->cork.rt) {
1242                 dst_release(&np->cork.rt->u.dst);
1243                 np->cork.rt = NULL;
1244                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1245         }
1246         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1247 }