Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         mtu = dst_mtu(dst);
230         if ((skb->len <= mtu) || ipfragok) {
231                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233                                 dst_output);
234         }
235
236         if (net_ratelimit())
237                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238         skb->dev = dst->dev;
239         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241         kfree_skb(skb);
242         return -EMSGSIZE;
243 }
244
245 /*
246  *      To avoid extra problems ND packets are send through this
247  *      routine. It's code duplication but I really want to avoid
248  *      extra checks since ipv6_build_header is used by TCP (which
249  *      is for us performance critical)
250  */
251
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253                struct in6_addr *saddr, struct in6_addr *daddr,
254                int proto, int len)
255 {
256         struct ipv6_pinfo *np = inet6_sk(sk);
257         struct ipv6hdr *hdr;
258         int totlen;
259
260         skb->protocol = htons(ETH_P_IPV6);
261         skb->dev = dev;
262
263         totlen = len + sizeof(struct ipv6hdr);
264
265         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266         skb->nh.ipv6h = hdr;
267
268         *(u32*)hdr = htonl(0x60000000);
269
270         hdr->payload_len = htons(len);
271         hdr->nexthdr = proto;
272         hdr->hop_limit = np->hop_limit;
273
274         ipv6_addr_copy(&hdr->saddr, saddr);
275         ipv6_addr_copy(&hdr->daddr, daddr);
276
277         return 0;
278 }
279
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282         struct ip6_ra_chain *ra;
283         struct sock *last = NULL;
284
285         read_lock(&ip6_ra_lock);
286         for (ra = ip6_ra_chain; ra; ra = ra->next) {
287                 struct sock *sk = ra->sk;
288                 if (sk && ra->sel == sel &&
289                     (!sk->sk_bound_dev_if ||
290                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
291                         if (last) {
292                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293                                 if (skb2)
294                                         rawv6_rcv(last, skb2);
295                         }
296                         last = sk;
297                 }
298         }
299
300         if (last) {
301                 rawv6_rcv(last, skb);
302                 read_unlock(&ip6_ra_lock);
303                 return 1;
304         }
305         read_unlock(&ip6_ra_lock);
306         return 0;
307 }
308
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311         return dst_output(skb);
312 }
313
314 int ip6_forward(struct sk_buff *skb)
315 {
316         struct dst_entry *dst = skb->dst;
317         struct ipv6hdr *hdr = skb->nh.ipv6h;
318         struct inet6_skb_parm *opt = IP6CB(skb);
319         
320         if (ipv6_devconf.forwarding == 0)
321                 goto error;
322
323         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325                 goto drop;
326         }
327
328         skb->ip_summed = CHECKSUM_NONE;
329
330         /*
331          *      We DO NOT make any processing on
332          *      RA packets, pushing them to user level AS IS
333          *      without ane WARRANTY that application will be able
334          *      to interpret them. The reason is that we
335          *      cannot make anything clever here.
336          *
337          *      We are not end-node, so that if packet contains
338          *      AH/ESP, we cannot make anything.
339          *      Defragmentation also would be mistake, RA packets
340          *      cannot be fragmented, because there is no warranty
341          *      that different fragments will go along one path. --ANK
342          */
343         if (opt->ra) {
344                 u8 *ptr = skb->nh.raw + opt->ra;
345                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346                         return 0;
347         }
348
349         /*
350          *      check and decrement ttl
351          */
352         if (hdr->hop_limit <= 1) {
353                 /* Force OUTPUT device used as source address */
354                 skb->dev = dst->dev;
355                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356                             0, skb->dev);
357
358                 kfree_skb(skb);
359                 return -ETIMEDOUT;
360         }
361
362         if (!xfrm6_route_forward(skb)) {
363                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364                 goto drop;
365         }
366         dst = skb->dst;
367
368         /* IPv6 specs say nothing about it, but it is clear that we cannot
369            send redirects to source routed frames.
370          */
371         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372                 struct in6_addr *target = NULL;
373                 struct rt6_info *rt;
374                 struct neighbour *n = dst->neighbour;
375
376                 /*
377                  *      incoming and outgoing devices are the same
378                  *      send a redirect.
379                  */
380
381                 rt = (struct rt6_info *) dst;
382                 if ((rt->rt6i_flags & RTF_GATEWAY))
383                         target = (struct in6_addr*)&n->primary_key;
384                 else
385                         target = &hdr->daddr;
386
387                 /* Limit redirects both by destination (here)
388                    and by source (inside ndisc_send_redirect)
389                  */
390                 if (xrlim_allow(dst, 1*HZ))
391                         ndisc_send_redirect(skb, n, target);
392         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393                                                 |IPV6_ADDR_LINKLOCAL)) {
394                 /* This check is security critical. */
395                 goto error;
396         }
397
398         if (skb->len > dst_mtu(dst)) {
399                 /* Again, force OUTPUT device used as source address */
400                 skb->dev = dst->dev;
401                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404                 kfree_skb(skb);
405                 return -EMSGSIZE;
406         }
407
408         if (skb_cow(skb, dst->dev->hard_header_len)) {
409                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410                 goto drop;
411         }
412
413         hdr = skb->nh.ipv6h;
414
415         /* Mangling hops number delayed to point after skb COW */
416  
417         hdr->hop_limit--;
418
419         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421
422 error:
423         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425         kfree_skb(skb);
426         return -EINVAL;
427 }
428
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431         to->pkt_type = from->pkt_type;
432         to->priority = from->priority;
433         to->protocol = from->protocol;
434         dst_release(to->dst);
435         to->dst = dst_clone(from->dst);
436         to->dev = from->dev;
437
438 #ifdef CONFIG_NET_SCHED
439         to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442         to->nfmark = from->nfmark;
443         /* Connection association is same as pre-frag packet */
444         nf_conntrack_put(to->nfct);
445         to->nfct = from->nfct;
446         nf_conntrack_get(to->nfct);
447         to->nfctinfo = from->nfctinfo;
448 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
449         nf_conntrack_put_reasm(to->nfct_reasm);
450         to->nfct_reasm = from->nfct_reasm;
451         nf_conntrack_get_reasm(to->nfct_reasm);
452 #endif
453 #ifdef CONFIG_BRIDGE_NETFILTER
454         nf_bridge_put(to->nf_bridge);
455         to->nf_bridge = from->nf_bridge;
456         nf_bridge_get(to->nf_bridge);
457 #endif
458 #endif
459 }
460
461 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
462 {
463         u16 offset = sizeof(struct ipv6hdr);
464         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
465         unsigned int packet_len = skb->tail - skb->nh.raw;
466         int found_rhdr = 0;
467         *nexthdr = &skb->nh.ipv6h->nexthdr;
468
469         while (offset + 1 <= packet_len) {
470
471                 switch (**nexthdr) {
472
473                 case NEXTHDR_HOP:
474                 case NEXTHDR_ROUTING:
475                 case NEXTHDR_DEST:
476                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
477                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
478                         offset += ipv6_optlen(exthdr);
479                         *nexthdr = &exthdr->nexthdr;
480                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
481                         break;
482                 default :
483                         return offset;
484                 }
485         }
486
487         return offset;
488 }
489
490 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
491 {
492         struct net_device *dev;
493         struct sk_buff *frag;
494         struct rt6_info *rt = (struct rt6_info*)skb->dst;
495         struct ipv6hdr *tmp_hdr;
496         struct frag_hdr *fh;
497         unsigned int mtu, hlen, left, len;
498         u32 frag_id = 0;
499         int ptr, offset = 0, err=0;
500         u8 *prevhdr, nexthdr = 0;
501
502         dev = rt->u.dst.dev;
503         hlen = ip6_find_1stfragopt(skb, &prevhdr);
504         nexthdr = *prevhdr;
505
506         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
507
508         if (skb_shinfo(skb)->frag_list) {
509                 int first_len = skb_pagelen(skb);
510
511                 if (first_len - hlen > mtu ||
512                     ((first_len - hlen) & 7) ||
513                     skb_cloned(skb))
514                         goto slow_path;
515
516                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
517                         /* Correct geometry. */
518                         if (frag->len > mtu ||
519                             ((frag->len & 7) && frag->next) ||
520                             skb_headroom(frag) < hlen)
521                             goto slow_path;
522
523                         /* Partially cloned skb? */
524                         if (skb_shared(frag))
525                                 goto slow_path;
526
527                         BUG_ON(frag->sk);
528                         if (skb->sk) {
529                                 sock_hold(skb->sk);
530                                 frag->sk = skb->sk;
531                                 frag->destructor = sock_wfree;
532                                 skb->truesize -= frag->truesize;
533                         }
534                 }
535
536                 err = 0;
537                 offset = 0;
538                 frag = skb_shinfo(skb)->frag_list;
539                 skb_shinfo(skb)->frag_list = NULL;
540                 /* BUILD HEADER */
541
542                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
543                 if (!tmp_hdr) {
544                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
545                         return -ENOMEM;
546                 }
547
548                 *prevhdr = NEXTHDR_FRAGMENT;
549                 memcpy(tmp_hdr, skb->nh.raw, hlen);
550                 __skb_pull(skb, hlen);
551                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
552                 skb->nh.raw = __skb_push(skb, hlen);
553                 memcpy(skb->nh.raw, tmp_hdr, hlen);
554
555                 ipv6_select_ident(skb, fh);
556                 fh->nexthdr = nexthdr;
557                 fh->reserved = 0;
558                 fh->frag_off = htons(IP6_MF);
559                 frag_id = fh->identification;
560
561                 first_len = skb_pagelen(skb);
562                 skb->data_len = first_len - skb_headlen(skb);
563                 skb->len = first_len;
564                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
565  
566
567                 for (;;) {
568                         /* Prepare header of the next frame,
569                          * before previous one went down. */
570                         if (frag) {
571                                 frag->ip_summed = CHECKSUM_NONE;
572                                 frag->h.raw = frag->data;
573                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
574                                 frag->nh.raw = __skb_push(frag, hlen);
575                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
576                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
577                                 fh->nexthdr = nexthdr;
578                                 fh->reserved = 0;
579                                 fh->frag_off = htons(offset);
580                                 if (frag->next != NULL)
581                                         fh->frag_off |= htons(IP6_MF);
582                                 fh->identification = frag_id;
583                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
584                                 ip6_copy_metadata(frag, skb);
585                         }
586                         
587                         err = output(skb);
588                         if (err || !frag)
589                                 break;
590
591                         skb = frag;
592                         frag = skb->next;
593                         skb->next = NULL;
594                 }
595
596                 kfree(tmp_hdr);
597
598                 if (err == 0) {
599                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
600                         return 0;
601                 }
602
603                 while (frag) {
604                         skb = frag->next;
605                         kfree_skb(frag);
606                         frag = skb;
607                 }
608
609                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
610                 return err;
611         }
612
613 slow_path:
614         left = skb->len - hlen;         /* Space per frame */
615         ptr = hlen;                     /* Where to start from */
616
617         /*
618          *      Fragment the datagram.
619          */
620
621         *prevhdr = NEXTHDR_FRAGMENT;
622
623         /*
624          *      Keep copying data until we run out.
625          */
626         while(left > 0) {
627                 len = left;
628                 /* IF: it doesn't fit, use 'mtu' - the data space left */
629                 if (len > mtu)
630                         len = mtu;
631                 /* IF: we are not sending upto and including the packet end
632                    then align the next start on an eight byte boundary */
633                 if (len < left) {
634                         len &= ~7;
635                 }
636                 /*
637                  *      Allocate buffer.
638                  */
639
640                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
641                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
642                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643                         err = -ENOMEM;
644                         goto fail;
645                 }
646
647                 /*
648                  *      Set up data on packet
649                  */
650
651                 ip6_copy_metadata(frag, skb);
652                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
653                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
654                 frag->nh.raw = frag->data;
655                 fh = (struct frag_hdr*)(frag->data + hlen);
656                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
657
658                 /*
659                  *      Charge the memory for the fragment to any owner
660                  *      it might possess
661                  */
662                 if (skb->sk)
663                         skb_set_owner_w(frag, skb->sk);
664
665                 /*
666                  *      Copy the packet header into the new buffer.
667                  */
668                 memcpy(frag->nh.raw, skb->data, hlen);
669
670                 /*
671                  *      Build fragment header.
672                  */
673                 fh->nexthdr = nexthdr;
674                 fh->reserved = 0;
675                 if (!frag_id) {
676                         ipv6_select_ident(skb, fh);
677                         frag_id = fh->identification;
678                 } else
679                         fh->identification = frag_id;
680
681                 /*
682                  *      Copy a block of the IP datagram.
683                  */
684                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
685                         BUG();
686                 left -= len;
687
688                 fh->frag_off = htons(offset);
689                 if (left > 0)
690                         fh->frag_off |= htons(IP6_MF);
691                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
692
693                 ptr += len;
694                 offset += len;
695
696                 /*
697                  *      Put this fragment into the sending queue.
698                  */
699
700                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
701
702                 err = output(frag);
703                 if (err)
704                         goto fail;
705         }
706         kfree_skb(skb);
707         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
708         return err;
709
710 fail:
711         kfree_skb(skb); 
712         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
713         return err;
714 }
715
716 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
717 {
718         int err = 0;
719
720         *dst = NULL;
721         if (sk) {
722                 struct ipv6_pinfo *np = inet6_sk(sk);
723         
724                 *dst = sk_dst_check(sk, np->dst_cookie);
725                 if (*dst) {
726                         struct rt6_info *rt = (struct rt6_info*)*dst;
727         
728                                 /* Yes, checking route validity in not connected
729                                    case is not very simple. Take into account,
730                                    that we do not support routing by source, TOS,
731                                    and MSG_DONTROUTE            --ANK (980726)
732         
733                                    1. If route was host route, check that
734                                       cached destination is current.
735                                       If it is network route, we still may
736                                       check its validity using saved pointer
737                                       to the last used address: daddr_cache.
738                                       We do not want to save whole address now,
739                                       (because main consumer of this service
740                                        is tcp, which has not this problem),
741                                       so that the last trick works only on connected
742                                       sockets.
743                                    2. oif also should be the same.
744                                  */
745         
746                         if (((rt->rt6i_dst.plen != 128 ||
747                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
748                              && (np->daddr_cache == NULL ||
749                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
750                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
751                                 dst_release(*dst);
752                                 *dst = NULL;
753                         }
754                 }
755         }
756
757         if (*dst == NULL)
758                 *dst = ip6_route_output(sk, fl);
759
760         if ((err = (*dst)->error))
761                 goto out_err_release;
762
763         if (ipv6_addr_any(&fl->fl6_src)) {
764                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
765
766                 if (err)
767                         goto out_err_release;
768         }
769
770         return 0;
771
772 out_err_release:
773         dst_release(*dst);
774         *dst = NULL;
775         return err;
776 }
777
778 static inline int ip6_ufo_append_data(struct sock *sk,
779                         int getfrag(void *from, char *to, int offset, int len,
780                         int odd, struct sk_buff *skb),
781                         void *from, int length, int hh_len, int fragheaderlen,
782                         int transhdrlen, int mtu,unsigned int flags)
783
784 {
785         struct sk_buff *skb;
786         int err;
787
788         /* There is support for UDP large send offload by network
789          * device, so create one single skb packet containing complete
790          * udp datagram
791          */
792         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
793                 skb = sock_alloc_send_skb(sk,
794                         hh_len + fragheaderlen + transhdrlen + 20,
795                         (flags & MSG_DONTWAIT), &err);
796                 if (skb == NULL)
797                         return -ENOMEM;
798
799                 /* reserve space for Hardware header */
800                 skb_reserve(skb, hh_len);
801
802                 /* create space for UDP/IP header */
803                 skb_put(skb,fragheaderlen + transhdrlen);
804
805                 /* initialize network header pointer */
806                 skb->nh.raw = skb->data;
807
808                 /* initialize protocol header pointer */
809                 skb->h.raw = skb->data + fragheaderlen;
810
811                 skb->ip_summed = CHECKSUM_HW;
812                 skb->csum = 0;
813                 sk->sk_sndmsg_off = 0;
814         }
815
816         err = skb_append_datato_frags(sk,skb, getfrag, from,
817                                       (length - transhdrlen));
818         if (!err) {
819                 struct frag_hdr fhdr;
820
821                 /* specify the length of each IP datagram fragment*/
822                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
823                                                 sizeof(struct frag_hdr);
824                 ipv6_select_ident(skb, &fhdr);
825                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
826                 __skb_queue_tail(&sk->sk_write_queue, skb);
827
828                 return 0;
829         }
830         /* There is not enough support do UPD LSO,
831          * so follow normal path
832          */
833         kfree_skb(skb);
834
835         return err;
836 }
837
838 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
839         int offset, int len, int odd, struct sk_buff *skb),
840         void *from, int length, int transhdrlen,
841         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
842         struct rt6_info *rt, unsigned int flags)
843 {
844         struct inet_sock *inet = inet_sk(sk);
845         struct ipv6_pinfo *np = inet6_sk(sk);
846         struct sk_buff *skb;
847         unsigned int maxfraglen, fragheaderlen;
848         int exthdrlen;
849         int hh_len;
850         int mtu;
851         int copy;
852         int err;
853         int offset = 0;
854         int csummode = CHECKSUM_NONE;
855
856         if (flags&MSG_PROBE)
857                 return 0;
858         if (skb_queue_empty(&sk->sk_write_queue)) {
859                 /*
860                  * setup for corking
861                  */
862                 if (opt) {
863                         if (np->cork.opt == NULL) {
864                                 np->cork.opt = kmalloc(opt->tot_len,
865                                                        sk->sk_allocation);
866                                 if (unlikely(np->cork.opt == NULL))
867                                         return -ENOBUFS;
868                         } else if (np->cork.opt->tot_len < opt->tot_len) {
869                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
870                                 return -EINVAL;
871                         }
872                         memcpy(np->cork.opt, opt, opt->tot_len);
873                         inet->cork.flags |= IPCORK_OPT;
874                         /* need source address above miyazawa*/
875                 }
876                 dst_hold(&rt->u.dst);
877                 np->cork.rt = rt;
878                 inet->cork.fl = *fl;
879                 np->cork.hop_limit = hlimit;
880                 np->cork.tclass = tclass;
881                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
882                 if (dst_allfrag(rt->u.dst.path))
883                         inet->cork.flags |= IPCORK_ALLFRAG;
884                 inet->cork.length = 0;
885                 sk->sk_sndmsg_page = NULL;
886                 sk->sk_sndmsg_off = 0;
887                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
888                 length += exthdrlen;
889                 transhdrlen += exthdrlen;
890         } else {
891                 rt = np->cork.rt;
892                 fl = &inet->cork.fl;
893                 if (inet->cork.flags & IPCORK_OPT)
894                         opt = np->cork.opt;
895                 transhdrlen = 0;
896                 exthdrlen = 0;
897                 mtu = inet->cork.fragsize;
898         }
899
900         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
901
902         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
903         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
904
905         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
906                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
907                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
908                         return -EMSGSIZE;
909                 }
910         }
911
912         /*
913          * Let's try using as much space as possible.
914          * Use MTU if total length of the message fits into the MTU.
915          * Otherwise, we need to reserve fragment header and
916          * fragment alignment (= 8-15 octects, in total).
917          *
918          * Note that we may need to "move" the data from the tail of
919          * of the buffer to the new fragment when we split 
920          * the message.
921          *
922          * FIXME: It may be fragmented into multiple chunks 
923          *        at once if non-fragmentable extension headers
924          *        are too large.
925          * --yoshfuji 
926          */
927
928         inet->cork.length += length;
929         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
930             (rt->u.dst.dev->features & NETIF_F_UFO)) {
931
932                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
933                                 fragheaderlen, transhdrlen, mtu, flags))
934                         goto error;
935
936                 return 0;
937         }
938
939         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
940                 goto alloc_new_skb;
941
942         while (length > 0) {
943                 /* Check if the remaining data fits into current packet. */
944                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
945                 if (copy < length)
946                         copy = maxfraglen - skb->len;
947
948                 if (copy <= 0) {
949                         char *data;
950                         unsigned int datalen;
951                         unsigned int fraglen;
952                         unsigned int fraggap;
953                         unsigned int alloclen;
954                         struct sk_buff *skb_prev;
955 alloc_new_skb:
956                         skb_prev = skb;
957
958                         /* There's no room in the current skb */
959                         if (skb_prev)
960                                 fraggap = skb_prev->len - maxfraglen;
961                         else
962                                 fraggap = 0;
963
964                         /*
965                          * If remaining data exceeds the mtu,
966                          * we know we need more fragment(s).
967                          */
968                         datalen = length + fraggap;
969                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
970                                 datalen = maxfraglen - fragheaderlen;
971
972                         fraglen = datalen + fragheaderlen;
973                         if ((flags & MSG_MORE) &&
974                             !(rt->u.dst.dev->features&NETIF_F_SG))
975                                 alloclen = mtu;
976                         else
977                                 alloclen = datalen + fragheaderlen;
978
979                         /*
980                          * The last fragment gets additional space at tail.
981                          * Note: we overallocate on fragments with MSG_MODE
982                          * because we have no idea if we're the last one.
983                          */
984                         if (datalen == length + fraggap)
985                                 alloclen += rt->u.dst.trailer_len;
986
987                         /*
988                          * We just reserve space for fragment header.
989                          * Note: this may be overallocation if the message 
990                          * (without MSG_MORE) fits into the MTU.
991                          */
992                         alloclen += sizeof(struct frag_hdr);
993
994                         if (transhdrlen) {
995                                 skb = sock_alloc_send_skb(sk,
996                                                 alloclen + hh_len,
997                                                 (flags & MSG_DONTWAIT), &err);
998                         } else {
999                                 skb = NULL;
1000                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1001                                     2 * sk->sk_sndbuf)
1002                                         skb = sock_wmalloc(sk,
1003                                                            alloclen + hh_len, 1,
1004                                                            sk->sk_allocation);
1005                                 if (unlikely(skb == NULL))
1006                                         err = -ENOBUFS;
1007                         }
1008                         if (skb == NULL)
1009                                 goto error;
1010                         /*
1011                          *      Fill in the control structures
1012                          */
1013                         skb->ip_summed = csummode;
1014                         skb->csum = 0;
1015                         /* reserve for fragmentation */
1016                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1017
1018                         /*
1019                          *      Find where to start putting bytes
1020                          */
1021                         data = skb_put(skb, fraglen);
1022                         skb->nh.raw = data + exthdrlen;
1023                         data += fragheaderlen;
1024                         skb->h.raw = data + exthdrlen;
1025
1026                         if (fraggap) {
1027                                 skb->csum = skb_copy_and_csum_bits(
1028                                         skb_prev, maxfraglen,
1029                                         data + transhdrlen, fraggap, 0);
1030                                 skb_prev->csum = csum_sub(skb_prev->csum,
1031                                                           skb->csum);
1032                                 data += fraggap;
1033                                 skb_trim(skb_prev, maxfraglen);
1034                         }
1035                         copy = datalen - transhdrlen - fraggap;
1036                         if (copy < 0) {
1037                                 err = -EINVAL;
1038                                 kfree_skb(skb);
1039                                 goto error;
1040                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1041                                 err = -EFAULT;
1042                                 kfree_skb(skb);
1043                                 goto error;
1044                         }
1045
1046                         offset += copy;
1047                         length -= datalen - fraggap;
1048                         transhdrlen = 0;
1049                         exthdrlen = 0;
1050                         csummode = CHECKSUM_NONE;
1051
1052                         /*
1053                          * Put the packet on the pending queue
1054                          */
1055                         __skb_queue_tail(&sk->sk_write_queue, skb);
1056                         continue;
1057                 }
1058
1059                 if (copy > length)
1060                         copy = length;
1061
1062                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1063                         unsigned int off;
1064
1065                         off = skb->len;
1066                         if (getfrag(from, skb_put(skb, copy),
1067                                                 offset, copy, off, skb) < 0) {
1068                                 __skb_trim(skb, off);
1069                                 err = -EFAULT;
1070                                 goto error;
1071                         }
1072                 } else {
1073                         int i = skb_shinfo(skb)->nr_frags;
1074                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1075                         struct page *page = sk->sk_sndmsg_page;
1076                         int off = sk->sk_sndmsg_off;
1077                         unsigned int left;
1078
1079                         if (page && (left = PAGE_SIZE - off) > 0) {
1080                                 if (copy >= left)
1081                                         copy = left;
1082                                 if (page != frag->page) {
1083                                         if (i == MAX_SKB_FRAGS) {
1084                                                 err = -EMSGSIZE;
1085                                                 goto error;
1086                                         }
1087                                         get_page(page);
1088                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1089                                         frag = &skb_shinfo(skb)->frags[i];
1090                                 }
1091                         } else if(i < MAX_SKB_FRAGS) {
1092                                 if (copy > PAGE_SIZE)
1093                                         copy = PAGE_SIZE;
1094                                 page = alloc_pages(sk->sk_allocation, 0);
1095                                 if (page == NULL) {
1096                                         err = -ENOMEM;
1097                                         goto error;
1098                                 }
1099                                 sk->sk_sndmsg_page = page;
1100                                 sk->sk_sndmsg_off = 0;
1101
1102                                 skb_fill_page_desc(skb, i, page, 0, 0);
1103                                 frag = &skb_shinfo(skb)->frags[i];
1104                                 skb->truesize += PAGE_SIZE;
1105                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1106                         } else {
1107                                 err = -EMSGSIZE;
1108                                 goto error;
1109                         }
1110                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1111                                 err = -EFAULT;
1112                                 goto error;
1113                         }
1114                         sk->sk_sndmsg_off += copy;
1115                         frag->size += copy;
1116                         skb->len += copy;
1117                         skb->data_len += copy;
1118                 }
1119                 offset += copy;
1120                 length -= copy;
1121         }
1122         return 0;
1123 error:
1124         inet->cork.length -= length;
1125         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1126         return err;
1127 }
1128
1129 int ip6_push_pending_frames(struct sock *sk)
1130 {
1131         struct sk_buff *skb, *tmp_skb;
1132         struct sk_buff **tail_skb;
1133         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1134         struct inet_sock *inet = inet_sk(sk);
1135         struct ipv6_pinfo *np = inet6_sk(sk);
1136         struct ipv6hdr *hdr;
1137         struct ipv6_txoptions *opt = np->cork.opt;
1138         struct rt6_info *rt = np->cork.rt;
1139         struct flowi *fl = &inet->cork.fl;
1140         unsigned char proto = fl->proto;
1141         int err = 0;
1142
1143         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1144                 goto out;
1145         tail_skb = &(skb_shinfo(skb)->frag_list);
1146
1147         /* move skb->data to ip header from ext header */
1148         if (skb->data < skb->nh.raw)
1149                 __skb_pull(skb, skb->nh.raw - skb->data);
1150         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1151                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1152                 *tail_skb = tmp_skb;
1153                 tail_skb = &(tmp_skb->next);
1154                 skb->len += tmp_skb->len;
1155                 skb->data_len += tmp_skb->len;
1156                 skb->truesize += tmp_skb->truesize;
1157                 __sock_put(tmp_skb->sk);
1158                 tmp_skb->destructor = NULL;
1159                 tmp_skb->sk = NULL;
1160         }
1161
1162         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1163         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1164         if (opt && opt->opt_flen)
1165                 ipv6_push_frag_opts(skb, opt, &proto);
1166         if (opt && opt->opt_nflen)
1167                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1168
1169         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1170         
1171         *(u32*)hdr = fl->fl6_flowlabel |
1172                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1173
1174         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1175                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1176         else
1177                 hdr->payload_len = 0;
1178         hdr->hop_limit = np->cork.hop_limit;
1179         hdr->nexthdr = proto;
1180         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1181         ipv6_addr_copy(&hdr->daddr, final_dst);
1182
1183         skb->dst = dst_clone(&rt->u.dst);
1184         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1185         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1186         if (err) {
1187                 if (err > 0)
1188                         err = np->recverr ? net_xmit_errno(err) : 0;
1189                 if (err)
1190                         goto error;
1191         }
1192
1193 out:
1194         inet->cork.flags &= ~IPCORK_OPT;
1195         kfree(np->cork.opt);
1196         np->cork.opt = NULL;
1197         if (np->cork.rt) {
1198                 dst_release(&np->cork.rt->u.dst);
1199                 np->cork.rt = NULL;
1200                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1201         }
1202         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1203         return err;
1204 error:
1205         goto out;
1206 }
1207
1208 void ip6_flush_pending_frames(struct sock *sk)
1209 {
1210         struct inet_sock *inet = inet_sk(sk);
1211         struct ipv6_pinfo *np = inet6_sk(sk);
1212         struct sk_buff *skb;
1213
1214         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1215                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1216                 kfree_skb(skb);
1217         }
1218
1219         inet->cork.flags &= ~IPCORK_OPT;
1220
1221         kfree(np->cork.opt);
1222         np->cork.opt = NULL;
1223         if (np->cork.rt) {
1224                 dst_release(&np->cork.rt->u.dst);
1225                 np->cork.rt = NULL;
1226                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1227         }
1228         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1229 }