[PATCH] cpusets: confine pdflush to its cpuset
[linux-2.6] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         mtu = dst_mtu(dst);
230         if ((skb->len <= mtu) || ipfragok) {
231                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233                                 dst_output);
234         }
235
236         if (net_ratelimit())
237                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238         skb->dev = dst->dev;
239         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241         kfree_skb(skb);
242         return -EMSGSIZE;
243 }
244
245 /*
246  *      To avoid extra problems ND packets are send through this
247  *      routine. It's code duplication but I really want to avoid
248  *      extra checks since ipv6_build_header is used by TCP (which
249  *      is for us performance critical)
250  */
251
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253                struct in6_addr *saddr, struct in6_addr *daddr,
254                int proto, int len)
255 {
256         struct ipv6_pinfo *np = inet6_sk(sk);
257         struct ipv6hdr *hdr;
258         int totlen;
259
260         skb->protocol = htons(ETH_P_IPV6);
261         skb->dev = dev;
262
263         totlen = len + sizeof(struct ipv6hdr);
264
265         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266         skb->nh.ipv6h = hdr;
267
268         *(u32*)hdr = htonl(0x60000000);
269
270         hdr->payload_len = htons(len);
271         hdr->nexthdr = proto;
272         hdr->hop_limit = np->hop_limit;
273
274         ipv6_addr_copy(&hdr->saddr, saddr);
275         ipv6_addr_copy(&hdr->daddr, daddr);
276
277         return 0;
278 }
279
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282         struct ip6_ra_chain *ra;
283         struct sock *last = NULL;
284
285         read_lock(&ip6_ra_lock);
286         for (ra = ip6_ra_chain; ra; ra = ra->next) {
287                 struct sock *sk = ra->sk;
288                 if (sk && ra->sel == sel &&
289                     (!sk->sk_bound_dev_if ||
290                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
291                         if (last) {
292                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293                                 if (skb2)
294                                         rawv6_rcv(last, skb2);
295                         }
296                         last = sk;
297                 }
298         }
299
300         if (last) {
301                 rawv6_rcv(last, skb);
302                 read_unlock(&ip6_ra_lock);
303                 return 1;
304         }
305         read_unlock(&ip6_ra_lock);
306         return 0;
307 }
308
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311         return dst_output(skb);
312 }
313
314 int ip6_forward(struct sk_buff *skb)
315 {
316         struct dst_entry *dst = skb->dst;
317         struct ipv6hdr *hdr = skb->nh.ipv6h;
318         struct inet6_skb_parm *opt = IP6CB(skb);
319         
320         if (ipv6_devconf.forwarding == 0)
321                 goto error;
322
323         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325                 goto drop;
326         }
327
328         skb->ip_summed = CHECKSUM_NONE;
329
330         /*
331          *      We DO NOT make any processing on
332          *      RA packets, pushing them to user level AS IS
333          *      without ane WARRANTY that application will be able
334          *      to interpret them. The reason is that we
335          *      cannot make anything clever here.
336          *
337          *      We are not end-node, so that if packet contains
338          *      AH/ESP, we cannot make anything.
339          *      Defragmentation also would be mistake, RA packets
340          *      cannot be fragmented, because there is no warranty
341          *      that different fragments will go along one path. --ANK
342          */
343         if (opt->ra) {
344                 u8 *ptr = skb->nh.raw + opt->ra;
345                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346                         return 0;
347         }
348
349         /*
350          *      check and decrement ttl
351          */
352         if (hdr->hop_limit <= 1) {
353                 /* Force OUTPUT device used as source address */
354                 skb->dev = dst->dev;
355                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356                             0, skb->dev);
357
358                 kfree_skb(skb);
359                 return -ETIMEDOUT;
360         }
361
362         if (!xfrm6_route_forward(skb)) {
363                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364                 goto drop;
365         }
366         dst = skb->dst;
367
368         /* IPv6 specs say nothing about it, but it is clear that we cannot
369            send redirects to source routed frames.
370          */
371         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372                 struct in6_addr *target = NULL;
373                 struct rt6_info *rt;
374                 struct neighbour *n = dst->neighbour;
375
376                 /*
377                  *      incoming and outgoing devices are the same
378                  *      send a redirect.
379                  */
380
381                 rt = (struct rt6_info *) dst;
382                 if ((rt->rt6i_flags & RTF_GATEWAY))
383                         target = (struct in6_addr*)&n->primary_key;
384                 else
385                         target = &hdr->daddr;
386
387                 /* Limit redirects both by destination (here)
388                    and by source (inside ndisc_send_redirect)
389                  */
390                 if (xrlim_allow(dst, 1*HZ))
391                         ndisc_send_redirect(skb, n, target);
392         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393                                                 |IPV6_ADDR_LINKLOCAL)) {
394                 /* This check is security critical. */
395                 goto error;
396         }
397
398         if (skb->len > dst_mtu(dst)) {
399                 /* Again, force OUTPUT device used as source address */
400                 skb->dev = dst->dev;
401                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404                 kfree_skb(skb);
405                 return -EMSGSIZE;
406         }
407
408         if (skb_cow(skb, dst->dev->hard_header_len)) {
409                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410                 goto drop;
411         }
412
413         hdr = skb->nh.ipv6h;
414
415         /* Mangling hops number delayed to point after skb COW */
416  
417         hdr->hop_limit--;
418
419         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421
422 error:
423         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425         kfree_skb(skb);
426         return -EINVAL;
427 }
428
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431         to->pkt_type = from->pkt_type;
432         to->priority = from->priority;
433         to->protocol = from->protocol;
434         dst_release(to->dst);
435         to->dst = dst_clone(from->dst);
436         to->dev = from->dev;
437
438 #ifdef CONFIG_NET_SCHED
439         to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442         to->nfmark = from->nfmark;
443         /* Connection association is same as pre-frag packet */
444         to->nfct = from->nfct;
445         nf_conntrack_get(to->nfct);
446         to->nfctinfo = from->nfctinfo;
447 #ifdef CONFIG_BRIDGE_NETFILTER
448         nf_bridge_put(to->nf_bridge);
449         to->nf_bridge = from->nf_bridge;
450         nf_bridge_get(to->nf_bridge);
451 #endif
452 #endif
453 }
454
455 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
456 {
457         u16 offset = sizeof(struct ipv6hdr);
458         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
459         unsigned int packet_len = skb->tail - skb->nh.raw;
460         int found_rhdr = 0;
461         *nexthdr = &skb->nh.ipv6h->nexthdr;
462
463         while (offset + 1 <= packet_len) {
464
465                 switch (**nexthdr) {
466
467                 case NEXTHDR_HOP:
468                 case NEXTHDR_ROUTING:
469                 case NEXTHDR_DEST:
470                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
471                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
472                         offset += ipv6_optlen(exthdr);
473                         *nexthdr = &exthdr->nexthdr;
474                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
475                         break;
476                 default :
477                         return offset;
478                 }
479         }
480
481         return offset;
482 }
483
484 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
485 {
486         struct net_device *dev;
487         struct sk_buff *frag;
488         struct rt6_info *rt = (struct rt6_info*)skb->dst;
489         struct ipv6hdr *tmp_hdr;
490         struct frag_hdr *fh;
491         unsigned int mtu, hlen, left, len;
492         u32 frag_id = 0;
493         int ptr, offset = 0, err=0;
494         u8 *prevhdr, nexthdr = 0;
495
496         dev = rt->u.dst.dev;
497         hlen = ip6_find_1stfragopt(skb, &prevhdr);
498         nexthdr = *prevhdr;
499
500         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
501
502         if (skb_shinfo(skb)->frag_list) {
503                 int first_len = skb_pagelen(skb);
504
505                 if (first_len - hlen > mtu ||
506                     ((first_len - hlen) & 7) ||
507                     skb_cloned(skb))
508                         goto slow_path;
509
510                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
511                         /* Correct geometry. */
512                         if (frag->len > mtu ||
513                             ((frag->len & 7) && frag->next) ||
514                             skb_headroom(frag) < hlen)
515                             goto slow_path;
516
517                         /* Partially cloned skb? */
518                         if (skb_shared(frag))
519                                 goto slow_path;
520
521                         BUG_ON(frag->sk);
522                         if (skb->sk) {
523                                 sock_hold(skb->sk);
524                                 frag->sk = skb->sk;
525                                 frag->destructor = sock_wfree;
526                                 skb->truesize -= frag->truesize;
527                         }
528                 }
529
530                 err = 0;
531                 offset = 0;
532                 frag = skb_shinfo(skb)->frag_list;
533                 skb_shinfo(skb)->frag_list = NULL;
534                 /* BUILD HEADER */
535
536                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
537                 if (!tmp_hdr) {
538                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
539                         return -ENOMEM;
540                 }
541
542                 *prevhdr = NEXTHDR_FRAGMENT;
543                 memcpy(tmp_hdr, skb->nh.raw, hlen);
544                 __skb_pull(skb, hlen);
545                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
546                 skb->nh.raw = __skb_push(skb, hlen);
547                 memcpy(skb->nh.raw, tmp_hdr, hlen);
548
549                 ipv6_select_ident(skb, fh);
550                 fh->nexthdr = nexthdr;
551                 fh->reserved = 0;
552                 fh->frag_off = htons(IP6_MF);
553                 frag_id = fh->identification;
554
555                 first_len = skb_pagelen(skb);
556                 skb->data_len = first_len - skb_headlen(skb);
557                 skb->len = first_len;
558                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
559  
560
561                 for (;;) {
562                         /* Prepare header of the next frame,
563                          * before previous one went down. */
564                         if (frag) {
565                                 frag->ip_summed = CHECKSUM_NONE;
566                                 frag->h.raw = frag->data;
567                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
568                                 frag->nh.raw = __skb_push(frag, hlen);
569                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
570                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
571                                 fh->nexthdr = nexthdr;
572                                 fh->reserved = 0;
573                                 fh->frag_off = htons(offset);
574                                 if (frag->next != NULL)
575                                         fh->frag_off |= htons(IP6_MF);
576                                 fh->identification = frag_id;
577                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
578                                 ip6_copy_metadata(frag, skb);
579                         }
580                         
581                         err = output(skb);
582                         if (err || !frag)
583                                 break;
584
585                         skb = frag;
586                         frag = skb->next;
587                         skb->next = NULL;
588                 }
589
590                 if (tmp_hdr)
591                         kfree(tmp_hdr);
592
593                 if (err == 0) {
594                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
595                         return 0;
596                 }
597
598                 while (frag) {
599                         skb = frag->next;
600                         kfree_skb(frag);
601                         frag = skb;
602                 }
603
604                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
605                 return err;
606         }
607
608 slow_path:
609         left = skb->len - hlen;         /* Space per frame */
610         ptr = hlen;                     /* Where to start from */
611
612         /*
613          *      Fragment the datagram.
614          */
615
616         *prevhdr = NEXTHDR_FRAGMENT;
617
618         /*
619          *      Keep copying data until we run out.
620          */
621         while(left > 0) {
622                 len = left;
623                 /* IF: it doesn't fit, use 'mtu' - the data space left */
624                 if (len > mtu)
625                         len = mtu;
626                 /* IF: we are not sending upto and including the packet end
627                    then align the next start on an eight byte boundary */
628                 if (len < left) {
629                         len &= ~7;
630                 }
631                 /*
632                  *      Allocate buffer.
633                  */
634
635                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
636                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
637                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
638                         err = -ENOMEM;
639                         goto fail;
640                 }
641
642                 /*
643                  *      Set up data on packet
644                  */
645
646                 ip6_copy_metadata(frag, skb);
647                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
648                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
649                 frag->nh.raw = frag->data;
650                 fh = (struct frag_hdr*)(frag->data + hlen);
651                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
652
653                 /*
654                  *      Charge the memory for the fragment to any owner
655                  *      it might possess
656                  */
657                 if (skb->sk)
658                         skb_set_owner_w(frag, skb->sk);
659
660                 /*
661                  *      Copy the packet header into the new buffer.
662                  */
663                 memcpy(frag->nh.raw, skb->data, hlen);
664
665                 /*
666                  *      Build fragment header.
667                  */
668                 fh->nexthdr = nexthdr;
669                 fh->reserved = 0;
670                 if (!frag_id) {
671                         ipv6_select_ident(skb, fh);
672                         frag_id = fh->identification;
673                 } else
674                         fh->identification = frag_id;
675
676                 /*
677                  *      Copy a block of the IP datagram.
678                  */
679                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
680                         BUG();
681                 left -= len;
682
683                 fh->frag_off = htons(offset);
684                 if (left > 0)
685                         fh->frag_off |= htons(IP6_MF);
686                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
687
688                 ptr += len;
689                 offset += len;
690
691                 /*
692                  *      Put this fragment into the sending queue.
693                  */
694
695                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
696
697                 err = output(frag);
698                 if (err)
699                         goto fail;
700         }
701         kfree_skb(skb);
702         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
703         return err;
704
705 fail:
706         kfree_skb(skb); 
707         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
708         return err;
709 }
710
711 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
712 {
713         int err = 0;
714
715         *dst = NULL;
716         if (sk) {
717                 struct ipv6_pinfo *np = inet6_sk(sk);
718         
719                 *dst = sk_dst_check(sk, np->dst_cookie);
720                 if (*dst) {
721                         struct rt6_info *rt = (struct rt6_info*)*dst;
722         
723                                 /* Yes, checking route validity in not connected
724                                    case is not very simple. Take into account,
725                                    that we do not support routing by source, TOS,
726                                    and MSG_DONTROUTE            --ANK (980726)
727         
728                                    1. If route was host route, check that
729                                       cached destination is current.
730                                       If it is network route, we still may
731                                       check its validity using saved pointer
732                                       to the last used address: daddr_cache.
733                                       We do not want to save whole address now,
734                                       (because main consumer of this service
735                                        is tcp, which has not this problem),
736                                       so that the last trick works only on connected
737                                       sockets.
738                                    2. oif also should be the same.
739                                  */
740         
741                         if (((rt->rt6i_dst.plen != 128 ||
742                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
743                              && (np->daddr_cache == NULL ||
744                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
745                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
746                                 dst_release(*dst);
747                                 *dst = NULL;
748                         }
749                 }
750         }
751
752         if (*dst == NULL)
753                 *dst = ip6_route_output(sk, fl);
754
755         if ((err = (*dst)->error))
756                 goto out_err_release;
757
758         if (ipv6_addr_any(&fl->fl6_src)) {
759                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
760
761                 if (err)
762                         goto out_err_release;
763         }
764
765         return 0;
766
767 out_err_release:
768         dst_release(*dst);
769         *dst = NULL;
770         return err;
771 }
772 inline int ip6_ufo_append_data(struct sock *sk,
773                         int getfrag(void *from, char *to, int offset, int len,
774                         int odd, struct sk_buff *skb),
775                         void *from, int length, int hh_len, int fragheaderlen,
776                         int transhdrlen, int mtu,unsigned int flags)
777
778 {
779         struct sk_buff *skb;
780         int err;
781
782         /* There is support for UDP large send offload by network
783          * device, so create one single skb packet containing complete
784          * udp datagram
785          */
786         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
787                 skb = sock_alloc_send_skb(sk,
788                         hh_len + fragheaderlen + transhdrlen + 20,
789                         (flags & MSG_DONTWAIT), &err);
790                 if (skb == NULL)
791                         return -ENOMEM;
792
793                 /* reserve space for Hardware header */
794                 skb_reserve(skb, hh_len);
795
796                 /* create space for UDP/IP header */
797                 skb_put(skb,fragheaderlen + transhdrlen);
798
799                 /* initialize network header pointer */
800                 skb->nh.raw = skb->data;
801
802                 /* initialize protocol header pointer */
803                 skb->h.raw = skb->data + fragheaderlen;
804
805                 skb->ip_summed = CHECKSUM_HW;
806                 skb->csum = 0;
807                 sk->sk_sndmsg_off = 0;
808         }
809
810         err = skb_append_datato_frags(sk,skb, getfrag, from,
811                                       (length - transhdrlen));
812         if (!err) {
813                 struct frag_hdr fhdr;
814
815                 /* specify the length of each IP datagram fragment*/
816                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
817                                                 sizeof(struct frag_hdr);
818                 ipv6_select_ident(skb, &fhdr);
819                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
820                 __skb_queue_tail(&sk->sk_write_queue, skb);
821
822                 return 0;
823         }
824         /* There is not enough support do UPD LSO,
825          * so follow normal path
826          */
827         kfree_skb(skb);
828
829         return err;
830 }
831
832 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
833         int offset, int len, int odd, struct sk_buff *skb),
834         void *from, int length, int transhdrlen,
835         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
836         struct rt6_info *rt, unsigned int flags)
837 {
838         struct inet_sock *inet = inet_sk(sk);
839         struct ipv6_pinfo *np = inet6_sk(sk);
840         struct sk_buff *skb;
841         unsigned int maxfraglen, fragheaderlen;
842         int exthdrlen;
843         int hh_len;
844         int mtu;
845         int copy;
846         int err;
847         int offset = 0;
848         int csummode = CHECKSUM_NONE;
849
850         if (flags&MSG_PROBE)
851                 return 0;
852         if (skb_queue_empty(&sk->sk_write_queue)) {
853                 /*
854                  * setup for corking
855                  */
856                 if (opt) {
857                         if (np->cork.opt == NULL) {
858                                 np->cork.opt = kmalloc(opt->tot_len,
859                                                        sk->sk_allocation);
860                                 if (unlikely(np->cork.opt == NULL))
861                                         return -ENOBUFS;
862                         } else if (np->cork.opt->tot_len < opt->tot_len) {
863                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
864                                 return -EINVAL;
865                         }
866                         memcpy(np->cork.opt, opt, opt->tot_len);
867                         inet->cork.flags |= IPCORK_OPT;
868                         /* need source address above miyazawa*/
869                 }
870                 dst_hold(&rt->u.dst);
871                 np->cork.rt = rt;
872                 inet->cork.fl = *fl;
873                 np->cork.hop_limit = hlimit;
874                 np->cork.tclass = tclass;
875                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
876                 if (dst_allfrag(rt->u.dst.path))
877                         inet->cork.flags |= IPCORK_ALLFRAG;
878                 inet->cork.length = 0;
879                 sk->sk_sndmsg_page = NULL;
880                 sk->sk_sndmsg_off = 0;
881                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
882                 length += exthdrlen;
883                 transhdrlen += exthdrlen;
884         } else {
885                 rt = np->cork.rt;
886                 fl = &inet->cork.fl;
887                 if (inet->cork.flags & IPCORK_OPT)
888                         opt = np->cork.opt;
889                 transhdrlen = 0;
890                 exthdrlen = 0;
891                 mtu = inet->cork.fragsize;
892         }
893
894         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
895
896         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
897         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
898
899         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
900                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
901                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
902                         return -EMSGSIZE;
903                 }
904         }
905
906         /*
907          * Let's try using as much space as possible.
908          * Use MTU if total length of the message fits into the MTU.
909          * Otherwise, we need to reserve fragment header and
910          * fragment alignment (= 8-15 octects, in total).
911          *
912          * Note that we may need to "move" the data from the tail of
913          * of the buffer to the new fragment when we split 
914          * the message.
915          *
916          * FIXME: It may be fragmented into multiple chunks 
917          *        at once if non-fragmentable extension headers
918          *        are too large.
919          * --yoshfuji 
920          */
921
922         inet->cork.length += length;
923         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
924             (rt->u.dst.dev->features & NETIF_F_UFO)) {
925
926                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
927                                 fragheaderlen, transhdrlen, mtu, flags))
928                         goto error;
929
930                 return 0;
931         }
932
933         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
934                 goto alloc_new_skb;
935
936         while (length > 0) {
937                 /* Check if the remaining data fits into current packet. */
938                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
939                 if (copy < length)
940                         copy = maxfraglen - skb->len;
941
942                 if (copy <= 0) {
943                         char *data;
944                         unsigned int datalen;
945                         unsigned int fraglen;
946                         unsigned int fraggap;
947                         unsigned int alloclen;
948                         struct sk_buff *skb_prev;
949 alloc_new_skb:
950                         skb_prev = skb;
951
952                         /* There's no room in the current skb */
953                         if (skb_prev)
954                                 fraggap = skb_prev->len - maxfraglen;
955                         else
956                                 fraggap = 0;
957
958                         /*
959                          * If remaining data exceeds the mtu,
960                          * we know we need more fragment(s).
961                          */
962                         datalen = length + fraggap;
963                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
964                                 datalen = maxfraglen - fragheaderlen;
965
966                         fraglen = datalen + fragheaderlen;
967                         if ((flags & MSG_MORE) &&
968                             !(rt->u.dst.dev->features&NETIF_F_SG))
969                                 alloclen = mtu;
970                         else
971                                 alloclen = datalen + fragheaderlen;
972
973                         /*
974                          * The last fragment gets additional space at tail.
975                          * Note: we overallocate on fragments with MSG_MODE
976                          * because we have no idea if we're the last one.
977                          */
978                         if (datalen == length + fraggap)
979                                 alloclen += rt->u.dst.trailer_len;
980
981                         /*
982                          * We just reserve space for fragment header.
983                          * Note: this may be overallocation if the message 
984                          * (without MSG_MORE) fits into the MTU.
985                          */
986                         alloclen += sizeof(struct frag_hdr);
987
988                         if (transhdrlen) {
989                                 skb = sock_alloc_send_skb(sk,
990                                                 alloclen + hh_len,
991                                                 (flags & MSG_DONTWAIT), &err);
992                         } else {
993                                 skb = NULL;
994                                 if (atomic_read(&sk->sk_wmem_alloc) <=
995                                     2 * sk->sk_sndbuf)
996                                         skb = sock_wmalloc(sk,
997                                                            alloclen + hh_len, 1,
998                                                            sk->sk_allocation);
999                                 if (unlikely(skb == NULL))
1000                                         err = -ENOBUFS;
1001                         }
1002                         if (skb == NULL)
1003                                 goto error;
1004                         /*
1005                          *      Fill in the control structures
1006                          */
1007                         skb->ip_summed = csummode;
1008                         skb->csum = 0;
1009                         /* reserve for fragmentation */
1010                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1011
1012                         /*
1013                          *      Find where to start putting bytes
1014                          */
1015                         data = skb_put(skb, fraglen);
1016                         skb->nh.raw = data + exthdrlen;
1017                         data += fragheaderlen;
1018                         skb->h.raw = data + exthdrlen;
1019
1020                         if (fraggap) {
1021                                 skb->csum = skb_copy_and_csum_bits(
1022                                         skb_prev, maxfraglen,
1023                                         data + transhdrlen, fraggap, 0);
1024                                 skb_prev->csum = csum_sub(skb_prev->csum,
1025                                                           skb->csum);
1026                                 data += fraggap;
1027                                 skb_trim(skb_prev, maxfraglen);
1028                         }
1029                         copy = datalen - transhdrlen - fraggap;
1030                         if (copy < 0) {
1031                                 err = -EINVAL;
1032                                 kfree_skb(skb);
1033                                 goto error;
1034                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1035                                 err = -EFAULT;
1036                                 kfree_skb(skb);
1037                                 goto error;
1038                         }
1039
1040                         offset += copy;
1041                         length -= datalen - fraggap;
1042                         transhdrlen = 0;
1043                         exthdrlen = 0;
1044                         csummode = CHECKSUM_NONE;
1045
1046                         /*
1047                          * Put the packet on the pending queue
1048                          */
1049                         __skb_queue_tail(&sk->sk_write_queue, skb);
1050                         continue;
1051                 }
1052
1053                 if (copy > length)
1054                         copy = length;
1055
1056                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1057                         unsigned int off;
1058
1059                         off = skb->len;
1060                         if (getfrag(from, skb_put(skb, copy),
1061                                                 offset, copy, off, skb) < 0) {
1062                                 __skb_trim(skb, off);
1063                                 err = -EFAULT;
1064                                 goto error;
1065                         }
1066                 } else {
1067                         int i = skb_shinfo(skb)->nr_frags;
1068                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1069                         struct page *page = sk->sk_sndmsg_page;
1070                         int off = sk->sk_sndmsg_off;
1071                         unsigned int left;
1072
1073                         if (page && (left = PAGE_SIZE - off) > 0) {
1074                                 if (copy >= left)
1075                                         copy = left;
1076                                 if (page != frag->page) {
1077                                         if (i == MAX_SKB_FRAGS) {
1078                                                 err = -EMSGSIZE;
1079                                                 goto error;
1080                                         }
1081                                         get_page(page);
1082                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1083                                         frag = &skb_shinfo(skb)->frags[i];
1084                                 }
1085                         } else if(i < MAX_SKB_FRAGS) {
1086                                 if (copy > PAGE_SIZE)
1087                                         copy = PAGE_SIZE;
1088                                 page = alloc_pages(sk->sk_allocation, 0);
1089                                 if (page == NULL) {
1090                                         err = -ENOMEM;
1091                                         goto error;
1092                                 }
1093                                 sk->sk_sndmsg_page = page;
1094                                 sk->sk_sndmsg_off = 0;
1095
1096                                 skb_fill_page_desc(skb, i, page, 0, 0);
1097                                 frag = &skb_shinfo(skb)->frags[i];
1098                                 skb->truesize += PAGE_SIZE;
1099                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1100                         } else {
1101                                 err = -EMSGSIZE;
1102                                 goto error;
1103                         }
1104                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1105                                 err = -EFAULT;
1106                                 goto error;
1107                         }
1108                         sk->sk_sndmsg_off += copy;
1109                         frag->size += copy;
1110                         skb->len += copy;
1111                         skb->data_len += copy;
1112                 }
1113                 offset += copy;
1114                 length -= copy;
1115         }
1116         return 0;
1117 error:
1118         inet->cork.length -= length;
1119         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1120         return err;
1121 }
1122
1123 int ip6_push_pending_frames(struct sock *sk)
1124 {
1125         struct sk_buff *skb, *tmp_skb;
1126         struct sk_buff **tail_skb;
1127         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1128         struct inet_sock *inet = inet_sk(sk);
1129         struct ipv6_pinfo *np = inet6_sk(sk);
1130         struct ipv6hdr *hdr;
1131         struct ipv6_txoptions *opt = np->cork.opt;
1132         struct rt6_info *rt = np->cork.rt;
1133         struct flowi *fl = &inet->cork.fl;
1134         unsigned char proto = fl->proto;
1135         int err = 0;
1136
1137         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1138                 goto out;
1139         tail_skb = &(skb_shinfo(skb)->frag_list);
1140
1141         /* move skb->data to ip header from ext header */
1142         if (skb->data < skb->nh.raw)
1143                 __skb_pull(skb, skb->nh.raw - skb->data);
1144         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1145                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1146                 *tail_skb = tmp_skb;
1147                 tail_skb = &(tmp_skb->next);
1148                 skb->len += tmp_skb->len;
1149                 skb->data_len += tmp_skb->len;
1150                 skb->truesize += tmp_skb->truesize;
1151                 __sock_put(tmp_skb->sk);
1152                 tmp_skb->destructor = NULL;
1153                 tmp_skb->sk = NULL;
1154         }
1155
1156         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1157         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1158         if (opt && opt->opt_flen)
1159                 ipv6_push_frag_opts(skb, opt, &proto);
1160         if (opt && opt->opt_nflen)
1161                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1162
1163         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1164         
1165         *(u32*)hdr = fl->fl6_flowlabel |
1166                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1167
1168         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1169                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1170         else
1171                 hdr->payload_len = 0;
1172         hdr->hop_limit = np->cork.hop_limit;
1173         hdr->nexthdr = proto;
1174         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1175         ipv6_addr_copy(&hdr->daddr, final_dst);
1176
1177         skb->dst = dst_clone(&rt->u.dst);
1178         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1179         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1180         if (err) {
1181                 if (err > 0)
1182                         err = np->recverr ? net_xmit_errno(err) : 0;
1183                 if (err)
1184                         goto error;
1185         }
1186
1187 out:
1188         inet->cork.flags &= ~IPCORK_OPT;
1189         if (np->cork.opt) {
1190                 kfree(np->cork.opt);
1191                 np->cork.opt = NULL;
1192         }
1193         if (np->cork.rt) {
1194                 dst_release(&np->cork.rt->u.dst);
1195                 np->cork.rt = NULL;
1196                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1197         }
1198         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199         return err;
1200 error:
1201         goto out;
1202 }
1203
1204 void ip6_flush_pending_frames(struct sock *sk)
1205 {
1206         struct inet_sock *inet = inet_sk(sk);
1207         struct ipv6_pinfo *np = inet6_sk(sk);
1208         struct sk_buff *skb;
1209
1210         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1211                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1212                 kfree_skb(skb);
1213         }
1214
1215         inet->cork.flags &= ~IPCORK_OPT;
1216
1217         if (np->cork.opt) {
1218                 kfree(np->cork.opt);
1219                 np->cork.opt = NULL;
1220         }
1221         if (np->cork.rt) {
1222                 dst_release(&np->cork.rt->u.dst);
1223                 np->cork.rt = NULL;
1224                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1225         }
1226         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1227 }