2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
57 #include <net/checksum.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
73 static inline int ip6_output_finish(struct sk_buff *skb)
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
111 static int ip6_output2(struct sk_buff *skb)
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
116 skb->protocol = htons(ETH_P_IPV6);
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
133 ip6_dev_loopback_xmit);
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
148 int ip6_output(struct sk_buff *skb)
150 if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 dst_allfrag(skb->dst))
152 return ip6_fragment(skb, ip6_output2);
154 return ip6_output2(skb);
158 * xmit an sk_buff (used by TCP)
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 struct ipv6_txoptions *opt, int ipfragok)
164 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 struct in6_addr *first_hop = &fl->fl6_dst;
166 struct dst_entry *dst = skb->dst;
168 u8 proto = fl->proto;
169 int seg_len = skb->len;
176 /* First: exthdrs may take lots of space (~8K for now)
177 MAX_HEADER is not enough.
179 head_room = opt->opt_nflen + opt->opt_flen;
180 seg_len += head_room;
181 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
183 if (skb_headroom(skb) < head_room) {
184 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
188 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
192 skb_set_owner_w(skb, sk);
195 ipv6_push_frag_opts(skb, opt, &proto);
197 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
200 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
203 * Fill in the IPv6 header
208 hlimit = np->hop_limit;
210 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
212 hlimit = ipv6_get_hoplimit(dst->dev);
220 *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
222 hdr->payload_len = htons(seg_len);
223 hdr->nexthdr = proto;
224 hdr->hop_limit = hlimit;
226 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 ipv6_addr_copy(&hdr->daddr, first_hop);
229 skb->priority = sk->sk_priority;
232 if ((skb->len <= mtu) || ipfragok) {
233 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
234 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
239 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
241 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
248 * To avoid extra problems ND packets are send through this
249 * routine. It's code duplication but I really want to avoid
250 * extra checks since ipv6_build_header is used by TCP (which
251 * is for us performance critical)
254 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255 struct in6_addr *saddr, struct in6_addr *daddr,
258 struct ipv6_pinfo *np = inet6_sk(sk);
262 skb->protocol = htons(ETH_P_IPV6);
265 totlen = len + sizeof(struct ipv6hdr);
267 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
270 *(u32*)hdr = htonl(0x60000000);
272 hdr->payload_len = htons(len);
273 hdr->nexthdr = proto;
274 hdr->hop_limit = np->hop_limit;
276 ipv6_addr_copy(&hdr->saddr, saddr);
277 ipv6_addr_copy(&hdr->daddr, daddr);
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
284 struct ip6_ra_chain *ra;
285 struct sock *last = NULL;
287 read_lock(&ip6_ra_lock);
288 for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 struct sock *sk = ra->sk;
290 if (sk && ra->sel == sel &&
291 (!sk->sk_bound_dev_if ||
292 sk->sk_bound_dev_if == skb->dev->ifindex)) {
294 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
296 rawv6_rcv(last, skb2);
303 rawv6_rcv(last, skb);
304 read_unlock(&ip6_ra_lock);
307 read_unlock(&ip6_ra_lock);
311 static inline int ip6_forward_finish(struct sk_buff *skb)
313 return dst_output(skb);
316 int ip6_forward(struct sk_buff *skb)
318 struct dst_entry *dst = skb->dst;
319 struct ipv6hdr *hdr = skb->nh.ipv6h;
320 struct inet6_skb_parm *opt = IP6CB(skb);
322 if (ipv6_devconf.forwarding == 0)
325 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
330 skb->ip_summed = CHECKSUM_NONE;
333 * We DO NOT make any processing on
334 * RA packets, pushing them to user level AS IS
335 * without ane WARRANTY that application will be able
336 * to interpret them. The reason is that we
337 * cannot make anything clever here.
339 * We are not end-node, so that if packet contains
340 * AH/ESP, we cannot make anything.
341 * Defragmentation also would be mistake, RA packets
342 * cannot be fragmented, because there is no warranty
343 * that different fragments will go along one path. --ANK
346 u8 *ptr = skb->nh.raw + opt->ra;
347 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
352 * check and decrement ttl
354 if (hdr->hop_limit <= 1) {
355 /* Force OUTPUT device used as source address */
357 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
364 if (!xfrm6_route_forward(skb)) {
365 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
370 /* IPv6 specs say nothing about it, but it is clear that we cannot
371 send redirects to source routed frames.
373 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374 struct in6_addr *target = NULL;
376 struct neighbour *n = dst->neighbour;
379 * incoming and outgoing devices are the same
383 rt = (struct rt6_info *) dst;
384 if ((rt->rt6i_flags & RTF_GATEWAY))
385 target = (struct in6_addr*)&n->primary_key;
387 target = &hdr->daddr;
389 /* Limit redirects both by destination (here)
390 and by source (inside ndisc_send_redirect)
392 if (xrlim_allow(dst, 1*HZ))
393 ndisc_send_redirect(skb, n, target);
394 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395 |IPV6_ADDR_LINKLOCAL)) {
396 /* This check is security critical. */
400 if (skb->len > dst_mtu(dst)) {
401 /* Again, force OUTPUT device used as source address */
403 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
410 if (skb_cow(skb, dst->dev->hard_header_len)) {
411 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
417 /* Mangling hops number delayed to point after skb COW */
421 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
425 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
431 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
433 to->pkt_type = from->pkt_type;
434 to->priority = from->priority;
435 to->protocol = from->protocol;
436 dst_release(to->dst);
437 to->dst = dst_clone(from->dst);
440 #ifdef CONFIG_NET_SCHED
441 to->tc_index = from->tc_index;
443 #ifdef CONFIG_NETFILTER
444 to->nfmark = from->nfmark;
445 /* Connection association is same as pre-frag packet */
446 nf_conntrack_put(to->nfct);
447 to->nfct = from->nfct;
448 nf_conntrack_get(to->nfct);
449 to->nfctinfo = from->nfctinfo;
450 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451 nf_conntrack_put_reasm(to->nfct_reasm);
452 to->nfct_reasm = from->nfct_reasm;
453 nf_conntrack_get_reasm(to->nfct_reasm);
455 #ifdef CONFIG_BRIDGE_NETFILTER
456 nf_bridge_put(to->nf_bridge);
457 to->nf_bridge = from->nf_bridge;
458 nf_bridge_get(to->nf_bridge);
463 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
465 u16 offset = sizeof(struct ipv6hdr);
466 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467 unsigned int packet_len = skb->tail - skb->nh.raw;
469 *nexthdr = &skb->nh.ipv6h->nexthdr;
471 while (offset + 1 <= packet_len) {
476 case NEXTHDR_ROUTING:
478 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480 offset += ipv6_optlen(exthdr);
481 *nexthdr = &exthdr->nexthdr;
482 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
492 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
494 struct net_device *dev;
495 struct sk_buff *frag;
496 struct rt6_info *rt = (struct rt6_info*)skb->dst;
497 struct ipv6hdr *tmp_hdr;
499 unsigned int mtu, hlen, left, len;
501 int ptr, offset = 0, err=0;
502 u8 *prevhdr, nexthdr = 0;
505 hlen = ip6_find_1stfragopt(skb, &prevhdr);
508 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
510 if (skb_shinfo(skb)->frag_list) {
511 int first_len = skb_pagelen(skb);
513 if (first_len - hlen > mtu ||
514 ((first_len - hlen) & 7) ||
518 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
519 /* Correct geometry. */
520 if (frag->len > mtu ||
521 ((frag->len & 7) && frag->next) ||
522 skb_headroom(frag) < hlen)
525 /* Partially cloned skb? */
526 if (skb_shared(frag))
533 frag->destructor = sock_wfree;
534 skb->truesize -= frag->truesize;
540 frag = skb_shinfo(skb)->frag_list;
541 skb_shinfo(skb)->frag_list = NULL;
544 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
546 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
550 *prevhdr = NEXTHDR_FRAGMENT;
551 memcpy(tmp_hdr, skb->nh.raw, hlen);
552 __skb_pull(skb, hlen);
553 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
554 skb->nh.raw = __skb_push(skb, hlen);
555 memcpy(skb->nh.raw, tmp_hdr, hlen);
557 ipv6_select_ident(skb, fh);
558 fh->nexthdr = nexthdr;
560 fh->frag_off = htons(IP6_MF);
561 frag_id = fh->identification;
563 first_len = skb_pagelen(skb);
564 skb->data_len = first_len - skb_headlen(skb);
565 skb->len = first_len;
566 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
570 /* Prepare header of the next frame,
571 * before previous one went down. */
573 frag->ip_summed = CHECKSUM_NONE;
574 frag->h.raw = frag->data;
575 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
576 frag->nh.raw = __skb_push(frag, hlen);
577 memcpy(frag->nh.raw, tmp_hdr, hlen);
578 offset += skb->len - hlen - sizeof(struct frag_hdr);
579 fh->nexthdr = nexthdr;
581 fh->frag_off = htons(offset);
582 if (frag->next != NULL)
583 fh->frag_off |= htons(IP6_MF);
584 fh->identification = frag_id;
585 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
586 ip6_copy_metadata(frag, skb);
601 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
611 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
616 left = skb->len - hlen; /* Space per frame */
617 ptr = hlen; /* Where to start from */
620 * Fragment the datagram.
623 *prevhdr = NEXTHDR_FRAGMENT;
626 * Keep copying data until we run out.
630 /* IF: it doesn't fit, use 'mtu' - the data space left */
633 /* IF: we are not sending upto and including the packet end
634 then align the next start on an eight byte boundary */
642 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
643 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
644 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
650 * Set up data on packet
653 ip6_copy_metadata(frag, skb);
654 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
655 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
656 frag->nh.raw = frag->data;
657 fh = (struct frag_hdr*)(frag->data + hlen);
658 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
661 * Charge the memory for the fragment to any owner
665 skb_set_owner_w(frag, skb->sk);
668 * Copy the packet header into the new buffer.
670 memcpy(frag->nh.raw, skb->data, hlen);
673 * Build fragment header.
675 fh->nexthdr = nexthdr;
678 ipv6_select_ident(skb, fh);
679 frag_id = fh->identification;
681 fh->identification = frag_id;
684 * Copy a block of the IP datagram.
686 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
690 fh->frag_off = htons(offset);
692 fh->frag_off |= htons(IP6_MF);
693 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
699 * Put this fragment into the sending queue.
702 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
709 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
714 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
718 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
724 struct ipv6_pinfo *np = inet6_sk(sk);
726 *dst = sk_dst_check(sk, np->dst_cookie);
728 struct rt6_info *rt = (struct rt6_info*)*dst;
730 /* Yes, checking route validity in not connected
731 case is not very simple. Take into account,
732 that we do not support routing by source, TOS,
733 and MSG_DONTROUTE --ANK (980726)
735 1. If route was host route, check that
736 cached destination is current.
737 If it is network route, we still may
738 check its validity using saved pointer
739 to the last used address: daddr_cache.
740 We do not want to save whole address now,
741 (because main consumer of this service
742 is tcp, which has not this problem),
743 so that the last trick works only on connected
745 2. oif also should be the same.
748 if (((rt->rt6i_dst.plen != 128 ||
749 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
750 && (np->daddr_cache == NULL ||
751 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
752 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
760 *dst = ip6_route_output(sk, fl);
762 if ((err = (*dst)->error))
763 goto out_err_release;
765 if (ipv6_addr_any(&fl->fl6_src)) {
766 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
769 goto out_err_release;
780 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
782 static inline int ip6_ufo_append_data(struct sock *sk,
783 int getfrag(void *from, char *to, int offset, int len,
784 int odd, struct sk_buff *skb),
785 void *from, int length, int hh_len, int fragheaderlen,
786 int transhdrlen, int mtu,unsigned int flags)
792 /* There is support for UDP large send offload by network
793 * device, so create one single skb packet containing complete
796 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
797 skb = sock_alloc_send_skb(sk,
798 hh_len + fragheaderlen + transhdrlen + 20,
799 (flags & MSG_DONTWAIT), &err);
803 /* reserve space for Hardware header */
804 skb_reserve(skb, hh_len);
806 /* create space for UDP/IP header */
807 skb_put(skb,fragheaderlen + transhdrlen);
809 /* initialize network header pointer */
810 skb->nh.raw = skb->data;
812 /* initialize protocol header pointer */
813 skb->h.raw = skb->data + fragheaderlen;
815 skb->ip_summed = CHECKSUM_HW;
817 sk->sk_sndmsg_off = 0;
820 err = skb_append_datato_frags(sk,skb, getfrag, from,
821 (length - transhdrlen));
823 struct frag_hdr fhdr;
825 /* specify the length of each IP datagram fragment*/
826 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
827 sizeof(struct frag_hdr);
828 ipv6_select_ident(skb, &fhdr);
829 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
830 __skb_queue_tail(&sk->sk_write_queue, skb);
834 /* There is not enough support do UPD LSO,
835 * so follow normal path
842 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
843 int offset, int len, int odd, struct sk_buff *skb),
844 void *from, int length, int transhdrlen,
845 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
846 struct rt6_info *rt, unsigned int flags)
848 struct inet_sock *inet = inet_sk(sk);
849 struct ipv6_pinfo *np = inet6_sk(sk);
851 unsigned int maxfraglen, fragheaderlen;
858 int csummode = CHECKSUM_NONE;
862 if (skb_queue_empty(&sk->sk_write_queue)) {
867 if (np->cork.opt == NULL) {
868 np->cork.opt = kmalloc(opt->tot_len,
870 if (unlikely(np->cork.opt == NULL))
872 } else if (np->cork.opt->tot_len < opt->tot_len) {
873 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
876 memcpy(np->cork.opt, opt, opt->tot_len);
877 inet->cork.flags |= IPCORK_OPT;
878 /* need source address above miyazawa*/
880 dst_hold(&rt->u.dst);
883 np->cork.hop_limit = hlimit;
884 np->cork.tclass = tclass;
885 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
886 if (dst_allfrag(rt->u.dst.path))
887 inet->cork.flags |= IPCORK_ALLFRAG;
888 inet->cork.length = 0;
889 sk->sk_sndmsg_page = NULL;
890 sk->sk_sndmsg_off = 0;
891 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
893 transhdrlen += exthdrlen;
897 if (inet->cork.flags & IPCORK_OPT)
901 mtu = inet->cork.fragsize;
904 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
906 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
907 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
909 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
910 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
911 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
917 * Let's try using as much space as possible.
918 * Use MTU if total length of the message fits into the MTU.
919 * Otherwise, we need to reserve fragment header and
920 * fragment alignment (= 8-15 octects, in total).
922 * Note that we may need to "move" the data from the tail of
923 * of the buffer to the new fragment when we split
926 * FIXME: It may be fragmented into multiple chunks
927 * at once if non-fragmentable extension headers
932 inet->cork.length += length;
933 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
934 (rt->u.dst.dev->features & NETIF_F_UFO)) {
936 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
937 fragheaderlen, transhdrlen, mtu, flags))
943 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
947 /* Check if the remaining data fits into current packet. */
948 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
950 copy = maxfraglen - skb->len;
954 unsigned int datalen;
955 unsigned int fraglen;
956 unsigned int fraggap;
957 unsigned int alloclen;
958 struct sk_buff *skb_prev;
962 /* There's no room in the current skb */
964 fraggap = skb_prev->len - maxfraglen;
969 * If remaining data exceeds the mtu,
970 * we know we need more fragment(s).
972 datalen = length + fraggap;
973 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
974 datalen = maxfraglen - fragheaderlen;
976 fraglen = datalen + fragheaderlen;
977 if ((flags & MSG_MORE) &&
978 !(rt->u.dst.dev->features&NETIF_F_SG))
981 alloclen = datalen + fragheaderlen;
984 * The last fragment gets additional space at tail.
985 * Note: we overallocate on fragments with MSG_MODE
986 * because we have no idea if we're the last one.
988 if (datalen == length + fraggap)
989 alloclen += rt->u.dst.trailer_len;
992 * We just reserve space for fragment header.
993 * Note: this may be overallocation if the message
994 * (without MSG_MORE) fits into the MTU.
996 alloclen += sizeof(struct frag_hdr);
999 skb = sock_alloc_send_skb(sk,
1001 (flags & MSG_DONTWAIT), &err);
1004 if (atomic_read(&sk->sk_wmem_alloc) <=
1006 skb = sock_wmalloc(sk,
1007 alloclen + hh_len, 1,
1009 if (unlikely(skb == NULL))
1015 * Fill in the control structures
1017 skb->ip_summed = csummode;
1019 /* reserve for fragmentation */
1020 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1023 * Find where to start putting bytes
1025 data = skb_put(skb, fraglen);
1026 skb->nh.raw = data + exthdrlen;
1027 data += fragheaderlen;
1028 skb->h.raw = data + exthdrlen;
1031 skb->csum = skb_copy_and_csum_bits(
1032 skb_prev, maxfraglen,
1033 data + transhdrlen, fraggap, 0);
1034 skb_prev->csum = csum_sub(skb_prev->csum,
1037 skb_trim(skb_prev, maxfraglen);
1039 copy = datalen - transhdrlen - fraggap;
1044 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1051 length -= datalen - fraggap;
1054 csummode = CHECKSUM_NONE;
1057 * Put the packet on the pending queue
1059 __skb_queue_tail(&sk->sk_write_queue, skb);
1066 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1070 if (getfrag(from, skb_put(skb, copy),
1071 offset, copy, off, skb) < 0) {
1072 __skb_trim(skb, off);
1077 int i = skb_shinfo(skb)->nr_frags;
1078 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1079 struct page *page = sk->sk_sndmsg_page;
1080 int off = sk->sk_sndmsg_off;
1083 if (page && (left = PAGE_SIZE - off) > 0) {
1086 if (page != frag->page) {
1087 if (i == MAX_SKB_FRAGS) {
1092 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1093 frag = &skb_shinfo(skb)->frags[i];
1095 } else if(i < MAX_SKB_FRAGS) {
1096 if (copy > PAGE_SIZE)
1098 page = alloc_pages(sk->sk_allocation, 0);
1103 sk->sk_sndmsg_page = page;
1104 sk->sk_sndmsg_off = 0;
1106 skb_fill_page_desc(skb, i, page, 0, 0);
1107 frag = &skb_shinfo(skb)->frags[i];
1108 skb->truesize += PAGE_SIZE;
1109 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1114 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1118 sk->sk_sndmsg_off += copy;
1121 skb->data_len += copy;
1128 inet->cork.length -= length;
1129 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1133 int ip6_push_pending_frames(struct sock *sk)
1135 struct sk_buff *skb, *tmp_skb;
1136 struct sk_buff **tail_skb;
1137 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1138 struct inet_sock *inet = inet_sk(sk);
1139 struct ipv6_pinfo *np = inet6_sk(sk);
1140 struct ipv6hdr *hdr;
1141 struct ipv6_txoptions *opt = np->cork.opt;
1142 struct rt6_info *rt = np->cork.rt;
1143 struct flowi *fl = &inet->cork.fl;
1144 unsigned char proto = fl->proto;
1147 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1149 tail_skb = &(skb_shinfo(skb)->frag_list);
1151 /* move skb->data to ip header from ext header */
1152 if (skb->data < skb->nh.raw)
1153 __skb_pull(skb, skb->nh.raw - skb->data);
1154 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1155 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1156 *tail_skb = tmp_skb;
1157 tail_skb = &(tmp_skb->next);
1158 skb->len += tmp_skb->len;
1159 skb->data_len += tmp_skb->len;
1160 skb->truesize += tmp_skb->truesize;
1161 __sock_put(tmp_skb->sk);
1162 tmp_skb->destructor = NULL;
1166 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1167 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1168 if (opt && opt->opt_flen)
1169 ipv6_push_frag_opts(skb, opt, &proto);
1170 if (opt && opt->opt_nflen)
1171 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1173 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1175 *(u32*)hdr = fl->fl6_flowlabel |
1176 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1178 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1179 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1181 hdr->payload_len = 0;
1182 hdr->hop_limit = np->cork.hop_limit;
1183 hdr->nexthdr = proto;
1184 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1185 ipv6_addr_copy(&hdr->daddr, final_dst);
1187 skb->priority = sk->sk_priority;
1189 skb->dst = dst_clone(&rt->u.dst);
1190 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1191 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1194 err = np->recverr ? net_xmit_errno(err) : 0;
1200 inet->cork.flags &= ~IPCORK_OPT;
1201 kfree(np->cork.opt);
1202 np->cork.opt = NULL;
1204 dst_release(&np->cork.rt->u.dst);
1206 inet->cork.flags &= ~IPCORK_ALLFRAG;
1208 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1214 void ip6_flush_pending_frames(struct sock *sk)
1216 struct inet_sock *inet = inet_sk(sk);
1217 struct ipv6_pinfo *np = inet6_sk(sk);
1218 struct sk_buff *skb;
1220 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1221 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1225 inet->cork.flags &= ~IPCORK_OPT;
1227 kfree(np->cork.opt);
1228 np->cork.opt = NULL;
1230 dst_release(&np->cork.rt->u.dst);
1232 inet->cork.flags &= ~IPCORK_ALLFRAG;
1234 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));