Merge HEAD from ../scsi-misc-2.6-old
[linux-2.6] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         newskb->mac.raw = newskb->data;
99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /* 
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           u32 saddr, u32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         if (opt)
129                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130         else
131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133         iph->version  = 4;
134         iph->ihl      = 5;
135         iph->tos      = inet->tos;
136         if (ip_dont_fragment(sk, &rt->u.dst))
137                 iph->frag_off = htons(IP_DF);
138         else
139                 iph->frag_off = 0;
140         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141         iph->daddr    = rt->rt_dst;
142         iph->saddr    = rt->rt_src;
143         iph->protocol = sk->sk_protocol;
144         iph->tot_len  = htons(skb->len);
145         ip_select_ident(iph, &rt->u.dst, sk);
146         skb->nh.iph   = iph;
147
148         if (opt && opt->optlen) {
149                 iph->ihl += opt->optlen>>2;
150                 ip_options_build(skb, opt, daddr, rt, 0);
151         }
152         ip_send_check(iph);
153
154         skb->priority = sk->sk_priority;
155
156         /* Send it out. */
157         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158                        dst_output);
159 }
160
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
163 static inline int ip_finish_output2(struct sk_buff *skb)
164 {
165         struct dst_entry *dst = skb->dst;
166         struct hh_cache *hh = dst->hh;
167         struct net_device *dev = dst->dev;
168         int hh_len = LL_RESERVED_SPACE(dev);
169
170         /* Be paranoid, rather than too clever. */
171         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172                 struct sk_buff *skb2;
173
174                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175                 if (skb2 == NULL) {
176                         kfree_skb(skb);
177                         return -ENOMEM;
178                 }
179                 if (skb->sk)
180                         skb_set_owner_w(skb2, skb->sk);
181                 kfree_skb(skb);
182                 skb = skb2;
183         }
184
185         if (hh) {
186                 int hh_alen;
187
188                 read_lock_bh(&hh->hh_lock);
189                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
190                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191                 read_unlock_bh(&hh->hh_lock);
192                 skb_push(skb, hh->hh_len);
193                 return hh->hh_output(skb);
194         } else if (dst->neighbour)
195                 return dst->neighbour->output(skb);
196
197         if (net_ratelimit())
198                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199         kfree_skb(skb);
200         return -EINVAL;
201 }
202
203 static inline int ip_finish_output(struct sk_buff *skb)
204 {
205         struct net_device *dev = skb->dst->dev;
206
207         skb->dev = dev;
208         skb->protocol = htons(ETH_P_IP);
209
210         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211                        ip_finish_output2);
212 }
213
214 int ip_mc_output(struct sk_buff *skb)
215 {
216         struct sock *sk = skb->sk;
217         struct rtable *rt = (struct rtable*)skb->dst;
218         struct net_device *dev = rt->u.dst.dev;
219
220         /*
221          *      If the indicated interface is up and running, send the packet.
222          */
223         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224
225         skb->dev = dev;
226         skb->protocol = htons(ETH_P_IP);
227
228         /*
229          *      Multicasts are looped back for other local users
230          */
231
232         if (rt->rt_flags&RTCF_MULTICAST) {
233                 if ((!sk || inet_sk(sk)->mc_loop)
234 #ifdef CONFIG_IP_MROUTE
235                 /* Small optimization: do not loopback not local frames,
236                    which returned after forwarding; they will be  dropped
237                    by ip_mr_input in any case.
238                    Note, that local frames are looped back to be delivered
239                    to local recipients.
240
241                    This check is duplicated in ip_mr_input at the moment.
242                  */
243                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244 #endif
245                 ) {
246                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247                         if (newskb)
248                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249                                         newskb->dev, 
250                                         ip_dev_loopback_xmit);
251                 }
252
253                 /* Multicasts with ttl 0 must not go beyond the host */
254
255                 if (skb->nh.iph->ttl == 0) {
256                         kfree_skb(skb);
257                         return 0;
258                 }
259         }
260
261         if (rt->rt_flags&RTCF_BROADCAST) {
262                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263                 if (newskb)
264                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265                                 newskb->dev, ip_dev_loopback_xmit);
266         }
267
268         if (skb->len > dst_mtu(&rt->u.dst))
269                 return ip_fragment(skb, ip_finish_output);
270         else
271                 return ip_finish_output(skb);
272 }
273
274 int ip_output(struct sk_buff *skb)
275 {
276         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277
278         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
279                 return ip_fragment(skb, ip_finish_output);
280         else
281                 return ip_finish_output(skb);
282 }
283
284 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
285 {
286         struct sock *sk = skb->sk;
287         struct inet_sock *inet = inet_sk(sk);
288         struct ip_options *opt = inet->opt;
289         struct rtable *rt;
290         struct iphdr *iph;
291
292         /* Skip all of this if the packet is already routed,
293          * f.e. by something like SCTP.
294          */
295         rt = (struct rtable *) skb->dst;
296         if (rt != NULL)
297                 goto packet_routed;
298
299         /* Make sure we can route this packet. */
300         rt = (struct rtable *)__sk_dst_check(sk, 0);
301         if (rt == NULL) {
302                 u32 daddr;
303
304                 /* Use correct destination address if we have options. */
305                 daddr = inet->daddr;
306                 if(opt && opt->srr)
307                         daddr = opt->faddr;
308
309                 {
310                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
311                                             .nl_u = { .ip4_u =
312                                                       { .daddr = daddr,
313                                                         .saddr = inet->saddr,
314                                                         .tos = RT_CONN_FLAGS(sk) } },
315                                             .proto = sk->sk_protocol,
316                                             .uli_u = { .ports =
317                                                        { .sport = inet->sport,
318                                                          .dport = inet->dport } } };
319
320                         /* If this fails, retransmit mechanism of transport layer will
321                          * keep trying until route appears or the connection times
322                          * itself out.
323                          */
324                         if (ip_route_output_flow(&rt, &fl, sk, 0))
325                                 goto no_route;
326                 }
327                 sk_setup_caps(sk, &rt->u.dst);
328         }
329         skb->dst = dst_clone(&rt->u.dst);
330
331 packet_routed:
332         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
333                 goto no_route;
334
335         /* OK, we know where to send it, allocate and build IP header. */
336         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
337         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
338         iph->tot_len = htons(skb->len);
339         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
340                 iph->frag_off = htons(IP_DF);
341         else
342                 iph->frag_off = 0;
343         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
344         iph->protocol = sk->sk_protocol;
345         iph->saddr    = rt->rt_src;
346         iph->daddr    = rt->rt_dst;
347         skb->nh.iph   = iph;
348         /* Transport layer set skb->h.foo itself. */
349
350         if (opt && opt->optlen) {
351                 iph->ihl += opt->optlen >> 2;
352                 ip_options_build(skb, opt, inet->daddr, rt, 0);
353         }
354
355         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
356
357         /* Add an IP checksum. */
358         ip_send_check(iph);
359
360         skb->priority = sk->sk_priority;
361
362         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
363                        dst_output);
364
365 no_route:
366         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
367         kfree_skb(skb);
368         return -EHOSTUNREACH;
369 }
370
371
372 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
373 {
374         to->pkt_type = from->pkt_type;
375         to->priority = from->priority;
376         to->protocol = from->protocol;
377         dst_release(to->dst);
378         to->dst = dst_clone(from->dst);
379         to->dev = from->dev;
380
381         /* Copy the flags to each fragment. */
382         IPCB(to)->flags = IPCB(from)->flags;
383
384 #ifdef CONFIG_NET_SCHED
385         to->tc_index = from->tc_index;
386 #endif
387 #ifdef CONFIG_NETFILTER
388         to->nfmark = from->nfmark;
389         /* Connection association is same as pre-frag packet */
390         nf_conntrack_put(to->nfct);
391         to->nfct = from->nfct;
392         nf_conntrack_get(to->nfct);
393         to->nfctinfo = from->nfctinfo;
394 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
395         to->ipvs_property = from->ipvs_property;
396 #endif
397 #ifdef CONFIG_BRIDGE_NETFILTER
398         nf_bridge_put(to->nf_bridge);
399         to->nf_bridge = from->nf_bridge;
400         nf_bridge_get(to->nf_bridge);
401 #endif
402 #endif
403 }
404
405 /*
406  *      This IP datagram is too large to be sent in one piece.  Break it up into
407  *      smaller pieces (each of size equal to IP header plus
408  *      a block of the data of the original IP data part) that will yet fit in a
409  *      single device frame, and queue such a frame for sending.
410  */
411
412 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
413 {
414         struct iphdr *iph;
415         int raw = 0;
416         int ptr;
417         struct net_device *dev;
418         struct sk_buff *skb2;
419         unsigned int mtu, hlen, left, len, ll_rs;
420         int offset;
421         int not_last_frag;
422         struct rtable *rt = (struct rtable*)skb->dst;
423         int err = 0;
424
425         dev = rt->u.dst.dev;
426
427         /*
428          *      Point into the IP datagram header.
429          */
430
431         iph = skb->nh.iph;
432
433         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
434                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
435                           htonl(dst_mtu(&rt->u.dst)));
436                 kfree_skb(skb);
437                 return -EMSGSIZE;
438         }
439
440         /*
441          *      Setup starting values.
442          */
443
444         hlen = iph->ihl * 4;
445         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
446
447         /* When frag_list is given, use it. First, check its validity:
448          * some transformers could create wrong frag_list or break existing
449          * one, it is not prohibited. In this case fall back to copying.
450          *
451          * LATER: this step can be merged to real generation of fragments,
452          * we can switch to copy when see the first bad fragment.
453          */
454         if (skb_shinfo(skb)->frag_list) {
455                 struct sk_buff *frag;
456                 int first_len = skb_pagelen(skb);
457
458                 if (first_len - hlen > mtu ||
459                     ((first_len - hlen) & 7) ||
460                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
461                     skb_cloned(skb))
462                         goto slow_path;
463
464                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
465                         /* Correct geometry. */
466                         if (frag->len > mtu ||
467                             ((frag->len & 7) && frag->next) ||
468                             skb_headroom(frag) < hlen)
469                             goto slow_path;
470
471                         /* Partially cloned skb? */
472                         if (skb_shared(frag))
473                                 goto slow_path;
474
475                         BUG_ON(frag->sk);
476                         if (skb->sk) {
477                                 sock_hold(skb->sk);
478                                 frag->sk = skb->sk;
479                                 frag->destructor = sock_wfree;
480                                 skb->truesize -= frag->truesize;
481                         }
482                 }
483
484                 /* Everything is OK. Generate! */
485
486                 err = 0;
487                 offset = 0;
488                 frag = skb_shinfo(skb)->frag_list;
489                 skb_shinfo(skb)->frag_list = NULL;
490                 skb->data_len = first_len - skb_headlen(skb);
491                 skb->len = first_len;
492                 iph->tot_len = htons(first_len);
493                 iph->frag_off = htons(IP_MF);
494                 ip_send_check(iph);
495
496                 for (;;) {
497                         /* Prepare header of the next frame,
498                          * before previous one went down. */
499                         if (frag) {
500                                 frag->ip_summed = CHECKSUM_NONE;
501                                 frag->h.raw = frag->data;
502                                 frag->nh.raw = __skb_push(frag, hlen);
503                                 memcpy(frag->nh.raw, iph, hlen);
504                                 iph = frag->nh.iph;
505                                 iph->tot_len = htons(frag->len);
506                                 ip_copy_metadata(frag, skb);
507                                 if (offset == 0)
508                                         ip_options_fragment(frag);
509                                 offset += skb->len - hlen;
510                                 iph->frag_off = htons(offset>>3);
511                                 if (frag->next != NULL)
512                                         iph->frag_off |= htons(IP_MF);
513                                 /* Ready, complete checksum */
514                                 ip_send_check(iph);
515                         }
516
517                         err = output(skb);
518
519                         if (err || !frag)
520                                 break;
521
522                         skb = frag;
523                         frag = skb->next;
524                         skb->next = NULL;
525                 }
526
527                 if (err == 0) {
528                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
529                         return 0;
530                 }
531
532                 while (frag) {
533                         skb = frag->next;
534                         kfree_skb(frag);
535                         frag = skb;
536                 }
537                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538                 return err;
539         }
540
541 slow_path:
542         left = skb->len - hlen;         /* Space per frame */
543         ptr = raw + hlen;               /* Where to start from */
544
545 #ifdef CONFIG_BRIDGE_NETFILTER
546         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
547          * we need to make room for the encapsulating header */
548         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
549         mtu -= nf_bridge_pad(skb);
550 #else
551         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
552 #endif
553         /*
554          *      Fragment the datagram.
555          */
556
557         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
558         not_last_frag = iph->frag_off & htons(IP_MF);
559
560         /*
561          *      Keep copying data until we run out.
562          */
563
564         while(left > 0) {
565                 len = left;
566                 /* IF: it doesn't fit, use 'mtu' - the data space left */
567                 if (len > mtu)
568                         len = mtu;
569                 /* IF: we are not sending upto and including the packet end
570                    then align the next start on an eight byte boundary */
571                 if (len < left) {
572                         len &= ~7;
573                 }
574                 /*
575                  *      Allocate buffer.
576                  */
577
578                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
579                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
580                         err = -ENOMEM;
581                         goto fail;
582                 }
583
584                 /*
585                  *      Set up data on packet
586                  */
587
588                 ip_copy_metadata(skb2, skb);
589                 skb_reserve(skb2, ll_rs);
590                 skb_put(skb2, len + hlen);
591                 skb2->nh.raw = skb2->data;
592                 skb2->h.raw = skb2->data + hlen;
593
594                 /*
595                  *      Charge the memory for the fragment to any owner
596                  *      it might possess
597                  */
598
599                 if (skb->sk)
600                         skb_set_owner_w(skb2, skb->sk);
601
602                 /*
603                  *      Copy the packet header into the new buffer.
604                  */
605
606                 memcpy(skb2->nh.raw, skb->data, hlen);
607
608                 /*
609                  *      Copy a block of the IP datagram.
610                  */
611                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
612                         BUG();
613                 left -= len;
614
615                 /*
616                  *      Fill in the new header fields.
617                  */
618                 iph = skb2->nh.iph;
619                 iph->frag_off = htons((offset >> 3));
620
621                 /* ANK: dirty, but effective trick. Upgrade options only if
622                  * the segment to be fragmented was THE FIRST (otherwise,
623                  * options are already fixed) and make it ONCE
624                  * on the initial skb, so that all the following fragments
625                  * will inherit fixed options.
626                  */
627                 if (offset == 0)
628                         ip_options_fragment(skb);
629
630                 /*
631                  *      Added AC : If we are fragmenting a fragment that's not the
632                  *                 last fragment then keep MF on each bit
633                  */
634                 if (left > 0 || not_last_frag)
635                         iph->frag_off |= htons(IP_MF);
636                 ptr += len;
637                 offset += len;
638
639                 /*
640                  *      Put this fragment into the sending queue.
641                  */
642
643                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
644
645                 iph->tot_len = htons(len + hlen);
646
647                 ip_send_check(iph);
648
649                 err = output(skb2);
650                 if (err)
651                         goto fail;
652         }
653         kfree_skb(skb);
654         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
655         return err;
656
657 fail:
658         kfree_skb(skb); 
659         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
660         return err;
661 }
662
663 int
664 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
665 {
666         struct iovec *iov = from;
667
668         if (skb->ip_summed == CHECKSUM_HW) {
669                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
670                         return -EFAULT;
671         } else {
672                 unsigned int csum = 0;
673                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
674                         return -EFAULT;
675                 skb->csum = csum_block_add(skb->csum, csum, odd);
676         }
677         return 0;
678 }
679
680 static inline unsigned int
681 csum_page(struct page *page, int offset, int copy)
682 {
683         char *kaddr;
684         unsigned int csum;
685         kaddr = kmap(page);
686         csum = csum_partial(kaddr + offset, copy, 0);
687         kunmap(page);
688         return csum;
689 }
690
691 /*
692  *      ip_append_data() and ip_append_page() can make one large IP datagram
693  *      from many pieces of data. Each pieces will be holded on the socket
694  *      until ip_push_pending_frames() is called. Each piece can be a page
695  *      or non-page data.
696  *      
697  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
698  *      this interface potentially.
699  *
700  *      LATER: length must be adjusted by pad at tail, when it is required.
701  */
702 int ip_append_data(struct sock *sk,
703                    int getfrag(void *from, char *to, int offset, int len,
704                                int odd, struct sk_buff *skb),
705                    void *from, int length, int transhdrlen,
706                    struct ipcm_cookie *ipc, struct rtable *rt,
707                    unsigned int flags)
708 {
709         struct inet_sock *inet = inet_sk(sk);
710         struct sk_buff *skb;
711
712         struct ip_options *opt = NULL;
713         int hh_len;
714         int exthdrlen;
715         int mtu;
716         int copy;
717         int err;
718         int offset = 0;
719         unsigned int maxfraglen, fragheaderlen;
720         int csummode = CHECKSUM_NONE;
721
722         if (flags&MSG_PROBE)
723                 return 0;
724
725         if (skb_queue_empty(&sk->sk_write_queue)) {
726                 /*
727                  * setup for corking.
728                  */
729                 opt = ipc->opt;
730                 if (opt) {
731                         if (inet->cork.opt == NULL) {
732                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
733                                 if (unlikely(inet->cork.opt == NULL))
734                                         return -ENOBUFS;
735                         }
736                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
737                         inet->cork.flags |= IPCORK_OPT;
738                         inet->cork.addr = ipc->addr;
739                 }
740                 dst_hold(&rt->u.dst);
741                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
742                 inet->cork.rt = rt;
743                 inet->cork.length = 0;
744                 sk->sk_sndmsg_page = NULL;
745                 sk->sk_sndmsg_off = 0;
746                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
747                         length += exthdrlen;
748                         transhdrlen += exthdrlen;
749                 }
750         } else {
751                 rt = inet->cork.rt;
752                 if (inet->cork.flags & IPCORK_OPT)
753                         opt = inet->cork.opt;
754
755                 transhdrlen = 0;
756                 exthdrlen = 0;
757                 mtu = inet->cork.fragsize;
758         }
759         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
760
761         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
762         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
763
764         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
765                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
766                 return -EMSGSIZE;
767         }
768
769         /*
770          * transhdrlen > 0 means that this is the first fragment and we wish
771          * it won't be fragmented in the future.
772          */
773         if (transhdrlen &&
774             length + fragheaderlen <= mtu &&
775             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
776             !exthdrlen)
777                 csummode = CHECKSUM_HW;
778
779         inet->cork.length += length;
780
781         /* So, what's going on in the loop below?
782          *
783          * We use calculated fragment length to generate chained skb,
784          * each of segments is IP fragment ready for sending to network after
785          * adding appropriate IP header.
786          */
787
788         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
789                 goto alloc_new_skb;
790
791         while (length > 0) {
792                 /* Check if the remaining data fits into current packet. */
793                 copy = mtu - skb->len;
794                 if (copy < length)
795                         copy = maxfraglen - skb->len;
796                 if (copy <= 0) {
797                         char *data;
798                         unsigned int datalen;
799                         unsigned int fraglen;
800                         unsigned int fraggap;
801                         unsigned int alloclen;
802                         struct sk_buff *skb_prev;
803 alloc_new_skb:
804                         skb_prev = skb;
805                         if (skb_prev)
806                                 fraggap = skb_prev->len - maxfraglen;
807                         else
808                                 fraggap = 0;
809
810                         /*
811                          * If remaining data exceeds the mtu,
812                          * we know we need more fragment(s).
813                          */
814                         datalen = length + fraggap;
815                         if (datalen > mtu - fragheaderlen)
816                                 datalen = maxfraglen - fragheaderlen;
817                         fraglen = datalen + fragheaderlen;
818
819                         if ((flags & MSG_MORE) && 
820                             !(rt->u.dst.dev->features&NETIF_F_SG))
821                                 alloclen = mtu;
822                         else
823                                 alloclen = datalen + fragheaderlen;
824
825                         /* The last fragment gets additional space at tail.
826                          * Note, with MSG_MORE we overallocate on fragments,
827                          * because we have no idea what fragment will be
828                          * the last.
829                          */
830                         if (datalen == length)
831                                 alloclen += rt->u.dst.trailer_len;
832
833                         if (transhdrlen) {
834                                 skb = sock_alloc_send_skb(sk, 
835                                                 alloclen + hh_len + 15,
836                                                 (flags & MSG_DONTWAIT), &err);
837                         } else {
838                                 skb = NULL;
839                                 if (atomic_read(&sk->sk_wmem_alloc) <=
840                                     2 * sk->sk_sndbuf)
841                                         skb = sock_wmalloc(sk, 
842                                                            alloclen + hh_len + 15, 1,
843                                                            sk->sk_allocation);
844                                 if (unlikely(skb == NULL))
845                                         err = -ENOBUFS;
846                         }
847                         if (skb == NULL)
848                                 goto error;
849
850                         /*
851                          *      Fill in the control structures
852                          */
853                         skb->ip_summed = csummode;
854                         skb->csum = 0;
855                         skb_reserve(skb, hh_len);
856
857                         /*
858                          *      Find where to start putting bytes.
859                          */
860                         data = skb_put(skb, fraglen);
861                         skb->nh.raw = data + exthdrlen;
862                         data += fragheaderlen;
863                         skb->h.raw = data + exthdrlen;
864
865                         if (fraggap) {
866                                 skb->csum = skb_copy_and_csum_bits(
867                                         skb_prev, maxfraglen,
868                                         data + transhdrlen, fraggap, 0);
869                                 skb_prev->csum = csum_sub(skb_prev->csum,
870                                                           skb->csum);
871                                 data += fraggap;
872                                 skb_trim(skb_prev, maxfraglen);
873                         }
874
875                         copy = datalen - transhdrlen - fraggap;
876                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
877                                 err = -EFAULT;
878                                 kfree_skb(skb);
879                                 goto error;
880                         }
881
882                         offset += copy;
883                         length -= datalen - fraggap;
884                         transhdrlen = 0;
885                         exthdrlen = 0;
886                         csummode = CHECKSUM_NONE;
887
888                         /*
889                          * Put the packet on the pending queue.
890                          */
891                         __skb_queue_tail(&sk->sk_write_queue, skb);
892                         continue;
893                 }
894
895                 if (copy > length)
896                         copy = length;
897
898                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
899                         unsigned int off;
900
901                         off = skb->len;
902                         if (getfrag(from, skb_put(skb, copy), 
903                                         offset, copy, off, skb) < 0) {
904                                 __skb_trim(skb, off);
905                                 err = -EFAULT;
906                                 goto error;
907                         }
908                 } else {
909                         int i = skb_shinfo(skb)->nr_frags;
910                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
911                         struct page *page = sk->sk_sndmsg_page;
912                         int off = sk->sk_sndmsg_off;
913                         unsigned int left;
914
915                         if (page && (left = PAGE_SIZE - off) > 0) {
916                                 if (copy >= left)
917                                         copy = left;
918                                 if (page != frag->page) {
919                                         if (i == MAX_SKB_FRAGS) {
920                                                 err = -EMSGSIZE;
921                                                 goto error;
922                                         }
923                                         get_page(page);
924                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
925                                         frag = &skb_shinfo(skb)->frags[i];
926                                 }
927                         } else if (i < MAX_SKB_FRAGS) {
928                                 if (copy > PAGE_SIZE)
929                                         copy = PAGE_SIZE;
930                                 page = alloc_pages(sk->sk_allocation, 0);
931                                 if (page == NULL)  {
932                                         err = -ENOMEM;
933                                         goto error;
934                                 }
935                                 sk->sk_sndmsg_page = page;
936                                 sk->sk_sndmsg_off = 0;
937
938                                 skb_fill_page_desc(skb, i, page, 0, 0);
939                                 frag = &skb_shinfo(skb)->frags[i];
940                                 skb->truesize += PAGE_SIZE;
941                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
942                         } else {
943                                 err = -EMSGSIZE;
944                                 goto error;
945                         }
946                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
947                                 err = -EFAULT;
948                                 goto error;
949                         }
950                         sk->sk_sndmsg_off += copy;
951                         frag->size += copy;
952                         skb->len += copy;
953                         skb->data_len += copy;
954                 }
955                 offset += copy;
956                 length -= copy;
957         }
958
959         return 0;
960
961 error:
962         inet->cork.length -= length;
963         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
964         return err; 
965 }
966
967 ssize_t ip_append_page(struct sock *sk, struct page *page,
968                        int offset, size_t size, int flags)
969 {
970         struct inet_sock *inet = inet_sk(sk);
971         struct sk_buff *skb;
972         struct rtable *rt;
973         struct ip_options *opt = NULL;
974         int hh_len;
975         int mtu;
976         int len;
977         int err;
978         unsigned int maxfraglen, fragheaderlen, fraggap;
979
980         if (inet->hdrincl)
981                 return -EPERM;
982
983         if (flags&MSG_PROBE)
984                 return 0;
985
986         if (skb_queue_empty(&sk->sk_write_queue))
987                 return -EINVAL;
988
989         rt = inet->cork.rt;
990         if (inet->cork.flags & IPCORK_OPT)
991                 opt = inet->cork.opt;
992
993         if (!(rt->u.dst.dev->features&NETIF_F_SG))
994                 return -EOPNOTSUPP;
995
996         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
997         mtu = inet->cork.fragsize;
998
999         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1000         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1001
1002         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1003                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1004                 return -EMSGSIZE;
1005         }
1006
1007         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1008                 return -EINVAL;
1009
1010         inet->cork.length += size;
1011
1012         while (size > 0) {
1013                 int i;
1014
1015                 /* Check if the remaining data fits into current packet. */
1016                 len = mtu - skb->len;
1017                 if (len < size)
1018                         len = maxfraglen - skb->len;
1019                 if (len <= 0) {
1020                         struct sk_buff *skb_prev;
1021                         char *data;
1022                         struct iphdr *iph;
1023                         int alloclen;
1024
1025                         skb_prev = skb;
1026                         fraggap = skb_prev->len - maxfraglen;
1027
1028                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1029                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1030                         if (unlikely(!skb)) {
1031                                 err = -ENOBUFS;
1032                                 goto error;
1033                         }
1034
1035                         /*
1036                          *      Fill in the control structures
1037                          */
1038                         skb->ip_summed = CHECKSUM_NONE;
1039                         skb->csum = 0;
1040                         skb_reserve(skb, hh_len);
1041
1042                         /*
1043                          *      Find where to start putting bytes.
1044                          */
1045                         data = skb_put(skb, fragheaderlen + fraggap);
1046                         skb->nh.iph = iph = (struct iphdr *)data;
1047                         data += fragheaderlen;
1048                         skb->h.raw = data;
1049
1050                         if (fraggap) {
1051                                 skb->csum = skb_copy_and_csum_bits(
1052                                         skb_prev, maxfraglen,
1053                                         data, fraggap, 0);
1054                                 skb_prev->csum = csum_sub(skb_prev->csum,
1055                                                           skb->csum);
1056                                 skb_trim(skb_prev, maxfraglen);
1057                         }
1058
1059                         /*
1060                          * Put the packet on the pending queue.
1061                          */
1062                         __skb_queue_tail(&sk->sk_write_queue, skb);
1063                         continue;
1064                 }
1065
1066                 i = skb_shinfo(skb)->nr_frags;
1067                 if (len > size)
1068                         len = size;
1069                 if (skb_can_coalesce(skb, i, page, offset)) {
1070                         skb_shinfo(skb)->frags[i-1].size += len;
1071                 } else if (i < MAX_SKB_FRAGS) {
1072                         get_page(page);
1073                         skb_fill_page_desc(skb, i, page, offset, len);
1074                 } else {
1075                         err = -EMSGSIZE;
1076                         goto error;
1077                 }
1078
1079                 if (skb->ip_summed == CHECKSUM_NONE) {
1080                         unsigned int csum;
1081                         csum = csum_page(page, offset, len);
1082                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1083                 }
1084
1085                 skb->len += len;
1086                 skb->data_len += len;
1087                 offset += len;
1088                 size -= len;
1089         }
1090         return 0;
1091
1092 error:
1093         inet->cork.length -= size;
1094         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1095         return err;
1096 }
1097
1098 /*
1099  *      Combined all pending IP fragments on the socket as one IP datagram
1100  *      and push them out.
1101  */
1102 int ip_push_pending_frames(struct sock *sk)
1103 {
1104         struct sk_buff *skb, *tmp_skb;
1105         struct sk_buff **tail_skb;
1106         struct inet_sock *inet = inet_sk(sk);
1107         struct ip_options *opt = NULL;
1108         struct rtable *rt = inet->cork.rt;
1109         struct iphdr *iph;
1110         int df = 0;
1111         __u8 ttl;
1112         int err = 0;
1113
1114         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1115                 goto out;
1116         tail_skb = &(skb_shinfo(skb)->frag_list);
1117
1118         /* move skb->data to ip header from ext header */
1119         if (skb->data < skb->nh.raw)
1120                 __skb_pull(skb, skb->nh.raw - skb->data);
1121         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1122                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1123                 *tail_skb = tmp_skb;
1124                 tail_skb = &(tmp_skb->next);
1125                 skb->len += tmp_skb->len;
1126                 skb->data_len += tmp_skb->len;
1127                 skb->truesize += tmp_skb->truesize;
1128                 __sock_put(tmp_skb->sk);
1129                 tmp_skb->destructor = NULL;
1130                 tmp_skb->sk = NULL;
1131         }
1132
1133         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1134          * to fragment the frame generated here. No matter, what transforms
1135          * how transforms change size of the packet, it will come out.
1136          */
1137         if (inet->pmtudisc != IP_PMTUDISC_DO)
1138                 skb->local_df = 1;
1139
1140         /* DF bit is set when we want to see DF on outgoing frames.
1141          * If local_df is set too, we still allow to fragment this frame
1142          * locally. */
1143         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1144             (skb->len <= dst_mtu(&rt->u.dst) &&
1145              ip_dont_fragment(sk, &rt->u.dst)))
1146                 df = htons(IP_DF);
1147
1148         if (inet->cork.flags & IPCORK_OPT)
1149                 opt = inet->cork.opt;
1150
1151         if (rt->rt_type == RTN_MULTICAST)
1152                 ttl = inet->mc_ttl;
1153         else
1154                 ttl = ip_select_ttl(inet, &rt->u.dst);
1155
1156         iph = (struct iphdr *)skb->data;
1157         iph->version = 4;
1158         iph->ihl = 5;
1159         if (opt) {
1160                 iph->ihl += opt->optlen>>2;
1161                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1162         }
1163         iph->tos = inet->tos;
1164         iph->tot_len = htons(skb->len);
1165         iph->frag_off = df;
1166         if (!df) {
1167                 __ip_select_ident(iph, &rt->u.dst, 0);
1168         } else {
1169                 iph->id = htons(inet->id++);
1170         }
1171         iph->ttl = ttl;
1172         iph->protocol = sk->sk_protocol;
1173         iph->saddr = rt->rt_src;
1174         iph->daddr = rt->rt_dst;
1175         ip_send_check(iph);
1176
1177         skb->priority = sk->sk_priority;
1178         skb->dst = dst_clone(&rt->u.dst);
1179
1180         /* Netfilter gets whole the not fragmented skb. */
1181         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1182                       skb->dst->dev, dst_output);
1183         if (err) {
1184                 if (err > 0)
1185                         err = inet->recverr ? net_xmit_errno(err) : 0;
1186                 if (err)
1187                         goto error;
1188         }
1189
1190 out:
1191         inet->cork.flags &= ~IPCORK_OPT;
1192         if (inet->cork.opt) {
1193                 kfree(inet->cork.opt);
1194                 inet->cork.opt = NULL;
1195         }
1196         if (inet->cork.rt) {
1197                 ip_rt_put(inet->cork.rt);
1198                 inet->cork.rt = NULL;
1199         }
1200         return err;
1201
1202 error:
1203         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1204         goto out;
1205 }
1206
1207 /*
1208  *      Throw away all pending data on the socket.
1209  */
1210 void ip_flush_pending_frames(struct sock *sk)
1211 {
1212         struct inet_sock *inet = inet_sk(sk);
1213         struct sk_buff *skb;
1214
1215         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1216                 kfree_skb(skb);
1217
1218         inet->cork.flags &= ~IPCORK_OPT;
1219         if (inet->cork.opt) {
1220                 kfree(inet->cork.opt);
1221                 inet->cork.opt = NULL;
1222         }
1223         if (inet->cork.rt) {
1224                 ip_rt_put(inet->cork.rt);
1225                 inet->cork.rt = NULL;
1226         }
1227 }
1228
1229
1230 /*
1231  *      Fetch data from kernel space and fill in checksum if needed.
1232  */
1233 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1234                               int len, int odd, struct sk_buff *skb)
1235 {
1236         unsigned int csum;
1237
1238         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1239         skb->csum = csum_block_add(skb->csum, csum, odd);
1240         return 0;  
1241 }
1242
1243 /* 
1244  *      Generic function to send a packet as reply to another packet.
1245  *      Used to send TCP resets so far. ICMP should use this function too.
1246  *
1247  *      Should run single threaded per socket because it uses the sock 
1248  *      structure to pass arguments.
1249  *
1250  *      LATER: switch from ip_build_xmit to ip_append_*
1251  */
1252 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1253                    unsigned int len)
1254 {
1255         struct inet_sock *inet = inet_sk(sk);
1256         struct {
1257                 struct ip_options       opt;
1258                 char                    data[40];
1259         } replyopts;
1260         struct ipcm_cookie ipc;
1261         u32 daddr;
1262         struct rtable *rt = (struct rtable*)skb->dst;
1263
1264         if (ip_options_echo(&replyopts.opt, skb))
1265                 return;
1266
1267         daddr = ipc.addr = rt->rt_src;
1268         ipc.opt = NULL;
1269
1270         if (replyopts.opt.optlen) {
1271                 ipc.opt = &replyopts.opt;
1272
1273                 if (ipc.opt->srr)
1274                         daddr = replyopts.opt.faddr;
1275         }
1276
1277         {
1278                 struct flowi fl = { .nl_u = { .ip4_u =
1279                                               { .daddr = daddr,
1280                                                 .saddr = rt->rt_spec_dst,
1281                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1282                                     /* Not quite clean, but right. */
1283                                     .uli_u = { .ports =
1284                                                { .sport = skb->h.th->dest,
1285                                                  .dport = skb->h.th->source } },
1286                                     .proto = sk->sk_protocol };
1287                 if (ip_route_output_key(&rt, &fl))
1288                         return;
1289         }
1290
1291         /* And let IP do all the hard work.
1292
1293            This chunk is not reenterable, hence spinlock.
1294            Note that it uses the fact, that this function is called
1295            with locally disabled BH and that sk cannot be already spinlocked.
1296          */
1297         bh_lock_sock(sk);
1298         inet->tos = skb->nh.iph->tos;
1299         sk->sk_priority = skb->priority;
1300         sk->sk_protocol = skb->nh.iph->protocol;
1301         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1302                        &ipc, rt, MSG_DONTWAIT);
1303         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1304                 if (arg->csumoffset >= 0)
1305                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1306                 skb->ip_summed = CHECKSUM_NONE;
1307                 ip_push_pending_frames(sk);
1308         }
1309
1310         bh_unlock_sock(sk);
1311
1312         ip_rt_put(rt);
1313 }
1314
1315 void __init ip_init(void)
1316 {
1317         ip_rt_init();
1318         inet_initpeers();
1319
1320 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1321         igmp_mc_proc_init();
1322 #endif
1323 }
1324
1325 EXPORT_SYMBOL(ip_fragment);
1326 EXPORT_SYMBOL(ip_generic_getfrag);
1327 EXPORT_SYMBOL(ip_queue_xmit);
1328 EXPORT_SYMBOL(ip_send_check);