Merge master.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-for-linus-2.6
[linux-2.6] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         newskb->mac.raw = newskb->data;
99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /* 
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           u32 saddr, u32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         if (opt)
129                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130         else
131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133         iph->version  = 4;
134         iph->ihl      = 5;
135         iph->tos      = inet->tos;
136         if (ip_dont_fragment(sk, &rt->u.dst))
137                 iph->frag_off = htons(IP_DF);
138         else
139                 iph->frag_off = 0;
140         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141         iph->daddr    = rt->rt_dst;
142         iph->saddr    = rt->rt_src;
143         iph->protocol = sk->sk_protocol;
144         iph->tot_len  = htons(skb->len);
145         ip_select_ident(iph, &rt->u.dst, sk);
146         skb->nh.iph   = iph;
147
148         if (opt && opt->optlen) {
149                 iph->ihl += opt->optlen>>2;
150                 ip_options_build(skb, opt, daddr, rt, 0);
151         }
152         ip_send_check(iph);
153
154         skb->priority = sk->sk_priority;
155
156         /* Send it out. */
157         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158                        dst_output);
159 }
160
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
163 static inline int ip_finish_output2(struct sk_buff *skb)
164 {
165         struct dst_entry *dst = skb->dst;
166         struct hh_cache *hh = dst->hh;
167         struct net_device *dev = dst->dev;
168         int hh_len = LL_RESERVED_SPACE(dev);
169
170         /* Be paranoid, rather than too clever. */
171         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172                 struct sk_buff *skb2;
173
174                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175                 if (skb2 == NULL) {
176                         kfree_skb(skb);
177                         return -ENOMEM;
178                 }
179                 if (skb->sk)
180                         skb_set_owner_w(skb2, skb->sk);
181                 kfree_skb(skb);
182                 skb = skb2;
183         }
184
185         if (hh) {
186                 int hh_alen;
187
188                 read_lock_bh(&hh->hh_lock);
189                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
190                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191                 read_unlock_bh(&hh->hh_lock);
192                 skb_push(skb, hh->hh_len);
193                 return hh->hh_output(skb);
194         } else if (dst->neighbour)
195                 return dst->neighbour->output(skb);
196
197         if (net_ratelimit())
198                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199         kfree_skb(skb);
200         return -EINVAL;
201 }
202
203 static inline int ip_finish_output(struct sk_buff *skb)
204 {
205         struct net_device *dev = skb->dst->dev;
206
207         skb->dev = dev;
208         skb->protocol = htons(ETH_P_IP);
209
210         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211                        ip_finish_output2);
212 }
213
214 int ip_mc_output(struct sk_buff *skb)
215 {
216         struct sock *sk = skb->sk;
217         struct rtable *rt = (struct rtable*)skb->dst;
218         struct net_device *dev = rt->u.dst.dev;
219
220         /*
221          *      If the indicated interface is up and running, send the packet.
222          */
223         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224
225         skb->dev = dev;
226         skb->protocol = htons(ETH_P_IP);
227
228         /*
229          *      Multicasts are looped back for other local users
230          */
231
232         if (rt->rt_flags&RTCF_MULTICAST) {
233                 if ((!sk || inet_sk(sk)->mc_loop)
234 #ifdef CONFIG_IP_MROUTE
235                 /* Small optimization: do not loopback not local frames,
236                    which returned after forwarding; they will be  dropped
237                    by ip_mr_input in any case.
238                    Note, that local frames are looped back to be delivered
239                    to local recipients.
240
241                    This check is duplicated in ip_mr_input at the moment.
242                  */
243                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244 #endif
245                 ) {
246                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247                         if (newskb)
248                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249                                         newskb->dev, 
250                                         ip_dev_loopback_xmit);
251                 }
252
253                 /* Multicasts with ttl 0 must not go beyond the host */
254
255                 if (skb->nh.iph->ttl == 0) {
256                         kfree_skb(skb);
257                         return 0;
258                 }
259         }
260
261         if (rt->rt_flags&RTCF_BROADCAST) {
262                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263                 if (newskb)
264                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265                                 newskb->dev, ip_dev_loopback_xmit);
266         }
267
268         if (skb->len > dst_mtu(&rt->u.dst))
269                 return ip_fragment(skb, ip_finish_output);
270         else
271                 return ip_finish_output(skb);
272 }
273
274 int ip_output(struct sk_buff *skb)
275 {
276         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277
278         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
279                 return ip_fragment(skb, ip_finish_output);
280         else
281                 return ip_finish_output(skb);
282 }
283
284 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
285 {
286         struct sock *sk = skb->sk;
287         struct inet_sock *inet = inet_sk(sk);
288         struct ip_options *opt = inet->opt;
289         struct rtable *rt;
290         struct iphdr *iph;
291
292         /* Skip all of this if the packet is already routed,
293          * f.e. by something like SCTP.
294          */
295         rt = (struct rtable *) skb->dst;
296         if (rt != NULL)
297                 goto packet_routed;
298
299         /* Make sure we can route this packet. */
300         rt = (struct rtable *)__sk_dst_check(sk, 0);
301         if (rt == NULL) {
302                 u32 daddr;
303
304                 /* Use correct destination address if we have options. */
305                 daddr = inet->daddr;
306                 if(opt && opt->srr)
307                         daddr = opt->faddr;
308
309                 {
310                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
311                                             .nl_u = { .ip4_u =
312                                                       { .daddr = daddr,
313                                                         .saddr = inet->saddr,
314                                                         .tos = RT_CONN_FLAGS(sk) } },
315                                             .proto = sk->sk_protocol,
316                                             .uli_u = { .ports =
317                                                        { .sport = inet->sport,
318                                                          .dport = inet->dport } } };
319
320                         /* If this fails, retransmit mechanism of transport layer will
321                          * keep trying until route appears or the connection times
322                          * itself out.
323                          */
324                         if (ip_route_output_flow(&rt, &fl, sk, 0))
325                                 goto no_route;
326                 }
327                 sk_setup_caps(sk, &rt->u.dst);
328         }
329         skb->dst = dst_clone(&rt->u.dst);
330
331 packet_routed:
332         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
333                 goto no_route;
334
335         /* OK, we know where to send it, allocate and build IP header. */
336         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
337         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
338         iph->tot_len = htons(skb->len);
339         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
340                 iph->frag_off = htons(IP_DF);
341         else
342                 iph->frag_off = 0;
343         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
344         iph->protocol = sk->sk_protocol;
345         iph->saddr    = rt->rt_src;
346         iph->daddr    = rt->rt_dst;
347         skb->nh.iph   = iph;
348         /* Transport layer set skb->h.foo itself. */
349
350         if (opt && opt->optlen) {
351                 iph->ihl += opt->optlen >> 2;
352                 ip_options_build(skb, opt, inet->daddr, rt, 0);
353         }
354
355         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
356
357         /* Add an IP checksum. */
358         ip_send_check(iph);
359
360         skb->priority = sk->sk_priority;
361
362         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
363                        dst_output);
364
365 no_route:
366         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
367         kfree_skb(skb);
368         return -EHOSTUNREACH;
369 }
370
371
372 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
373 {
374         to->pkt_type = from->pkt_type;
375         to->priority = from->priority;
376         to->protocol = from->protocol;
377         dst_release(to->dst);
378         to->dst = dst_clone(from->dst);
379         to->dev = from->dev;
380
381         /* Copy the flags to each fragment. */
382         IPCB(to)->flags = IPCB(from)->flags;
383
384 #ifdef CONFIG_NET_SCHED
385         to->tc_index = from->tc_index;
386 #endif
387 #ifdef CONFIG_NETFILTER
388         to->nfmark = from->nfmark;
389         /* Connection association is same as pre-frag packet */
390         nf_conntrack_put(to->nfct);
391         to->nfct = from->nfct;
392         nf_conntrack_get(to->nfct);
393         to->nfctinfo = from->nfctinfo;
394 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
395         to->ipvs_property = from->ipvs_property;
396 #endif
397 #ifdef CONFIG_BRIDGE_NETFILTER
398         nf_bridge_put(to->nf_bridge);
399         to->nf_bridge = from->nf_bridge;
400         nf_bridge_get(to->nf_bridge);
401 #endif
402 #endif
403 }
404
405 /*
406  *      This IP datagram is too large to be sent in one piece.  Break it up into
407  *      smaller pieces (each of size equal to IP header plus
408  *      a block of the data of the original IP data part) that will yet fit in a
409  *      single device frame, and queue such a frame for sending.
410  */
411
412 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
413 {
414         struct iphdr *iph;
415         int raw = 0;
416         int ptr;
417         struct net_device *dev;
418         struct sk_buff *skb2;
419         unsigned int mtu, hlen, left, len, ll_rs;
420         int offset;
421         int not_last_frag;
422         struct rtable *rt = (struct rtable*)skb->dst;
423         int err = 0;
424
425         dev = rt->u.dst.dev;
426
427         /*
428          *      Point into the IP datagram header.
429          */
430
431         iph = skb->nh.iph;
432
433         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
434                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
435                           htonl(dst_mtu(&rt->u.dst)));
436                 kfree_skb(skb);
437                 return -EMSGSIZE;
438         }
439
440         /*
441          *      Setup starting values.
442          */
443
444         hlen = iph->ihl * 4;
445         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
446
447         /* When frag_list is given, use it. First, check its validity:
448          * some transformers could create wrong frag_list or break existing
449          * one, it is not prohibited. In this case fall back to copying.
450          *
451          * LATER: this step can be merged to real generation of fragments,
452          * we can switch to copy when see the first bad fragment.
453          */
454         if (skb_shinfo(skb)->frag_list) {
455                 struct sk_buff *frag;
456                 int first_len = skb_pagelen(skb);
457
458                 if (first_len - hlen > mtu ||
459                     ((first_len - hlen) & 7) ||
460                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
461                     skb_cloned(skb))
462                         goto slow_path;
463
464                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
465                         /* Correct geometry. */
466                         if (frag->len > mtu ||
467                             ((frag->len & 7) && frag->next) ||
468                             skb_headroom(frag) < hlen)
469                             goto slow_path;
470
471                         /* Partially cloned skb? */
472                         if (skb_shared(frag))
473                                 goto slow_path;
474
475                         BUG_ON(frag->sk);
476                         if (skb->sk) {
477                                 sock_hold(skb->sk);
478                                 frag->sk = skb->sk;
479                                 frag->destructor = sock_wfree;
480                                 skb->truesize -= frag->truesize;
481                         }
482                 }
483
484                 /* Everything is OK. Generate! */
485
486                 err = 0;
487                 offset = 0;
488                 frag = skb_shinfo(skb)->frag_list;
489                 skb_shinfo(skb)->frag_list = NULL;
490                 skb->data_len = first_len - skb_headlen(skb);
491                 skb->len = first_len;
492                 iph->tot_len = htons(first_len);
493                 iph->frag_off = htons(IP_MF);
494                 ip_send_check(iph);
495
496                 for (;;) {
497                         /* Prepare header of the next frame,
498                          * before previous one went down. */
499                         if (frag) {
500                                 frag->ip_summed = CHECKSUM_NONE;
501                                 frag->h.raw = frag->data;
502                                 frag->nh.raw = __skb_push(frag, hlen);
503                                 memcpy(frag->nh.raw, iph, hlen);
504                                 iph = frag->nh.iph;
505                                 iph->tot_len = htons(frag->len);
506                                 ip_copy_metadata(frag, skb);
507                                 if (offset == 0)
508                                         ip_options_fragment(frag);
509                                 offset += skb->len - hlen;
510                                 iph->frag_off = htons(offset>>3);
511                                 if (frag->next != NULL)
512                                         iph->frag_off |= htons(IP_MF);
513                                 /* Ready, complete checksum */
514                                 ip_send_check(iph);
515                         }
516
517                         err = output(skb);
518
519                         if (err || !frag)
520                                 break;
521
522                         skb = frag;
523                         frag = skb->next;
524                         skb->next = NULL;
525                 }
526
527                 if (err == 0) {
528                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
529                         return 0;
530                 }
531
532                 while (frag) {
533                         skb = frag->next;
534                         kfree_skb(frag);
535                         frag = skb;
536                 }
537                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538                 return err;
539         }
540
541 slow_path:
542         left = skb->len - hlen;         /* Space per frame */
543         ptr = raw + hlen;               /* Where to start from */
544
545 #ifdef CONFIG_BRIDGE_NETFILTER
546         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
547          * we need to make room for the encapsulating header */
548         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
549         mtu -= nf_bridge_pad(skb);
550 #else
551         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
552 #endif
553         /*
554          *      Fragment the datagram.
555          */
556
557         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
558         not_last_frag = iph->frag_off & htons(IP_MF);
559
560         /*
561          *      Keep copying data until we run out.
562          */
563
564         while(left > 0) {
565                 len = left;
566                 /* IF: it doesn't fit, use 'mtu' - the data space left */
567                 if (len > mtu)
568                         len = mtu;
569                 /* IF: we are not sending upto and including the packet end
570                    then align the next start on an eight byte boundary */
571                 if (len < left) {
572                         len &= ~7;
573                 }
574                 /*
575                  *      Allocate buffer.
576                  */
577
578                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
579                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
580                         err = -ENOMEM;
581                         goto fail;
582                 }
583
584                 /*
585                  *      Set up data on packet
586                  */
587
588                 ip_copy_metadata(skb2, skb);
589                 skb_reserve(skb2, ll_rs);
590                 skb_put(skb2, len + hlen);
591                 skb2->nh.raw = skb2->data;
592                 skb2->h.raw = skb2->data + hlen;
593
594                 /*
595                  *      Charge the memory for the fragment to any owner
596                  *      it might possess
597                  */
598
599                 if (skb->sk)
600                         skb_set_owner_w(skb2, skb->sk);
601
602                 /*
603                  *      Copy the packet header into the new buffer.
604                  */
605
606                 memcpy(skb2->nh.raw, skb->data, hlen);
607
608                 /*
609                  *      Copy a block of the IP datagram.
610                  */
611                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
612                         BUG();
613                 left -= len;
614
615                 /*
616                  *      Fill in the new header fields.
617                  */
618                 iph = skb2->nh.iph;
619                 iph->frag_off = htons((offset >> 3));
620
621                 /* ANK: dirty, but effective trick. Upgrade options only if
622                  * the segment to be fragmented was THE FIRST (otherwise,
623                  * options are already fixed) and make it ONCE
624                  * on the initial skb, so that all the following fragments
625                  * will inherit fixed options.
626                  */
627                 if (offset == 0)
628                         ip_options_fragment(skb);
629
630                 /*
631                  *      Added AC : If we are fragmenting a fragment that's not the
632                  *                 last fragment then keep MF on each bit
633                  */
634                 if (left > 0 || not_last_frag)
635                         iph->frag_off |= htons(IP_MF);
636                 ptr += len;
637                 offset += len;
638
639                 /*
640                  *      Put this fragment into the sending queue.
641                  */
642
643                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
644
645                 iph->tot_len = htons(len + hlen);
646
647                 ip_send_check(iph);
648
649                 err = output(skb2);
650                 if (err)
651                         goto fail;
652         }
653         kfree_skb(skb);
654         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
655         return err;
656
657 fail:
658         kfree_skb(skb); 
659         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
660         return err;
661 }
662
663 int
664 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
665 {
666         struct iovec *iov = from;
667
668         if (skb->ip_summed == CHECKSUM_HW) {
669                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
670                         return -EFAULT;
671         } else {
672                 unsigned int csum = 0;
673                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
674                         return -EFAULT;
675                 skb->csum = csum_block_add(skb->csum, csum, odd);
676         }
677         return 0;
678 }
679
680 static inline unsigned int
681 csum_page(struct page *page, int offset, int copy)
682 {
683         char *kaddr;
684         unsigned int csum;
685         kaddr = kmap(page);
686         csum = csum_partial(kaddr + offset, copy, 0);
687         kunmap(page);
688         return csum;
689 }
690
691 /*
692  *      ip_append_data() and ip_append_page() can make one large IP datagram
693  *      from many pieces of data. Each pieces will be holded on the socket
694  *      until ip_push_pending_frames() is called. Each piece can be a page
695  *      or non-page data.
696  *      
697  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
698  *      this interface potentially.
699  *
700  *      LATER: length must be adjusted by pad at tail, when it is required.
701  */
702 int ip_append_data(struct sock *sk,
703                    int getfrag(void *from, char *to, int offset, int len,
704                                int odd, struct sk_buff *skb),
705                    void *from, int length, int transhdrlen,
706                    struct ipcm_cookie *ipc, struct rtable *rt,
707                    unsigned int flags)
708 {
709         struct inet_sock *inet = inet_sk(sk);
710         struct sk_buff *skb;
711
712         struct ip_options *opt = NULL;
713         int hh_len;
714         int exthdrlen;
715         int mtu;
716         int copy;
717         int err;
718         int offset = 0;
719         unsigned int maxfraglen, fragheaderlen;
720         int csummode = CHECKSUM_NONE;
721
722         if (flags&MSG_PROBE)
723                 return 0;
724
725         if (skb_queue_empty(&sk->sk_write_queue)) {
726                 /*
727                  * setup for corking.
728                  */
729                 opt = ipc->opt;
730                 if (opt) {
731                         if (inet->cork.opt == NULL) {
732                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
733                                 if (unlikely(inet->cork.opt == NULL))
734                                         return -ENOBUFS;
735                         }
736                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
737                         inet->cork.flags |= IPCORK_OPT;
738                         inet->cork.addr = ipc->addr;
739                 }
740                 dst_hold(&rt->u.dst);
741                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
742                 inet->cork.rt = rt;
743                 inet->cork.length = 0;
744                 sk->sk_sndmsg_page = NULL;
745                 sk->sk_sndmsg_off = 0;
746                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
747                         length += exthdrlen;
748                         transhdrlen += exthdrlen;
749                 }
750         } else {
751                 rt = inet->cork.rt;
752                 if (inet->cork.flags & IPCORK_OPT)
753                         opt = inet->cork.opt;
754
755                 transhdrlen = 0;
756                 exthdrlen = 0;
757                 mtu = inet->cork.fragsize;
758         }
759         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
760
761         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
762         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
763
764         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
765                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
766                 return -EMSGSIZE;
767         }
768
769         /*
770          * transhdrlen > 0 means that this is the first fragment and we wish
771          * it won't be fragmented in the future.
772          */
773         if (transhdrlen &&
774             length + fragheaderlen <= mtu &&
775             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
776             !exthdrlen)
777                 csummode = CHECKSUM_HW;
778
779         inet->cork.length += length;
780
781         /* So, what's going on in the loop below?
782          *
783          * We use calculated fragment length to generate chained skb,
784          * each of segments is IP fragment ready for sending to network after
785          * adding appropriate IP header.
786          */
787
788         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
789                 goto alloc_new_skb;
790
791         while (length > 0) {
792                 /* Check if the remaining data fits into current packet. */
793                 copy = mtu - skb->len;
794                 if (copy < length)
795                         copy = maxfraglen - skb->len;
796                 if (copy <= 0) {
797                         char *data;
798                         unsigned int datalen;
799                         unsigned int fraglen;
800                         unsigned int fraggap;
801                         unsigned int alloclen;
802                         struct sk_buff *skb_prev;
803 alloc_new_skb:
804                         skb_prev = skb;
805                         if (skb_prev)
806                                 fraggap = skb_prev->len - maxfraglen;
807                         else
808                                 fraggap = 0;
809
810                         /*
811                          * If remaining data exceeds the mtu,
812                          * we know we need more fragment(s).
813                          */
814                         datalen = length + fraggap;
815                         if (datalen > mtu - fragheaderlen)
816                                 datalen = maxfraglen - fragheaderlen;
817                         fraglen = datalen + fragheaderlen;
818
819                         if ((flags & MSG_MORE) && 
820                             !(rt->u.dst.dev->features&NETIF_F_SG))
821                                 alloclen = mtu;
822                         else
823                                 alloclen = datalen + fragheaderlen;
824
825                         /* The last fragment gets additional space at tail.
826                          * Note, with MSG_MORE we overallocate on fragments,
827                          * because we have no idea what fragment will be
828                          * the last.
829                          */
830                         if (datalen == length)
831                                 alloclen += rt->u.dst.trailer_len;
832
833                         if (transhdrlen) {
834                                 skb = sock_alloc_send_skb(sk, 
835                                                 alloclen + hh_len + 15,
836                                                 (flags & MSG_DONTWAIT), &err);
837                         } else {
838                                 skb = NULL;
839                                 if (atomic_read(&sk->sk_wmem_alloc) <=
840                                     2 * sk->sk_sndbuf)
841                                         skb = sock_wmalloc(sk, 
842                                                            alloclen + hh_len + 15, 1,
843                                                            sk->sk_allocation);
844                                 if (unlikely(skb == NULL))
845                                         err = -ENOBUFS;
846                         }
847                         if (skb == NULL)
848                                 goto error;
849
850                         /*
851                          *      Fill in the control structures
852                          */
853                         skb->ip_summed = csummode;
854                         skb->csum = 0;
855                         skb_reserve(skb, hh_len);
856
857                         /*
858                          *      Find where to start putting bytes.
859                          */
860                         data = skb_put(skb, fraglen);
861                         skb->nh.raw = data + exthdrlen;
862                         data += fragheaderlen;
863                         skb->h.raw = data + exthdrlen;
864
865                         if (fraggap) {
866                                 skb->csum = skb_copy_and_csum_bits(
867                                         skb_prev, maxfraglen,
868                                         data + transhdrlen, fraggap, 0);
869                                 skb_prev->csum = csum_sub(skb_prev->csum,
870                                                           skb->csum);
871                                 data += fraggap;
872                                 skb_trim(skb_prev, maxfraglen);
873                         }
874
875                         copy = datalen - transhdrlen - fraggap;
876                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
877                                 err = -EFAULT;
878                                 kfree_skb(skb);
879                                 goto error;
880                         }
881
882                         offset += copy;
883                         length -= datalen - fraggap;
884                         transhdrlen = 0;
885                         exthdrlen = 0;
886                         csummode = CHECKSUM_NONE;
887
888                         /*
889                          * Put the packet on the pending queue.
890                          */
891                         __skb_queue_tail(&sk->sk_write_queue, skb);
892                         continue;
893                 }
894
895                 if (copy > length)
896                         copy = length;
897
898                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
899                         unsigned int off;
900
901                         off = skb->len;
902                         if (getfrag(from, skb_put(skb, copy), 
903                                         offset, copy, off, skb) < 0) {
904                                 __skb_trim(skb, off);
905                                 err = -EFAULT;
906                                 goto error;
907                         }
908                 } else {
909                         int i = skb_shinfo(skb)->nr_frags;
910                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
911                         struct page *page = sk->sk_sndmsg_page;
912                         int off = sk->sk_sndmsg_off;
913                         unsigned int left;
914
915                         if (page && (left = PAGE_SIZE - off) > 0) {
916                                 if (copy >= left)
917                                         copy = left;
918                                 if (page != frag->page) {
919                                         if (i == MAX_SKB_FRAGS) {
920                                                 err = -EMSGSIZE;
921                                                 goto error;
922                                         }
923                                         get_page(page);
924                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
925                                         frag = &skb_shinfo(skb)->frags[i];
926                                 }
927                         } else if (i < MAX_SKB_FRAGS) {
928                                 if (copy > PAGE_SIZE)
929                                         copy = PAGE_SIZE;
930                                 page = alloc_pages(sk->sk_allocation, 0);
931                                 if (page == NULL)  {
932                                         err = -ENOMEM;
933                                         goto error;
934                                 }
935                                 sk->sk_sndmsg_page = page;
936                                 sk->sk_sndmsg_off = 0;
937
938                                 skb_fill_page_desc(skb, i, page, 0, 0);
939                                 frag = &skb_shinfo(skb)->frags[i];
940                                 skb->truesize += PAGE_SIZE;
941                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
942                         } else {
943                                 err = -EMSGSIZE;
944                                 goto error;
945                         }
946                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
947                                 err = -EFAULT;
948                                 goto error;
949                         }
950                         sk->sk_sndmsg_off += copy;
951                         frag->size += copy;
952                         skb->len += copy;
953                         skb->data_len += copy;
954                 }
955                 offset += copy;
956                 length -= copy;
957         }
958
959         return 0;
960
961 error:
962         inet->cork.length -= length;
963         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
964         return err; 
965 }
966
967 ssize_t ip_append_page(struct sock *sk, struct page *page,
968                        int offset, size_t size, int flags)
969 {
970         struct inet_sock *inet = inet_sk(sk);
971         struct sk_buff *skb;
972         struct rtable *rt;
973         struct ip_options *opt = NULL;
974         int hh_len;
975         int mtu;
976         int len;
977         int err;
978         unsigned int maxfraglen, fragheaderlen, fraggap;
979
980         if (inet->hdrincl)
981                 return -EPERM;
982
983         if (flags&MSG_PROBE)
984                 return 0;
985
986         if (skb_queue_empty(&sk->sk_write_queue))
987                 return -EINVAL;
988
989         rt = inet->cork.rt;
990         if (inet->cork.flags & IPCORK_OPT)
991                 opt = inet->cork.opt;
992
993         if (!(rt->u.dst.dev->features&NETIF_F_SG))
994                 return -EOPNOTSUPP;
995
996         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
997         mtu = inet->cork.fragsize;
998
999         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1000         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1001
1002         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1003                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1004                 return -EMSGSIZE;
1005         }
1006
1007         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1008                 return -EINVAL;
1009
1010         inet->cork.length += size;
1011
1012         while (size > 0) {
1013                 int i;
1014
1015                 /* Check if the remaining data fits into current packet. */
1016                 len = mtu - skb->len;
1017                 if (len < size)
1018                         len = maxfraglen - skb->len;
1019                 if (len <= 0) {
1020                         struct sk_buff *skb_prev;
1021                         char *data;
1022                         struct iphdr *iph;
1023                         int alloclen;
1024
1025                         skb_prev = skb;
1026                         if (skb_prev)
1027                                 fraggap = skb_prev->len - maxfraglen;
1028                         else
1029                                 fraggap = 0;
1030
1031                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1032                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1033                         if (unlikely(!skb)) {
1034                                 err = -ENOBUFS;
1035                                 goto error;
1036                         }
1037
1038                         /*
1039                          *      Fill in the control structures
1040                          */
1041                         skb->ip_summed = CHECKSUM_NONE;
1042                         skb->csum = 0;
1043                         skb_reserve(skb, hh_len);
1044
1045                         /*
1046                          *      Find where to start putting bytes.
1047                          */
1048                         data = skb_put(skb, fragheaderlen + fraggap);
1049                         skb->nh.iph = iph = (struct iphdr *)data;
1050                         data += fragheaderlen;
1051                         skb->h.raw = data;
1052
1053                         if (fraggap) {
1054                                 skb->csum = skb_copy_and_csum_bits(
1055                                         skb_prev, maxfraglen,
1056                                         data, fraggap, 0);
1057                                 skb_prev->csum = csum_sub(skb_prev->csum,
1058                                                           skb->csum);
1059                                 skb_trim(skb_prev, maxfraglen);
1060                         }
1061
1062                         /*
1063                          * Put the packet on the pending queue.
1064                          */
1065                         __skb_queue_tail(&sk->sk_write_queue, skb);
1066                         continue;
1067                 }
1068
1069                 i = skb_shinfo(skb)->nr_frags;
1070                 if (len > size)
1071                         len = size;
1072                 if (skb_can_coalesce(skb, i, page, offset)) {
1073                         skb_shinfo(skb)->frags[i-1].size += len;
1074                 } else if (i < MAX_SKB_FRAGS) {
1075                         get_page(page);
1076                         skb_fill_page_desc(skb, i, page, offset, len);
1077                 } else {
1078                         err = -EMSGSIZE;
1079                         goto error;
1080                 }
1081
1082                 if (skb->ip_summed == CHECKSUM_NONE) {
1083                         unsigned int csum;
1084                         csum = csum_page(page, offset, len);
1085                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1086                 }
1087
1088                 skb->len += len;
1089                 skb->data_len += len;
1090                 offset += len;
1091                 size -= len;
1092         }
1093         return 0;
1094
1095 error:
1096         inet->cork.length -= size;
1097         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1098         return err;
1099 }
1100
1101 /*
1102  *      Combined all pending IP fragments on the socket as one IP datagram
1103  *      and push them out.
1104  */
1105 int ip_push_pending_frames(struct sock *sk)
1106 {
1107         struct sk_buff *skb, *tmp_skb;
1108         struct sk_buff **tail_skb;
1109         struct inet_sock *inet = inet_sk(sk);
1110         struct ip_options *opt = NULL;
1111         struct rtable *rt = inet->cork.rt;
1112         struct iphdr *iph;
1113         int df = 0;
1114         __u8 ttl;
1115         int err = 0;
1116
1117         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1118                 goto out;
1119         tail_skb = &(skb_shinfo(skb)->frag_list);
1120
1121         /* move skb->data to ip header from ext header */
1122         if (skb->data < skb->nh.raw)
1123                 __skb_pull(skb, skb->nh.raw - skb->data);
1124         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1125                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1126                 *tail_skb = tmp_skb;
1127                 tail_skb = &(tmp_skb->next);
1128                 skb->len += tmp_skb->len;
1129                 skb->data_len += tmp_skb->len;
1130                 skb->truesize += tmp_skb->truesize;
1131                 __sock_put(tmp_skb->sk);
1132                 tmp_skb->destructor = NULL;
1133                 tmp_skb->sk = NULL;
1134         }
1135
1136         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1137          * to fragment the frame generated here. No matter, what transforms
1138          * how transforms change size of the packet, it will come out.
1139          */
1140         if (inet->pmtudisc != IP_PMTUDISC_DO)
1141                 skb->local_df = 1;
1142
1143         /* DF bit is set when we want to see DF on outgoing frames.
1144          * If local_df is set too, we still allow to fragment this frame
1145          * locally. */
1146         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1147             (skb->len <= dst_mtu(&rt->u.dst) &&
1148              ip_dont_fragment(sk, &rt->u.dst)))
1149                 df = htons(IP_DF);
1150
1151         if (inet->cork.flags & IPCORK_OPT)
1152                 opt = inet->cork.opt;
1153
1154         if (rt->rt_type == RTN_MULTICAST)
1155                 ttl = inet->mc_ttl;
1156         else
1157                 ttl = ip_select_ttl(inet, &rt->u.dst);
1158
1159         iph = (struct iphdr *)skb->data;
1160         iph->version = 4;
1161         iph->ihl = 5;
1162         if (opt) {
1163                 iph->ihl += opt->optlen>>2;
1164                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1165         }
1166         iph->tos = inet->tos;
1167         iph->tot_len = htons(skb->len);
1168         iph->frag_off = df;
1169         if (!df) {
1170                 __ip_select_ident(iph, &rt->u.dst, 0);
1171         } else {
1172                 iph->id = htons(inet->id++);
1173         }
1174         iph->ttl = ttl;
1175         iph->protocol = sk->sk_protocol;
1176         iph->saddr = rt->rt_src;
1177         iph->daddr = rt->rt_dst;
1178         ip_send_check(iph);
1179
1180         skb->priority = sk->sk_priority;
1181         skb->dst = dst_clone(&rt->u.dst);
1182
1183         /* Netfilter gets whole the not fragmented skb. */
1184         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1185                       skb->dst->dev, dst_output);
1186         if (err) {
1187                 if (err > 0)
1188                         err = inet->recverr ? net_xmit_errno(err) : 0;
1189                 if (err)
1190                         goto error;
1191         }
1192
1193 out:
1194         inet->cork.flags &= ~IPCORK_OPT;
1195         if (inet->cork.opt) {
1196                 kfree(inet->cork.opt);
1197                 inet->cork.opt = NULL;
1198         }
1199         if (inet->cork.rt) {
1200                 ip_rt_put(inet->cork.rt);
1201                 inet->cork.rt = NULL;
1202         }
1203         return err;
1204
1205 error:
1206         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1207         goto out;
1208 }
1209
1210 /*
1211  *      Throw away all pending data on the socket.
1212  */
1213 void ip_flush_pending_frames(struct sock *sk)
1214 {
1215         struct inet_sock *inet = inet_sk(sk);
1216         struct sk_buff *skb;
1217
1218         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1219                 kfree_skb(skb);
1220
1221         inet->cork.flags &= ~IPCORK_OPT;
1222         if (inet->cork.opt) {
1223                 kfree(inet->cork.opt);
1224                 inet->cork.opt = NULL;
1225         }
1226         if (inet->cork.rt) {
1227                 ip_rt_put(inet->cork.rt);
1228                 inet->cork.rt = NULL;
1229         }
1230 }
1231
1232
1233 /*
1234  *      Fetch data from kernel space and fill in checksum if needed.
1235  */
1236 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1237                               int len, int odd, struct sk_buff *skb)
1238 {
1239         unsigned int csum;
1240
1241         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1242         skb->csum = csum_block_add(skb->csum, csum, odd);
1243         return 0;  
1244 }
1245
1246 /* 
1247  *      Generic function to send a packet as reply to another packet.
1248  *      Used to send TCP resets so far. ICMP should use this function too.
1249  *
1250  *      Should run single threaded per socket because it uses the sock 
1251  *      structure to pass arguments.
1252  *
1253  *      LATER: switch from ip_build_xmit to ip_append_*
1254  */
1255 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1256                    unsigned int len)
1257 {
1258         struct inet_sock *inet = inet_sk(sk);
1259         struct {
1260                 struct ip_options       opt;
1261                 char                    data[40];
1262         } replyopts;
1263         struct ipcm_cookie ipc;
1264         u32 daddr;
1265         struct rtable *rt = (struct rtable*)skb->dst;
1266
1267         if (ip_options_echo(&replyopts.opt, skb))
1268                 return;
1269
1270         daddr = ipc.addr = rt->rt_src;
1271         ipc.opt = NULL;
1272
1273         if (replyopts.opt.optlen) {
1274                 ipc.opt = &replyopts.opt;
1275
1276                 if (ipc.opt->srr)
1277                         daddr = replyopts.opt.faddr;
1278         }
1279
1280         {
1281                 struct flowi fl = { .nl_u = { .ip4_u =
1282                                               { .daddr = daddr,
1283                                                 .saddr = rt->rt_spec_dst,
1284                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1285                                     /* Not quite clean, but right. */
1286                                     .uli_u = { .ports =
1287                                                { .sport = skb->h.th->dest,
1288                                                  .dport = skb->h.th->source } },
1289                                     .proto = sk->sk_protocol };
1290                 if (ip_route_output_key(&rt, &fl))
1291                         return;
1292         }
1293
1294         /* And let IP do all the hard work.
1295
1296            This chunk is not reenterable, hence spinlock.
1297            Note that it uses the fact, that this function is called
1298            with locally disabled BH and that sk cannot be already spinlocked.
1299          */
1300         bh_lock_sock(sk);
1301         inet->tos = skb->nh.iph->tos;
1302         sk->sk_priority = skb->priority;
1303         sk->sk_protocol = skb->nh.iph->protocol;
1304         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1305                        &ipc, rt, MSG_DONTWAIT);
1306         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1307                 if (arg->csumoffset >= 0)
1308                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1309                 skb->ip_summed = CHECKSUM_NONE;
1310                 ip_push_pending_frames(sk);
1311         }
1312
1313         bh_unlock_sock(sk);
1314
1315         ip_rt_put(rt);
1316 }
1317
1318 void __init ip_init(void)
1319 {
1320         ip_rt_init();
1321         inet_initpeers();
1322
1323 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1324         igmp_mc_proc_init();
1325 #endif
1326 }
1327
1328 EXPORT_SYMBOL(ip_fragment);
1329 EXPORT_SYMBOL(ip_generic_getfrag);
1330 EXPORT_SYMBOL(ip_queue_xmit);
1331 EXPORT_SYMBOL(ip_send_check);