Merge with /pub/scm/linux/kernel/git/torvalds/linux-2.6.git
[linux-2.6] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         newskb->mac.raw = newskb->data;
99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /* 
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           u32 saddr, u32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         if (opt)
129                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130         else
131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133         iph->version  = 4;
134         iph->ihl      = 5;
135         iph->tos      = inet->tos;
136         if (ip_dont_fragment(sk, &rt->u.dst))
137                 iph->frag_off = htons(IP_DF);
138         else
139                 iph->frag_off = 0;
140         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141         iph->daddr    = rt->rt_dst;
142         iph->saddr    = rt->rt_src;
143         iph->protocol = sk->sk_protocol;
144         iph->tot_len  = htons(skb->len);
145         ip_select_ident(iph, &rt->u.dst, sk);
146         skb->nh.iph   = iph;
147
148         if (opt && opt->optlen) {
149                 iph->ihl += opt->optlen>>2;
150                 ip_options_build(skb, opt, daddr, rt, 0);
151         }
152         ip_send_check(iph);
153
154         skb->priority = sk->sk_priority;
155
156         /* Send it out. */
157         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158                        dst_output);
159 }
160
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
163 static inline int ip_finish_output2(struct sk_buff *skb)
164 {
165         struct dst_entry *dst = skb->dst;
166         struct hh_cache *hh = dst->hh;
167         struct net_device *dev = dst->dev;
168         int hh_len = LL_RESERVED_SPACE(dev);
169
170         /* Be paranoid, rather than too clever. */
171         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172                 struct sk_buff *skb2;
173
174                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175                 if (skb2 == NULL) {
176                         kfree_skb(skb);
177                         return -ENOMEM;
178                 }
179                 if (skb->sk)
180                         skb_set_owner_w(skb2, skb->sk);
181                 kfree_skb(skb);
182                 skb = skb2;
183         }
184
185         if (hh) {
186                 int hh_alen;
187
188                 read_lock_bh(&hh->hh_lock);
189                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
190                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191                 read_unlock_bh(&hh->hh_lock);
192                 skb_push(skb, hh->hh_len);
193                 return hh->hh_output(skb);
194         } else if (dst->neighbour)
195                 return dst->neighbour->output(skb);
196
197         if (net_ratelimit())
198                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199         kfree_skb(skb);
200         return -EINVAL;
201 }
202
203 static inline int ip_finish_output(struct sk_buff *skb)
204 {
205         struct net_device *dev = skb->dst->dev;
206
207         skb->dev = dev;
208         skb->protocol = htons(ETH_P_IP);
209
210         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211                        ip_finish_output2);
212 }
213
214 int ip_mc_output(struct sk_buff *skb)
215 {
216         struct sock *sk = skb->sk;
217         struct rtable *rt = (struct rtable*)skb->dst;
218         struct net_device *dev = rt->u.dst.dev;
219
220         /*
221          *      If the indicated interface is up and running, send the packet.
222          */
223         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224
225         skb->dev = dev;
226         skb->protocol = htons(ETH_P_IP);
227
228         /*
229          *      Multicasts are looped back for other local users
230          */
231
232         if (rt->rt_flags&RTCF_MULTICAST) {
233                 if ((!sk || inet_sk(sk)->mc_loop)
234 #ifdef CONFIG_IP_MROUTE
235                 /* Small optimization: do not loopback not local frames,
236                    which returned after forwarding; they will be  dropped
237                    by ip_mr_input in any case.
238                    Note, that local frames are looped back to be delivered
239                    to local recipients.
240
241                    This check is duplicated in ip_mr_input at the moment.
242                  */
243                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244 #endif
245                 ) {
246                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247                         if (newskb)
248                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249                                         newskb->dev, 
250                                         ip_dev_loopback_xmit);
251                 }
252
253                 /* Multicasts with ttl 0 must not go beyond the host */
254
255                 if (skb->nh.iph->ttl == 0) {
256                         kfree_skb(skb);
257                         return 0;
258                 }
259         }
260
261         if (rt->rt_flags&RTCF_BROADCAST) {
262                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263                 if (newskb)
264                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265                                 newskb->dev, ip_dev_loopback_xmit);
266         }
267
268         if (skb->len > dst_mtu(&rt->u.dst))
269                 return ip_fragment(skb, ip_finish_output);
270         else
271                 return ip_finish_output(skb);
272 }
273
274 int ip_output(struct sk_buff *skb)
275 {
276         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277
278         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
279                 return ip_fragment(skb, ip_finish_output);
280         else
281                 return ip_finish_output(skb);
282 }
283
284 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
285 {
286         struct sock *sk = skb->sk;
287         struct inet_sock *inet = inet_sk(sk);
288         struct ip_options *opt = inet->opt;
289         struct rtable *rt;
290         struct iphdr *iph;
291
292         /* Skip all of this if the packet is already routed,
293          * f.e. by something like SCTP.
294          */
295         rt = (struct rtable *) skb->dst;
296         if (rt != NULL)
297                 goto packet_routed;
298
299         /* Make sure we can route this packet. */
300         rt = (struct rtable *)__sk_dst_check(sk, 0);
301         if (rt == NULL) {
302                 u32 daddr;
303
304                 /* Use correct destination address if we have options. */
305                 daddr = inet->daddr;
306                 if(opt && opt->srr)
307                         daddr = opt->faddr;
308
309                 {
310                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
311                                             .nl_u = { .ip4_u =
312                                                       { .daddr = daddr,
313                                                         .saddr = inet->saddr,
314                                                         .tos = RT_CONN_FLAGS(sk) } },
315                                             .proto = sk->sk_protocol,
316                                             .uli_u = { .ports =
317                                                        { .sport = inet->sport,
318                                                          .dport = inet->dport } } };
319
320                         /* If this fails, retransmit mechanism of transport layer will
321                          * keep trying until route appears or the connection times
322                          * itself out.
323                          */
324                         if (ip_route_output_flow(&rt, &fl, sk, 0))
325                                 goto no_route;
326                 }
327                 sk_setup_caps(sk, &rt->u.dst);
328         }
329         skb->dst = dst_clone(&rt->u.dst);
330
331 packet_routed:
332         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
333                 goto no_route;
334
335         /* OK, we know where to send it, allocate and build IP header. */
336         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
337         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
338         iph->tot_len = htons(skb->len);
339         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
340                 iph->frag_off = htons(IP_DF);
341         else
342                 iph->frag_off = 0;
343         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
344         iph->protocol = sk->sk_protocol;
345         iph->saddr    = rt->rt_src;
346         iph->daddr    = rt->rt_dst;
347         skb->nh.iph   = iph;
348         /* Transport layer set skb->h.foo itself. */
349
350         if (opt && opt->optlen) {
351                 iph->ihl += opt->optlen >> 2;
352                 ip_options_build(skb, opt, inet->daddr, rt, 0);
353         }
354
355         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
356
357         /* Add an IP checksum. */
358         ip_send_check(iph);
359
360         skb->priority = sk->sk_priority;
361
362         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
363                        dst_output);
364
365 no_route:
366         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
367         kfree_skb(skb);
368         return -EHOSTUNREACH;
369 }
370
371
372 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
373 {
374         to->pkt_type = from->pkt_type;
375         to->priority = from->priority;
376         to->protocol = from->protocol;
377         dst_release(to->dst);
378         to->dst = dst_clone(from->dst);
379         to->dev = from->dev;
380
381         /* Copy the flags to each fragment. */
382         IPCB(to)->flags = IPCB(from)->flags;
383
384 #ifdef CONFIG_NET_SCHED
385         to->tc_index = from->tc_index;
386 #endif
387 #ifdef CONFIG_NETFILTER
388         to->nfmark = from->nfmark;
389         /* Connection association is same as pre-frag packet */
390         nf_conntrack_put(to->nfct);
391         to->nfct = from->nfct;
392         nf_conntrack_get(to->nfct);
393         to->nfctinfo = from->nfctinfo;
394 #ifdef CONFIG_BRIDGE_NETFILTER
395         nf_bridge_put(to->nf_bridge);
396         to->nf_bridge = from->nf_bridge;
397         nf_bridge_get(to->nf_bridge);
398 #endif
399 #endif
400 }
401
402 /*
403  *      This IP datagram is too large to be sent in one piece.  Break it up into
404  *      smaller pieces (each of size equal to IP header plus
405  *      a block of the data of the original IP data part) that will yet fit in a
406  *      single device frame, and queue such a frame for sending.
407  */
408
409 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
410 {
411         struct iphdr *iph;
412         int raw = 0;
413         int ptr;
414         struct net_device *dev;
415         struct sk_buff *skb2;
416         unsigned int mtu, hlen, left, len, ll_rs;
417         int offset;
418         int not_last_frag;
419         struct rtable *rt = (struct rtable*)skb->dst;
420         int err = 0;
421
422         dev = rt->u.dst.dev;
423
424         /*
425          *      Point into the IP datagram header.
426          */
427
428         iph = skb->nh.iph;
429
430         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
431                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
432                           htonl(dst_mtu(&rt->u.dst)));
433                 kfree_skb(skb);
434                 return -EMSGSIZE;
435         }
436
437         /*
438          *      Setup starting values.
439          */
440
441         hlen = iph->ihl * 4;
442         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
443
444         /* When frag_list is given, use it. First, check its validity:
445          * some transformers could create wrong frag_list or break existing
446          * one, it is not prohibited. In this case fall back to copying.
447          *
448          * LATER: this step can be merged to real generation of fragments,
449          * we can switch to copy when see the first bad fragment.
450          */
451         if (skb_shinfo(skb)->frag_list) {
452                 struct sk_buff *frag;
453                 int first_len = skb_pagelen(skb);
454
455                 if (first_len - hlen > mtu ||
456                     ((first_len - hlen) & 7) ||
457                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
458                     skb_cloned(skb))
459                         goto slow_path;
460
461                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
462                         /* Correct geometry. */
463                         if (frag->len > mtu ||
464                             ((frag->len & 7) && frag->next) ||
465                             skb_headroom(frag) < hlen)
466                             goto slow_path;
467
468                         /* Partially cloned skb? */
469                         if (skb_shared(frag))
470                                 goto slow_path;
471
472                         BUG_ON(frag->sk);
473                         if (skb->sk) {
474                                 sock_hold(skb->sk);
475                                 frag->sk = skb->sk;
476                                 frag->destructor = sock_wfree;
477                                 skb->truesize -= frag->truesize;
478                         }
479                 }
480
481                 /* Everything is OK. Generate! */
482
483                 err = 0;
484                 offset = 0;
485                 frag = skb_shinfo(skb)->frag_list;
486                 skb_shinfo(skb)->frag_list = NULL;
487                 skb->data_len = first_len - skb_headlen(skb);
488                 skb->len = first_len;
489                 iph->tot_len = htons(first_len);
490                 iph->frag_off = htons(IP_MF);
491                 ip_send_check(iph);
492
493                 for (;;) {
494                         /* Prepare header of the next frame,
495                          * before previous one went down. */
496                         if (frag) {
497                                 frag->ip_summed = CHECKSUM_NONE;
498                                 frag->h.raw = frag->data;
499                                 frag->nh.raw = __skb_push(frag, hlen);
500                                 memcpy(frag->nh.raw, iph, hlen);
501                                 iph = frag->nh.iph;
502                                 iph->tot_len = htons(frag->len);
503                                 ip_copy_metadata(frag, skb);
504                                 if (offset == 0)
505                                         ip_options_fragment(frag);
506                                 offset += skb->len - hlen;
507                                 iph->frag_off = htons(offset>>3);
508                                 if (frag->next != NULL)
509                                         iph->frag_off |= htons(IP_MF);
510                                 /* Ready, complete checksum */
511                                 ip_send_check(iph);
512                         }
513
514                         err = output(skb);
515
516                         if (err || !frag)
517                                 break;
518
519                         skb = frag;
520                         frag = skb->next;
521                         skb->next = NULL;
522                 }
523
524                 if (err == 0) {
525                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
526                         return 0;
527                 }
528
529                 while (frag) {
530                         skb = frag->next;
531                         kfree_skb(frag);
532                         frag = skb;
533                 }
534                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
535                 return err;
536         }
537
538 slow_path:
539         left = skb->len - hlen;         /* Space per frame */
540         ptr = raw + hlen;               /* Where to start from */
541
542 #ifdef CONFIG_BRIDGE_NETFILTER
543         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
544          * we need to make room for the encapsulating header */
545         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
546         mtu -= nf_bridge_pad(skb);
547 #else
548         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
549 #endif
550         /*
551          *      Fragment the datagram.
552          */
553
554         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
555         not_last_frag = iph->frag_off & htons(IP_MF);
556
557         /*
558          *      Keep copying data until we run out.
559          */
560
561         while(left > 0) {
562                 len = left;
563                 /* IF: it doesn't fit, use 'mtu' - the data space left */
564                 if (len > mtu)
565                         len = mtu;
566                 /* IF: we are not sending upto and including the packet end
567                    then align the next start on an eight byte boundary */
568                 if (len < left) {
569                         len &= ~7;
570                 }
571                 /*
572                  *      Allocate buffer.
573                  */
574
575                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
576                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
577                         err = -ENOMEM;
578                         goto fail;
579                 }
580
581                 /*
582                  *      Set up data on packet
583                  */
584
585                 ip_copy_metadata(skb2, skb);
586                 skb_reserve(skb2, ll_rs);
587                 skb_put(skb2, len + hlen);
588                 skb2->nh.raw = skb2->data;
589                 skb2->h.raw = skb2->data + hlen;
590
591                 /*
592                  *      Charge the memory for the fragment to any owner
593                  *      it might possess
594                  */
595
596                 if (skb->sk)
597                         skb_set_owner_w(skb2, skb->sk);
598
599                 /*
600                  *      Copy the packet header into the new buffer.
601                  */
602
603                 memcpy(skb2->nh.raw, skb->data, hlen);
604
605                 /*
606                  *      Copy a block of the IP datagram.
607                  */
608                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
609                         BUG();
610                 left -= len;
611
612                 /*
613                  *      Fill in the new header fields.
614                  */
615                 iph = skb2->nh.iph;
616                 iph->frag_off = htons((offset >> 3));
617
618                 /* ANK: dirty, but effective trick. Upgrade options only if
619                  * the segment to be fragmented was THE FIRST (otherwise,
620                  * options are already fixed) and make it ONCE
621                  * on the initial skb, so that all the following fragments
622                  * will inherit fixed options.
623                  */
624                 if (offset == 0)
625                         ip_options_fragment(skb);
626
627                 /*
628                  *      Added AC : If we are fragmenting a fragment that's not the
629                  *                 last fragment then keep MF on each bit
630                  */
631                 if (left > 0 || not_last_frag)
632                         iph->frag_off |= htons(IP_MF);
633                 ptr += len;
634                 offset += len;
635
636                 /*
637                  *      Put this fragment into the sending queue.
638                  */
639
640                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
641
642                 iph->tot_len = htons(len + hlen);
643
644                 ip_send_check(iph);
645
646                 err = output(skb2);
647                 if (err)
648                         goto fail;
649         }
650         kfree_skb(skb);
651         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
652         return err;
653
654 fail:
655         kfree_skb(skb); 
656         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
657         return err;
658 }
659
660 int
661 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
662 {
663         struct iovec *iov = from;
664
665         if (skb->ip_summed == CHECKSUM_HW) {
666                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
667                         return -EFAULT;
668         } else {
669                 unsigned int csum = 0;
670                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
671                         return -EFAULT;
672                 skb->csum = csum_block_add(skb->csum, csum, odd);
673         }
674         return 0;
675 }
676
677 static inline unsigned int
678 csum_page(struct page *page, int offset, int copy)
679 {
680         char *kaddr;
681         unsigned int csum;
682         kaddr = kmap(page);
683         csum = csum_partial(kaddr + offset, copy, 0);
684         kunmap(page);
685         return csum;
686 }
687
688 /*
689  *      ip_append_data() and ip_append_page() can make one large IP datagram
690  *      from many pieces of data. Each pieces will be holded on the socket
691  *      until ip_push_pending_frames() is called. Each piece can be a page
692  *      or non-page data.
693  *      
694  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
695  *      this interface potentially.
696  *
697  *      LATER: length must be adjusted by pad at tail, when it is required.
698  */
699 int ip_append_data(struct sock *sk,
700                    int getfrag(void *from, char *to, int offset, int len,
701                                int odd, struct sk_buff *skb),
702                    void *from, int length, int transhdrlen,
703                    struct ipcm_cookie *ipc, struct rtable *rt,
704                    unsigned int flags)
705 {
706         struct inet_sock *inet = inet_sk(sk);
707         struct sk_buff *skb;
708
709         struct ip_options *opt = NULL;
710         int hh_len;
711         int exthdrlen;
712         int mtu;
713         int copy;
714         int err;
715         int offset = 0;
716         unsigned int maxfraglen, fragheaderlen;
717         int csummode = CHECKSUM_NONE;
718
719         if (flags&MSG_PROBE)
720                 return 0;
721
722         if (skb_queue_empty(&sk->sk_write_queue)) {
723                 /*
724                  * setup for corking.
725                  */
726                 opt = ipc->opt;
727                 if (opt) {
728                         if (inet->cork.opt == NULL) {
729                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
730                                 if (unlikely(inet->cork.opt == NULL))
731                                         return -ENOBUFS;
732                         }
733                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
734                         inet->cork.flags |= IPCORK_OPT;
735                         inet->cork.addr = ipc->addr;
736                 }
737                 dst_hold(&rt->u.dst);
738                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
739                 inet->cork.rt = rt;
740                 inet->cork.length = 0;
741                 sk->sk_sndmsg_page = NULL;
742                 sk->sk_sndmsg_off = 0;
743                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
744                         length += exthdrlen;
745                         transhdrlen += exthdrlen;
746                 }
747         } else {
748                 rt = inet->cork.rt;
749                 if (inet->cork.flags & IPCORK_OPT)
750                         opt = inet->cork.opt;
751
752                 transhdrlen = 0;
753                 exthdrlen = 0;
754                 mtu = inet->cork.fragsize;
755         }
756         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
757
758         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
759         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
760
761         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
762                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
763                 return -EMSGSIZE;
764         }
765
766         /*
767          * transhdrlen > 0 means that this is the first fragment and we wish
768          * it won't be fragmented in the future.
769          */
770         if (transhdrlen &&
771             length + fragheaderlen <= mtu &&
772             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
773             !exthdrlen)
774                 csummode = CHECKSUM_HW;
775
776         inet->cork.length += length;
777
778         /* So, what's going on in the loop below?
779          *
780          * We use calculated fragment length to generate chained skb,
781          * each of segments is IP fragment ready for sending to network after
782          * adding appropriate IP header.
783          */
784
785         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
786                 goto alloc_new_skb;
787
788         while (length > 0) {
789                 /* Check if the remaining data fits into current packet. */
790                 copy = mtu - skb->len;
791                 if (copy < length)
792                         copy = maxfraglen - skb->len;
793                 if (copy <= 0) {
794                         char *data;
795                         unsigned int datalen;
796                         unsigned int fraglen;
797                         unsigned int fraggap;
798                         unsigned int alloclen;
799                         struct sk_buff *skb_prev;
800 alloc_new_skb:
801                         skb_prev = skb;
802                         if (skb_prev)
803                                 fraggap = skb_prev->len - maxfraglen;
804                         else
805                                 fraggap = 0;
806
807                         /*
808                          * If remaining data exceeds the mtu,
809                          * we know we need more fragment(s).
810                          */
811                         datalen = length + fraggap;
812                         if (datalen > mtu - fragheaderlen)
813                                 datalen = maxfraglen - fragheaderlen;
814                         fraglen = datalen + fragheaderlen;
815
816                         if ((flags & MSG_MORE) && 
817                             !(rt->u.dst.dev->features&NETIF_F_SG))
818                                 alloclen = mtu;
819                         else
820                                 alloclen = datalen + fragheaderlen;
821
822                         /* The last fragment gets additional space at tail.
823                          * Note, with MSG_MORE we overallocate on fragments,
824                          * because we have no idea what fragment will be
825                          * the last.
826                          */
827                         if (datalen == length)
828                                 alloclen += rt->u.dst.trailer_len;
829
830                         if (transhdrlen) {
831                                 skb = sock_alloc_send_skb(sk, 
832                                                 alloclen + hh_len + 15,
833                                                 (flags & MSG_DONTWAIT), &err);
834                         } else {
835                                 skb = NULL;
836                                 if (atomic_read(&sk->sk_wmem_alloc) <=
837                                     2 * sk->sk_sndbuf)
838                                         skb = sock_wmalloc(sk, 
839                                                            alloclen + hh_len + 15, 1,
840                                                            sk->sk_allocation);
841                                 if (unlikely(skb == NULL))
842                                         err = -ENOBUFS;
843                         }
844                         if (skb == NULL)
845                                 goto error;
846
847                         /*
848                          *      Fill in the control structures
849                          */
850                         skb->ip_summed = csummode;
851                         skb->csum = 0;
852                         skb_reserve(skb, hh_len);
853
854                         /*
855                          *      Find where to start putting bytes.
856                          */
857                         data = skb_put(skb, fraglen);
858                         skb->nh.raw = data + exthdrlen;
859                         data += fragheaderlen;
860                         skb->h.raw = data + exthdrlen;
861
862                         if (fraggap) {
863                                 skb->csum = skb_copy_and_csum_bits(
864                                         skb_prev, maxfraglen,
865                                         data + transhdrlen, fraggap, 0);
866                                 skb_prev->csum = csum_sub(skb_prev->csum,
867                                                           skb->csum);
868                                 data += fraggap;
869                                 skb_trim(skb_prev, maxfraglen);
870                         }
871
872                         copy = datalen - transhdrlen - fraggap;
873                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
874                                 err = -EFAULT;
875                                 kfree_skb(skb);
876                                 goto error;
877                         }
878
879                         offset += copy;
880                         length -= datalen - fraggap;
881                         transhdrlen = 0;
882                         exthdrlen = 0;
883                         csummode = CHECKSUM_NONE;
884
885                         /*
886                          * Put the packet on the pending queue.
887                          */
888                         __skb_queue_tail(&sk->sk_write_queue, skb);
889                         continue;
890                 }
891
892                 if (copy > length)
893                         copy = length;
894
895                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
896                         unsigned int off;
897
898                         off = skb->len;
899                         if (getfrag(from, skb_put(skb, copy), 
900                                         offset, copy, off, skb) < 0) {
901                                 __skb_trim(skb, off);
902                                 err = -EFAULT;
903                                 goto error;
904                         }
905                 } else {
906                         int i = skb_shinfo(skb)->nr_frags;
907                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
908                         struct page *page = sk->sk_sndmsg_page;
909                         int off = sk->sk_sndmsg_off;
910                         unsigned int left;
911
912                         if (page && (left = PAGE_SIZE - off) > 0) {
913                                 if (copy >= left)
914                                         copy = left;
915                                 if (page != frag->page) {
916                                         if (i == MAX_SKB_FRAGS) {
917                                                 err = -EMSGSIZE;
918                                                 goto error;
919                                         }
920                                         get_page(page);
921                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
922                                         frag = &skb_shinfo(skb)->frags[i];
923                                 }
924                         } else if (i < MAX_SKB_FRAGS) {
925                                 if (copy > PAGE_SIZE)
926                                         copy = PAGE_SIZE;
927                                 page = alloc_pages(sk->sk_allocation, 0);
928                                 if (page == NULL)  {
929                                         err = -ENOMEM;
930                                         goto error;
931                                 }
932                                 sk->sk_sndmsg_page = page;
933                                 sk->sk_sndmsg_off = 0;
934
935                                 skb_fill_page_desc(skb, i, page, 0, 0);
936                                 frag = &skb_shinfo(skb)->frags[i];
937                                 skb->truesize += PAGE_SIZE;
938                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
939                         } else {
940                                 err = -EMSGSIZE;
941                                 goto error;
942                         }
943                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
944                                 err = -EFAULT;
945                                 goto error;
946                         }
947                         sk->sk_sndmsg_off += copy;
948                         frag->size += copy;
949                         skb->len += copy;
950                         skb->data_len += copy;
951                 }
952                 offset += copy;
953                 length -= copy;
954         }
955
956         return 0;
957
958 error:
959         inet->cork.length -= length;
960         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
961         return err; 
962 }
963
964 ssize_t ip_append_page(struct sock *sk, struct page *page,
965                        int offset, size_t size, int flags)
966 {
967         struct inet_sock *inet = inet_sk(sk);
968         struct sk_buff *skb;
969         struct rtable *rt;
970         struct ip_options *opt = NULL;
971         int hh_len;
972         int mtu;
973         int len;
974         int err;
975         unsigned int maxfraglen, fragheaderlen, fraggap;
976
977         if (inet->hdrincl)
978                 return -EPERM;
979
980         if (flags&MSG_PROBE)
981                 return 0;
982
983         if (skb_queue_empty(&sk->sk_write_queue))
984                 return -EINVAL;
985
986         rt = inet->cork.rt;
987         if (inet->cork.flags & IPCORK_OPT)
988                 opt = inet->cork.opt;
989
990         if (!(rt->u.dst.dev->features&NETIF_F_SG))
991                 return -EOPNOTSUPP;
992
993         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
994         mtu = inet->cork.fragsize;
995
996         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
997         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
998
999         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1000                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1001                 return -EMSGSIZE;
1002         }
1003
1004         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1005                 return -EINVAL;
1006
1007         inet->cork.length += size;
1008
1009         while (size > 0) {
1010                 int i;
1011
1012                 /* Check if the remaining data fits into current packet. */
1013                 len = mtu - skb->len;
1014                 if (len < size)
1015                         len = maxfraglen - skb->len;
1016                 if (len <= 0) {
1017                         struct sk_buff *skb_prev;
1018                         char *data;
1019                         struct iphdr *iph;
1020                         int alloclen;
1021
1022                         skb_prev = skb;
1023                         if (skb_prev)
1024                                 fraggap = skb_prev->len - maxfraglen;
1025                         else
1026                                 fraggap = 0;
1027
1028                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1029                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1030                         if (unlikely(!skb)) {
1031                                 err = -ENOBUFS;
1032                                 goto error;
1033                         }
1034
1035                         /*
1036                          *      Fill in the control structures
1037                          */
1038                         skb->ip_summed = CHECKSUM_NONE;
1039                         skb->csum = 0;
1040                         skb_reserve(skb, hh_len);
1041
1042                         /*
1043                          *      Find where to start putting bytes.
1044                          */
1045                         data = skb_put(skb, fragheaderlen + fraggap);
1046                         skb->nh.iph = iph = (struct iphdr *)data;
1047                         data += fragheaderlen;
1048                         skb->h.raw = data;
1049
1050                         if (fraggap) {
1051                                 skb->csum = skb_copy_and_csum_bits(
1052                                         skb_prev, maxfraglen,
1053                                         data, fraggap, 0);
1054                                 skb_prev->csum = csum_sub(skb_prev->csum,
1055                                                           skb->csum);
1056                                 skb_trim(skb_prev, maxfraglen);
1057                         }
1058
1059                         /*
1060                          * Put the packet on the pending queue.
1061                          */
1062                         __skb_queue_tail(&sk->sk_write_queue, skb);
1063                         continue;
1064                 }
1065
1066                 i = skb_shinfo(skb)->nr_frags;
1067                 if (len > size)
1068                         len = size;
1069                 if (skb_can_coalesce(skb, i, page, offset)) {
1070                         skb_shinfo(skb)->frags[i-1].size += len;
1071                 } else if (i < MAX_SKB_FRAGS) {
1072                         get_page(page);
1073                         skb_fill_page_desc(skb, i, page, offset, len);
1074                 } else {
1075                         err = -EMSGSIZE;
1076                         goto error;
1077                 }
1078
1079                 if (skb->ip_summed == CHECKSUM_NONE) {
1080                         unsigned int csum;
1081                         csum = csum_page(page, offset, len);
1082                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1083                 }
1084
1085                 skb->len += len;
1086                 skb->data_len += len;
1087                 offset += len;
1088                 size -= len;
1089         }
1090         return 0;
1091
1092 error:
1093         inet->cork.length -= size;
1094         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1095         return err;
1096 }
1097
1098 /*
1099  *      Combined all pending IP fragments on the socket as one IP datagram
1100  *      and push them out.
1101  */
1102 int ip_push_pending_frames(struct sock *sk)
1103 {
1104         struct sk_buff *skb, *tmp_skb;
1105         struct sk_buff **tail_skb;
1106         struct inet_sock *inet = inet_sk(sk);
1107         struct ip_options *opt = NULL;
1108         struct rtable *rt = inet->cork.rt;
1109         struct iphdr *iph;
1110         int df = 0;
1111         __u8 ttl;
1112         int err = 0;
1113
1114         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1115                 goto out;
1116         tail_skb = &(skb_shinfo(skb)->frag_list);
1117
1118         /* move skb->data to ip header from ext header */
1119         if (skb->data < skb->nh.raw)
1120                 __skb_pull(skb, skb->nh.raw - skb->data);
1121         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1122                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1123                 *tail_skb = tmp_skb;
1124                 tail_skb = &(tmp_skb->next);
1125                 skb->len += tmp_skb->len;
1126                 skb->data_len += tmp_skb->len;
1127                 skb->truesize += tmp_skb->truesize;
1128                 __sock_put(tmp_skb->sk);
1129                 tmp_skb->destructor = NULL;
1130                 tmp_skb->sk = NULL;
1131         }
1132
1133         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1134          * to fragment the frame generated here. No matter, what transforms
1135          * how transforms change size of the packet, it will come out.
1136          */
1137         if (inet->pmtudisc != IP_PMTUDISC_DO)
1138                 skb->local_df = 1;
1139
1140         /* DF bit is set when we want to see DF on outgoing frames.
1141          * If local_df is set too, we still allow to fragment this frame
1142          * locally. */
1143         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1144             (skb->len <= dst_mtu(&rt->u.dst) &&
1145              ip_dont_fragment(sk, &rt->u.dst)))
1146                 df = htons(IP_DF);
1147
1148         if (inet->cork.flags & IPCORK_OPT)
1149                 opt = inet->cork.opt;
1150
1151         if (rt->rt_type == RTN_MULTICAST)
1152                 ttl = inet->mc_ttl;
1153         else
1154                 ttl = ip_select_ttl(inet, &rt->u.dst);
1155
1156         iph = (struct iphdr *)skb->data;
1157         iph->version = 4;
1158         iph->ihl = 5;
1159         if (opt) {
1160                 iph->ihl += opt->optlen>>2;
1161                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1162         }
1163         iph->tos = inet->tos;
1164         iph->tot_len = htons(skb->len);
1165         iph->frag_off = df;
1166         if (!df) {
1167                 __ip_select_ident(iph, &rt->u.dst, 0);
1168         } else {
1169                 iph->id = htons(inet->id++);
1170         }
1171         iph->ttl = ttl;
1172         iph->protocol = sk->sk_protocol;
1173         iph->saddr = rt->rt_src;
1174         iph->daddr = rt->rt_dst;
1175         ip_send_check(iph);
1176
1177         skb->priority = sk->sk_priority;
1178         skb->dst = dst_clone(&rt->u.dst);
1179
1180         /* Netfilter gets whole the not fragmented skb. */
1181         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1182                       skb->dst->dev, dst_output);
1183         if (err) {
1184                 if (err > 0)
1185                         err = inet->recverr ? net_xmit_errno(err) : 0;
1186                 if (err)
1187                         goto error;
1188         }
1189
1190 out:
1191         inet->cork.flags &= ~IPCORK_OPT;
1192         if (inet->cork.opt) {
1193                 kfree(inet->cork.opt);
1194                 inet->cork.opt = NULL;
1195         }
1196         if (inet->cork.rt) {
1197                 ip_rt_put(inet->cork.rt);
1198                 inet->cork.rt = NULL;
1199         }
1200         return err;
1201
1202 error:
1203         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1204         goto out;
1205 }
1206
1207 /*
1208  *      Throw away all pending data on the socket.
1209  */
1210 void ip_flush_pending_frames(struct sock *sk)
1211 {
1212         struct inet_sock *inet = inet_sk(sk);
1213         struct sk_buff *skb;
1214
1215         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1216                 kfree_skb(skb);
1217
1218         inet->cork.flags &= ~IPCORK_OPT;
1219         if (inet->cork.opt) {
1220                 kfree(inet->cork.opt);
1221                 inet->cork.opt = NULL;
1222         }
1223         if (inet->cork.rt) {
1224                 ip_rt_put(inet->cork.rt);
1225                 inet->cork.rt = NULL;
1226         }
1227 }
1228
1229
1230 /*
1231  *      Fetch data from kernel space and fill in checksum if needed.
1232  */
1233 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1234                               int len, int odd, struct sk_buff *skb)
1235 {
1236         unsigned int csum;
1237
1238         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1239         skb->csum = csum_block_add(skb->csum, csum, odd);
1240         return 0;  
1241 }
1242
1243 /* 
1244  *      Generic function to send a packet as reply to another packet.
1245  *      Used to send TCP resets so far. ICMP should use this function too.
1246  *
1247  *      Should run single threaded per socket because it uses the sock 
1248  *      structure to pass arguments.
1249  *
1250  *      LATER: switch from ip_build_xmit to ip_append_*
1251  */
1252 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1253                    unsigned int len)
1254 {
1255         struct inet_sock *inet = inet_sk(sk);
1256         struct {
1257                 struct ip_options       opt;
1258                 char                    data[40];
1259         } replyopts;
1260         struct ipcm_cookie ipc;
1261         u32 daddr;
1262         struct rtable *rt = (struct rtable*)skb->dst;
1263
1264         if (ip_options_echo(&replyopts.opt, skb))
1265                 return;
1266
1267         daddr = ipc.addr = rt->rt_src;
1268         ipc.opt = NULL;
1269
1270         if (replyopts.opt.optlen) {
1271                 ipc.opt = &replyopts.opt;
1272
1273                 if (ipc.opt->srr)
1274                         daddr = replyopts.opt.faddr;
1275         }
1276
1277         {
1278                 struct flowi fl = { .nl_u = { .ip4_u =
1279                                               { .daddr = daddr,
1280                                                 .saddr = rt->rt_spec_dst,
1281                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1282                                     /* Not quite clean, but right. */
1283                                     .uli_u = { .ports =
1284                                                { .sport = skb->h.th->dest,
1285                                                  .dport = skb->h.th->source } },
1286                                     .proto = sk->sk_protocol };
1287                 if (ip_route_output_key(&rt, &fl))
1288                         return;
1289         }
1290
1291         /* And let IP do all the hard work.
1292
1293            This chunk is not reenterable, hence spinlock.
1294            Note that it uses the fact, that this function is called
1295            with locally disabled BH and that sk cannot be already spinlocked.
1296          */
1297         bh_lock_sock(sk);
1298         inet->tos = skb->nh.iph->tos;
1299         sk->sk_priority = skb->priority;
1300         sk->sk_protocol = skb->nh.iph->protocol;
1301         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1302                        &ipc, rt, MSG_DONTWAIT);
1303         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1304                 if (arg->csumoffset >= 0)
1305                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1306                 skb->ip_summed = CHECKSUM_NONE;
1307                 ip_push_pending_frames(sk);
1308         }
1309
1310         bh_unlock_sock(sk);
1311
1312         ip_rt_put(rt);
1313 }
1314
1315 void __init ip_init(void)
1316 {
1317         ip_rt_init();
1318         inet_initpeers();
1319
1320 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1321         igmp_mc_proc_init();
1322 #endif
1323 }
1324
1325 EXPORT_SYMBOL(ip_fragment);
1326 EXPORT_SYMBOL(ip_generic_getfrag);
1327 EXPORT_SYMBOL(ip_queue_xmit);
1328 EXPORT_SYMBOL(ip_send_check);