Merge git://git.kernel.org/pub/scm/linux/kernel/git/hpa/linux-2.6-inttypes
[linux-2.6] / net / ipv4 / ipip.c
1 /*
2  *      Linux NET3:     IP/IP protocol decoder.
3  *
4  *      Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
5  *
6  *      Authors:
7  *              Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
8  *
9  *      Fixes:
10  *              Alan Cox        :       Merged and made usable non modular (its so tiny its silly as
11  *                                      a module taking up 2 pages).
12  *              Alan Cox        :       Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
13  *                                      to keep ip_forward happy.
14  *              Alan Cox        :       More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
15  *              Kai Schulte     :       Fixed #defines for IP_FIREWALL->FIREWALL
16  *              David Woodhouse :       Perform some basic ICMP handling.
17  *                                      IPIP Routing without decapsulation.
18  *              Carlos Picoto   :       GRE over IP support
19  *              Alexey Kuznetsov:       Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
20  *                                      I do not want to merge them together.
21  *
22  *      This program is free software; you can redistribute it and/or
23  *      modify it under the terms of the GNU General Public License
24  *      as published by the Free Software Foundation; either version
25  *      2 of the License, or (at your option) any later version.
26  *
27  */
28
29 /* tunnel.c: an IP tunnel driver
30
31         The purpose of this driver is to provide an IP tunnel through
32         which you can tunnel network traffic transparently across subnets.
33
34         This was written by looking at Nick Holloway's dummy driver
35         Thanks for the great code!
36
37                 -Sam Lantinga   (slouken@cs.ucdavis.edu)  02/01/95
38
39         Minor tweaks:
40                 Cleaned up the code a little and added some pre-1.3.0 tweaks.
41                 dev->hard_header/hard_header_len changed to use no headers.
42                 Comments/bracketing tweaked.
43                 Made the tunnels use dev->name not tunnel: when error reporting.
44                 Added tx_dropped stat
45
46                 -Alan Cox       (Alan.Cox@linux.org) 21 March 95
47
48         Reworked:
49                 Changed to tunnel to destination gateway in addition to the
50                         tunnel's pointopoint address
51                 Almost completely rewritten
52                 Note:  There is currently no firewall or ICMP handling done.
53
54                 -Sam Lantinga   (slouken@cs.ucdavis.edu) 02/13/96
55
56 */
57
58 /* Things I wish I had known when writing the tunnel driver:
59
60         When the tunnel_xmit() function is called, the skb contains the
61         packet to be sent (plus a great deal of extra info), and dev
62         contains the tunnel device that _we_ are.
63
64         When we are passed a packet, we are expected to fill in the
65         source address with our source IP address.
66
67         What is the proper way to allocate, copy and free a buffer?
68         After you allocate it, it is a "0 length" chunk of memory
69         starting at zero.  If you want to add headers to the buffer
70         later, you'll have to call "skb_reserve(skb, amount)" with
71         the amount of memory you want reserved.  Then, you call
72         "skb_put(skb, amount)" with the amount of space you want in
73         the buffer.  skb_put() returns a pointer to the top (#0) of
74         that buffer.  skb->len is set to the amount of space you have
75         "allocated" with skb_put().  You can then write up to skb->len
76         bytes to that buffer.  If you need more, you can call skb_put()
77         again with the additional amount of space you need.  You can
78         find out how much more space you can allocate by calling
79         "skb_tailroom(skb)".
80         Now, to add header space, call "skb_push(skb, header_len)".
81         This creates space at the beginning of the buffer and returns
82         a pointer to this new space.  If later you need to strip a
83         header from a buffer, call "skb_pull(skb, header_len)".
84         skb_headroom() will return how much space is left at the top
85         of the buffer (before the main data).  Remember, this headroom
86         space must be reserved before the skb_put() function is called.
87         */
88
89 /*
90    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
91
92    For comments look at net/ipv4/ip_gre.c --ANK
93  */
94
95
96 #include <linux/capability.h>
97 #include <linux/module.h>
98 #include <linux/types.h>
99 #include <linux/kernel.h>
100 #include <asm/uaccess.h>
101 #include <linux/skbuff.h>
102 #include <linux/netdevice.h>
103 #include <linux/in.h>
104 #include <linux/tcp.h>
105 #include <linux/udp.h>
106 #include <linux/if_arp.h>
107 #include <linux/mroute.h>
108 #include <linux/init.h>
109 #include <linux/netfilter_ipv4.h>
110 #include <linux/if_ether.h>
111
112 #include <net/sock.h>
113 #include <net/ip.h>
114 #include <net/icmp.h>
115 #include <net/ipip.h>
116 #include <net/inet_ecn.h>
117 #include <net/xfrm.h>
118 #include <net/net_namespace.h>
119 #include <net/netns/generic.h>
120
121 #define HASH_SIZE  16
122 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
123
124 static int ipip_net_id;
125 struct ipip_net {
126         struct ip_tunnel *tunnels_r_l[HASH_SIZE];
127         struct ip_tunnel *tunnels_r[HASH_SIZE];
128         struct ip_tunnel *tunnels_l[HASH_SIZE];
129         struct ip_tunnel *tunnels_wc[1];
130         struct ip_tunnel **tunnels[4];
131
132         struct net_device *fb_tunnel_dev;
133 };
134
135 static int ipip_fb_tunnel_init(struct net_device *dev);
136 static int ipip_tunnel_init(struct net_device *dev);
137 static void ipip_tunnel_setup(struct net_device *dev);
138
139 static DEFINE_RWLOCK(ipip_lock);
140
141 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
142                 __be32 remote, __be32 local)
143 {
144         unsigned h0 = HASH(remote);
145         unsigned h1 = HASH(local);
146         struct ip_tunnel *t;
147         struct ipip_net *ipn = net_generic(net, ipip_net_id);
148
149         for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
150                 if (local == t->parms.iph.saddr &&
151                     remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
152                         return t;
153         }
154         for (t = ipn->tunnels_r[h0]; t; t = t->next) {
155                 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
156                         return t;
157         }
158         for (t = ipn->tunnels_l[h1]; t; t = t->next) {
159                 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
160                         return t;
161         }
162         if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
163                 return t;
164         return NULL;
165 }
166
167 static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
168                 struct ip_tunnel_parm *parms)
169 {
170         __be32 remote = parms->iph.daddr;
171         __be32 local = parms->iph.saddr;
172         unsigned h = 0;
173         int prio = 0;
174
175         if (remote) {
176                 prio |= 2;
177                 h ^= HASH(remote);
178         }
179         if (local) {
180                 prio |= 1;
181                 h ^= HASH(local);
182         }
183         return &ipn->tunnels[prio][h];
184 }
185
186 static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
187                 struct ip_tunnel *t)
188 {
189         return __ipip_bucket(ipn, &t->parms);
190 }
191
192 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
193 {
194         struct ip_tunnel **tp;
195
196         for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
197                 if (t == *tp) {
198                         write_lock_bh(&ipip_lock);
199                         *tp = t->next;
200                         write_unlock_bh(&ipip_lock);
201                         break;
202                 }
203         }
204 }
205
206 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
207 {
208         struct ip_tunnel **tp = ipip_bucket(ipn, t);
209
210         t->next = *tp;
211         write_lock_bh(&ipip_lock);
212         *tp = t;
213         write_unlock_bh(&ipip_lock);
214 }
215
216 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
217                 struct ip_tunnel_parm *parms, int create)
218 {
219         __be32 remote = parms->iph.daddr;
220         __be32 local = parms->iph.saddr;
221         struct ip_tunnel *t, **tp, *nt;
222         struct net_device *dev;
223         char name[IFNAMSIZ];
224         struct ipip_net *ipn = net_generic(net, ipip_net_id);
225
226         for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
227                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
228                         return t;
229         }
230         if (!create)
231                 return NULL;
232
233         if (parms->name[0])
234                 strlcpy(name, parms->name, IFNAMSIZ);
235         else
236                 sprintf(name, "tunl%%d");
237
238         dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
239         if (dev == NULL)
240                 return NULL;
241
242         dev_net_set(dev, net);
243
244         if (strchr(name, '%')) {
245                 if (dev_alloc_name(dev, name) < 0)
246                         goto failed_free;
247         }
248
249         nt = netdev_priv(dev);
250         dev->init = ipip_tunnel_init;
251         nt->parms = *parms;
252
253         if (register_netdevice(dev) < 0)
254                 goto failed_free;
255
256         dev_hold(dev);
257         ipip_tunnel_link(ipn, nt);
258         return nt;
259
260 failed_free:
261         free_netdev(dev);
262         return NULL;
263 }
264
265 static void ipip_tunnel_uninit(struct net_device *dev)
266 {
267         struct net *net = dev_net(dev);
268         struct ipip_net *ipn = net_generic(net, ipip_net_id);
269
270         if (dev == ipn->fb_tunnel_dev) {
271                 write_lock_bh(&ipip_lock);
272                 ipn->tunnels_wc[0] = NULL;
273                 write_unlock_bh(&ipip_lock);
274         } else
275                 ipip_tunnel_unlink(ipn, netdev_priv(dev));
276         dev_put(dev);
277 }
278
279 static int ipip_err(struct sk_buff *skb, u32 info)
280 {
281 #ifndef I_WISH_WORLD_WERE_PERFECT
282
283 /* It is not :-( All the routers (except for Linux) return only
284    8 bytes of packet payload. It means, that precise relaying of
285    ICMP in the real Internet is absolutely infeasible.
286  */
287         struct iphdr *iph = (struct iphdr*)skb->data;
288         const int type = icmp_hdr(skb)->type;
289         const int code = icmp_hdr(skb)->code;
290         struct ip_tunnel *t;
291         int err;
292
293         switch (type) {
294         default:
295         case ICMP_PARAMETERPROB:
296                 return 0;
297
298         case ICMP_DEST_UNREACH:
299                 switch (code) {
300                 case ICMP_SR_FAILED:
301                 case ICMP_PORT_UNREACH:
302                         /* Impossible event. */
303                         return 0;
304                 case ICMP_FRAG_NEEDED:
305                         /* Soft state for pmtu is maintained by IP core. */
306                         return 0;
307                 default:
308                         /* All others are translated to HOST_UNREACH.
309                            rfc2003 contains "deep thoughts" about NET_UNREACH,
310                            I believe they are just ether pollution. --ANK
311                          */
312                         break;
313                 }
314                 break;
315         case ICMP_TIME_EXCEEDED:
316                 if (code != ICMP_EXC_TTL)
317                         return 0;
318                 break;
319         }
320
321         err = -ENOENT;
322
323         read_lock(&ipip_lock);
324         t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
325         if (t == NULL || t->parms.iph.daddr == 0)
326                 goto out;
327
328         err = 0;
329         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
330                 goto out;
331
332         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
333                 t->err_count++;
334         else
335                 t->err_count = 1;
336         t->err_time = jiffies;
337 out:
338         read_unlock(&ipip_lock);
339         return err;
340 #else
341         struct iphdr *iph = (struct iphdr*)dp;
342         int hlen = iph->ihl<<2;
343         struct iphdr *eiph;
344         const int type = icmp_hdr(skb)->type;
345         const int code = icmp_hdr(skb)->code;
346         int rel_type = 0;
347         int rel_code = 0;
348         __be32 rel_info = 0;
349         __u32 n = 0;
350         struct sk_buff *skb2;
351         struct flowi fl;
352         struct rtable *rt;
353
354         if (len < hlen + sizeof(struct iphdr))
355                 return 0;
356         eiph = (struct iphdr*)(dp + hlen);
357
358         switch (type) {
359         default:
360                 return 0;
361         case ICMP_PARAMETERPROB:
362                 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
363                 if (n < hlen)
364                         return 0;
365
366                 /* So... This guy found something strange INSIDE encapsulated
367                    packet. Well, he is fool, but what can we do ?
368                  */
369                 rel_type = ICMP_PARAMETERPROB;
370                 rel_info = htonl((n - hlen) << 24);
371                 break;
372
373         case ICMP_DEST_UNREACH:
374                 switch (code) {
375                 case ICMP_SR_FAILED:
376                 case ICMP_PORT_UNREACH:
377                         /* Impossible event. */
378                         return 0;
379                 case ICMP_FRAG_NEEDED:
380                         /* And it is the only really necessary thing :-) */
381                         n = ntohs(icmp_hdr(skb)->un.frag.mtu);
382                         if (n < hlen+68)
383                                 return 0;
384                         n -= hlen;
385                         /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
386                         if (n > ntohs(eiph->tot_len))
387                                 return 0;
388                         rel_info = htonl(n);
389                         break;
390                 default:
391                         /* All others are translated to HOST_UNREACH.
392                            rfc2003 contains "deep thoughts" about NET_UNREACH,
393                            I believe, it is just ether pollution. --ANK
394                          */
395                         rel_type = ICMP_DEST_UNREACH;
396                         rel_code = ICMP_HOST_UNREACH;
397                         break;
398                 }
399                 break;
400         case ICMP_TIME_EXCEEDED:
401                 if (code != ICMP_EXC_TTL)
402                         return 0;
403                 break;
404         }
405
406         /* Prepare fake skb to feed it to icmp_send */
407         skb2 = skb_clone(skb, GFP_ATOMIC);
408         if (skb2 == NULL)
409                 return 0;
410         dst_release(skb2->dst);
411         skb2->dst = NULL;
412         skb_pull(skb2, skb->data - (u8*)eiph);
413         skb_reset_network_header(skb2);
414
415         /* Try to guess incoming interface */
416         memset(&fl, 0, sizeof(fl));
417         fl.fl4_daddr = eiph->saddr;
418         fl.fl4_tos = RT_TOS(eiph->tos);
419         fl.proto = IPPROTO_IPIP;
420         if (ip_route_output_key(dev_net(skb->dev), &rt, &key)) {
421                 kfree_skb(skb2);
422                 return 0;
423         }
424         skb2->dev = rt->u.dst.dev;
425
426         /* route "incoming" packet */
427         if (rt->rt_flags&RTCF_LOCAL) {
428                 ip_rt_put(rt);
429                 rt = NULL;
430                 fl.fl4_daddr = eiph->daddr;
431                 fl.fl4_src = eiph->saddr;
432                 fl.fl4_tos = eiph->tos;
433                 if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
434                     rt->u.dst.dev->type != ARPHRD_TUNNEL) {
435                         ip_rt_put(rt);
436                         kfree_skb(skb2);
437                         return 0;
438                 }
439         } else {
440                 ip_rt_put(rt);
441                 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
442                     skb2->dst->dev->type != ARPHRD_TUNNEL) {
443                         kfree_skb(skb2);
444                         return 0;
445                 }
446         }
447
448         /* change mtu on this route */
449         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
450                 if (n > dst_mtu(skb2->dst)) {
451                         kfree_skb(skb2);
452                         return 0;
453                 }
454                 skb2->dst->ops->update_pmtu(skb2->dst, n);
455         } else if (type == ICMP_TIME_EXCEEDED) {
456                 struct ip_tunnel *t = netdev_priv(skb2->dev);
457                 if (t->parms.iph.ttl) {
458                         rel_type = ICMP_DEST_UNREACH;
459                         rel_code = ICMP_HOST_UNREACH;
460                 }
461         }
462
463         icmp_send(skb2, rel_type, rel_code, rel_info);
464         kfree_skb(skb2);
465         return 0;
466 #endif
467 }
468
469 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
470                                         struct sk_buff *skb)
471 {
472         struct iphdr *inner_iph = ip_hdr(skb);
473
474         if (INET_ECN_is_ce(outer_iph->tos))
475                 IP_ECN_set_ce(inner_iph);
476 }
477
478 static int ipip_rcv(struct sk_buff *skb)
479 {
480         struct ip_tunnel *tunnel;
481         const struct iphdr *iph = ip_hdr(skb);
482
483         read_lock(&ipip_lock);
484         if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
485                                         iph->saddr, iph->daddr)) != NULL) {
486                 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
487                         read_unlock(&ipip_lock);
488                         kfree_skb(skb);
489                         return 0;
490                 }
491
492                 secpath_reset(skb);
493
494                 skb->mac_header = skb->network_header;
495                 skb_reset_network_header(skb);
496                 skb->protocol = htons(ETH_P_IP);
497                 skb->pkt_type = PACKET_HOST;
498
499                 tunnel->stat.rx_packets++;
500                 tunnel->stat.rx_bytes += skb->len;
501                 skb->dev = tunnel->dev;
502                 dst_release(skb->dst);
503                 skb->dst = NULL;
504                 nf_reset(skb);
505                 ipip_ecn_decapsulate(iph, skb);
506                 netif_rx(skb);
507                 read_unlock(&ipip_lock);
508                 return 0;
509         }
510         read_unlock(&ipip_lock);
511
512         return -1;
513 }
514
515 /*
516  *      This function assumes it is being called from dev_queue_xmit()
517  *      and that skb is filled properly by that function.
518  */
519
520 static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
521 {
522         struct ip_tunnel *tunnel = netdev_priv(dev);
523         struct net_device_stats *stats = &tunnel->stat;
524         struct iphdr  *tiph = &tunnel->parms.iph;
525         u8     tos = tunnel->parms.iph.tos;
526         __be16 df = tiph->frag_off;
527         struct rtable *rt;                      /* Route to the other host */
528         struct net_device *tdev;                        /* Device to other host */
529         struct iphdr  *old_iph = ip_hdr(skb);
530         struct iphdr  *iph;                     /* Our new IP header */
531         unsigned int max_headroom;              /* The extra header space needed */
532         __be32 dst = tiph->daddr;
533         int    mtu;
534
535         if (tunnel->recursion++) {
536                 tunnel->stat.collisions++;
537                 goto tx_error;
538         }
539
540         if (skb->protocol != htons(ETH_P_IP))
541                 goto tx_error;
542
543         if (tos&1)
544                 tos = old_iph->tos;
545
546         if (!dst) {
547                 /* NBMA tunnel */
548                 if ((rt = skb->rtable) == NULL) {
549                         tunnel->stat.tx_fifo_errors++;
550                         goto tx_error;
551                 }
552                 if ((dst = rt->rt_gateway) == 0)
553                         goto tx_error_icmp;
554         }
555
556         {
557                 struct flowi fl = { .oif = tunnel->parms.link,
558                                     .nl_u = { .ip4_u =
559                                               { .daddr = dst,
560                                                 .saddr = tiph->saddr,
561                                                 .tos = RT_TOS(tos) } },
562                                     .proto = IPPROTO_IPIP };
563                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
564                         tunnel->stat.tx_carrier_errors++;
565                         goto tx_error_icmp;
566                 }
567         }
568         tdev = rt->u.dst.dev;
569
570         if (tdev == dev) {
571                 ip_rt_put(rt);
572                 tunnel->stat.collisions++;
573                 goto tx_error;
574         }
575
576         if (tiph->frag_off)
577                 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
578         else
579                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
580
581         if (mtu < 68) {
582                 tunnel->stat.collisions++;
583                 ip_rt_put(rt);
584                 goto tx_error;
585         }
586         if (skb->dst)
587                 skb->dst->ops->update_pmtu(skb->dst, mtu);
588
589         df |= (old_iph->frag_off&htons(IP_DF));
590
591         if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
592                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
593                 ip_rt_put(rt);
594                 goto tx_error;
595         }
596
597         if (tunnel->err_count > 0) {
598                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
599                         tunnel->err_count--;
600                         dst_link_failure(skb);
601                 } else
602                         tunnel->err_count = 0;
603         }
604
605         /*
606          * Okay, now see if we can stuff it in the buffer as-is.
607          */
608         max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
609
610         if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
611             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
612                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
613                 if (!new_skb) {
614                         ip_rt_put(rt);
615                         stats->tx_dropped++;
616                         dev_kfree_skb(skb);
617                         tunnel->recursion--;
618                         return 0;
619                 }
620                 if (skb->sk)
621                         skb_set_owner_w(new_skb, skb->sk);
622                 dev_kfree_skb(skb);
623                 skb = new_skb;
624                 old_iph = ip_hdr(skb);
625         }
626
627         skb->transport_header = skb->network_header;
628         skb_push(skb, sizeof(struct iphdr));
629         skb_reset_network_header(skb);
630         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
631         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
632                               IPSKB_REROUTED);
633         dst_release(skb->dst);
634         skb->dst = &rt->u.dst;
635
636         /*
637          *      Push down and install the IPIP header.
638          */
639
640         iph                     =       ip_hdr(skb);
641         iph->version            =       4;
642         iph->ihl                =       sizeof(struct iphdr)>>2;
643         iph->frag_off           =       df;
644         iph->protocol           =       IPPROTO_IPIP;
645         iph->tos                =       INET_ECN_encapsulate(tos, old_iph->tos);
646         iph->daddr              =       rt->rt_dst;
647         iph->saddr              =       rt->rt_src;
648
649         if ((iph->ttl = tiph->ttl) == 0)
650                 iph->ttl        =       old_iph->ttl;
651
652         nf_reset(skb);
653
654         IPTUNNEL_XMIT();
655         tunnel->recursion--;
656         return 0;
657
658 tx_error_icmp:
659         dst_link_failure(skb);
660 tx_error:
661         stats->tx_errors++;
662         dev_kfree_skb(skb);
663         tunnel->recursion--;
664         return 0;
665 }
666
667 static void ipip_tunnel_bind_dev(struct net_device *dev)
668 {
669         struct net_device *tdev = NULL;
670         struct ip_tunnel *tunnel;
671         struct iphdr *iph;
672
673         tunnel = netdev_priv(dev);
674         iph = &tunnel->parms.iph;
675
676         if (iph->daddr) {
677                 struct flowi fl = { .oif = tunnel->parms.link,
678                                     .nl_u = { .ip4_u =
679                                               { .daddr = iph->daddr,
680                                                 .saddr = iph->saddr,
681                                                 .tos = RT_TOS(iph->tos) } },
682                                     .proto = IPPROTO_IPIP };
683                 struct rtable *rt;
684                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
685                         tdev = rt->u.dst.dev;
686                         ip_rt_put(rt);
687                 }
688                 dev->flags |= IFF_POINTOPOINT;
689         }
690
691         if (!tdev && tunnel->parms.link)
692                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
693
694         if (tdev) {
695                 dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
696                 dev->mtu = tdev->mtu - sizeof(struct iphdr);
697         }
698         dev->iflink = tunnel->parms.link;
699 }
700
701 static int
702 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
703 {
704         int err = 0;
705         struct ip_tunnel_parm p;
706         struct ip_tunnel *t;
707         struct net *net = dev_net(dev);
708         struct ipip_net *ipn = net_generic(net, ipip_net_id);
709
710         switch (cmd) {
711         case SIOCGETTUNNEL:
712                 t = NULL;
713                 if (dev == ipn->fb_tunnel_dev) {
714                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
715                                 err = -EFAULT;
716                                 break;
717                         }
718                         t = ipip_tunnel_locate(net, &p, 0);
719                 }
720                 if (t == NULL)
721                         t = netdev_priv(dev);
722                 memcpy(&p, &t->parms, sizeof(p));
723                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
724                         err = -EFAULT;
725                 break;
726
727         case SIOCADDTUNNEL:
728         case SIOCCHGTUNNEL:
729                 err = -EPERM;
730                 if (!capable(CAP_NET_ADMIN))
731                         goto done;
732
733                 err = -EFAULT;
734                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
735                         goto done;
736
737                 err = -EINVAL;
738                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
739                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
740                         goto done;
741                 if (p.iph.ttl)
742                         p.iph.frag_off |= htons(IP_DF);
743
744                 t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
745
746                 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
747                         if (t != NULL) {
748                                 if (t->dev != dev) {
749                                         err = -EEXIST;
750                                         break;
751                                 }
752                         } else {
753                                 if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
754                                     (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
755                                         err = -EINVAL;
756                                         break;
757                                 }
758                                 t = netdev_priv(dev);
759                                 ipip_tunnel_unlink(ipn, t);
760                                 t->parms.iph.saddr = p.iph.saddr;
761                                 t->parms.iph.daddr = p.iph.daddr;
762                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
763                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
764                                 ipip_tunnel_link(ipn, t);
765                                 netdev_state_change(dev);
766                         }
767                 }
768
769                 if (t) {
770                         err = 0;
771                         if (cmd == SIOCCHGTUNNEL) {
772                                 t->parms.iph.ttl = p.iph.ttl;
773                                 t->parms.iph.tos = p.iph.tos;
774                                 t->parms.iph.frag_off = p.iph.frag_off;
775                                 if (t->parms.link != p.link) {
776                                         t->parms.link = p.link;
777                                         ipip_tunnel_bind_dev(dev);
778                                         netdev_state_change(dev);
779                                 }
780                         }
781                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
782                                 err = -EFAULT;
783                 } else
784                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
785                 break;
786
787         case SIOCDELTUNNEL:
788                 err = -EPERM;
789                 if (!capable(CAP_NET_ADMIN))
790                         goto done;
791
792                 if (dev == ipn->fb_tunnel_dev) {
793                         err = -EFAULT;
794                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
795                                 goto done;
796                         err = -ENOENT;
797                         if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
798                                 goto done;
799                         err = -EPERM;
800                         if (t->dev == ipn->fb_tunnel_dev)
801                                 goto done;
802                         dev = t->dev;
803                 }
804                 unregister_netdevice(dev);
805                 err = 0;
806                 break;
807
808         default:
809                 err = -EINVAL;
810         }
811
812 done:
813         return err;
814 }
815
816 static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
817 {
818         return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
819 }
820
821 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
822 {
823         if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
824                 return -EINVAL;
825         dev->mtu = new_mtu;
826         return 0;
827 }
828
829 static void ipip_tunnel_setup(struct net_device *dev)
830 {
831         dev->uninit             = ipip_tunnel_uninit;
832         dev->hard_start_xmit    = ipip_tunnel_xmit;
833         dev->get_stats          = ipip_tunnel_get_stats;
834         dev->do_ioctl           = ipip_tunnel_ioctl;
835         dev->change_mtu         = ipip_tunnel_change_mtu;
836         dev->destructor         = free_netdev;
837
838         dev->type               = ARPHRD_TUNNEL;
839         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
840         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr);
841         dev->flags              = IFF_NOARP;
842         dev->iflink             = 0;
843         dev->addr_len           = 4;
844         dev->features           |= NETIF_F_NETNS_LOCAL;
845 }
846
847 static int ipip_tunnel_init(struct net_device *dev)
848 {
849         struct ip_tunnel *tunnel;
850
851         tunnel = netdev_priv(dev);
852
853         tunnel->dev = dev;
854         strcpy(tunnel->parms.name, dev->name);
855
856         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
857         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
858
859         ipip_tunnel_bind_dev(dev);
860
861         return 0;
862 }
863
864 static int ipip_fb_tunnel_init(struct net_device *dev)
865 {
866         struct ip_tunnel *tunnel = netdev_priv(dev);
867         struct iphdr *iph = &tunnel->parms.iph;
868         struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
869
870         tunnel->dev = dev;
871         strcpy(tunnel->parms.name, dev->name);
872
873         iph->version            = 4;
874         iph->protocol           = IPPROTO_IPIP;
875         iph->ihl                = 5;
876
877         dev_hold(dev);
878         ipn->tunnels_wc[0]      = tunnel;
879         return 0;
880 }
881
882 static struct xfrm_tunnel ipip_handler = {
883         .handler        =       ipip_rcv,
884         .err_handler    =       ipip_err,
885         .priority       =       1,
886 };
887
888 static char banner[] __initdata =
889         KERN_INFO "IPv4 over IPv4 tunneling driver\n";
890
891 static void ipip_destroy_tunnels(struct ipip_net *ipn)
892 {
893         int prio;
894
895         for (prio = 1; prio < 4; prio++) {
896                 int h;
897                 for (h = 0; h < HASH_SIZE; h++) {
898                         struct ip_tunnel *t;
899                         while ((t = ipn->tunnels[prio][h]) != NULL)
900                                 unregister_netdevice(t->dev);
901                 }
902         }
903 }
904
905 static int ipip_init_net(struct net *net)
906 {
907         int err;
908         struct ipip_net *ipn;
909
910         err = -ENOMEM;
911         ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
912         if (ipn == NULL)
913                 goto err_alloc;
914
915         err = net_assign_generic(net, ipip_net_id, ipn);
916         if (err < 0)
917                 goto err_assign;
918
919         ipn->tunnels[0] = ipn->tunnels_wc;
920         ipn->tunnels[1] = ipn->tunnels_l;
921         ipn->tunnels[2] = ipn->tunnels_r;
922         ipn->tunnels[3] = ipn->tunnels_r_l;
923
924         ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
925                                            "tunl0",
926                                            ipip_tunnel_setup);
927         if (!ipn->fb_tunnel_dev) {
928                 err = -ENOMEM;
929                 goto err_alloc_dev;
930         }
931
932         ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;
933         dev_net_set(ipn->fb_tunnel_dev, net);
934
935         if ((err = register_netdev(ipn->fb_tunnel_dev)))
936                 goto err_reg_dev;
937
938         return 0;
939
940 err_reg_dev:
941         free_netdev(ipn->fb_tunnel_dev);
942 err_alloc_dev:
943         /* nothing */
944 err_assign:
945         kfree(ipn);
946 err_alloc:
947         return err;
948 }
949
950 static void ipip_exit_net(struct net *net)
951 {
952         struct ipip_net *ipn;
953
954         ipn = net_generic(net, ipip_net_id);
955         rtnl_lock();
956         ipip_destroy_tunnels(ipn);
957         unregister_netdevice(ipn->fb_tunnel_dev);
958         rtnl_unlock();
959         kfree(ipn);
960 }
961
962 static struct pernet_operations ipip_net_ops = {
963         .init = ipip_init_net,
964         .exit = ipip_exit_net,
965 };
966
967 static int __init ipip_init(void)
968 {
969         int err;
970
971         printk(banner);
972
973         if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {
974                 printk(KERN_INFO "ipip init: can't register tunnel\n");
975                 return -EAGAIN;
976         }
977
978         err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
979         if (err)
980                 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
981
982         return err;
983 }
984
985 static void __exit ipip_fini(void)
986 {
987         if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
988                 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
989
990         unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
991 }
992
993 module_init(ipip_init);
994 module_exit(ipip_fini);
995 MODULE_LICENSE("GPL");