Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
50
51 /*
52    Problems & solutions
53    --------------------
54
55    1. The most important issue is detecting local dead loops.
56    They would cause complete host lockup in transmit, which
57    would be "resolved" by stack overflow or, if queueing is enabled,
58    with infinite looping in net_bh.
59
60    We cannot track such dead loops during route installation,
61    it is infeasible task. The most general solutions would be
62    to keep skb->encapsulation counter (sort of local ttl),
63    and silently drop packet when it expires. It is the best
64    solution, but it supposes maintaing new variable in ALL
65    skb, even if no tunneling is used.
66
67    Current solution: t->recursion lock breaks dead loops. It looks
68    like dev->tbusy flag, but I preferred new variable, because
69    the semantics is different. One day, when hard_start_xmit
70    will be multithreaded we will have to use skb->encapsulation.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122
123 /* Fallback tunnel: no source, no destination, no key, no options */
124
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
126
127 #define HASH_SIZE  16
128
129 static int ipgre_net_id;
130 struct ipgre_net {
131         struct ip_tunnel *tunnels[4][HASH_SIZE];
132
133         struct net_device *fb_tunnel_dev;
134 };
135
136 /* Tunnel hash table */
137
138 /*
139    4 hash tables:
140
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155
156 #define tunnels_r_l     tunnels[3]
157 #define tunnels_r       tunnels[2]
158 #define tunnels_l       tunnels[1]
159 #define tunnels_wc      tunnels[0]
160
161 static DEFINE_RWLOCK(ipgre_lock);
162
163 /* Given src, dst and key, find appropriate for input tunnel. */
164
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166                 __be32 remote, __be32 local, __be32 key)
167 {
168         unsigned h0 = HASH(remote);
169         unsigned h1 = HASH(key);
170         struct ip_tunnel *t;
171         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
172
173         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
176                                 return t;
177                 }
178         }
179         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180                 if (remote == t->parms.iph.daddr) {
181                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182                                 return t;
183                 }
184         }
185         for (t = ign->tunnels_l[h1]; t; t = t->next) {
186                 if (local == t->parms.iph.saddr ||
187                      (local == t->parms.iph.daddr &&
188                       ipv4_is_multicast(local))) {
189                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190                                 return t;
191                 }
192         }
193         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194                 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
195                         return t;
196         }
197
198         if (ign->fb_tunnel_dev->flags&IFF_UP)
199                 return netdev_priv(ign->fb_tunnel_dev);
200         return NULL;
201 }
202
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204                 struct ip_tunnel_parm *parms)
205 {
206         __be32 remote = parms->iph.daddr;
207         __be32 local = parms->iph.saddr;
208         __be32 key = parms->i_key;
209         unsigned h = HASH(key);
210         int prio = 0;
211
212         if (local)
213                 prio |= 1;
214         if (remote && !ipv4_is_multicast(remote)) {
215                 prio |= 2;
216                 h ^= HASH(remote);
217         }
218
219         return &ign->tunnels[prio][h];
220 }
221
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
223                 struct ip_tunnel *t)
224 {
225         return __ipgre_bucket(ign, &t->parms);
226 }
227
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
229 {
230         struct ip_tunnel **tp = ipgre_bucket(ign, t);
231
232         t->next = *tp;
233         write_lock_bh(&ipgre_lock);
234         *tp = t;
235         write_unlock_bh(&ipgre_lock);
236 }
237
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
239 {
240         struct ip_tunnel **tp;
241
242         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243                 if (t == *tp) {
244                         write_lock_bh(&ipgre_lock);
245                         *tp = t->next;
246                         write_unlock_bh(&ipgre_lock);
247                         break;
248                 }
249         }
250 }
251
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253                 struct ip_tunnel_parm *parms, int create)
254 {
255         __be32 remote = parms->iph.daddr;
256         __be32 local = parms->iph.saddr;
257         __be32 key = parms->i_key;
258         struct ip_tunnel *t, **tp, *nt;
259         struct net_device *dev;
260         char name[IFNAMSIZ];
261         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
262
263         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265                         if (key == t->parms.i_key)
266                                 return t;
267                 }
268         }
269         if (!create)
270                 return NULL;
271
272         if (parms->name[0])
273                 strlcpy(name, parms->name, IFNAMSIZ);
274         else
275                 sprintf(name, "gre%%d");
276
277         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
278         if (!dev)
279           return NULL;
280
281         dev_net_set(dev, net);
282
283         if (strchr(name, '%')) {
284                 if (dev_alloc_name(dev, name) < 0)
285                         goto failed_free;
286         }
287
288         dev->init = ipgre_tunnel_init;
289         nt = netdev_priv(dev);
290         nt->parms = *parms;
291
292         if (register_netdevice(dev) < 0)
293                 goto failed_free;
294
295         dev_hold(dev);
296         ipgre_tunnel_link(ign, nt);
297         return nt;
298
299 failed_free:
300         free_netdev(dev);
301         return NULL;
302 }
303
304 static void ipgre_tunnel_uninit(struct net_device *dev)
305 {
306         struct net *net = dev_net(dev);
307         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
308
309         ipgre_tunnel_unlink(ign, netdev_priv(dev));
310         dev_put(dev);
311 }
312
313
314 static void ipgre_err(struct sk_buff *skb, u32 info)
315 {
316
317 /* All the routers (except for Linux) return only
318    8 bytes of packet payload. It means, that precise relaying of
319    ICMP in the real Internet is absolutely infeasible.
320
321    Moreover, Cisco "wise men" put GRE key to the third word
322    in GRE header. It makes impossible maintaining even soft state for keyed
323    GRE tunnels with enabled checksum. Tell them "thank you".
324
325    Well, I wonder, rfc1812 was written by Cisco employee,
326    what the hell these idiots break standrads established
327    by themself???
328  */
329
330         struct iphdr *iph = (struct iphdr*)skb->data;
331         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
332         int grehlen = (iph->ihl<<2) + 4;
333         const int type = icmp_hdr(skb)->type;
334         const int code = icmp_hdr(skb)->code;
335         struct ip_tunnel *t;
336         __be16 flags;
337
338         flags = p[0];
339         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
340                 if (flags&(GRE_VERSION|GRE_ROUTING))
341                         return;
342                 if (flags&GRE_KEY) {
343                         grehlen += 4;
344                         if (flags&GRE_CSUM)
345                                 grehlen += 4;
346                 }
347         }
348
349         /* If only 8 bytes returned, keyed message will be dropped here */
350         if (skb_headlen(skb) < grehlen)
351                 return;
352
353         switch (type) {
354         default:
355         case ICMP_PARAMETERPROB:
356                 return;
357
358         case ICMP_DEST_UNREACH:
359                 switch (code) {
360                 case ICMP_SR_FAILED:
361                 case ICMP_PORT_UNREACH:
362                         /* Impossible event. */
363                         return;
364                 case ICMP_FRAG_NEEDED:
365                         /* Soft state for pmtu is maintained by IP core. */
366                         return;
367                 default:
368                         /* All others are translated to HOST_UNREACH.
369                            rfc2003 contains "deep thoughts" about NET_UNREACH,
370                            I believe they are just ether pollution. --ANK
371                          */
372                         break;
373                 }
374                 break;
375         case ICMP_TIME_EXCEEDED:
376                 if (code != ICMP_EXC_TTL)
377                         return;
378                 break;
379         }
380
381         read_lock(&ipgre_lock);
382         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383                         (flags&GRE_KEY) ?
384                         *(((__be32*)p) + (grehlen>>2) - 1) : 0);
385         if (t == NULL || t->parms.iph.daddr == 0 ||
386             ipv4_is_multicast(t->parms.iph.daddr))
387                 goto out;
388
389         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390                 goto out;
391
392         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393                 t->err_count++;
394         else
395                 t->err_count = 1;
396         t->err_time = jiffies;
397 out:
398         read_unlock(&ipgre_lock);
399         return;
400 }
401
402 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
403 {
404         if (INET_ECN_is_ce(iph->tos)) {
405                 if (skb->protocol == htons(ETH_P_IP)) {
406                         IP_ECN_set_ce(ip_hdr(skb));
407                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
408                         IP6_ECN_set_ce(ipv6_hdr(skb));
409                 }
410         }
411 }
412
413 static inline u8
414 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
415 {
416         u8 inner = 0;
417         if (skb->protocol == htons(ETH_P_IP))
418                 inner = old_iph->tos;
419         else if (skb->protocol == htons(ETH_P_IPV6))
420                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
421         return INET_ECN_encapsulate(tos, inner);
422 }
423
424 static int ipgre_rcv(struct sk_buff *skb)
425 {
426         struct iphdr *iph;
427         u8     *h;
428         __be16    flags;
429         __sum16   csum = 0;
430         __be32 key = 0;
431         u32    seqno = 0;
432         struct ip_tunnel *tunnel;
433         int    offset = 4;
434
435         if (!pskb_may_pull(skb, 16))
436                 goto drop_nolock;
437
438         iph = ip_hdr(skb);
439         h = skb->data;
440         flags = *(__be16*)h;
441
442         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
443                 /* - Version must be 0.
444                    - We do not support routing headers.
445                  */
446                 if (flags&(GRE_VERSION|GRE_ROUTING))
447                         goto drop_nolock;
448
449                 if (flags&GRE_CSUM) {
450                         switch (skb->ip_summed) {
451                         case CHECKSUM_COMPLETE:
452                                 csum = csum_fold(skb->csum);
453                                 if (!csum)
454                                         break;
455                                 /* fall through */
456                         case CHECKSUM_NONE:
457                                 skb->csum = 0;
458                                 csum = __skb_checksum_complete(skb);
459                                 skb->ip_summed = CHECKSUM_COMPLETE;
460                         }
461                         offset += 4;
462                 }
463                 if (flags&GRE_KEY) {
464                         key = *(__be32*)(h + offset);
465                         offset += 4;
466                 }
467                 if (flags&GRE_SEQ) {
468                         seqno = ntohl(*(__be32*)(h + offset));
469                         offset += 4;
470                 }
471         }
472
473         read_lock(&ipgre_lock);
474         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475                                         iph->saddr, iph->daddr, key)) != NULL) {
476                 struct net_device_stats *stats = &tunnel->dev->stats;
477
478                 secpath_reset(skb);
479
480                 skb->protocol = *(__be16*)(h + 2);
481                 /* WCCP version 1 and 2 protocol decoding.
482                  * - Change protocol to IP
483                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
484                  */
485                 if (flags == 0 &&
486                     skb->protocol == htons(ETH_P_WCCP)) {
487                         skb->protocol = htons(ETH_P_IP);
488                         if ((*(h + offset) & 0xF0) != 0x40)
489                                 offset += 4;
490                 }
491
492                 skb->mac_header = skb->network_header;
493                 __pskb_pull(skb, offset);
494                 skb_reset_network_header(skb);
495                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
496                 skb->pkt_type = PACKET_HOST;
497 #ifdef CONFIG_NET_IPGRE_BROADCAST
498                 if (ipv4_is_multicast(iph->daddr)) {
499                         /* Looped back packet, drop it! */
500                         if (skb->rtable->fl.iif == 0)
501                                 goto drop;
502                         stats->multicast++;
503                         skb->pkt_type = PACKET_BROADCAST;
504                 }
505 #endif
506
507                 if (((flags&GRE_CSUM) && csum) ||
508                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
509                         stats->rx_crc_errors++;
510                         stats->rx_errors++;
511                         goto drop;
512                 }
513                 if (tunnel->parms.i_flags&GRE_SEQ) {
514                         if (!(flags&GRE_SEQ) ||
515                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
516                                 stats->rx_fifo_errors++;
517                                 stats->rx_errors++;
518                                 goto drop;
519                         }
520                         tunnel->i_seqno = seqno + 1;
521                 }
522                 stats->rx_packets++;
523                 stats->rx_bytes += skb->len;
524                 skb->dev = tunnel->dev;
525                 dst_release(skb->dst);
526                 skb->dst = NULL;
527                 nf_reset(skb);
528                 ipgre_ecn_decapsulate(iph, skb);
529                 netif_rx(skb);
530                 read_unlock(&ipgre_lock);
531                 return(0);
532         }
533         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
534
535 drop:
536         read_unlock(&ipgre_lock);
537 drop_nolock:
538         kfree_skb(skb);
539         return(0);
540 }
541
542 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
543 {
544         struct ip_tunnel *tunnel = netdev_priv(dev);
545         struct net_device_stats *stats = &tunnel->dev->stats;
546         struct iphdr  *old_iph = ip_hdr(skb);
547         struct iphdr  *tiph;
548         u8     tos;
549         __be16 df;
550         struct rtable *rt;                      /* Route to the other host */
551         struct net_device *tdev;                        /* Device to other host */
552         struct iphdr  *iph;                     /* Our new IP header */
553         unsigned int max_headroom;              /* The extra header space needed */
554         int    gre_hlen;
555         __be32 dst;
556         int    mtu;
557
558         if (tunnel->recursion++) {
559                 stats->collisions++;
560                 goto tx_error;
561         }
562
563         if (dev->header_ops) {
564                 gre_hlen = 0;
565                 tiph = (struct iphdr*)skb->data;
566         } else {
567                 gre_hlen = tunnel->hlen;
568                 tiph = &tunnel->parms.iph;
569         }
570
571         if ((dst = tiph->daddr) == 0) {
572                 /* NBMA tunnel */
573
574                 if (skb->dst == NULL) {
575                         stats->tx_fifo_errors++;
576                         goto tx_error;
577                 }
578
579                 if (skb->protocol == htons(ETH_P_IP)) {
580                         rt = skb->rtable;
581                         if ((dst = rt->rt_gateway) == 0)
582                                 goto tx_error_icmp;
583                 }
584 #ifdef CONFIG_IPV6
585                 else if (skb->protocol == htons(ETH_P_IPV6)) {
586                         struct in6_addr *addr6;
587                         int addr_type;
588                         struct neighbour *neigh = skb->dst->neighbour;
589
590                         if (neigh == NULL)
591                                 goto tx_error;
592
593                         addr6 = (struct in6_addr*)&neigh->primary_key;
594                         addr_type = ipv6_addr_type(addr6);
595
596                         if (addr_type == IPV6_ADDR_ANY) {
597                                 addr6 = &ipv6_hdr(skb)->daddr;
598                                 addr_type = ipv6_addr_type(addr6);
599                         }
600
601                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
602                                 goto tx_error_icmp;
603
604                         dst = addr6->s6_addr32[3];
605                 }
606 #endif
607                 else
608                         goto tx_error;
609         }
610
611         tos = tiph->tos;
612         if (tos&1) {
613                 if (skb->protocol == htons(ETH_P_IP))
614                         tos = old_iph->tos;
615                 tos &= ~1;
616         }
617
618         {
619                 struct flowi fl = { .oif = tunnel->parms.link,
620                                     .nl_u = { .ip4_u =
621                                               { .daddr = dst,
622                                                 .saddr = tiph->saddr,
623                                                 .tos = RT_TOS(tos) } },
624                                     .proto = IPPROTO_GRE };
625                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
626                         stats->tx_carrier_errors++;
627                         goto tx_error;
628                 }
629         }
630         tdev = rt->u.dst.dev;
631
632         if (tdev == dev) {
633                 ip_rt_put(rt);
634                 stats->collisions++;
635                 goto tx_error;
636         }
637
638         df = tiph->frag_off;
639         if (df)
640                 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
641         else
642                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
643
644         if (skb->dst)
645                 skb->dst->ops->update_pmtu(skb->dst, mtu);
646
647         if (skb->protocol == htons(ETH_P_IP)) {
648                 df |= (old_iph->frag_off&htons(IP_DF));
649
650                 if ((old_iph->frag_off&htons(IP_DF)) &&
651                     mtu < ntohs(old_iph->tot_len)) {
652                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
653                         ip_rt_put(rt);
654                         goto tx_error;
655                 }
656         }
657 #ifdef CONFIG_IPV6
658         else if (skb->protocol == htons(ETH_P_IPV6)) {
659                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
660
661                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
662                         if ((tunnel->parms.iph.daddr &&
663                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
664                             rt6->rt6i_dst.plen == 128) {
665                                 rt6->rt6i_flags |= RTF_MODIFIED;
666                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
667                         }
668                 }
669
670                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
671                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
672                         ip_rt_put(rt);
673                         goto tx_error;
674                 }
675         }
676 #endif
677
678         if (tunnel->err_count > 0) {
679                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
680                         tunnel->err_count--;
681
682                         dst_link_failure(skb);
683                 } else
684                         tunnel->err_count = 0;
685         }
686
687         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
688
689         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
690             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
691                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
692                 if (!new_skb) {
693                         ip_rt_put(rt);
694                         stats->tx_dropped++;
695                         dev_kfree_skb(skb);
696                         tunnel->recursion--;
697                         return 0;
698                 }
699                 if (skb->sk)
700                         skb_set_owner_w(new_skb, skb->sk);
701                 dev_kfree_skb(skb);
702                 skb = new_skb;
703                 old_iph = ip_hdr(skb);
704         }
705
706         skb->transport_header = skb->network_header;
707         skb_push(skb, gre_hlen);
708         skb_reset_network_header(skb);
709         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
710         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
711                               IPSKB_REROUTED);
712         dst_release(skb->dst);
713         skb->dst = &rt->u.dst;
714
715         /*
716          *      Push down and install the IPIP header.
717          */
718
719         iph                     =       ip_hdr(skb);
720         iph->version            =       4;
721         iph->ihl                =       sizeof(struct iphdr) >> 2;
722         iph->frag_off           =       df;
723         iph->protocol           =       IPPROTO_GRE;
724         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
725         iph->daddr              =       rt->rt_dst;
726         iph->saddr              =       rt->rt_src;
727
728         if ((iph->ttl = tiph->ttl) == 0) {
729                 if (skb->protocol == htons(ETH_P_IP))
730                         iph->ttl = old_iph->ttl;
731 #ifdef CONFIG_IPV6
732                 else if (skb->protocol == htons(ETH_P_IPV6))
733                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
734 #endif
735                 else
736                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
737         }
738
739         ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
740         ((__be16*)(iph+1))[1] = skb->protocol;
741
742         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
743                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
744
745                 if (tunnel->parms.o_flags&GRE_SEQ) {
746                         ++tunnel->o_seqno;
747                         *ptr = htonl(tunnel->o_seqno);
748                         ptr--;
749                 }
750                 if (tunnel->parms.o_flags&GRE_KEY) {
751                         *ptr = tunnel->parms.o_key;
752                         ptr--;
753                 }
754                 if (tunnel->parms.o_flags&GRE_CSUM) {
755                         *ptr = 0;
756                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
757                 }
758         }
759
760         nf_reset(skb);
761
762         IPTUNNEL_XMIT();
763         tunnel->recursion--;
764         return 0;
765
766 tx_error_icmp:
767         dst_link_failure(skb);
768
769 tx_error:
770         stats->tx_errors++;
771         dev_kfree_skb(skb);
772         tunnel->recursion--;
773         return 0;
774 }
775
776 static void ipgre_tunnel_bind_dev(struct net_device *dev)
777 {
778         struct net_device *tdev = NULL;
779         struct ip_tunnel *tunnel;
780         struct iphdr *iph;
781         int hlen = LL_MAX_HEADER;
782         int mtu = ETH_DATA_LEN;
783         int addend = sizeof(struct iphdr) + 4;
784
785         tunnel = netdev_priv(dev);
786         iph = &tunnel->parms.iph;
787
788         /* Guess output device to choose reasonable mtu and hard_header_len */
789
790         if (iph->daddr) {
791                 struct flowi fl = { .oif = tunnel->parms.link,
792                                     .nl_u = { .ip4_u =
793                                               { .daddr = iph->daddr,
794                                                 .saddr = iph->saddr,
795                                                 .tos = RT_TOS(iph->tos) } },
796                                     .proto = IPPROTO_GRE };
797                 struct rtable *rt;
798                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
799                         tdev = rt->u.dst.dev;
800                         ip_rt_put(rt);
801                 }
802                 dev->flags |= IFF_POINTOPOINT;
803         }
804
805         if (!tdev && tunnel->parms.link)
806                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
807
808         if (tdev) {
809                 hlen = tdev->hard_header_len;
810                 mtu = tdev->mtu;
811         }
812         dev->iflink = tunnel->parms.link;
813
814         /* Precalculate GRE options length */
815         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
816                 if (tunnel->parms.o_flags&GRE_CSUM)
817                         addend += 4;
818                 if (tunnel->parms.o_flags&GRE_KEY)
819                         addend += 4;
820                 if (tunnel->parms.o_flags&GRE_SEQ)
821                         addend += 4;
822         }
823         dev->hard_header_len = hlen + addend;
824         dev->mtu = mtu - addend;
825         tunnel->hlen = addend;
826
827 }
828
829 static int
830 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
831 {
832         int err = 0;
833         struct ip_tunnel_parm p;
834         struct ip_tunnel *t;
835         struct net *net = dev_net(dev);
836         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
837
838         switch (cmd) {
839         case SIOCGETTUNNEL:
840                 t = NULL;
841                 if (dev == ign->fb_tunnel_dev) {
842                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
843                                 err = -EFAULT;
844                                 break;
845                         }
846                         t = ipgre_tunnel_locate(net, &p, 0);
847                 }
848                 if (t == NULL)
849                         t = netdev_priv(dev);
850                 memcpy(&p, &t->parms, sizeof(p));
851                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
852                         err = -EFAULT;
853                 break;
854
855         case SIOCADDTUNNEL:
856         case SIOCCHGTUNNEL:
857                 err = -EPERM;
858                 if (!capable(CAP_NET_ADMIN))
859                         goto done;
860
861                 err = -EFAULT;
862                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
863                         goto done;
864
865                 err = -EINVAL;
866                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
867                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
868                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
869                         goto done;
870                 if (p.iph.ttl)
871                         p.iph.frag_off |= htons(IP_DF);
872
873                 if (!(p.i_flags&GRE_KEY))
874                         p.i_key = 0;
875                 if (!(p.o_flags&GRE_KEY))
876                         p.o_key = 0;
877
878                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
879
880                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
881                         if (t != NULL) {
882                                 if (t->dev != dev) {
883                                         err = -EEXIST;
884                                         break;
885                                 }
886                         } else {
887                                 unsigned nflags=0;
888
889                                 t = netdev_priv(dev);
890
891                                 if (ipv4_is_multicast(p.iph.daddr))
892                                         nflags = IFF_BROADCAST;
893                                 else if (p.iph.daddr)
894                                         nflags = IFF_POINTOPOINT;
895
896                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
897                                         err = -EINVAL;
898                                         break;
899                                 }
900                                 ipgre_tunnel_unlink(ign, t);
901                                 t->parms.iph.saddr = p.iph.saddr;
902                                 t->parms.iph.daddr = p.iph.daddr;
903                                 t->parms.i_key = p.i_key;
904                                 t->parms.o_key = p.o_key;
905                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
906                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
907                                 ipgre_tunnel_link(ign, t);
908                                 netdev_state_change(dev);
909                         }
910                 }
911
912                 if (t) {
913                         err = 0;
914                         if (cmd == SIOCCHGTUNNEL) {
915                                 t->parms.iph.ttl = p.iph.ttl;
916                                 t->parms.iph.tos = p.iph.tos;
917                                 t->parms.iph.frag_off = p.iph.frag_off;
918                                 if (t->parms.link != p.link) {
919                                         t->parms.link = p.link;
920                                         ipgre_tunnel_bind_dev(dev);
921                                         netdev_state_change(dev);
922                                 }
923                         }
924                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
925                                 err = -EFAULT;
926                 } else
927                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
928                 break;
929
930         case SIOCDELTUNNEL:
931                 err = -EPERM;
932                 if (!capable(CAP_NET_ADMIN))
933                         goto done;
934
935                 if (dev == ign->fb_tunnel_dev) {
936                         err = -EFAULT;
937                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
938                                 goto done;
939                         err = -ENOENT;
940                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
941                                 goto done;
942                         err = -EPERM;
943                         if (t == netdev_priv(ign->fb_tunnel_dev))
944                                 goto done;
945                         dev = t->dev;
946                 }
947                 unregister_netdevice(dev);
948                 err = 0;
949                 break;
950
951         default:
952                 err = -EINVAL;
953         }
954
955 done:
956         return err;
957 }
958
959 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960 {
961         struct ip_tunnel *tunnel = netdev_priv(dev);
962         if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
963                 return -EINVAL;
964         dev->mtu = new_mtu;
965         return 0;
966 }
967
968 /* Nice toy. Unfortunately, useless in real life :-)
969    It allows to construct virtual multiprotocol broadcast "LAN"
970    over the Internet, provided multicast routing is tuned.
971
972
973    I have no idea was this bicycle invented before me,
974    so that I had to set ARPHRD_IPGRE to a random value.
975    I have an impression, that Cisco could make something similar,
976    but this feature is apparently missing in IOS<=11.2(8).
977
978    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
979    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
980
981    ping -t 255 224.66.66.66
982
983    If nobody answers, mbone does not work.
984
985    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
986    ip addr add 10.66.66.<somewhat>/24 dev Universe
987    ifconfig Universe up
988    ifconfig Universe add fe80::<Your_real_addr>/10
989    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
990    ftp 10.66.66.66
991    ...
992    ftp fec0:6666:6666::193.233.7.65
993    ...
994
995  */
996
997 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
998                         unsigned short type,
999                         const void *daddr, const void *saddr, unsigned len)
1000 {
1001         struct ip_tunnel *t = netdev_priv(dev);
1002         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1003         __be16 *p = (__be16*)(iph+1);
1004
1005         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1006         p[0]            = t->parms.o_flags;
1007         p[1]            = htons(type);
1008
1009         /*
1010          *      Set the source hardware address.
1011          */
1012
1013         if (saddr)
1014                 memcpy(&iph->saddr, saddr, 4);
1015
1016         if (daddr) {
1017                 memcpy(&iph->daddr, daddr, 4);
1018                 return t->hlen;
1019         }
1020         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1021                 return t->hlen;
1022
1023         return -t->hlen;
1024 }
1025
1026 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1027 {
1028         struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1029         memcpy(haddr, &iph->saddr, 4);
1030         return 4;
1031 }
1032
1033 static const struct header_ops ipgre_header_ops = {
1034         .create = ipgre_header,
1035         .parse  = ipgre_header_parse,
1036 };
1037
1038 #ifdef CONFIG_NET_IPGRE_BROADCAST
1039 static int ipgre_open(struct net_device *dev)
1040 {
1041         struct ip_tunnel *t = netdev_priv(dev);
1042
1043         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1044                 struct flowi fl = { .oif = t->parms.link,
1045                                     .nl_u = { .ip4_u =
1046                                               { .daddr = t->parms.iph.daddr,
1047                                                 .saddr = t->parms.iph.saddr,
1048                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1049                                     .proto = IPPROTO_GRE };
1050                 struct rtable *rt;
1051                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1052                         return -EADDRNOTAVAIL;
1053                 dev = rt->u.dst.dev;
1054                 ip_rt_put(rt);
1055                 if (__in_dev_get_rtnl(dev) == NULL)
1056                         return -EADDRNOTAVAIL;
1057                 t->mlink = dev->ifindex;
1058                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1059         }
1060         return 0;
1061 }
1062
1063 static int ipgre_close(struct net_device *dev)
1064 {
1065         struct ip_tunnel *t = netdev_priv(dev);
1066         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1067                 struct in_device *in_dev;
1068                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1069                 if (in_dev) {
1070                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1071                         in_dev_put(in_dev);
1072                 }
1073         }
1074         return 0;
1075 }
1076
1077 #endif
1078
1079 static void ipgre_tunnel_setup(struct net_device *dev)
1080 {
1081         dev->uninit             = ipgre_tunnel_uninit;
1082         dev->destructor         = free_netdev;
1083         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1084         dev->do_ioctl           = ipgre_tunnel_ioctl;
1085         dev->change_mtu         = ipgre_tunnel_change_mtu;
1086
1087         dev->type               = ARPHRD_IPGRE;
1088         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1089         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1090         dev->flags              = IFF_NOARP;
1091         dev->iflink             = 0;
1092         dev->addr_len           = 4;
1093         dev->features           |= NETIF_F_NETNS_LOCAL;
1094 }
1095
1096 static int ipgre_tunnel_init(struct net_device *dev)
1097 {
1098         struct ip_tunnel *tunnel;
1099         struct iphdr *iph;
1100
1101         tunnel = netdev_priv(dev);
1102         iph = &tunnel->parms.iph;
1103
1104         tunnel->dev = dev;
1105         strcpy(tunnel->parms.name, dev->name);
1106
1107         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1108         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1109
1110         ipgre_tunnel_bind_dev(dev);
1111
1112         if (iph->daddr) {
1113 #ifdef CONFIG_NET_IPGRE_BROADCAST
1114                 if (ipv4_is_multicast(iph->daddr)) {
1115                         if (!iph->saddr)
1116                                 return -EINVAL;
1117                         dev->flags = IFF_BROADCAST;
1118                         dev->header_ops = &ipgre_header_ops;
1119                         dev->open = ipgre_open;
1120                         dev->stop = ipgre_close;
1121                 }
1122 #endif
1123         } else
1124                 dev->header_ops = &ipgre_header_ops;
1125
1126         return 0;
1127 }
1128
1129 static int ipgre_fb_tunnel_init(struct net_device *dev)
1130 {
1131         struct ip_tunnel *tunnel = netdev_priv(dev);
1132         struct iphdr *iph = &tunnel->parms.iph;
1133         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1134
1135         tunnel->dev = dev;
1136         strcpy(tunnel->parms.name, dev->name);
1137
1138         iph->version            = 4;
1139         iph->protocol           = IPPROTO_GRE;
1140         iph->ihl                = 5;
1141         tunnel->hlen            = sizeof(struct iphdr) + 4;
1142
1143         dev_hold(dev);
1144         ign->tunnels_wc[0]      = tunnel;
1145         return 0;
1146 }
1147
1148
1149 static struct net_protocol ipgre_protocol = {
1150         .handler        =       ipgre_rcv,
1151         .err_handler    =       ipgre_err,
1152         .netns_ok       =       1,
1153 };
1154
1155 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1156 {
1157         int prio;
1158
1159         for (prio = 0; prio < 4; prio++) {
1160                 int h;
1161                 for (h = 0; h < HASH_SIZE; h++) {
1162                         struct ip_tunnel *t;
1163                         while ((t = ign->tunnels[prio][h]) != NULL)
1164                                 unregister_netdevice(t->dev);
1165                 }
1166         }
1167 }
1168
1169 static int ipgre_init_net(struct net *net)
1170 {
1171         int err;
1172         struct ipgre_net *ign;
1173
1174         err = -ENOMEM;
1175         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1176         if (ign == NULL)
1177                 goto err_alloc;
1178
1179         err = net_assign_generic(net, ipgre_net_id, ign);
1180         if (err < 0)
1181                 goto err_assign;
1182
1183         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1184                                            ipgre_tunnel_setup);
1185         if (!ign->fb_tunnel_dev) {
1186                 err = -ENOMEM;
1187                 goto err_alloc_dev;
1188         }
1189
1190         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1191         dev_net_set(ign->fb_tunnel_dev, net);
1192
1193         if ((err = register_netdev(ign->fb_tunnel_dev)))
1194                 goto err_reg_dev;
1195
1196         return 0;
1197
1198 err_reg_dev:
1199         free_netdev(ign->fb_tunnel_dev);
1200 err_alloc_dev:
1201         /* nothing */
1202 err_assign:
1203         kfree(ign);
1204 err_alloc:
1205         return err;
1206 }
1207
1208 static void ipgre_exit_net(struct net *net)
1209 {
1210         struct ipgre_net *ign;
1211
1212         ign = net_generic(net, ipgre_net_id);
1213         rtnl_lock();
1214         ipgre_destroy_tunnels(ign);
1215         rtnl_unlock();
1216         kfree(ign);
1217 }
1218
1219 static struct pernet_operations ipgre_net_ops = {
1220         .init = ipgre_init_net,
1221         .exit = ipgre_exit_net,
1222 };
1223
1224 /*
1225  *      And now the modules code and kernel interface.
1226  */
1227
1228 static int __init ipgre_init(void)
1229 {
1230         int err;
1231
1232         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1233
1234         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1235                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1236                 return -EAGAIN;
1237         }
1238
1239         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1240         if (err < 0)
1241                 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1242
1243         return err;
1244 }
1245
1246 static void __exit ipgre_fini(void)
1247 {
1248         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1249                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1250
1251         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1252 }
1253
1254 module_init(ipgre_init);
1255 module_exit(ipgre_fini);
1256 MODULE_LICENSE("GPL");