2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
35 #include <net/protocol.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
45 #include <net/ip6_fib.h>
46 #include <net/ip6_route.h>
53 1. The most important issue is detecting local dead loops.
54 They would cause complete host lockup in transmit, which
55 would be "resolved" by stack overflow or, if queueing is enabled,
56 with infinite looping in net_bh.
58 We cannot track such dead loops during route installation,
59 it is infeasible task. The most general solutions would be
60 to keep skb->encapsulation counter (sort of local ttl),
61 and silently drop packet when it expires. It is the best
62 solution, but it supposes maintaing new variable in ALL
63 skb, even if no tunneling is used.
65 Current solution: t->recursion lock breaks dead loops. It looks
66 like dev->tbusy flag, but I preferred new variable, because
67 the semantics is different. One day, when hard_start_xmit
68 will be multithreaded we will have to use skb->encapsulation.
72 2. Networking dead loops would not kill routers, but would really
73 kill network. IP hop limit plays role of "t->recursion" in this case,
74 if we copy it from packet being encapsulated to upper header.
75 It is very good solution, but it introduces two problems:
77 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
78 do not work over tunnels.
79 - traceroute does not work. I planned to relay ICMP from tunnel,
80 so that this problem would be solved and traceroute output
81 would even more informative. This idea appeared to be wrong:
82 only Linux complies to rfc1812 now (yes, guys, Linux is the only
83 true router now :-)), all routers (at least, in neighbourhood of mine)
84 return only 8 bytes of payload. It is the end.
86 Hence, if we want that OSPF worked or traceroute said something reasonable,
87 we should search for another solution.
89 One of them is to parse packet trying to detect inner encapsulation
90 made by our node. It is difficult or even impossible, especially,
91 taking into account fragmentation. TO be short, tt is not solution at all.
93 Current solution: The solution was UNEXPECTEDLY SIMPLE.
94 We force DF flag on tunnels with preconfigured hop limit,
95 that is ALL. :-) Well, it does not remove the problem completely,
96 but exponential growth of network traffic is changed to linear
97 (branches, that exceed pmtu are pruned) and tunnel mtu
98 fastly degrades to value <68, where looping stops.
99 Yes, it is not good if there exists a router in the loop,
100 which does not force DF, even when encapsulating packets have DF set.
101 But it is not our problem! Nobody could accuse us, we made
102 all that we could make. Even if it is your gated who injected
103 fatal route to network, even if it were you who configured
104 fatal static route: you are innocent. :-)
108 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
109 practically identical code. It would be good to glue them
110 together, but it is not very evident, how to make them modular.
111 sit is integral part of IPv6, ipip and gre are naturally modular.
112 We could extract common parts (hash table, ioctl etc)
113 to a separate module (ip_tunnel.c).
118 static int ipgre_tunnel_init(struct net_device *dev);
119 static void ipgre_tunnel_setup(struct net_device *dev);
121 /* Fallback tunnel: no source, no destination, no key, no options */
123 static int ipgre_fb_tunnel_init(struct net_device *dev);
125 static struct net_device *ipgre_fb_tunnel_dev;
127 /* Tunnel hash table */
137 We require exact key match i.e. if a key is present in packet
138 it will match only tunnel with the same key; if it is not present,
139 it will match only keyless tunnel.
141 All keysless packets, if not matched configured keyless tunnels
142 will match fallback tunnel.
146 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
148 static struct ip_tunnel *tunnels[4][HASH_SIZE];
150 #define tunnels_r_l (tunnels[3])
151 #define tunnels_r (tunnels[2])
152 #define tunnels_l (tunnels[1])
153 #define tunnels_wc (tunnels[0])
155 static DEFINE_RWLOCK(ipgre_lock);
157 /* Given src, dst and key, find appropriate for input tunnel. */
159 static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be32 key)
161 unsigned h0 = HASH(remote);
162 unsigned h1 = HASH(key);
165 for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
166 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
167 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
171 for (t = tunnels_r[h0^h1]; t; t = t->next) {
172 if (remote == t->parms.iph.daddr) {
173 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177 for (t = tunnels_l[h1]; t; t = t->next) {
178 if (local == t->parms.iph.saddr ||
179 (local == t->parms.iph.daddr &&
180 ipv4_is_multicast(local))) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
185 for (t = tunnels_wc[h1]; t; t = t->next) {
186 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190 if (ipgre_fb_tunnel_dev->flags&IFF_UP)
191 return netdev_priv(ipgre_fb_tunnel_dev);
195 static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms)
197 __be32 remote = parms->iph.daddr;
198 __be32 local = parms->iph.saddr;
199 __be32 key = parms->i_key;
200 unsigned h = HASH(key);
205 if (remote && !ipv4_is_multicast(remote)) {
210 return &tunnels[prio][h];
213 static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
215 return __ipgre_bucket(&t->parms);
218 static void ipgre_tunnel_link(struct ip_tunnel *t)
220 struct ip_tunnel **tp = ipgre_bucket(t);
223 write_lock_bh(&ipgre_lock);
225 write_unlock_bh(&ipgre_lock);
228 static void ipgre_tunnel_unlink(struct ip_tunnel *t)
230 struct ip_tunnel **tp;
232 for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
234 write_lock_bh(&ipgre_lock);
236 write_unlock_bh(&ipgre_lock);
242 static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
244 __be32 remote = parms->iph.daddr;
245 __be32 local = parms->iph.saddr;
246 __be32 key = parms->i_key;
247 struct ip_tunnel *t, **tp, *nt;
248 struct net_device *dev;
251 for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
252 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
253 if (key == t->parms.i_key)
261 strlcpy(name, parms->name, IFNAMSIZ);
263 sprintf(name, "gre%%d");
265 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
269 dev->init = ipgre_tunnel_init;
270 nt = netdev_priv(dev);
273 if (register_netdevice(dev) < 0) {
279 ipgre_tunnel_link(nt);
286 static void ipgre_tunnel_uninit(struct net_device *dev)
288 ipgre_tunnel_unlink(netdev_priv(dev));
293 static void ipgre_err(struct sk_buff *skb, u32 info)
295 #ifndef I_WISH_WORLD_WERE_PERFECT
297 /* It is not :-( All the routers (except for Linux) return only
298 8 bytes of packet payload. It means, that precise relaying of
299 ICMP in the real Internet is absolutely infeasible.
301 Moreover, Cisco "wise men" put GRE key to the third word
302 in GRE header. It makes impossible maintaining even soft state for keyed
303 GRE tunnels with enabled checksum. Tell them "thank you".
305 Well, I wonder, rfc1812 was written by Cisco employee,
306 what the hell these idiots break standrads established
310 struct iphdr *iph = (struct iphdr*)skb->data;
311 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
312 int grehlen = (iph->ihl<<2) + 4;
313 const int type = icmp_hdr(skb)->type;
314 const int code = icmp_hdr(skb)->code;
319 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
320 if (flags&(GRE_VERSION|GRE_ROUTING))
329 /* If only 8 bytes returned, keyed message will be dropped here */
330 if (skb_headlen(skb) < grehlen)
335 case ICMP_PARAMETERPROB:
338 case ICMP_DEST_UNREACH:
341 case ICMP_PORT_UNREACH:
342 /* Impossible event. */
344 case ICMP_FRAG_NEEDED:
345 /* Soft state for pmtu is maintained by IP core. */
348 /* All others are translated to HOST_UNREACH.
349 rfc2003 contains "deep thoughts" about NET_UNREACH,
350 I believe they are just ether pollution. --ANK
355 case ICMP_TIME_EXCEEDED:
356 if (code != ICMP_EXC_TTL)
361 read_lock(&ipgre_lock);
362 t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0);
363 if (t == NULL || t->parms.iph.daddr == 0 ||
364 ipv4_is_multicast(t->parms.iph.daddr))
367 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
370 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
374 t->err_time = jiffies;
376 read_unlock(&ipgre_lock);
379 struct iphdr *iph = (struct iphdr*)dp;
381 __be16 *p = (__be16*)(dp+(iph->ihl<<2));
382 const int type = icmp_hdr(skb)->type;
383 const int code = icmp_hdr(skb)->code;
389 int grehlen = (iph->ihl<<2) + 4;
390 struct sk_buff *skb2;
394 if (p[1] != htons(ETH_P_IP))
398 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
399 if (flags&(GRE_VERSION|GRE_ROUTING))
408 if (len < grehlen + sizeof(struct iphdr))
410 eiph = (struct iphdr*)(dp + grehlen);
415 case ICMP_PARAMETERPROB:
416 n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
417 if (n < (iph->ihl<<2))
420 /* So... This guy found something strange INSIDE encapsulated
421 packet. Well, he is fool, but what can we do ?
423 rel_type = ICMP_PARAMETERPROB;
425 rel_info = htonl(n << 24);
428 case ICMP_DEST_UNREACH:
431 case ICMP_PORT_UNREACH:
432 /* Impossible event. */
434 case ICMP_FRAG_NEEDED:
435 /* And it is the only really necessary thing :-) */
436 n = ntohs(icmp_hdr(skb)->un.frag.mtu);
440 /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
441 if (n > ntohs(eiph->tot_len))
446 /* All others are translated to HOST_UNREACH.
447 rfc2003 contains "deep thoughts" about NET_UNREACH,
448 I believe, it is just ether pollution. --ANK
450 rel_type = ICMP_DEST_UNREACH;
451 rel_code = ICMP_HOST_UNREACH;
455 case ICMP_TIME_EXCEEDED:
456 if (code != ICMP_EXC_TTL)
461 /* Prepare fake skb to feed it to icmp_send */
462 skb2 = skb_clone(skb, GFP_ATOMIC);
465 dst_release(skb2->dst);
467 skb_pull(skb2, skb->data - (u8*)eiph);
468 skb_reset_network_header(skb2);
470 /* Try to guess incoming interface */
471 memset(&fl, 0, sizeof(fl));
472 fl.fl4_dst = eiph->saddr;
473 fl.fl4_tos = RT_TOS(eiph->tos);
474 fl.proto = IPPROTO_GRE;
475 if (ip_route_output_key(&init_net, &rt, &fl)) {
479 skb2->dev = rt->u.dst.dev;
481 /* route "incoming" packet */
482 if (rt->rt_flags&RTCF_LOCAL) {
485 fl.fl4_dst = eiph->daddr;
486 fl.fl4_src = eiph->saddr;
487 fl.fl4_tos = eiph->tos;
488 if (ip_route_output_key(&init_net, &rt, &fl) ||
489 rt->u.dst.dev->type != ARPHRD_IPGRE) {
496 if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
497 skb2->dst->dev->type != ARPHRD_IPGRE) {
503 /* change mtu on this route */
504 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
505 if (n > dst_mtu(skb2->dst)) {
509 skb2->dst->ops->update_pmtu(skb2->dst, n);
510 } else if (type == ICMP_TIME_EXCEEDED) {
511 struct ip_tunnel *t = netdev_priv(skb2->dev);
512 if (t->parms.iph.ttl) {
513 rel_type = ICMP_DEST_UNREACH;
514 rel_code = ICMP_HOST_UNREACH;
518 icmp_send(skb2, rel_type, rel_code, rel_info);
523 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
525 if (INET_ECN_is_ce(iph->tos)) {
526 if (skb->protocol == htons(ETH_P_IP)) {
527 IP_ECN_set_ce(ip_hdr(skb));
528 } else if (skb->protocol == htons(ETH_P_IPV6)) {
529 IP6_ECN_set_ce(ipv6_hdr(skb));
535 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
538 if (skb->protocol == htons(ETH_P_IP))
539 inner = old_iph->tos;
540 else if (skb->protocol == htons(ETH_P_IPV6))
541 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
542 return INET_ECN_encapsulate(tos, inner);
545 static int ipgre_rcv(struct sk_buff *skb)
553 struct ip_tunnel *tunnel;
556 if (!pskb_may_pull(skb, 16))
563 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
564 /* - Version must be 0.
565 - We do not support routing headers.
567 if (flags&(GRE_VERSION|GRE_ROUTING))
570 if (flags&GRE_CSUM) {
571 switch (skb->ip_summed) {
572 case CHECKSUM_COMPLETE:
573 csum = csum_fold(skb->csum);
579 csum = __skb_checksum_complete(skb);
580 skb->ip_summed = CHECKSUM_COMPLETE;
585 key = *(__be32*)(h + offset);
589 seqno = ntohl(*(__be32*)(h + offset));
594 read_lock(&ipgre_lock);
595 if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
598 skb->protocol = *(__be16*)(h + 2);
599 /* WCCP version 1 and 2 protocol decoding.
600 * - Change protocol to IP
601 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
604 skb->protocol == htons(ETH_P_WCCP)) {
605 skb->protocol = htons(ETH_P_IP);
606 if ((*(h + offset) & 0xF0) != 0x40)
610 skb->mac_header = skb->network_header;
611 __pskb_pull(skb, offset);
612 skb_reset_network_header(skb);
613 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
614 skb->pkt_type = PACKET_HOST;
615 #ifdef CONFIG_NET_IPGRE_BROADCAST
616 if (ipv4_is_multicast(iph->daddr)) {
617 /* Looped back packet, drop it! */
618 if (((struct rtable*)skb->dst)->fl.iif == 0)
620 tunnel->stat.multicast++;
621 skb->pkt_type = PACKET_BROADCAST;
625 if (((flags&GRE_CSUM) && csum) ||
626 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
627 tunnel->stat.rx_crc_errors++;
628 tunnel->stat.rx_errors++;
631 if (tunnel->parms.i_flags&GRE_SEQ) {
632 if (!(flags&GRE_SEQ) ||
633 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
634 tunnel->stat.rx_fifo_errors++;
635 tunnel->stat.rx_errors++;
638 tunnel->i_seqno = seqno + 1;
640 tunnel->stat.rx_packets++;
641 tunnel->stat.rx_bytes += skb->len;
642 skb->dev = tunnel->dev;
643 dst_release(skb->dst);
646 ipgre_ecn_decapsulate(iph, skb);
648 read_unlock(&ipgre_lock);
651 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654 read_unlock(&ipgre_lock);
660 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662 struct ip_tunnel *tunnel = netdev_priv(dev);
663 struct net_device_stats *stats = &tunnel->stat;
664 struct iphdr *old_iph = ip_hdr(skb);
668 struct rtable *rt; /* Route to the other host */
669 struct net_device *tdev; /* Device to other host */
670 struct iphdr *iph; /* Our new IP header */
671 unsigned int max_headroom; /* The extra header space needed */
676 if (tunnel->recursion++) {
677 tunnel->stat.collisions++;
681 if (dev->header_ops) {
683 tiph = (struct iphdr*)skb->data;
685 gre_hlen = tunnel->hlen;
686 tiph = &tunnel->parms.iph;
689 if ((dst = tiph->daddr) == 0) {
692 if (skb->dst == NULL) {
693 tunnel->stat.tx_fifo_errors++;
697 if (skb->protocol == htons(ETH_P_IP)) {
698 rt = (struct rtable*)skb->dst;
699 if ((dst = rt->rt_gateway) == 0)
703 else if (skb->protocol == htons(ETH_P_IPV6)) {
704 struct in6_addr *addr6;
706 struct neighbour *neigh = skb->dst->neighbour;
711 addr6 = (struct in6_addr*)&neigh->primary_key;
712 addr_type = ipv6_addr_type(addr6);
714 if (addr_type == IPV6_ADDR_ANY) {
715 addr6 = &ipv6_hdr(skb)->daddr;
716 addr_type = ipv6_addr_type(addr6);
719 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
722 dst = addr6->s6_addr32[3];
731 if (skb->protocol == htons(ETH_P_IP))
737 struct flowi fl = { .oif = tunnel->parms.link,
740 .saddr = tiph->saddr,
741 .tos = RT_TOS(tos) } },
742 .proto = IPPROTO_GRE };
743 if (ip_route_output_key(&init_net, &rt, &fl)) {
744 tunnel->stat.tx_carrier_errors++;
748 tdev = rt->u.dst.dev;
752 tunnel->stat.collisions++;
758 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
760 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
763 skb->dst->ops->update_pmtu(skb->dst, mtu);
765 if (skb->protocol == htons(ETH_P_IP)) {
766 df |= (old_iph->frag_off&htons(IP_DF));
768 if ((old_iph->frag_off&htons(IP_DF)) &&
769 mtu < ntohs(old_iph->tot_len)) {
770 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
776 else if (skb->protocol == htons(ETH_P_IPV6)) {
777 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
779 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
780 if ((tunnel->parms.iph.daddr &&
781 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
782 rt6->rt6i_dst.plen == 128) {
783 rt6->rt6i_flags |= RTF_MODIFIED;
784 skb->dst->metrics[RTAX_MTU-1] = mtu;
788 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
789 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
796 if (tunnel->err_count > 0) {
797 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
800 dst_link_failure(skb);
802 tunnel->err_count = 0;
805 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
807 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
808 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
809 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
818 skb_set_owner_w(new_skb, skb->sk);
821 old_iph = ip_hdr(skb);
824 skb->transport_header = skb->network_header;
825 skb_push(skb, gre_hlen);
826 skb_reset_network_header(skb);
827 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
828 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
830 dst_release(skb->dst);
831 skb->dst = &rt->u.dst;
834 * Push down and install the IPIP header.
839 iph->ihl = sizeof(struct iphdr) >> 2;
841 iph->protocol = IPPROTO_GRE;
842 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
843 iph->daddr = rt->rt_dst;
844 iph->saddr = rt->rt_src;
846 if ((iph->ttl = tiph->ttl) == 0) {
847 if (skb->protocol == htons(ETH_P_IP))
848 iph->ttl = old_iph->ttl;
850 else if (skb->protocol == htons(ETH_P_IPV6))
851 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
854 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
857 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
858 ((__be16*)(iph+1))[1] = skb->protocol;
860 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
861 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
863 if (tunnel->parms.o_flags&GRE_SEQ) {
865 *ptr = htonl(tunnel->o_seqno);
868 if (tunnel->parms.o_flags&GRE_KEY) {
869 *ptr = tunnel->parms.o_key;
872 if (tunnel->parms.o_flags&GRE_CSUM) {
874 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
885 dst_link_failure(skb);
894 static void ipgre_tunnel_bind_dev(struct net_device *dev)
896 struct net_device *tdev = NULL;
897 struct ip_tunnel *tunnel;
899 int hlen = LL_MAX_HEADER;
900 int mtu = ETH_DATA_LEN;
901 int addend = sizeof(struct iphdr) + 4;
903 tunnel = netdev_priv(dev);
904 iph = &tunnel->parms.iph;
906 /* Guess output device to choose reasonable mtu and hard_header_len */
909 struct flowi fl = { .oif = tunnel->parms.link,
911 { .daddr = iph->daddr,
913 .tos = RT_TOS(iph->tos) } },
914 .proto = IPPROTO_GRE };
916 if (!ip_route_output_key(&init_net, &rt, &fl)) {
917 tdev = rt->u.dst.dev;
920 dev->flags |= IFF_POINTOPOINT;
923 if (!tdev && tunnel->parms.link)
924 tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
927 hlen = tdev->hard_header_len;
930 dev->iflink = tunnel->parms.link;
932 /* Precalculate GRE options length */
933 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
934 if (tunnel->parms.o_flags&GRE_CSUM)
936 if (tunnel->parms.o_flags&GRE_KEY)
938 if (tunnel->parms.o_flags&GRE_SEQ)
941 dev->hard_header_len = hlen + addend;
942 dev->mtu = mtu - addend;
943 tunnel->hlen = addend;
948 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
951 struct ip_tunnel_parm p;
957 if (dev == ipgre_fb_tunnel_dev) {
958 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
962 t = ipgre_tunnel_locate(&p, 0);
965 t = netdev_priv(dev);
966 memcpy(&p, &t->parms, sizeof(p));
967 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
974 if (!capable(CAP_NET_ADMIN))
978 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
982 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
983 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
984 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
987 p.iph.frag_off |= htons(IP_DF);
989 if (!(p.i_flags&GRE_KEY))
991 if (!(p.o_flags&GRE_KEY))
994 t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
996 if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1005 t = netdev_priv(dev);
1007 if (ipv4_is_multicast(p.iph.daddr))
1008 nflags = IFF_BROADCAST;
1009 else if (p.iph.daddr)
1010 nflags = IFF_POINTOPOINT;
1012 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1016 ipgre_tunnel_unlink(t);
1017 t->parms.iph.saddr = p.iph.saddr;
1018 t->parms.iph.daddr = p.iph.daddr;
1019 t->parms.i_key = p.i_key;
1020 t->parms.o_key = p.o_key;
1021 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1022 memcpy(dev->broadcast, &p.iph.daddr, 4);
1023 ipgre_tunnel_link(t);
1024 netdev_state_change(dev);
1030 if (cmd == SIOCCHGTUNNEL) {
1031 t->parms.iph.ttl = p.iph.ttl;
1032 t->parms.iph.tos = p.iph.tos;
1033 t->parms.iph.frag_off = p.iph.frag_off;
1034 if (t->parms.link != p.link) {
1035 t->parms.link = p.link;
1036 ipgre_tunnel_bind_dev(dev);
1037 netdev_state_change(dev);
1040 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1043 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1048 if (!capable(CAP_NET_ADMIN))
1051 if (dev == ipgre_fb_tunnel_dev) {
1053 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1056 if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
1059 if (t == netdev_priv(ipgre_fb_tunnel_dev))
1063 unregister_netdevice(dev);
1075 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
1077 return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
1080 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1082 struct ip_tunnel *tunnel = netdev_priv(dev);
1083 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
1089 /* Nice toy. Unfortunately, useless in real life :-)
1090 It allows to construct virtual multiprotocol broadcast "LAN"
1091 over the Internet, provided multicast routing is tuned.
1094 I have no idea was this bicycle invented before me,
1095 so that I had to set ARPHRD_IPGRE to a random value.
1096 I have an impression, that Cisco could make something similar,
1097 but this feature is apparently missing in IOS<=11.2(8).
1099 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1100 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1102 ping -t 255 224.66.66.66
1104 If nobody answers, mbone does not work.
1106 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1107 ip addr add 10.66.66.<somewhat>/24 dev Universe
1108 ifconfig Universe up
1109 ifconfig Universe add fe80::<Your_real_addr>/10
1110 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1113 ftp fec0:6666:6666::193.233.7.65
1118 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1119 unsigned short type,
1120 const void *daddr, const void *saddr, unsigned len)
1122 struct ip_tunnel *t = netdev_priv(dev);
1123 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1124 __be16 *p = (__be16*)(iph+1);
1126 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1127 p[0] = t->parms.o_flags;
1131 * Set the source hardware address.
1135 memcpy(&iph->saddr, saddr, 4);
1138 memcpy(&iph->daddr, daddr, 4);
1141 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1147 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1149 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1150 memcpy(haddr, &iph->saddr, 4);
1154 static const struct header_ops ipgre_header_ops = {
1155 .create = ipgre_header,
1156 .parse = ipgre_header_parse,
1159 #ifdef CONFIG_NET_IPGRE_BROADCAST
1160 static int ipgre_open(struct net_device *dev)
1162 struct ip_tunnel *t = netdev_priv(dev);
1164 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1165 struct flowi fl = { .oif = t->parms.link,
1167 { .daddr = t->parms.iph.daddr,
1168 .saddr = t->parms.iph.saddr,
1169 .tos = RT_TOS(t->parms.iph.tos) } },
1170 .proto = IPPROTO_GRE };
1172 if (ip_route_output_key(&init_net, &rt, &fl))
1173 return -EADDRNOTAVAIL;
1174 dev = rt->u.dst.dev;
1176 if (__in_dev_get_rtnl(dev) == NULL)
1177 return -EADDRNOTAVAIL;
1178 t->mlink = dev->ifindex;
1179 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1184 static int ipgre_close(struct net_device *dev)
1186 struct ip_tunnel *t = netdev_priv(dev);
1187 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1188 struct in_device *in_dev;
1189 in_dev = inetdev_by_index(dev->nd_net, t->mlink);
1191 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1200 static void ipgre_tunnel_setup(struct net_device *dev)
1202 dev->uninit = ipgre_tunnel_uninit;
1203 dev->destructor = free_netdev;
1204 dev->hard_start_xmit = ipgre_tunnel_xmit;
1205 dev->get_stats = ipgre_tunnel_get_stats;
1206 dev->do_ioctl = ipgre_tunnel_ioctl;
1207 dev->change_mtu = ipgre_tunnel_change_mtu;
1209 dev->type = ARPHRD_IPGRE;
1210 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1211 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1212 dev->flags = IFF_NOARP;
1217 static int ipgre_tunnel_init(struct net_device *dev)
1219 struct ip_tunnel *tunnel;
1222 tunnel = netdev_priv(dev);
1223 iph = &tunnel->parms.iph;
1226 strcpy(tunnel->parms.name, dev->name);
1228 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1229 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1231 ipgre_tunnel_bind_dev(dev);
1234 #ifdef CONFIG_NET_IPGRE_BROADCAST
1235 if (ipv4_is_multicast(iph->daddr)) {
1238 dev->flags = IFF_BROADCAST;
1239 dev->header_ops = &ipgre_header_ops;
1240 dev->open = ipgre_open;
1241 dev->stop = ipgre_close;
1245 dev->header_ops = &ipgre_header_ops;
1250 static int __init ipgre_fb_tunnel_init(struct net_device *dev)
1252 struct ip_tunnel *tunnel = netdev_priv(dev);
1253 struct iphdr *iph = &tunnel->parms.iph;
1256 strcpy(tunnel->parms.name, dev->name);
1259 iph->protocol = IPPROTO_GRE;
1261 tunnel->hlen = sizeof(struct iphdr) + 4;
1264 tunnels_wc[0] = tunnel;
1269 static struct net_protocol ipgre_protocol = {
1270 .handler = ipgre_rcv,
1271 .err_handler = ipgre_err,
1276 * And now the modules code and kernel interface.
1279 static int __init ipgre_init(void)
1283 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1285 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1286 printk(KERN_INFO "ipgre init: can't add protocol\n");
1290 ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1291 ipgre_tunnel_setup);
1292 if (!ipgre_fb_tunnel_dev) {
1297 ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1299 if ((err = register_netdev(ipgre_fb_tunnel_dev)))
1304 free_netdev(ipgre_fb_tunnel_dev);
1306 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1310 static void __exit ipgre_destroy_tunnels(void)
1314 for (prio = 0; prio < 4; prio++) {
1316 for (h = 0; h < HASH_SIZE; h++) {
1317 struct ip_tunnel *t;
1318 while ((t = tunnels[prio][h]) != NULL)
1319 unregister_netdevice(t->dev);
1324 static void __exit ipgre_fini(void)
1326 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1327 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1330 ipgre_destroy_tunnels();
1334 module_init(ipgre_init);
1335 module_exit(ipgre_fini);
1336 MODULE_LICENSE("GPL");