2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
36 #include <net/protocol.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: t->recursion lock breaks dead loops. It looks
70 like dev->tbusy flag, but I preferred new variable, because
71 the semantics is different. One day, when hard_start_xmit
72 will be multithreaded we will have to use skb->encapsulation.
76 2. Networking dead loops would not kill routers, but would really
77 kill network. IP hop limit plays role of "t->recursion" in this case,
78 if we copy it from packet being encapsulated to upper header.
79 It is very good solution, but it introduces two problems:
81 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82 do not work over tunnels.
83 - traceroute does not work. I planned to relay ICMP from tunnel,
84 so that this problem would be solved and traceroute output
85 would even more informative. This idea appeared to be wrong:
86 only Linux complies to rfc1812 now (yes, guys, Linux is the only
87 true router now :-)), all routers (at least, in neighbourhood of mine)
88 return only 8 bytes of payload. It is the end.
90 Hence, if we want that OSPF worked or traceroute said something reasonable,
91 we should search for another solution.
93 One of them is to parse packet trying to detect inner encapsulation
94 made by our node. It is difficult or even impossible, especially,
95 taking into account fragmentation. TO be short, tt is not solution at all.
97 Current solution: The solution was UNEXPECTEDLY SIMPLE.
98 We force DF flag on tunnels with preconfigured hop limit,
99 that is ALL. :-) Well, it does not remove the problem completely,
100 but exponential growth of network traffic is changed to linear
101 (branches, that exceed pmtu are pruned) and tunnel mtu
102 fastly degrades to value <68, where looping stops.
103 Yes, it is not good if there exists a router in the loop,
104 which does not force DF, even when encapsulating packets have DF set.
105 But it is not our problem! Nobody could accuse us, we made
106 all that we could make. Even if it is your gated who injected
107 fatal route to network, even if it were you who configured
108 fatal static route: you are innocent. :-)
112 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113 practically identical code. It would be good to glue them
114 together, but it is not very evident, how to make them modular.
115 sit is integral part of IPv6, ipip and gre are naturally modular.
116 We could extract common parts (hash table, ioctl etc)
117 to a separate module (ip_tunnel.c).
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 /* Fallback tunnel: no source, no destination, no key, no options */
131 static int ipgre_net_id;
133 struct ip_tunnel *tunnels[4][HASH_SIZE];
135 struct net_device *fb_tunnel_dev;
138 /* Tunnel hash table */
148 We require exact key match i.e. if a key is present in packet
149 it will match only tunnel with the same key; if it is not present,
150 it will match only keyless tunnel.
152 All keysless packets, if not matched configured keyless tunnels
153 will match fallback tunnel.
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158 #define tunnels_r_l tunnels[3]
159 #define tunnels_r tunnels[2]
160 #define tunnels_l tunnels[1]
161 #define tunnels_wc tunnels[0]
163 static DEFINE_RWLOCK(ipgre_lock);
165 /* Given src, dst and key, find appropriate for input tunnel. */
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
168 __be32 remote, __be32 local,
169 __be32 key, __be16 gre_proto)
171 unsigned h0 = HASH(remote);
172 unsigned h1 = HASH(key);
174 struct ip_tunnel *t2 = NULL;
175 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
176 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
177 ARPHRD_ETHER : ARPHRD_IPGRE;
179 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
180 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
181 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
182 if (t->dev->type == dev_type)
184 if (t->dev->type == ARPHRD_IPGRE && !t2)
190 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
191 if (remote == t->parms.iph.daddr) {
192 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
193 if (t->dev->type == dev_type)
195 if (t->dev->type == ARPHRD_IPGRE && !t2)
201 for (t = ign->tunnels_l[h1]; t; t = t->next) {
202 if (local == t->parms.iph.saddr ||
203 (local == t->parms.iph.daddr &&
204 ipv4_is_multicast(local))) {
205 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
206 if (t->dev->type == dev_type)
208 if (t->dev->type == ARPHRD_IPGRE && !t2)
214 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
215 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
216 if (t->dev->type == dev_type)
218 if (t->dev->type == ARPHRD_IPGRE && !t2)
226 if (ign->fb_tunnel_dev->flags&IFF_UP)
227 return netdev_priv(ign->fb_tunnel_dev);
231 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
232 struct ip_tunnel_parm *parms)
234 __be32 remote = parms->iph.daddr;
235 __be32 local = parms->iph.saddr;
236 __be32 key = parms->i_key;
237 unsigned h = HASH(key);
242 if (remote && !ipv4_is_multicast(remote)) {
247 return &ign->tunnels[prio][h];
250 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
253 return __ipgre_bucket(ign, &t->parms);
256 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
258 struct ip_tunnel **tp = ipgre_bucket(ign, t);
261 write_lock_bh(&ipgre_lock);
263 write_unlock_bh(&ipgre_lock);
266 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
268 struct ip_tunnel **tp;
270 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
272 write_lock_bh(&ipgre_lock);
274 write_unlock_bh(&ipgre_lock);
280 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
281 struct ip_tunnel_parm *parms,
284 __be32 remote = parms->iph.daddr;
285 __be32 local = parms->iph.saddr;
286 __be32 key = parms->i_key;
287 struct ip_tunnel *t, **tp;
288 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
290 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
291 if (local == t->parms.iph.saddr &&
292 remote == t->parms.iph.daddr &&
293 key == t->parms.i_key &&
294 type == t->dev->type)
300 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
301 struct ip_tunnel_parm *parms, int create)
303 struct ip_tunnel *t, *nt;
304 struct net_device *dev;
306 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
308 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
313 strlcpy(name, parms->name, IFNAMSIZ);
315 sprintf(name, "gre%%d");
317 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
321 dev_net_set(dev, net);
323 if (strchr(name, '%')) {
324 if (dev_alloc_name(dev, name) < 0)
328 nt = netdev_priv(dev);
330 dev->rtnl_link_ops = &ipgre_link_ops;
332 dev->mtu = ipgre_tunnel_bind_dev(dev);
334 if (register_netdevice(dev) < 0)
338 ipgre_tunnel_link(ign, nt);
346 static void ipgre_tunnel_uninit(struct net_device *dev)
348 struct net *net = dev_net(dev);
349 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 ipgre_tunnel_unlink(ign, netdev_priv(dev));
356 static void ipgre_err(struct sk_buff *skb, u32 info)
359 /* All the routers (except for Linux) return only
360 8 bytes of packet payload. It means, that precise relaying of
361 ICMP in the real Internet is absolutely infeasible.
363 Moreover, Cisco "wise men" put GRE key to the third word
364 in GRE header. It makes impossible maintaining even soft state for keyed
365 GRE tunnels with enabled checksum. Tell them "thank you".
367 Well, I wonder, rfc1812 was written by Cisco employee,
368 what the hell these idiots break standrads established
372 struct iphdr *iph = (struct iphdr *)skb->data;
373 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
374 int grehlen = (iph->ihl<<2) + 4;
375 const int type = icmp_hdr(skb)->type;
376 const int code = icmp_hdr(skb)->code;
381 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
382 if (flags&(GRE_VERSION|GRE_ROUTING))
391 /* If only 8 bytes returned, keyed message will be dropped here */
392 if (skb_headlen(skb) < grehlen)
397 case ICMP_PARAMETERPROB:
400 case ICMP_DEST_UNREACH:
403 case ICMP_PORT_UNREACH:
404 /* Impossible event. */
406 case ICMP_FRAG_NEEDED:
407 /* Soft state for pmtu is maintained by IP core. */
410 /* All others are translated to HOST_UNREACH.
411 rfc2003 contains "deep thoughts" about NET_UNREACH,
412 I believe they are just ether pollution. --ANK
417 case ICMP_TIME_EXCEEDED:
418 if (code != ICMP_EXC_TTL)
423 read_lock(&ipgre_lock);
424 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
426 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
428 if (t == NULL || t->parms.iph.daddr == 0 ||
429 ipv4_is_multicast(t->parms.iph.daddr))
432 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
435 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
439 t->err_time = jiffies;
441 read_unlock(&ipgre_lock);
445 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
447 if (INET_ECN_is_ce(iph->tos)) {
448 if (skb->protocol == htons(ETH_P_IP)) {
449 IP_ECN_set_ce(ip_hdr(skb));
450 } else if (skb->protocol == htons(ETH_P_IPV6)) {
451 IP6_ECN_set_ce(ipv6_hdr(skb));
457 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
460 if (skb->protocol == htons(ETH_P_IP))
461 inner = old_iph->tos;
462 else if (skb->protocol == htons(ETH_P_IPV6))
463 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
464 return INET_ECN_encapsulate(tos, inner);
467 static int ipgre_rcv(struct sk_buff *skb)
475 struct ip_tunnel *tunnel;
480 if (!pskb_may_pull(skb, 16))
487 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
488 /* - Version must be 0.
489 - We do not support routing headers.
491 if (flags&(GRE_VERSION|GRE_ROUTING))
494 if (flags&GRE_CSUM) {
495 switch (skb->ip_summed) {
496 case CHECKSUM_COMPLETE:
497 csum = csum_fold(skb->csum);
503 csum = __skb_checksum_complete(skb);
504 skb->ip_summed = CHECKSUM_COMPLETE;
509 key = *(__be32*)(h + offset);
513 seqno = ntohl(*(__be32*)(h + offset));
518 gre_proto = *(__be16 *)(h + 2);
520 read_lock(&ipgre_lock);
521 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
522 iph->saddr, iph->daddr, key,
524 struct net_device_stats *stats = &tunnel->dev->stats;
528 skb->protocol = gre_proto;
529 /* WCCP version 1 and 2 protocol decoding.
530 * - Change protocol to IP
531 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
533 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
534 skb->protocol = htons(ETH_P_IP);
535 if ((*(h + offset) & 0xF0) != 0x40)
539 skb->mac_header = skb->network_header;
540 __pskb_pull(skb, offset);
541 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
542 skb->pkt_type = PACKET_HOST;
543 #ifdef CONFIG_NET_IPGRE_BROADCAST
544 if (ipv4_is_multicast(iph->daddr)) {
545 /* Looped back packet, drop it! */
546 if (skb->rtable->fl.iif == 0)
549 skb->pkt_type = PACKET_BROADCAST;
553 if (((flags&GRE_CSUM) && csum) ||
554 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
555 stats->rx_crc_errors++;
559 if (tunnel->parms.i_flags&GRE_SEQ) {
560 if (!(flags&GRE_SEQ) ||
561 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
562 stats->rx_fifo_errors++;
566 tunnel->i_seqno = seqno + 1;
571 /* Warning: All skb pointers will be invalidated! */
572 if (tunnel->dev->type == ARPHRD_ETHER) {
573 if (!pskb_may_pull(skb, ETH_HLEN)) {
574 stats->rx_length_errors++;
580 skb->protocol = eth_type_trans(skb, tunnel->dev);
581 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
585 stats->rx_bytes += len;
586 skb->dev = tunnel->dev;
587 dst_release(skb->dst);
591 skb_reset_network_header(skb);
592 ipgre_ecn_decapsulate(iph, skb);
595 read_unlock(&ipgre_lock);
598 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
601 read_unlock(&ipgre_lock);
607 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
609 struct ip_tunnel *tunnel = netdev_priv(dev);
610 struct net_device_stats *stats = &tunnel->dev->stats;
611 struct iphdr *old_iph = ip_hdr(skb);
615 struct rtable *rt; /* Route to the other host */
616 struct net_device *tdev; /* Device to other host */
617 struct iphdr *iph; /* Our new IP header */
618 unsigned int max_headroom; /* The extra header space needed */
623 if (tunnel->recursion++) {
628 if (dev->type == ARPHRD_ETHER)
629 IPCB(skb)->flags = 0;
631 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
633 tiph = (struct iphdr *)skb->data;
635 gre_hlen = tunnel->hlen;
636 tiph = &tunnel->parms.iph;
639 if ((dst = tiph->daddr) == 0) {
642 if (skb->dst == NULL) {
643 stats->tx_fifo_errors++;
647 if (skb->protocol == htons(ETH_P_IP)) {
649 if ((dst = rt->rt_gateway) == 0)
653 else if (skb->protocol == htons(ETH_P_IPV6)) {
654 struct in6_addr *addr6;
656 struct neighbour *neigh = skb->dst->neighbour;
661 addr6 = (struct in6_addr *)&neigh->primary_key;
662 addr_type = ipv6_addr_type(addr6);
664 if (addr_type == IPV6_ADDR_ANY) {
665 addr6 = &ipv6_hdr(skb)->daddr;
666 addr_type = ipv6_addr_type(addr6);
669 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
672 dst = addr6->s6_addr32[3];
681 if (skb->protocol == htons(ETH_P_IP))
687 struct flowi fl = { .oif = tunnel->parms.link,
690 .saddr = tiph->saddr,
691 .tos = RT_TOS(tos) } },
692 .proto = IPPROTO_GRE };
693 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
694 stats->tx_carrier_errors++;
698 tdev = rt->u.dst.dev;
708 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
710 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
713 skb->dst->ops->update_pmtu(skb->dst, mtu);
715 if (skb->protocol == htons(ETH_P_IP)) {
716 df |= (old_iph->frag_off&htons(IP_DF));
718 if ((old_iph->frag_off&htons(IP_DF)) &&
719 mtu < ntohs(old_iph->tot_len)) {
720 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
726 else if (skb->protocol == htons(ETH_P_IPV6)) {
727 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
729 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
730 if ((tunnel->parms.iph.daddr &&
731 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
732 rt6->rt6i_dst.plen == 128) {
733 rt6->rt6i_flags |= RTF_MODIFIED;
734 skb->dst->metrics[RTAX_MTU-1] = mtu;
738 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
739 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
746 if (tunnel->err_count > 0) {
747 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
750 dst_link_failure(skb);
752 tunnel->err_count = 0;
755 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
757 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
758 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
759 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
768 skb_set_owner_w(new_skb, skb->sk);
771 old_iph = ip_hdr(skb);
774 skb_reset_transport_header(skb);
775 skb_push(skb, gre_hlen);
776 skb_reset_network_header(skb);
777 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
778 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
780 dst_release(skb->dst);
781 skb->dst = &rt->u.dst;
784 * Push down and install the IPIP header.
789 iph->ihl = sizeof(struct iphdr) >> 2;
791 iph->protocol = IPPROTO_GRE;
792 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
793 iph->daddr = rt->rt_dst;
794 iph->saddr = rt->rt_src;
796 if ((iph->ttl = tiph->ttl) == 0) {
797 if (skb->protocol == htons(ETH_P_IP))
798 iph->ttl = old_iph->ttl;
800 else if (skb->protocol == htons(ETH_P_IPV6))
801 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
804 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
807 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
808 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
809 htons(ETH_P_TEB) : skb->protocol;
811 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
812 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
814 if (tunnel->parms.o_flags&GRE_SEQ) {
816 *ptr = htonl(tunnel->o_seqno);
819 if (tunnel->parms.o_flags&GRE_KEY) {
820 *ptr = tunnel->parms.o_key;
823 if (tunnel->parms.o_flags&GRE_CSUM) {
825 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
836 dst_link_failure(skb);
845 static int ipgre_tunnel_bind_dev(struct net_device *dev)
847 struct net_device *tdev = NULL;
848 struct ip_tunnel *tunnel;
850 int hlen = LL_MAX_HEADER;
851 int mtu = ETH_DATA_LEN;
852 int addend = sizeof(struct iphdr) + 4;
854 tunnel = netdev_priv(dev);
855 iph = &tunnel->parms.iph;
857 /* Guess output device to choose reasonable mtu and needed_headroom */
860 struct flowi fl = { .oif = tunnel->parms.link,
862 { .daddr = iph->daddr,
864 .tos = RT_TOS(iph->tos) } },
865 .proto = IPPROTO_GRE };
867 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
868 tdev = rt->u.dst.dev;
872 if (dev->type != ARPHRD_ETHER)
873 dev->flags |= IFF_POINTOPOINT;
876 if (!tdev && tunnel->parms.link)
877 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
880 hlen = tdev->hard_header_len + tdev->needed_headroom;
883 dev->iflink = tunnel->parms.link;
885 /* Precalculate GRE options length */
886 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
887 if (tunnel->parms.o_flags&GRE_CSUM)
889 if (tunnel->parms.o_flags&GRE_KEY)
891 if (tunnel->parms.o_flags&GRE_SEQ)
894 dev->needed_headroom = addend + hlen;
895 mtu -= dev->hard_header_len - addend;
900 tunnel->hlen = addend;
906 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
909 struct ip_tunnel_parm p;
911 struct net *net = dev_net(dev);
912 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
917 if (dev == ign->fb_tunnel_dev) {
918 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
922 t = ipgre_tunnel_locate(net, &p, 0);
925 t = netdev_priv(dev);
926 memcpy(&p, &t->parms, sizeof(p));
927 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
934 if (!capable(CAP_NET_ADMIN))
938 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
942 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
943 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
944 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
947 p.iph.frag_off |= htons(IP_DF);
949 if (!(p.i_flags&GRE_KEY))
951 if (!(p.o_flags&GRE_KEY))
954 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
956 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
965 t = netdev_priv(dev);
967 if (ipv4_is_multicast(p.iph.daddr))
968 nflags = IFF_BROADCAST;
969 else if (p.iph.daddr)
970 nflags = IFF_POINTOPOINT;
972 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
976 ipgre_tunnel_unlink(ign, t);
977 t->parms.iph.saddr = p.iph.saddr;
978 t->parms.iph.daddr = p.iph.daddr;
979 t->parms.i_key = p.i_key;
980 t->parms.o_key = p.o_key;
981 memcpy(dev->dev_addr, &p.iph.saddr, 4);
982 memcpy(dev->broadcast, &p.iph.daddr, 4);
983 ipgre_tunnel_link(ign, t);
984 netdev_state_change(dev);
990 if (cmd == SIOCCHGTUNNEL) {
991 t->parms.iph.ttl = p.iph.ttl;
992 t->parms.iph.tos = p.iph.tos;
993 t->parms.iph.frag_off = p.iph.frag_off;
994 if (t->parms.link != p.link) {
995 t->parms.link = p.link;
996 dev->mtu = ipgre_tunnel_bind_dev(dev);
997 netdev_state_change(dev);
1000 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1003 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1008 if (!capable(CAP_NET_ADMIN))
1011 if (dev == ign->fb_tunnel_dev) {
1013 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1016 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1019 if (t == netdev_priv(ign->fb_tunnel_dev))
1023 unregister_netdevice(dev);
1035 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1037 struct ip_tunnel *tunnel = netdev_priv(dev);
1039 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1045 /* Nice toy. Unfortunately, useless in real life :-)
1046 It allows to construct virtual multiprotocol broadcast "LAN"
1047 over the Internet, provided multicast routing is tuned.
1050 I have no idea was this bicycle invented before me,
1051 so that I had to set ARPHRD_IPGRE to a random value.
1052 I have an impression, that Cisco could make something similar,
1053 but this feature is apparently missing in IOS<=11.2(8).
1055 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1056 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1058 ping -t 255 224.66.66.66
1060 If nobody answers, mbone does not work.
1062 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1063 ip addr add 10.66.66.<somewhat>/24 dev Universe
1064 ifconfig Universe up
1065 ifconfig Universe add fe80::<Your_real_addr>/10
1066 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1069 ftp fec0:6666:6666::193.233.7.65
1074 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1075 unsigned short type,
1076 const void *daddr, const void *saddr, unsigned len)
1078 struct ip_tunnel *t = netdev_priv(dev);
1079 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1080 __be16 *p = (__be16*)(iph+1);
1082 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1083 p[0] = t->parms.o_flags;
1087 * Set the source hardware address.
1091 memcpy(&iph->saddr, saddr, 4);
1094 memcpy(&iph->daddr, daddr, 4);
1097 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1103 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1105 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1106 memcpy(haddr, &iph->saddr, 4);
1110 static const struct header_ops ipgre_header_ops = {
1111 .create = ipgre_header,
1112 .parse = ipgre_header_parse,
1115 #ifdef CONFIG_NET_IPGRE_BROADCAST
1116 static int ipgre_open(struct net_device *dev)
1118 struct ip_tunnel *t = netdev_priv(dev);
1120 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1121 struct flowi fl = { .oif = t->parms.link,
1123 { .daddr = t->parms.iph.daddr,
1124 .saddr = t->parms.iph.saddr,
1125 .tos = RT_TOS(t->parms.iph.tos) } },
1126 .proto = IPPROTO_GRE };
1128 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1129 return -EADDRNOTAVAIL;
1130 dev = rt->u.dst.dev;
1132 if (__in_dev_get_rtnl(dev) == NULL)
1133 return -EADDRNOTAVAIL;
1134 t->mlink = dev->ifindex;
1135 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1140 static int ipgre_close(struct net_device *dev)
1142 struct ip_tunnel *t = netdev_priv(dev);
1144 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1145 struct in_device *in_dev;
1146 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1148 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1157 static const struct net_device_ops ipgre_netdev_ops = {
1158 .ndo_init = ipgre_tunnel_init,
1159 .ndo_uninit = ipgre_tunnel_uninit,
1160 #ifdef CONFIG_NET_IPGRE_BROADCAST
1161 .ndo_open = ipgre_open,
1162 .ndo_stop = ipgre_close,
1164 .ndo_start_xmit = ipgre_tunnel_xmit,
1165 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1166 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1169 static void ipgre_tunnel_setup(struct net_device *dev)
1171 dev->netdev_ops = &ipgre_netdev_ops;
1172 dev->destructor = free_netdev;
1174 dev->type = ARPHRD_IPGRE;
1175 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1176 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1177 dev->flags = IFF_NOARP;
1180 dev->features |= NETIF_F_NETNS_LOCAL;
1183 static int ipgre_tunnel_init(struct net_device *dev)
1185 struct ip_tunnel *tunnel;
1188 tunnel = netdev_priv(dev);
1189 iph = &tunnel->parms.iph;
1192 strcpy(tunnel->parms.name, dev->name);
1194 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1195 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1198 #ifdef CONFIG_NET_IPGRE_BROADCAST
1199 if (ipv4_is_multicast(iph->daddr)) {
1202 dev->flags = IFF_BROADCAST;
1203 dev->header_ops = &ipgre_header_ops;
1207 dev->header_ops = &ipgre_header_ops;
1212 static void ipgre_fb_tunnel_init(struct net_device *dev)
1214 struct ip_tunnel *tunnel = netdev_priv(dev);
1215 struct iphdr *iph = &tunnel->parms.iph;
1216 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1219 strcpy(tunnel->parms.name, dev->name);
1222 iph->protocol = IPPROTO_GRE;
1224 tunnel->hlen = sizeof(struct iphdr) + 4;
1227 ign->tunnels_wc[0] = tunnel;
1231 static struct net_protocol ipgre_protocol = {
1232 .handler = ipgre_rcv,
1233 .err_handler = ipgre_err,
1237 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1241 for (prio = 0; prio < 4; prio++) {
1243 for (h = 0; h < HASH_SIZE; h++) {
1244 struct ip_tunnel *t;
1245 while ((t = ign->tunnels[prio][h]) != NULL)
1246 unregister_netdevice(t->dev);
1251 static int ipgre_init_net(struct net *net)
1254 struct ipgre_net *ign;
1257 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1261 err = net_assign_generic(net, ipgre_net_id, ign);
1265 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1266 ipgre_tunnel_setup);
1267 if (!ign->fb_tunnel_dev) {
1271 dev_net_set(ign->fb_tunnel_dev, net);
1273 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1274 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1276 if ((err = register_netdev(ign->fb_tunnel_dev)))
1282 free_netdev(ign->fb_tunnel_dev);
1291 static void ipgre_exit_net(struct net *net)
1293 struct ipgre_net *ign;
1295 ign = net_generic(net, ipgre_net_id);
1297 ipgre_destroy_tunnels(ign);
1302 static struct pernet_operations ipgre_net_ops = {
1303 .init = ipgre_init_net,
1304 .exit = ipgre_exit_net,
1307 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1315 if (data[IFLA_GRE_IFLAGS])
1316 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1317 if (data[IFLA_GRE_OFLAGS])
1318 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1319 if (flags & (GRE_VERSION|GRE_ROUTING))
1325 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1329 if (tb[IFLA_ADDRESS]) {
1330 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1332 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1333 return -EADDRNOTAVAIL;
1339 if (data[IFLA_GRE_REMOTE]) {
1340 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1346 return ipgre_tunnel_validate(tb, data);
1349 static void ipgre_netlink_parms(struct nlattr *data[],
1350 struct ip_tunnel_parm *parms)
1352 memset(parms, 0, sizeof(*parms));
1354 parms->iph.protocol = IPPROTO_GRE;
1359 if (data[IFLA_GRE_LINK])
1360 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1362 if (data[IFLA_GRE_IFLAGS])
1363 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1365 if (data[IFLA_GRE_OFLAGS])
1366 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368 if (data[IFLA_GRE_IKEY])
1369 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1371 if (data[IFLA_GRE_OKEY])
1372 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1374 if (data[IFLA_GRE_LOCAL])
1375 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1377 if (data[IFLA_GRE_REMOTE])
1378 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1380 if (data[IFLA_GRE_TTL])
1381 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1383 if (data[IFLA_GRE_TOS])
1384 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1386 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1387 parms->iph.frag_off = htons(IP_DF);
1390 static int ipgre_tap_init(struct net_device *dev)
1392 struct ip_tunnel *tunnel;
1394 tunnel = netdev_priv(dev);
1397 strcpy(tunnel->parms.name, dev->name);
1399 ipgre_tunnel_bind_dev(dev);
1404 static const struct net_device_ops ipgre_tap_netdev_ops = {
1405 .ndo_init = ipgre_tap_init,
1406 .ndo_uninit = ipgre_tunnel_uninit,
1407 .ndo_start_xmit = ipgre_tunnel_xmit,
1408 .ndo_set_mac_address = eth_mac_addr,
1409 .ndo_validate_addr = eth_validate_addr,
1410 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1413 static void ipgre_tap_setup(struct net_device *dev)
1418 dev->netdev_ops = &ipgre_netdev_ops;
1419 dev->destructor = free_netdev;
1422 dev->features |= NETIF_F_NETNS_LOCAL;
1425 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1426 struct nlattr *data[])
1428 struct ip_tunnel *nt;
1429 struct net *net = dev_net(dev);
1430 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1434 nt = netdev_priv(dev);
1435 ipgre_netlink_parms(data, &nt->parms);
1437 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1440 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1441 random_ether_addr(dev->dev_addr);
1443 mtu = ipgre_tunnel_bind_dev(dev);
1447 err = register_netdevice(dev);
1452 ipgre_tunnel_link(ign, nt);
1458 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1459 struct nlattr *data[])
1461 struct ip_tunnel *t, *nt;
1462 struct net *net = dev_net(dev);
1463 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1464 struct ip_tunnel_parm p;
1467 if (dev == ign->fb_tunnel_dev)
1470 nt = netdev_priv(dev);
1471 ipgre_netlink_parms(data, &p);
1473 t = ipgre_tunnel_locate(net, &p, 0);
1479 unsigned nflags = 0;
1483 if (ipv4_is_multicast(p.iph.daddr))
1484 nflags = IFF_BROADCAST;
1485 else if (p.iph.daddr)
1486 nflags = IFF_POINTOPOINT;
1488 if ((dev->flags ^ nflags) &
1489 (IFF_POINTOPOINT | IFF_BROADCAST))
1492 ipgre_tunnel_unlink(ign, t);
1493 t->parms.iph.saddr = p.iph.saddr;
1494 t->parms.iph.daddr = p.iph.daddr;
1495 t->parms.i_key = p.i_key;
1496 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1497 memcpy(dev->broadcast, &p.iph.daddr, 4);
1498 ipgre_tunnel_link(ign, t);
1499 netdev_state_change(dev);
1502 t->parms.o_key = p.o_key;
1503 t->parms.iph.ttl = p.iph.ttl;
1504 t->parms.iph.tos = p.iph.tos;
1505 t->parms.iph.frag_off = p.iph.frag_off;
1507 if (t->parms.link != p.link) {
1508 t->parms.link = p.link;
1509 mtu = ipgre_tunnel_bind_dev(dev);
1512 netdev_state_change(dev);
1518 static size_t ipgre_get_size(const struct net_device *dev)
1523 /* IFLA_GRE_IFLAGS */
1525 /* IFLA_GRE_OFLAGS */
1531 /* IFLA_GRE_LOCAL */
1533 /* IFLA_GRE_REMOTE */
1539 /* IFLA_GRE_PMTUDISC */
1544 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1546 struct ip_tunnel *t = netdev_priv(dev);
1547 struct ip_tunnel_parm *p = &t->parms;
1549 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1550 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1551 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1552 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1553 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1554 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1555 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1556 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1557 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1558 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1566 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1567 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1568 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1569 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1570 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1571 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1572 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1573 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1574 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1575 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1576 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1579 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1581 .maxtype = IFLA_GRE_MAX,
1582 .policy = ipgre_policy,
1583 .priv_size = sizeof(struct ip_tunnel),
1584 .setup = ipgre_tunnel_setup,
1585 .validate = ipgre_tunnel_validate,
1586 .newlink = ipgre_newlink,
1587 .changelink = ipgre_changelink,
1588 .get_size = ipgre_get_size,
1589 .fill_info = ipgre_fill_info,
1592 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1594 .maxtype = IFLA_GRE_MAX,
1595 .policy = ipgre_policy,
1596 .priv_size = sizeof(struct ip_tunnel),
1597 .setup = ipgre_tap_setup,
1598 .validate = ipgre_tap_validate,
1599 .newlink = ipgre_newlink,
1600 .changelink = ipgre_changelink,
1601 .get_size = ipgre_get_size,
1602 .fill_info = ipgre_fill_info,
1606 * And now the modules code and kernel interface.
1609 static int __init ipgre_init(void)
1613 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1615 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1616 printk(KERN_INFO "ipgre init: can't add protocol\n");
1620 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1622 goto gen_device_failed;
1624 err = rtnl_link_register(&ipgre_link_ops);
1626 goto rtnl_link_failed;
1628 err = rtnl_link_register(&ipgre_tap_ops);
1630 goto tap_ops_failed;
1636 rtnl_link_unregister(&ipgre_link_ops);
1638 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1640 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1644 static void __exit ipgre_fini(void)
1646 rtnl_link_unregister(&ipgre_tap_ops);
1647 rtnl_link_unregister(&ipgre_link_ops);
1648 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1649 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1650 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1653 module_init(ipgre_init);
1654 module_exit(ipgre_fini);
1655 MODULE_LICENSE("GPL");
1656 MODULE_ALIAS_RTNL_LINK("gre");
1657 MODULE_ALIAS_RTNL_LINK("gretap");