2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
36 #include <net/protocol.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: t->recursion lock breaks dead loops. It looks
70 like dev->tbusy flag, but I preferred new variable, because
71 the semantics is different. One day, when hard_start_xmit
72 will be multithreaded we will have to use skb->encapsulation.
76 2. Networking dead loops would not kill routers, but would really
77 kill network. IP hop limit plays role of "t->recursion" in this case,
78 if we copy it from packet being encapsulated to upper header.
79 It is very good solution, but it introduces two problems:
81 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82 do not work over tunnels.
83 - traceroute does not work. I planned to relay ICMP from tunnel,
84 so that this problem would be solved and traceroute output
85 would even more informative. This idea appeared to be wrong:
86 only Linux complies to rfc1812 now (yes, guys, Linux is the only
87 true router now :-)), all routers (at least, in neighbourhood of mine)
88 return only 8 bytes of payload. It is the end.
90 Hence, if we want that OSPF worked or traceroute said something reasonable,
91 we should search for another solution.
93 One of them is to parse packet trying to detect inner encapsulation
94 made by our node. It is difficult or even impossible, especially,
95 taking into account fragmentation. TO be short, tt is not solution at all.
97 Current solution: The solution was UNEXPECTEDLY SIMPLE.
98 We force DF flag on tunnels with preconfigured hop limit,
99 that is ALL. :-) Well, it does not remove the problem completely,
100 but exponential growth of network traffic is changed to linear
101 (branches, that exceed pmtu are pruned) and tunnel mtu
102 fastly degrades to value <68, where looping stops.
103 Yes, it is not good if there exists a router in the loop,
104 which does not force DF, even when encapsulating packets have DF set.
105 But it is not our problem! Nobody could accuse us, we made
106 all that we could make. Even if it is your gated who injected
107 fatal route to network, even if it were you who configured
108 fatal static route: you are innocent. :-)
112 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113 practically identical code. It would be good to glue them
114 together, but it is not very evident, how to make them modular.
115 sit is integral part of IPv6, ipip and gre are naturally modular.
116 We could extract common parts (hash table, ioctl etc)
117 to a separate module (ip_tunnel.c).
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 /* Fallback tunnel: no source, no destination, no key, no options */
129 static int ipgre_fb_tunnel_init(struct net_device *dev);
133 static int ipgre_net_id;
135 struct ip_tunnel *tunnels[4][HASH_SIZE];
137 struct net_device *fb_tunnel_dev;
140 /* Tunnel hash table */
150 We require exact key match i.e. if a key is present in packet
151 it will match only tunnel with the same key; if it is not present,
152 it will match only keyless tunnel.
154 All keysless packets, if not matched configured keyless tunnels
155 will match fallback tunnel.
158 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
160 #define tunnels_r_l tunnels[3]
161 #define tunnels_r tunnels[2]
162 #define tunnels_l tunnels[1]
163 #define tunnels_wc tunnels[0]
165 static DEFINE_RWLOCK(ipgre_lock);
167 /* Given src, dst and key, find appropriate for input tunnel. */
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
170 __be32 remote, __be32 local,
171 __be32 key, __be16 gre_proto)
173 unsigned h0 = HASH(remote);
174 unsigned h1 = HASH(key);
176 struct ip_tunnel *t2 = NULL;
177 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
178 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
179 ARPHRD_ETHER : ARPHRD_IPGRE;
181 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
183 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
184 if (t->dev->type == dev_type)
186 if (t->dev->type == ARPHRD_IPGRE && !t2)
192 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
193 if (remote == t->parms.iph.daddr) {
194 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
195 if (t->dev->type == dev_type)
197 if (t->dev->type == ARPHRD_IPGRE && !t2)
203 for (t = ign->tunnels_l[h1]; t; t = t->next) {
204 if (local == t->parms.iph.saddr ||
205 (local == t->parms.iph.daddr &&
206 ipv4_is_multicast(local))) {
207 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
208 if (t->dev->type == dev_type)
210 if (t->dev->type == ARPHRD_IPGRE && !t2)
216 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
217 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
218 if (t->dev->type == dev_type)
220 if (t->dev->type == ARPHRD_IPGRE && !t2)
228 if (ign->fb_tunnel_dev->flags&IFF_UP)
229 return netdev_priv(ign->fb_tunnel_dev);
233 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
234 struct ip_tunnel_parm *parms)
236 __be32 remote = parms->iph.daddr;
237 __be32 local = parms->iph.saddr;
238 __be32 key = parms->i_key;
239 unsigned h = HASH(key);
244 if (remote && !ipv4_is_multicast(remote)) {
249 return &ign->tunnels[prio][h];
252 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
255 return __ipgre_bucket(ign, &t->parms);
258 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
260 struct ip_tunnel **tp = ipgre_bucket(ign, t);
263 write_lock_bh(&ipgre_lock);
265 write_unlock_bh(&ipgre_lock);
268 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
270 struct ip_tunnel **tp;
272 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
274 write_lock_bh(&ipgre_lock);
276 write_unlock_bh(&ipgre_lock);
282 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
283 struct ip_tunnel_parm *parms,
286 __be32 remote = parms->iph.daddr;
287 __be32 local = parms->iph.saddr;
288 __be32 key = parms->i_key;
289 struct ip_tunnel *t, **tp;
290 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
292 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
293 if (local == t->parms.iph.saddr &&
294 remote == t->parms.iph.daddr &&
295 key == t->parms.i_key &&
296 type == t->dev->type)
302 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
303 struct ip_tunnel_parm *parms, int create)
305 struct ip_tunnel *t, *nt;
306 struct net_device *dev;
308 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
310 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
315 strlcpy(name, parms->name, IFNAMSIZ);
317 sprintf(name, "gre%%d");
319 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
323 dev_net_set(dev, net);
325 if (strchr(name, '%')) {
326 if (dev_alloc_name(dev, name) < 0)
330 nt = netdev_priv(dev);
332 dev->rtnl_link_ops = &ipgre_link_ops;
334 dev->mtu = ipgre_tunnel_bind_dev(dev);
336 if (register_netdevice(dev) < 0)
340 ipgre_tunnel_link(ign, nt);
348 static void ipgre_tunnel_uninit(struct net_device *dev)
350 struct net *net = dev_net(dev);
351 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
353 ipgre_tunnel_unlink(ign, netdev_priv(dev));
358 static void ipgre_err(struct sk_buff *skb, u32 info)
361 /* All the routers (except for Linux) return only
362 8 bytes of packet payload. It means, that precise relaying of
363 ICMP in the real Internet is absolutely infeasible.
365 Moreover, Cisco "wise men" put GRE key to the third word
366 in GRE header. It makes impossible maintaining even soft state for keyed
367 GRE tunnels with enabled checksum. Tell them "thank you".
369 Well, I wonder, rfc1812 was written by Cisco employee,
370 what the hell these idiots break standrads established
374 struct iphdr *iph = (struct iphdr*)skb->data;
375 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
376 int grehlen = (iph->ihl<<2) + 4;
377 const int type = icmp_hdr(skb)->type;
378 const int code = icmp_hdr(skb)->code;
383 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
384 if (flags&(GRE_VERSION|GRE_ROUTING))
393 /* If only 8 bytes returned, keyed message will be dropped here */
394 if (skb_headlen(skb) < grehlen)
399 case ICMP_PARAMETERPROB:
402 case ICMP_DEST_UNREACH:
405 case ICMP_PORT_UNREACH:
406 /* Impossible event. */
408 case ICMP_FRAG_NEEDED:
409 /* Soft state for pmtu is maintained by IP core. */
412 /* All others are translated to HOST_UNREACH.
413 rfc2003 contains "deep thoughts" about NET_UNREACH,
414 I believe they are just ether pollution. --ANK
419 case ICMP_TIME_EXCEEDED:
420 if (code != ICMP_EXC_TTL)
425 read_lock(&ipgre_lock);
426 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
428 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
430 if (t == NULL || t->parms.iph.daddr == 0 ||
431 ipv4_is_multicast(t->parms.iph.daddr))
434 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
437 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
441 t->err_time = jiffies;
443 read_unlock(&ipgre_lock);
447 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
449 if (INET_ECN_is_ce(iph->tos)) {
450 if (skb->protocol == htons(ETH_P_IP)) {
451 IP_ECN_set_ce(ip_hdr(skb));
452 } else if (skb->protocol == htons(ETH_P_IPV6)) {
453 IP6_ECN_set_ce(ipv6_hdr(skb));
459 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
462 if (skb->protocol == htons(ETH_P_IP))
463 inner = old_iph->tos;
464 else if (skb->protocol == htons(ETH_P_IPV6))
465 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
466 return INET_ECN_encapsulate(tos, inner);
469 static int ipgre_rcv(struct sk_buff *skb)
477 struct ip_tunnel *tunnel;
482 if (!pskb_may_pull(skb, 16))
489 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
490 /* - Version must be 0.
491 - We do not support routing headers.
493 if (flags&(GRE_VERSION|GRE_ROUTING))
496 if (flags&GRE_CSUM) {
497 switch (skb->ip_summed) {
498 case CHECKSUM_COMPLETE:
499 csum = csum_fold(skb->csum);
505 csum = __skb_checksum_complete(skb);
506 skb->ip_summed = CHECKSUM_COMPLETE;
511 key = *(__be32*)(h + offset);
515 seqno = ntohl(*(__be32*)(h + offset));
520 gre_proto = *(__be16 *)(h + 2);
522 read_lock(&ipgre_lock);
523 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
524 iph->saddr, iph->daddr, key,
526 struct net_device_stats *stats = &tunnel->dev->stats;
530 skb->protocol = gre_proto;
531 /* WCCP version 1 and 2 protocol decoding.
532 * - Change protocol to IP
533 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
535 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
536 skb->protocol = htons(ETH_P_IP);
537 if ((*(h + offset) & 0xF0) != 0x40)
541 skb->mac_header = skb->network_header;
542 __pskb_pull(skb, offset);
543 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
544 skb->pkt_type = PACKET_HOST;
545 #ifdef CONFIG_NET_IPGRE_BROADCAST
546 if (ipv4_is_multicast(iph->daddr)) {
547 /* Looped back packet, drop it! */
548 if (skb->rtable->fl.iif == 0)
551 skb->pkt_type = PACKET_BROADCAST;
555 if (((flags&GRE_CSUM) && csum) ||
556 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
557 stats->rx_crc_errors++;
561 if (tunnel->parms.i_flags&GRE_SEQ) {
562 if (!(flags&GRE_SEQ) ||
563 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
564 stats->rx_fifo_errors++;
568 tunnel->i_seqno = seqno + 1;
573 /* Warning: All skb pointers will be invalidated! */
574 if (tunnel->dev->type == ARPHRD_ETHER) {
575 if (!pskb_may_pull(skb, ETH_HLEN)) {
576 stats->rx_length_errors++;
582 skb->protocol = eth_type_trans(skb, tunnel->dev);
583 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
587 stats->rx_bytes += len;
588 skb->dev = tunnel->dev;
589 dst_release(skb->dst);
593 skb_reset_network_header(skb);
594 ipgre_ecn_decapsulate(iph, skb);
597 read_unlock(&ipgre_lock);
600 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
603 read_unlock(&ipgre_lock);
609 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
611 struct ip_tunnel *tunnel = netdev_priv(dev);
612 struct net_device_stats *stats = &tunnel->dev->stats;
613 struct iphdr *old_iph = ip_hdr(skb);
617 struct rtable *rt; /* Route to the other host */
618 struct net_device *tdev; /* Device to other host */
619 struct iphdr *iph; /* Our new IP header */
620 unsigned int max_headroom; /* The extra header space needed */
625 if (tunnel->recursion++) {
630 if (dev->type == ARPHRD_ETHER)
631 IPCB(skb)->flags = 0;
633 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
635 tiph = (struct iphdr*)skb->data;
637 gre_hlen = tunnel->hlen;
638 tiph = &tunnel->parms.iph;
641 if ((dst = tiph->daddr) == 0) {
644 if (skb->dst == NULL) {
645 stats->tx_fifo_errors++;
649 if (skb->protocol == htons(ETH_P_IP)) {
651 if ((dst = rt->rt_gateway) == 0)
655 else if (skb->protocol == htons(ETH_P_IPV6)) {
656 struct in6_addr *addr6;
658 struct neighbour *neigh = skb->dst->neighbour;
663 addr6 = (struct in6_addr*)&neigh->primary_key;
664 addr_type = ipv6_addr_type(addr6);
666 if (addr_type == IPV6_ADDR_ANY) {
667 addr6 = &ipv6_hdr(skb)->daddr;
668 addr_type = ipv6_addr_type(addr6);
671 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
674 dst = addr6->s6_addr32[3];
683 if (skb->protocol == htons(ETH_P_IP))
689 struct flowi fl = { .oif = tunnel->parms.link,
692 .saddr = tiph->saddr,
693 .tos = RT_TOS(tos) } },
694 .proto = IPPROTO_GRE };
695 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
696 stats->tx_carrier_errors++;
700 tdev = rt->u.dst.dev;
710 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
712 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
715 skb->dst->ops->update_pmtu(skb->dst, mtu);
717 if (skb->protocol == htons(ETH_P_IP)) {
718 df |= (old_iph->frag_off&htons(IP_DF));
720 if ((old_iph->frag_off&htons(IP_DF)) &&
721 mtu < ntohs(old_iph->tot_len)) {
722 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
728 else if (skb->protocol == htons(ETH_P_IPV6)) {
729 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
731 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
732 if ((tunnel->parms.iph.daddr &&
733 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
734 rt6->rt6i_dst.plen == 128) {
735 rt6->rt6i_flags |= RTF_MODIFIED;
736 skb->dst->metrics[RTAX_MTU-1] = mtu;
740 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
741 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
748 if (tunnel->err_count > 0) {
749 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
752 dst_link_failure(skb);
754 tunnel->err_count = 0;
757 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
759 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
760 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
761 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
770 skb_set_owner_w(new_skb, skb->sk);
773 old_iph = ip_hdr(skb);
776 skb_reset_transport_header(skb);
777 skb_push(skb, gre_hlen);
778 skb_reset_network_header(skb);
779 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
780 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
782 dst_release(skb->dst);
783 skb->dst = &rt->u.dst;
786 * Push down and install the IPIP header.
791 iph->ihl = sizeof(struct iphdr) >> 2;
793 iph->protocol = IPPROTO_GRE;
794 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
795 iph->daddr = rt->rt_dst;
796 iph->saddr = rt->rt_src;
798 if ((iph->ttl = tiph->ttl) == 0) {
799 if (skb->protocol == htons(ETH_P_IP))
800 iph->ttl = old_iph->ttl;
802 else if (skb->protocol == htons(ETH_P_IPV6))
803 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
806 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
809 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
810 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
811 htons(ETH_P_TEB) : skb->protocol;
813 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
814 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
816 if (tunnel->parms.o_flags&GRE_SEQ) {
818 *ptr = htonl(tunnel->o_seqno);
821 if (tunnel->parms.o_flags&GRE_KEY) {
822 *ptr = tunnel->parms.o_key;
825 if (tunnel->parms.o_flags&GRE_CSUM) {
827 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
838 dst_link_failure(skb);
847 static int ipgre_tunnel_bind_dev(struct net_device *dev)
849 struct net_device *tdev = NULL;
850 struct ip_tunnel *tunnel;
852 int hlen = LL_MAX_HEADER;
853 int mtu = ETH_DATA_LEN;
854 int addend = sizeof(struct iphdr) + 4;
856 tunnel = netdev_priv(dev);
857 iph = &tunnel->parms.iph;
859 /* Guess output device to choose reasonable mtu and needed_headroom */
862 struct flowi fl = { .oif = tunnel->parms.link,
864 { .daddr = iph->daddr,
866 .tos = RT_TOS(iph->tos) } },
867 .proto = IPPROTO_GRE };
869 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
870 tdev = rt->u.dst.dev;
874 if (dev->type != ARPHRD_ETHER)
875 dev->flags |= IFF_POINTOPOINT;
878 if (!tdev && tunnel->parms.link)
879 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
882 hlen = tdev->hard_header_len + tdev->needed_headroom;
885 dev->iflink = tunnel->parms.link;
887 /* Precalculate GRE options length */
888 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
889 if (tunnel->parms.o_flags&GRE_CSUM)
891 if (tunnel->parms.o_flags&GRE_KEY)
893 if (tunnel->parms.o_flags&GRE_SEQ)
896 dev->needed_headroom = addend + hlen;
897 mtu -= dev->hard_header_len - addend;
902 tunnel->hlen = addend;
908 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
911 struct ip_tunnel_parm p;
913 struct net *net = dev_net(dev);
914 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
919 if (dev == ign->fb_tunnel_dev) {
920 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
924 t = ipgre_tunnel_locate(net, &p, 0);
927 t = netdev_priv(dev);
928 memcpy(&p, &t->parms, sizeof(p));
929 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
936 if (!capable(CAP_NET_ADMIN))
940 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
944 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
945 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
946 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
949 p.iph.frag_off |= htons(IP_DF);
951 if (!(p.i_flags&GRE_KEY))
953 if (!(p.o_flags&GRE_KEY))
956 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
958 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
967 t = netdev_priv(dev);
969 if (ipv4_is_multicast(p.iph.daddr))
970 nflags = IFF_BROADCAST;
971 else if (p.iph.daddr)
972 nflags = IFF_POINTOPOINT;
974 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
978 ipgre_tunnel_unlink(ign, t);
979 t->parms.iph.saddr = p.iph.saddr;
980 t->parms.iph.daddr = p.iph.daddr;
981 t->parms.i_key = p.i_key;
982 t->parms.o_key = p.o_key;
983 memcpy(dev->dev_addr, &p.iph.saddr, 4);
984 memcpy(dev->broadcast, &p.iph.daddr, 4);
985 ipgre_tunnel_link(ign, t);
986 netdev_state_change(dev);
992 if (cmd == SIOCCHGTUNNEL) {
993 t->parms.iph.ttl = p.iph.ttl;
994 t->parms.iph.tos = p.iph.tos;
995 t->parms.iph.frag_off = p.iph.frag_off;
996 if (t->parms.link != p.link) {
997 t->parms.link = p.link;
998 dev->mtu = ipgre_tunnel_bind_dev(dev);
999 netdev_state_change(dev);
1002 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1005 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1010 if (!capable(CAP_NET_ADMIN))
1013 if (dev == ign->fb_tunnel_dev) {
1015 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1018 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1021 if (t == netdev_priv(ign->fb_tunnel_dev))
1025 unregister_netdevice(dev);
1037 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1039 struct ip_tunnel *tunnel = netdev_priv(dev);
1041 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1047 /* Nice toy. Unfortunately, useless in real life :-)
1048 It allows to construct virtual multiprotocol broadcast "LAN"
1049 over the Internet, provided multicast routing is tuned.
1052 I have no idea was this bicycle invented before me,
1053 so that I had to set ARPHRD_IPGRE to a random value.
1054 I have an impression, that Cisco could make something similar,
1055 but this feature is apparently missing in IOS<=11.2(8).
1057 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1058 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1060 ping -t 255 224.66.66.66
1062 If nobody answers, mbone does not work.
1064 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1065 ip addr add 10.66.66.<somewhat>/24 dev Universe
1066 ifconfig Universe up
1067 ifconfig Universe add fe80::<Your_real_addr>/10
1068 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1071 ftp fec0:6666:6666::193.233.7.65
1076 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1077 unsigned short type,
1078 const void *daddr, const void *saddr, unsigned len)
1080 struct ip_tunnel *t = netdev_priv(dev);
1081 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1082 __be16 *p = (__be16*)(iph+1);
1084 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1085 p[0] = t->parms.o_flags;
1089 * Set the source hardware address.
1093 memcpy(&iph->saddr, saddr, 4);
1096 memcpy(&iph->daddr, daddr, 4);
1099 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1105 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1107 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1108 memcpy(haddr, &iph->saddr, 4);
1112 static const struct header_ops ipgre_header_ops = {
1113 .create = ipgre_header,
1114 .parse = ipgre_header_parse,
1117 #ifdef CONFIG_NET_IPGRE_BROADCAST
1118 static int ipgre_open(struct net_device *dev)
1120 struct ip_tunnel *t = netdev_priv(dev);
1122 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1123 struct flowi fl = { .oif = t->parms.link,
1125 { .daddr = t->parms.iph.daddr,
1126 .saddr = t->parms.iph.saddr,
1127 .tos = RT_TOS(t->parms.iph.tos) } },
1128 .proto = IPPROTO_GRE };
1130 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1131 return -EADDRNOTAVAIL;
1132 dev = rt->u.dst.dev;
1134 if (__in_dev_get_rtnl(dev) == NULL)
1135 return -EADDRNOTAVAIL;
1136 t->mlink = dev->ifindex;
1137 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1142 static int ipgre_close(struct net_device *dev)
1144 struct ip_tunnel *t = netdev_priv(dev);
1145 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1146 struct in_device *in_dev;
1147 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1149 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1158 static void ipgre_tunnel_setup(struct net_device *dev)
1160 dev->init = ipgre_tunnel_init;
1161 dev->uninit = ipgre_tunnel_uninit;
1162 dev->destructor = free_netdev;
1163 dev->hard_start_xmit = ipgre_tunnel_xmit;
1164 dev->do_ioctl = ipgre_tunnel_ioctl;
1165 dev->change_mtu = ipgre_tunnel_change_mtu;
1167 dev->type = ARPHRD_IPGRE;
1168 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1169 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1170 dev->flags = IFF_NOARP;
1173 dev->features |= NETIF_F_NETNS_LOCAL;
1176 static int ipgre_tunnel_init(struct net_device *dev)
1178 struct ip_tunnel *tunnel;
1181 tunnel = netdev_priv(dev);
1182 iph = &tunnel->parms.iph;
1185 strcpy(tunnel->parms.name, dev->name);
1187 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1188 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1191 #ifdef CONFIG_NET_IPGRE_BROADCAST
1192 if (ipv4_is_multicast(iph->daddr)) {
1195 dev->flags = IFF_BROADCAST;
1196 dev->header_ops = &ipgre_header_ops;
1197 dev->open = ipgre_open;
1198 dev->stop = ipgre_close;
1202 dev->header_ops = &ipgre_header_ops;
1207 static int ipgre_fb_tunnel_init(struct net_device *dev)
1209 struct ip_tunnel *tunnel = netdev_priv(dev);
1210 struct iphdr *iph = &tunnel->parms.iph;
1211 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1214 strcpy(tunnel->parms.name, dev->name);
1217 iph->protocol = IPPROTO_GRE;
1219 tunnel->hlen = sizeof(struct iphdr) + 4;
1222 ign->tunnels_wc[0] = tunnel;
1227 static struct net_protocol ipgre_protocol = {
1228 .handler = ipgre_rcv,
1229 .err_handler = ipgre_err,
1233 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1237 for (prio = 0; prio < 4; prio++) {
1239 for (h = 0; h < HASH_SIZE; h++) {
1240 struct ip_tunnel *t;
1241 while ((t = ign->tunnels[prio][h]) != NULL)
1242 unregister_netdevice(t->dev);
1247 static int ipgre_init_net(struct net *net)
1250 struct ipgre_net *ign;
1253 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1257 err = net_assign_generic(net, ipgre_net_id, ign);
1261 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1262 ipgre_tunnel_setup);
1263 if (!ign->fb_tunnel_dev) {
1268 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1269 dev_net_set(ign->fb_tunnel_dev, net);
1270 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1272 if ((err = register_netdev(ign->fb_tunnel_dev)))
1278 free_netdev(ign->fb_tunnel_dev);
1287 static void ipgre_exit_net(struct net *net)
1289 struct ipgre_net *ign;
1291 ign = net_generic(net, ipgre_net_id);
1293 ipgre_destroy_tunnels(ign);
1298 static struct pernet_operations ipgre_net_ops = {
1299 .init = ipgre_init_net,
1300 .exit = ipgre_exit_net,
1303 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1311 if (data[IFLA_GRE_IFLAGS])
1312 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1313 if (data[IFLA_GRE_OFLAGS])
1314 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1315 if (flags & (GRE_VERSION|GRE_ROUTING))
1321 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1325 if (tb[IFLA_ADDRESS]) {
1326 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1328 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1329 return -EADDRNOTAVAIL;
1335 if (data[IFLA_GRE_REMOTE]) {
1336 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1342 return ipgre_tunnel_validate(tb, data);
1345 static void ipgre_netlink_parms(struct nlattr *data[],
1346 struct ip_tunnel_parm *parms)
1348 memset(parms, 0, sizeof(*parms));
1350 parms->iph.protocol = IPPROTO_GRE;
1355 if (data[IFLA_GRE_LINK])
1356 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1358 if (data[IFLA_GRE_IFLAGS])
1359 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1361 if (data[IFLA_GRE_OFLAGS])
1362 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1364 if (data[IFLA_GRE_IKEY])
1365 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1367 if (data[IFLA_GRE_OKEY])
1368 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1370 if (data[IFLA_GRE_LOCAL])
1371 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1373 if (data[IFLA_GRE_REMOTE])
1374 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1376 if (data[IFLA_GRE_TTL])
1377 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1379 if (data[IFLA_GRE_TOS])
1380 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1382 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1383 parms->iph.frag_off = htons(IP_DF);
1386 static int ipgre_tap_init(struct net_device *dev)
1388 struct ip_tunnel *tunnel;
1390 tunnel = netdev_priv(dev);
1393 strcpy(tunnel->parms.name, dev->name);
1395 ipgre_tunnel_bind_dev(dev);
1400 static void ipgre_tap_setup(struct net_device *dev)
1405 dev->init = ipgre_tap_init;
1406 dev->uninit = ipgre_tunnel_uninit;
1407 dev->destructor = free_netdev;
1408 dev->hard_start_xmit = ipgre_tunnel_xmit;
1409 dev->change_mtu = ipgre_tunnel_change_mtu;
1412 dev->features |= NETIF_F_NETNS_LOCAL;
1415 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1416 struct nlattr *data[])
1418 struct ip_tunnel *nt;
1419 struct net *net = dev_net(dev);
1420 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1424 nt = netdev_priv(dev);
1425 ipgre_netlink_parms(data, &nt->parms);
1427 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1430 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1431 random_ether_addr(dev->dev_addr);
1433 mtu = ipgre_tunnel_bind_dev(dev);
1437 err = register_netdevice(dev);
1442 ipgre_tunnel_link(ign, nt);
1448 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1449 struct nlattr *data[])
1451 struct ip_tunnel *t, *nt;
1452 struct net *net = dev_net(dev);
1453 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1454 struct ip_tunnel_parm p;
1457 if (dev == ign->fb_tunnel_dev)
1460 nt = netdev_priv(dev);
1461 ipgre_netlink_parms(data, &p);
1463 t = ipgre_tunnel_locate(net, &p, 0);
1469 unsigned nflags = 0;
1473 if (ipv4_is_multicast(p.iph.daddr))
1474 nflags = IFF_BROADCAST;
1475 else if (p.iph.daddr)
1476 nflags = IFF_POINTOPOINT;
1478 if ((dev->flags ^ nflags) &
1479 (IFF_POINTOPOINT | IFF_BROADCAST))
1482 ipgre_tunnel_unlink(ign, t);
1483 t->parms.iph.saddr = p.iph.saddr;
1484 t->parms.iph.daddr = p.iph.daddr;
1485 t->parms.i_key = p.i_key;
1486 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1487 memcpy(dev->broadcast, &p.iph.daddr, 4);
1488 ipgre_tunnel_link(ign, t);
1489 netdev_state_change(dev);
1492 t->parms.o_key = p.o_key;
1493 t->parms.iph.ttl = p.iph.ttl;
1494 t->parms.iph.tos = p.iph.tos;
1495 t->parms.iph.frag_off = p.iph.frag_off;
1497 if (t->parms.link != p.link) {
1498 t->parms.link = p.link;
1499 mtu = ipgre_tunnel_bind_dev(dev);
1502 netdev_state_change(dev);
1508 static size_t ipgre_get_size(const struct net_device *dev)
1513 /* IFLA_GRE_IFLAGS */
1515 /* IFLA_GRE_OFLAGS */
1521 /* IFLA_GRE_LOCAL */
1523 /* IFLA_GRE_REMOTE */
1529 /* IFLA_GRE_PMTUDISC */
1534 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1536 struct ip_tunnel *t = netdev_priv(dev);
1537 struct ip_tunnel_parm *p = &t->parms;
1539 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1540 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1541 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1542 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1543 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1544 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1545 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1546 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1547 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1548 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1556 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1557 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1558 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1559 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1560 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1561 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1562 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1563 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1564 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1565 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1566 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1569 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1571 .maxtype = IFLA_GRE_MAX,
1572 .policy = ipgre_policy,
1573 .priv_size = sizeof(struct ip_tunnel),
1574 .setup = ipgre_tunnel_setup,
1575 .validate = ipgre_tunnel_validate,
1576 .newlink = ipgre_newlink,
1577 .changelink = ipgre_changelink,
1578 .get_size = ipgre_get_size,
1579 .fill_info = ipgre_fill_info,
1582 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1584 .maxtype = IFLA_GRE_MAX,
1585 .policy = ipgre_policy,
1586 .priv_size = sizeof(struct ip_tunnel),
1587 .setup = ipgre_tap_setup,
1588 .validate = ipgre_tap_validate,
1589 .newlink = ipgre_newlink,
1590 .changelink = ipgre_changelink,
1591 .get_size = ipgre_get_size,
1592 .fill_info = ipgre_fill_info,
1596 * And now the modules code and kernel interface.
1599 static int __init ipgre_init(void)
1603 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1605 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1606 printk(KERN_INFO "ipgre init: can't add protocol\n");
1610 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1612 goto gen_device_failed;
1614 err = rtnl_link_register(&ipgre_link_ops);
1616 goto rtnl_link_failed;
1618 err = rtnl_link_register(&ipgre_tap_ops);
1620 goto tap_ops_failed;
1626 rtnl_link_unregister(&ipgre_link_ops);
1628 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1630 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1634 static void __exit ipgre_fini(void)
1636 rtnl_link_unregister(&ipgre_tap_ops);
1637 rtnl_link_unregister(&ipgre_link_ops);
1638 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1639 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1640 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1643 module_init(ipgre_init);
1644 module_exit(ipgre_fini);
1645 MODULE_LICENSE("GPL");
1646 MODULE_ALIAS_RTNL_LINK("gre");
1647 MODULE_ALIAS_RTNL_LINK("gretap");