cipso: Fix documentation comment
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 #define HASH_SIZE  16
130
131 static int ipgre_net_id;
132 struct ipgre_net {
133         struct ip_tunnel *tunnels[4][HASH_SIZE];
134
135         struct net_device *fb_tunnel_dev;
136 };
137
138 /* Tunnel hash table */
139
140 /*
141    4 hash tables:
142
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157
158 #define tunnels_r_l     tunnels[3]
159 #define tunnels_r       tunnels[2]
160 #define tunnels_l       tunnels[1]
161 #define tunnels_wc      tunnels[0]
162
163 static DEFINE_RWLOCK(ipgre_lock);
164
165 /* Given src, dst and key, find appropriate for input tunnel. */
166
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
168                                               __be32 remote, __be32 local,
169                                               __be32 key, __be16 gre_proto)
170 {
171         unsigned h0 = HASH(remote);
172         unsigned h1 = HASH(key);
173         struct ip_tunnel *t;
174         struct ip_tunnel *t2 = NULL;
175         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
176         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
177                        ARPHRD_ETHER : ARPHRD_IPGRE;
178
179         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
180                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
181                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
182                                 if (t->dev->type == dev_type)
183                                         return t;
184                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
185                                         t2 = t;
186                         }
187                 }
188         }
189
190         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
191                 if (remote == t->parms.iph.daddr) {
192                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
193                                 if (t->dev->type == dev_type)
194                                         return t;
195                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
196                                         t2 = t;
197                         }
198                 }
199         }
200
201         for (t = ign->tunnels_l[h1]; t; t = t->next) {
202                 if (local == t->parms.iph.saddr ||
203                      (local == t->parms.iph.daddr &&
204                       ipv4_is_multicast(local))) {
205                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
206                                 if (t->dev->type == dev_type)
207                                         return t;
208                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
209                                         t2 = t;
210                         }
211                 }
212         }
213
214         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
215                 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
216                         if (t->dev->type == dev_type)
217                                 return t;
218                         if (t->dev->type == ARPHRD_IPGRE && !t2)
219                                 t2 = t;
220                 }
221         }
222
223         if (t2)
224                 return t2;
225
226         if (ign->fb_tunnel_dev->flags&IFF_UP)
227                 return netdev_priv(ign->fb_tunnel_dev);
228         return NULL;
229 }
230
231 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
232                 struct ip_tunnel_parm *parms)
233 {
234         __be32 remote = parms->iph.daddr;
235         __be32 local = parms->iph.saddr;
236         __be32 key = parms->i_key;
237         unsigned h = HASH(key);
238         int prio = 0;
239
240         if (local)
241                 prio |= 1;
242         if (remote && !ipv4_is_multicast(remote)) {
243                 prio |= 2;
244                 h ^= HASH(remote);
245         }
246
247         return &ign->tunnels[prio][h];
248 }
249
250 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
251                 struct ip_tunnel *t)
252 {
253         return __ipgre_bucket(ign, &t->parms);
254 }
255
256 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
257 {
258         struct ip_tunnel **tp = ipgre_bucket(ign, t);
259
260         t->next = *tp;
261         write_lock_bh(&ipgre_lock);
262         *tp = t;
263         write_unlock_bh(&ipgre_lock);
264 }
265
266 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
267 {
268         struct ip_tunnel **tp;
269
270         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
271                 if (t == *tp) {
272                         write_lock_bh(&ipgre_lock);
273                         *tp = t->next;
274                         write_unlock_bh(&ipgre_lock);
275                         break;
276                 }
277         }
278 }
279
280 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
281                                            struct ip_tunnel_parm *parms,
282                                            int type)
283 {
284         __be32 remote = parms->iph.daddr;
285         __be32 local = parms->iph.saddr;
286         __be32 key = parms->i_key;
287         struct ip_tunnel *t, **tp;
288         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
289
290         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
291                 if (local == t->parms.iph.saddr &&
292                     remote == t->parms.iph.daddr &&
293                     key == t->parms.i_key &&
294                     type == t->dev->type)
295                         break;
296
297         return t;
298 }
299
300 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
301                 struct ip_tunnel_parm *parms, int create)
302 {
303         struct ip_tunnel *t, *nt;
304         struct net_device *dev;
305         char name[IFNAMSIZ];
306         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
307
308         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
309         if (t || !create)
310                 return t;
311
312         if (parms->name[0])
313                 strlcpy(name, parms->name, IFNAMSIZ);
314         else
315                 sprintf(name, "gre%%d");
316
317         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
318         if (!dev)
319           return NULL;
320
321         dev_net_set(dev, net);
322
323         if (strchr(name, '%')) {
324                 if (dev_alloc_name(dev, name) < 0)
325                         goto failed_free;
326         }
327
328         nt = netdev_priv(dev);
329         nt->parms = *parms;
330         dev->rtnl_link_ops = &ipgre_link_ops;
331
332         dev->mtu = ipgre_tunnel_bind_dev(dev);
333
334         if (register_netdevice(dev) < 0)
335                 goto failed_free;
336
337         dev_hold(dev);
338         ipgre_tunnel_link(ign, nt);
339         return nt;
340
341 failed_free:
342         free_netdev(dev);
343         return NULL;
344 }
345
346 static void ipgre_tunnel_uninit(struct net_device *dev)
347 {
348         struct net *net = dev_net(dev);
349         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
350
351         ipgre_tunnel_unlink(ign, netdev_priv(dev));
352         dev_put(dev);
353 }
354
355
356 static void ipgre_err(struct sk_buff *skb, u32 info)
357 {
358
359 /* All the routers (except for Linux) return only
360    8 bytes of packet payload. It means, that precise relaying of
361    ICMP in the real Internet is absolutely infeasible.
362
363    Moreover, Cisco "wise men" put GRE key to the third word
364    in GRE header. It makes impossible maintaining even soft state for keyed
365    GRE tunnels with enabled checksum. Tell them "thank you".
366
367    Well, I wonder, rfc1812 was written by Cisco employee,
368    what the hell these idiots break standrads established
369    by themself???
370  */
371
372         struct iphdr *iph = (struct iphdr *)skb->data;
373         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
374         int grehlen = (iph->ihl<<2) + 4;
375         const int type = icmp_hdr(skb)->type;
376         const int code = icmp_hdr(skb)->code;
377         struct ip_tunnel *t;
378         __be16 flags;
379
380         flags = p[0];
381         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
382                 if (flags&(GRE_VERSION|GRE_ROUTING))
383                         return;
384                 if (flags&GRE_KEY) {
385                         grehlen += 4;
386                         if (flags&GRE_CSUM)
387                                 grehlen += 4;
388                 }
389         }
390
391         /* If only 8 bytes returned, keyed message will be dropped here */
392         if (skb_headlen(skb) < grehlen)
393                 return;
394
395         switch (type) {
396         default:
397         case ICMP_PARAMETERPROB:
398                 return;
399
400         case ICMP_DEST_UNREACH:
401                 switch (code) {
402                 case ICMP_SR_FAILED:
403                 case ICMP_PORT_UNREACH:
404                         /* Impossible event. */
405                         return;
406                 case ICMP_FRAG_NEEDED:
407                         /* Soft state for pmtu is maintained by IP core. */
408                         return;
409                 default:
410                         /* All others are translated to HOST_UNREACH.
411                            rfc2003 contains "deep thoughts" about NET_UNREACH,
412                            I believe they are just ether pollution. --ANK
413                          */
414                         break;
415                 }
416                 break;
417         case ICMP_TIME_EXCEEDED:
418                 if (code != ICMP_EXC_TTL)
419                         return;
420                 break;
421         }
422
423         read_lock(&ipgre_lock);
424         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
425                                 flags & GRE_KEY ?
426                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
427                                 p[1]);
428         if (t == NULL || t->parms.iph.daddr == 0 ||
429             ipv4_is_multicast(t->parms.iph.daddr))
430                 goto out;
431
432         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
433                 goto out;
434
435         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
436                 t->err_count++;
437         else
438                 t->err_count = 1;
439         t->err_time = jiffies;
440 out:
441         read_unlock(&ipgre_lock);
442         return;
443 }
444
445 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
446 {
447         if (INET_ECN_is_ce(iph->tos)) {
448                 if (skb->protocol == htons(ETH_P_IP)) {
449                         IP_ECN_set_ce(ip_hdr(skb));
450                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
451                         IP6_ECN_set_ce(ipv6_hdr(skb));
452                 }
453         }
454 }
455
456 static inline u8
457 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
458 {
459         u8 inner = 0;
460         if (skb->protocol == htons(ETH_P_IP))
461                 inner = old_iph->tos;
462         else if (skb->protocol == htons(ETH_P_IPV6))
463                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
464         return INET_ECN_encapsulate(tos, inner);
465 }
466
467 static int ipgre_rcv(struct sk_buff *skb)
468 {
469         struct iphdr *iph;
470         u8     *h;
471         __be16    flags;
472         __sum16   csum = 0;
473         __be32 key = 0;
474         u32    seqno = 0;
475         struct ip_tunnel *tunnel;
476         int    offset = 4;
477         __be16 gre_proto;
478         unsigned int len;
479
480         if (!pskb_may_pull(skb, 16))
481                 goto drop_nolock;
482
483         iph = ip_hdr(skb);
484         h = skb->data;
485         flags = *(__be16*)h;
486
487         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
488                 /* - Version must be 0.
489                    - We do not support routing headers.
490                  */
491                 if (flags&(GRE_VERSION|GRE_ROUTING))
492                         goto drop_nolock;
493
494                 if (flags&GRE_CSUM) {
495                         switch (skb->ip_summed) {
496                         case CHECKSUM_COMPLETE:
497                                 csum = csum_fold(skb->csum);
498                                 if (!csum)
499                                         break;
500                                 /* fall through */
501                         case CHECKSUM_NONE:
502                                 skb->csum = 0;
503                                 csum = __skb_checksum_complete(skb);
504                                 skb->ip_summed = CHECKSUM_COMPLETE;
505                         }
506                         offset += 4;
507                 }
508                 if (flags&GRE_KEY) {
509                         key = *(__be32*)(h + offset);
510                         offset += 4;
511                 }
512                 if (flags&GRE_SEQ) {
513                         seqno = ntohl(*(__be32*)(h + offset));
514                         offset += 4;
515                 }
516         }
517
518         gre_proto = *(__be16 *)(h + 2);
519
520         read_lock(&ipgre_lock);
521         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
522                                           iph->saddr, iph->daddr, key,
523                                           gre_proto))) {
524                 struct net_device_stats *stats = &tunnel->dev->stats;
525
526                 secpath_reset(skb);
527
528                 skb->protocol = gre_proto;
529                 /* WCCP version 1 and 2 protocol decoding.
530                  * - Change protocol to IP
531                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
532                  */
533                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
534                         skb->protocol = htons(ETH_P_IP);
535                         if ((*(h + offset) & 0xF0) != 0x40)
536                                 offset += 4;
537                 }
538
539                 skb->mac_header = skb->network_header;
540                 __pskb_pull(skb, offset);
541                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
542                 skb->pkt_type = PACKET_HOST;
543 #ifdef CONFIG_NET_IPGRE_BROADCAST
544                 if (ipv4_is_multicast(iph->daddr)) {
545                         /* Looped back packet, drop it! */
546                         if (skb->rtable->fl.iif == 0)
547                                 goto drop;
548                         stats->multicast++;
549                         skb->pkt_type = PACKET_BROADCAST;
550                 }
551 #endif
552
553                 if (((flags&GRE_CSUM) && csum) ||
554                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
555                         stats->rx_crc_errors++;
556                         stats->rx_errors++;
557                         goto drop;
558                 }
559                 if (tunnel->parms.i_flags&GRE_SEQ) {
560                         if (!(flags&GRE_SEQ) ||
561                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
562                                 stats->rx_fifo_errors++;
563                                 stats->rx_errors++;
564                                 goto drop;
565                         }
566                         tunnel->i_seqno = seqno + 1;
567                 }
568
569                 len = skb->len;
570
571                 /* Warning: All skb pointers will be invalidated! */
572                 if (tunnel->dev->type == ARPHRD_ETHER) {
573                         if (!pskb_may_pull(skb, ETH_HLEN)) {
574                                 stats->rx_length_errors++;
575                                 stats->rx_errors++;
576                                 goto drop;
577                         }
578
579                         iph = ip_hdr(skb);
580                         skb->protocol = eth_type_trans(skb, tunnel->dev);
581                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
582                 }
583
584                 stats->rx_packets++;
585                 stats->rx_bytes += len;
586                 skb->dev = tunnel->dev;
587                 dst_release(skb->dst);
588                 skb->dst = NULL;
589                 nf_reset(skb);
590
591                 skb_reset_network_header(skb);
592                 ipgre_ecn_decapsulate(iph, skb);
593
594                 netif_rx(skb);
595                 read_unlock(&ipgre_lock);
596                 return(0);
597         }
598         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
599
600 drop:
601         read_unlock(&ipgre_lock);
602 drop_nolock:
603         kfree_skb(skb);
604         return(0);
605 }
606
607 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
608 {
609         struct ip_tunnel *tunnel = netdev_priv(dev);
610         struct net_device_stats *stats = &tunnel->dev->stats;
611         struct iphdr  *old_iph = ip_hdr(skb);
612         struct iphdr  *tiph;
613         u8     tos;
614         __be16 df;
615         struct rtable *rt;                      /* Route to the other host */
616         struct net_device *tdev;                        /* Device to other host */
617         struct iphdr  *iph;                     /* Our new IP header */
618         unsigned int max_headroom;              /* The extra header space needed */
619         int    gre_hlen;
620         __be32 dst;
621         int    mtu;
622
623         if (tunnel->recursion++) {
624                 stats->collisions++;
625                 goto tx_error;
626         }
627
628         if (dev->type == ARPHRD_ETHER)
629                 IPCB(skb)->flags = 0;
630
631         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
632                 gre_hlen = 0;
633                 tiph = (struct iphdr *)skb->data;
634         } else {
635                 gre_hlen = tunnel->hlen;
636                 tiph = &tunnel->parms.iph;
637         }
638
639         if ((dst = tiph->daddr) == 0) {
640                 /* NBMA tunnel */
641
642                 if (skb->dst == NULL) {
643                         stats->tx_fifo_errors++;
644                         goto tx_error;
645                 }
646
647                 if (skb->protocol == htons(ETH_P_IP)) {
648                         rt = skb->rtable;
649                         if ((dst = rt->rt_gateway) == 0)
650                                 goto tx_error_icmp;
651                 }
652 #ifdef CONFIG_IPV6
653                 else if (skb->protocol == htons(ETH_P_IPV6)) {
654                         struct in6_addr *addr6;
655                         int addr_type;
656                         struct neighbour *neigh = skb->dst->neighbour;
657
658                         if (neigh == NULL)
659                                 goto tx_error;
660
661                         addr6 = (struct in6_addr *)&neigh->primary_key;
662                         addr_type = ipv6_addr_type(addr6);
663
664                         if (addr_type == IPV6_ADDR_ANY) {
665                                 addr6 = &ipv6_hdr(skb)->daddr;
666                                 addr_type = ipv6_addr_type(addr6);
667                         }
668
669                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
670                                 goto tx_error_icmp;
671
672                         dst = addr6->s6_addr32[3];
673                 }
674 #endif
675                 else
676                         goto tx_error;
677         }
678
679         tos = tiph->tos;
680         if (tos&1) {
681                 if (skb->protocol == htons(ETH_P_IP))
682                         tos = old_iph->tos;
683                 tos &= ~1;
684         }
685
686         {
687                 struct flowi fl = { .oif = tunnel->parms.link,
688                                     .nl_u = { .ip4_u =
689                                               { .daddr = dst,
690                                                 .saddr = tiph->saddr,
691                                                 .tos = RT_TOS(tos) } },
692                                     .proto = IPPROTO_GRE };
693                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
694                         stats->tx_carrier_errors++;
695                         goto tx_error;
696                 }
697         }
698         tdev = rt->u.dst.dev;
699
700         if (tdev == dev) {
701                 ip_rt_put(rt);
702                 stats->collisions++;
703                 goto tx_error;
704         }
705
706         df = tiph->frag_off;
707         if (df)
708                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
709         else
710                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
711
712         if (skb->dst)
713                 skb->dst->ops->update_pmtu(skb->dst, mtu);
714
715         if (skb->protocol == htons(ETH_P_IP)) {
716                 df |= (old_iph->frag_off&htons(IP_DF));
717
718                 if ((old_iph->frag_off&htons(IP_DF)) &&
719                     mtu < ntohs(old_iph->tot_len)) {
720                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
721                         ip_rt_put(rt);
722                         goto tx_error;
723                 }
724         }
725 #ifdef CONFIG_IPV6
726         else if (skb->protocol == htons(ETH_P_IPV6)) {
727                 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
728
729                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
730                         if ((tunnel->parms.iph.daddr &&
731                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
732                             rt6->rt6i_dst.plen == 128) {
733                                 rt6->rt6i_flags |= RTF_MODIFIED;
734                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
735                         }
736                 }
737
738                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
739                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
740                         ip_rt_put(rt);
741                         goto tx_error;
742                 }
743         }
744 #endif
745
746         if (tunnel->err_count > 0) {
747                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
748                         tunnel->err_count--;
749
750                         dst_link_failure(skb);
751                 } else
752                         tunnel->err_count = 0;
753         }
754
755         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
756
757         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
758             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
759                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
760                 if (!new_skb) {
761                         ip_rt_put(rt);
762                         stats->tx_dropped++;
763                         dev_kfree_skb(skb);
764                         tunnel->recursion--;
765                         return 0;
766                 }
767                 if (skb->sk)
768                         skb_set_owner_w(new_skb, skb->sk);
769                 dev_kfree_skb(skb);
770                 skb = new_skb;
771                 old_iph = ip_hdr(skb);
772         }
773
774         skb_reset_transport_header(skb);
775         skb_push(skb, gre_hlen);
776         skb_reset_network_header(skb);
777         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
778         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
779                               IPSKB_REROUTED);
780         dst_release(skb->dst);
781         skb->dst = &rt->u.dst;
782
783         /*
784          *      Push down and install the IPIP header.
785          */
786
787         iph                     =       ip_hdr(skb);
788         iph->version            =       4;
789         iph->ihl                =       sizeof(struct iphdr) >> 2;
790         iph->frag_off           =       df;
791         iph->protocol           =       IPPROTO_GRE;
792         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
793         iph->daddr              =       rt->rt_dst;
794         iph->saddr              =       rt->rt_src;
795
796         if ((iph->ttl = tiph->ttl) == 0) {
797                 if (skb->protocol == htons(ETH_P_IP))
798                         iph->ttl = old_iph->ttl;
799 #ifdef CONFIG_IPV6
800                 else if (skb->protocol == htons(ETH_P_IPV6))
801                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
802 #endif
803                 else
804                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
805         }
806
807         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
808         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
809                                    htons(ETH_P_TEB) : skb->protocol;
810
811         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
812                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
813
814                 if (tunnel->parms.o_flags&GRE_SEQ) {
815                         ++tunnel->o_seqno;
816                         *ptr = htonl(tunnel->o_seqno);
817                         ptr--;
818                 }
819                 if (tunnel->parms.o_flags&GRE_KEY) {
820                         *ptr = tunnel->parms.o_key;
821                         ptr--;
822                 }
823                 if (tunnel->parms.o_flags&GRE_CSUM) {
824                         *ptr = 0;
825                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
826                 }
827         }
828
829         nf_reset(skb);
830
831         IPTUNNEL_XMIT();
832         tunnel->recursion--;
833         return 0;
834
835 tx_error_icmp:
836         dst_link_failure(skb);
837
838 tx_error:
839         stats->tx_errors++;
840         dev_kfree_skb(skb);
841         tunnel->recursion--;
842         return 0;
843 }
844
845 static int ipgre_tunnel_bind_dev(struct net_device *dev)
846 {
847         struct net_device *tdev = NULL;
848         struct ip_tunnel *tunnel;
849         struct iphdr *iph;
850         int hlen = LL_MAX_HEADER;
851         int mtu = ETH_DATA_LEN;
852         int addend = sizeof(struct iphdr) + 4;
853
854         tunnel = netdev_priv(dev);
855         iph = &tunnel->parms.iph;
856
857         /* Guess output device to choose reasonable mtu and needed_headroom */
858
859         if (iph->daddr) {
860                 struct flowi fl = { .oif = tunnel->parms.link,
861                                     .nl_u = { .ip4_u =
862                                               { .daddr = iph->daddr,
863                                                 .saddr = iph->saddr,
864                                                 .tos = RT_TOS(iph->tos) } },
865                                     .proto = IPPROTO_GRE };
866                 struct rtable *rt;
867                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
868                         tdev = rt->u.dst.dev;
869                         ip_rt_put(rt);
870                 }
871
872                 if (dev->type != ARPHRD_ETHER)
873                         dev->flags |= IFF_POINTOPOINT;
874         }
875
876         if (!tdev && tunnel->parms.link)
877                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
878
879         if (tdev) {
880                 hlen = tdev->hard_header_len + tdev->needed_headroom;
881                 mtu = tdev->mtu;
882         }
883         dev->iflink = tunnel->parms.link;
884
885         /* Precalculate GRE options length */
886         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
887                 if (tunnel->parms.o_flags&GRE_CSUM)
888                         addend += 4;
889                 if (tunnel->parms.o_flags&GRE_KEY)
890                         addend += 4;
891                 if (tunnel->parms.o_flags&GRE_SEQ)
892                         addend += 4;
893         }
894         dev->needed_headroom = addend + hlen;
895         mtu -= dev->hard_header_len - addend;
896
897         if (mtu < 68)
898                 mtu = 68;
899
900         tunnel->hlen = addend;
901
902         return mtu;
903 }
904
905 static int
906 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
907 {
908         int err = 0;
909         struct ip_tunnel_parm p;
910         struct ip_tunnel *t;
911         struct net *net = dev_net(dev);
912         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
913
914         switch (cmd) {
915         case SIOCGETTUNNEL:
916                 t = NULL;
917                 if (dev == ign->fb_tunnel_dev) {
918                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
919                                 err = -EFAULT;
920                                 break;
921                         }
922                         t = ipgre_tunnel_locate(net, &p, 0);
923                 }
924                 if (t == NULL)
925                         t = netdev_priv(dev);
926                 memcpy(&p, &t->parms, sizeof(p));
927                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
928                         err = -EFAULT;
929                 break;
930
931         case SIOCADDTUNNEL:
932         case SIOCCHGTUNNEL:
933                 err = -EPERM;
934                 if (!capable(CAP_NET_ADMIN))
935                         goto done;
936
937                 err = -EFAULT;
938                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
939                         goto done;
940
941                 err = -EINVAL;
942                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
943                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
944                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
945                         goto done;
946                 if (p.iph.ttl)
947                         p.iph.frag_off |= htons(IP_DF);
948
949                 if (!(p.i_flags&GRE_KEY))
950                         p.i_key = 0;
951                 if (!(p.o_flags&GRE_KEY))
952                         p.o_key = 0;
953
954                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
955
956                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
957                         if (t != NULL) {
958                                 if (t->dev != dev) {
959                                         err = -EEXIST;
960                                         break;
961                                 }
962                         } else {
963                                 unsigned nflags = 0;
964
965                                 t = netdev_priv(dev);
966
967                                 if (ipv4_is_multicast(p.iph.daddr))
968                                         nflags = IFF_BROADCAST;
969                                 else if (p.iph.daddr)
970                                         nflags = IFF_POINTOPOINT;
971
972                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
973                                         err = -EINVAL;
974                                         break;
975                                 }
976                                 ipgre_tunnel_unlink(ign, t);
977                                 t->parms.iph.saddr = p.iph.saddr;
978                                 t->parms.iph.daddr = p.iph.daddr;
979                                 t->parms.i_key = p.i_key;
980                                 t->parms.o_key = p.o_key;
981                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
982                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
983                                 ipgre_tunnel_link(ign, t);
984                                 netdev_state_change(dev);
985                         }
986                 }
987
988                 if (t) {
989                         err = 0;
990                         if (cmd == SIOCCHGTUNNEL) {
991                                 t->parms.iph.ttl = p.iph.ttl;
992                                 t->parms.iph.tos = p.iph.tos;
993                                 t->parms.iph.frag_off = p.iph.frag_off;
994                                 if (t->parms.link != p.link) {
995                                         t->parms.link = p.link;
996                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
997                                         netdev_state_change(dev);
998                                 }
999                         }
1000                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1001                                 err = -EFAULT;
1002                 } else
1003                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1004                 break;
1005
1006         case SIOCDELTUNNEL:
1007                 err = -EPERM;
1008                 if (!capable(CAP_NET_ADMIN))
1009                         goto done;
1010
1011                 if (dev == ign->fb_tunnel_dev) {
1012                         err = -EFAULT;
1013                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1014                                 goto done;
1015                         err = -ENOENT;
1016                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1017                                 goto done;
1018                         err = -EPERM;
1019                         if (t == netdev_priv(ign->fb_tunnel_dev))
1020                                 goto done;
1021                         dev = t->dev;
1022                 }
1023                 unregister_netdevice(dev);
1024                 err = 0;
1025                 break;
1026
1027         default:
1028                 err = -EINVAL;
1029         }
1030
1031 done:
1032         return err;
1033 }
1034
1035 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1036 {
1037         struct ip_tunnel *tunnel = netdev_priv(dev);
1038         if (new_mtu < 68 ||
1039             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1040                 return -EINVAL;
1041         dev->mtu = new_mtu;
1042         return 0;
1043 }
1044
1045 /* Nice toy. Unfortunately, useless in real life :-)
1046    It allows to construct virtual multiprotocol broadcast "LAN"
1047    over the Internet, provided multicast routing is tuned.
1048
1049
1050    I have no idea was this bicycle invented before me,
1051    so that I had to set ARPHRD_IPGRE to a random value.
1052    I have an impression, that Cisco could make something similar,
1053    but this feature is apparently missing in IOS<=11.2(8).
1054
1055    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1056    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1057
1058    ping -t 255 224.66.66.66
1059
1060    If nobody answers, mbone does not work.
1061
1062    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1063    ip addr add 10.66.66.<somewhat>/24 dev Universe
1064    ifconfig Universe up
1065    ifconfig Universe add fe80::<Your_real_addr>/10
1066    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1067    ftp 10.66.66.66
1068    ...
1069    ftp fec0:6666:6666::193.233.7.65
1070    ...
1071
1072  */
1073
1074 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1075                         unsigned short type,
1076                         const void *daddr, const void *saddr, unsigned len)
1077 {
1078         struct ip_tunnel *t = netdev_priv(dev);
1079         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1080         __be16 *p = (__be16*)(iph+1);
1081
1082         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1083         p[0]            = t->parms.o_flags;
1084         p[1]            = htons(type);
1085
1086         /*
1087          *      Set the source hardware address.
1088          */
1089
1090         if (saddr)
1091                 memcpy(&iph->saddr, saddr, 4);
1092
1093         if (daddr) {
1094                 memcpy(&iph->daddr, daddr, 4);
1095                 return t->hlen;
1096         }
1097         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1098                 return t->hlen;
1099
1100         return -t->hlen;
1101 }
1102
1103 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1104 {
1105         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1106         memcpy(haddr, &iph->saddr, 4);
1107         return 4;
1108 }
1109
1110 static const struct header_ops ipgre_header_ops = {
1111         .create = ipgre_header,
1112         .parse  = ipgre_header_parse,
1113 };
1114
1115 #ifdef CONFIG_NET_IPGRE_BROADCAST
1116 static int ipgre_open(struct net_device *dev)
1117 {
1118         struct ip_tunnel *t = netdev_priv(dev);
1119
1120         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1121                 struct flowi fl = { .oif = t->parms.link,
1122                                     .nl_u = { .ip4_u =
1123                                               { .daddr = t->parms.iph.daddr,
1124                                                 .saddr = t->parms.iph.saddr,
1125                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1126                                     .proto = IPPROTO_GRE };
1127                 struct rtable *rt;
1128                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1129                         return -EADDRNOTAVAIL;
1130                 dev = rt->u.dst.dev;
1131                 ip_rt_put(rt);
1132                 if (__in_dev_get_rtnl(dev) == NULL)
1133                         return -EADDRNOTAVAIL;
1134                 t->mlink = dev->ifindex;
1135                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1136         }
1137         return 0;
1138 }
1139
1140 static int ipgre_close(struct net_device *dev)
1141 {
1142         struct ip_tunnel *t = netdev_priv(dev);
1143
1144         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1145                 struct in_device *in_dev;
1146                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1147                 if (in_dev) {
1148                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1149                         in_dev_put(in_dev);
1150                 }
1151         }
1152         return 0;
1153 }
1154
1155 #endif
1156
1157 static const struct net_device_ops ipgre_netdev_ops = {
1158         .ndo_init               = ipgre_tunnel_init,
1159         .ndo_uninit             = ipgre_tunnel_uninit,
1160 #ifdef CONFIG_NET_IPGRE_BROADCAST
1161         .ndo_open               = ipgre_open,
1162         .ndo_stop               = ipgre_close,
1163 #endif
1164         .ndo_start_xmit         = ipgre_tunnel_xmit,
1165         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1166         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1167 };
1168
1169 static void ipgre_tunnel_setup(struct net_device *dev)
1170 {
1171         dev->netdev_ops         = &ipgre_netdev_ops;
1172         dev->destructor         = free_netdev;
1173
1174         dev->type               = ARPHRD_IPGRE;
1175         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1176         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1177         dev->flags              = IFF_NOARP;
1178         dev->iflink             = 0;
1179         dev->addr_len           = 4;
1180         dev->features           |= NETIF_F_NETNS_LOCAL;
1181 }
1182
1183 static int ipgre_tunnel_init(struct net_device *dev)
1184 {
1185         struct ip_tunnel *tunnel;
1186         struct iphdr *iph;
1187
1188         tunnel = netdev_priv(dev);
1189         iph = &tunnel->parms.iph;
1190
1191         tunnel->dev = dev;
1192         strcpy(tunnel->parms.name, dev->name);
1193
1194         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1195         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1196
1197         if (iph->daddr) {
1198 #ifdef CONFIG_NET_IPGRE_BROADCAST
1199                 if (ipv4_is_multicast(iph->daddr)) {
1200                         if (!iph->saddr)
1201                                 return -EINVAL;
1202                         dev->flags = IFF_BROADCAST;
1203                         dev->header_ops = &ipgre_header_ops;
1204                 }
1205 #endif
1206         } else
1207                 dev->header_ops = &ipgre_header_ops;
1208
1209         return 0;
1210 }
1211
1212 static void ipgre_fb_tunnel_init(struct net_device *dev)
1213 {
1214         struct ip_tunnel *tunnel = netdev_priv(dev);
1215         struct iphdr *iph = &tunnel->parms.iph;
1216         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1217
1218         tunnel->dev = dev;
1219         strcpy(tunnel->parms.name, dev->name);
1220
1221         iph->version            = 4;
1222         iph->protocol           = IPPROTO_GRE;
1223         iph->ihl                = 5;
1224         tunnel->hlen            = sizeof(struct iphdr) + 4;
1225
1226         dev_hold(dev);
1227         ign->tunnels_wc[0]      = tunnel;
1228 }
1229
1230
1231 static struct net_protocol ipgre_protocol = {
1232         .handler        =       ipgre_rcv,
1233         .err_handler    =       ipgre_err,
1234         .netns_ok       =       1,
1235 };
1236
1237 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1238 {
1239         int prio;
1240
1241         for (prio = 0; prio < 4; prio++) {
1242                 int h;
1243                 for (h = 0; h < HASH_SIZE; h++) {
1244                         struct ip_tunnel *t;
1245                         while ((t = ign->tunnels[prio][h]) != NULL)
1246                                 unregister_netdevice(t->dev);
1247                 }
1248         }
1249 }
1250
1251 static int ipgre_init_net(struct net *net)
1252 {
1253         int err;
1254         struct ipgre_net *ign;
1255
1256         err = -ENOMEM;
1257         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1258         if (ign == NULL)
1259                 goto err_alloc;
1260
1261         err = net_assign_generic(net, ipgre_net_id, ign);
1262         if (err < 0)
1263                 goto err_assign;
1264
1265         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1266                                            ipgre_tunnel_setup);
1267         if (!ign->fb_tunnel_dev) {
1268                 err = -ENOMEM;
1269                 goto err_alloc_dev;
1270         }
1271         dev_net_set(ign->fb_tunnel_dev, net);
1272
1273         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1274         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1275
1276         if ((err = register_netdev(ign->fb_tunnel_dev)))
1277                 goto err_reg_dev;
1278
1279         return 0;
1280
1281 err_reg_dev:
1282         free_netdev(ign->fb_tunnel_dev);
1283 err_alloc_dev:
1284         /* nothing */
1285 err_assign:
1286         kfree(ign);
1287 err_alloc:
1288         return err;
1289 }
1290
1291 static void ipgre_exit_net(struct net *net)
1292 {
1293         struct ipgre_net *ign;
1294
1295         ign = net_generic(net, ipgre_net_id);
1296         rtnl_lock();
1297         ipgre_destroy_tunnels(ign);
1298         rtnl_unlock();
1299         kfree(ign);
1300 }
1301
1302 static struct pernet_operations ipgre_net_ops = {
1303         .init = ipgre_init_net,
1304         .exit = ipgre_exit_net,
1305 };
1306
1307 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1308 {
1309         __be16 flags;
1310
1311         if (!data)
1312                 return 0;
1313
1314         flags = 0;
1315         if (data[IFLA_GRE_IFLAGS])
1316                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1317         if (data[IFLA_GRE_OFLAGS])
1318                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1319         if (flags & (GRE_VERSION|GRE_ROUTING))
1320                 return -EINVAL;
1321
1322         return 0;
1323 }
1324
1325 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1326 {
1327         __be32 daddr;
1328
1329         if (tb[IFLA_ADDRESS]) {
1330                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1331                         return -EINVAL;
1332                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1333                         return -EADDRNOTAVAIL;
1334         }
1335
1336         if (!data)
1337                 goto out;
1338
1339         if (data[IFLA_GRE_REMOTE]) {
1340                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1341                 if (!daddr)
1342                         return -EINVAL;
1343         }
1344
1345 out:
1346         return ipgre_tunnel_validate(tb, data);
1347 }
1348
1349 static void ipgre_netlink_parms(struct nlattr *data[],
1350                                 struct ip_tunnel_parm *parms)
1351 {
1352         memset(parms, 0, sizeof(*parms));
1353
1354         parms->iph.protocol = IPPROTO_GRE;
1355
1356         if (!data)
1357                 return;
1358
1359         if (data[IFLA_GRE_LINK])
1360                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1361
1362         if (data[IFLA_GRE_IFLAGS])
1363                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1364
1365         if (data[IFLA_GRE_OFLAGS])
1366                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1367
1368         if (data[IFLA_GRE_IKEY])
1369                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1370
1371         if (data[IFLA_GRE_OKEY])
1372                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1373
1374         if (data[IFLA_GRE_LOCAL])
1375                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1376
1377         if (data[IFLA_GRE_REMOTE])
1378                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1379
1380         if (data[IFLA_GRE_TTL])
1381                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1382
1383         if (data[IFLA_GRE_TOS])
1384                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1385
1386         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1387                 parms->iph.frag_off = htons(IP_DF);
1388 }
1389
1390 static int ipgre_tap_init(struct net_device *dev)
1391 {
1392         struct ip_tunnel *tunnel;
1393
1394         tunnel = netdev_priv(dev);
1395
1396         tunnel->dev = dev;
1397         strcpy(tunnel->parms.name, dev->name);
1398
1399         ipgre_tunnel_bind_dev(dev);
1400
1401         return 0;
1402 }
1403
1404 static const struct net_device_ops ipgre_tap_netdev_ops = {
1405         .ndo_init               = ipgre_tap_init,
1406         .ndo_uninit             = ipgre_tunnel_uninit,
1407         .ndo_start_xmit         = ipgre_tunnel_xmit,
1408         .ndo_set_mac_address    = eth_mac_addr,
1409         .ndo_validate_addr      = eth_validate_addr,
1410         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1411 };
1412
1413 static void ipgre_tap_setup(struct net_device *dev)
1414 {
1415
1416         ether_setup(dev);
1417
1418         dev->netdev_ops         = &ipgre_netdev_ops;
1419         dev->destructor         = free_netdev;
1420
1421         dev->iflink             = 0;
1422         dev->features           |= NETIF_F_NETNS_LOCAL;
1423 }
1424
1425 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1426                          struct nlattr *data[])
1427 {
1428         struct ip_tunnel *nt;
1429         struct net *net = dev_net(dev);
1430         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1431         int mtu;
1432         int err;
1433
1434         nt = netdev_priv(dev);
1435         ipgre_netlink_parms(data, &nt->parms);
1436
1437         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1438                 return -EEXIST;
1439
1440         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1441                 random_ether_addr(dev->dev_addr);
1442
1443         mtu = ipgre_tunnel_bind_dev(dev);
1444         if (!tb[IFLA_MTU])
1445                 dev->mtu = mtu;
1446
1447         err = register_netdevice(dev);
1448         if (err)
1449                 goto out;
1450
1451         dev_hold(dev);
1452         ipgre_tunnel_link(ign, nt);
1453
1454 out:
1455         return err;
1456 }
1457
1458 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1459                             struct nlattr *data[])
1460 {
1461         struct ip_tunnel *t, *nt;
1462         struct net *net = dev_net(dev);
1463         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1464         struct ip_tunnel_parm p;
1465         int mtu;
1466
1467         if (dev == ign->fb_tunnel_dev)
1468                 return -EINVAL;
1469
1470         nt = netdev_priv(dev);
1471         ipgre_netlink_parms(data, &p);
1472
1473         t = ipgre_tunnel_locate(net, &p, 0);
1474
1475         if (t) {
1476                 if (t->dev != dev)
1477                         return -EEXIST;
1478         } else {
1479                 unsigned nflags = 0;
1480
1481                 t = nt;
1482
1483                 if (ipv4_is_multicast(p.iph.daddr))
1484                         nflags = IFF_BROADCAST;
1485                 else if (p.iph.daddr)
1486                         nflags = IFF_POINTOPOINT;
1487
1488                 if ((dev->flags ^ nflags) &
1489                     (IFF_POINTOPOINT | IFF_BROADCAST))
1490                         return -EINVAL;
1491
1492                 ipgre_tunnel_unlink(ign, t);
1493                 t->parms.iph.saddr = p.iph.saddr;
1494                 t->parms.iph.daddr = p.iph.daddr;
1495                 t->parms.i_key = p.i_key;
1496                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1497                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1498                 ipgre_tunnel_link(ign, t);
1499                 netdev_state_change(dev);
1500         }
1501
1502         t->parms.o_key = p.o_key;
1503         t->parms.iph.ttl = p.iph.ttl;
1504         t->parms.iph.tos = p.iph.tos;
1505         t->parms.iph.frag_off = p.iph.frag_off;
1506
1507         if (t->parms.link != p.link) {
1508                 t->parms.link = p.link;
1509                 mtu = ipgre_tunnel_bind_dev(dev);
1510                 if (!tb[IFLA_MTU])
1511                         dev->mtu = mtu;
1512                 netdev_state_change(dev);
1513         }
1514
1515         return 0;
1516 }
1517
1518 static size_t ipgre_get_size(const struct net_device *dev)
1519 {
1520         return
1521                 /* IFLA_GRE_LINK */
1522                 nla_total_size(4) +
1523                 /* IFLA_GRE_IFLAGS */
1524                 nla_total_size(2) +
1525                 /* IFLA_GRE_OFLAGS */
1526                 nla_total_size(2) +
1527                 /* IFLA_GRE_IKEY */
1528                 nla_total_size(4) +
1529                 /* IFLA_GRE_OKEY */
1530                 nla_total_size(4) +
1531                 /* IFLA_GRE_LOCAL */
1532                 nla_total_size(4) +
1533                 /* IFLA_GRE_REMOTE */
1534                 nla_total_size(4) +
1535                 /* IFLA_GRE_TTL */
1536                 nla_total_size(1) +
1537                 /* IFLA_GRE_TOS */
1538                 nla_total_size(1) +
1539                 /* IFLA_GRE_PMTUDISC */
1540                 nla_total_size(1) +
1541                 0;
1542 }
1543
1544 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1545 {
1546         struct ip_tunnel *t = netdev_priv(dev);
1547         struct ip_tunnel_parm *p = &t->parms;
1548
1549         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1550         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1551         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1552         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1553         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1554         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1555         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1556         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1557         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1558         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1559
1560         return 0;
1561
1562 nla_put_failure:
1563         return -EMSGSIZE;
1564 }
1565
1566 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1567         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1568         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1569         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1570         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1571         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1572         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1573         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1574         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1575         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1576         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1577 };
1578
1579 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1580         .kind           = "gre",
1581         .maxtype        = IFLA_GRE_MAX,
1582         .policy         = ipgre_policy,
1583         .priv_size      = sizeof(struct ip_tunnel),
1584         .setup          = ipgre_tunnel_setup,
1585         .validate       = ipgre_tunnel_validate,
1586         .newlink        = ipgre_newlink,
1587         .changelink     = ipgre_changelink,
1588         .get_size       = ipgre_get_size,
1589         .fill_info      = ipgre_fill_info,
1590 };
1591
1592 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1593         .kind           = "gretap",
1594         .maxtype        = IFLA_GRE_MAX,
1595         .policy         = ipgre_policy,
1596         .priv_size      = sizeof(struct ip_tunnel),
1597         .setup          = ipgre_tap_setup,
1598         .validate       = ipgre_tap_validate,
1599         .newlink        = ipgre_newlink,
1600         .changelink     = ipgre_changelink,
1601         .get_size       = ipgre_get_size,
1602         .fill_info      = ipgre_fill_info,
1603 };
1604
1605 /*
1606  *      And now the modules code and kernel interface.
1607  */
1608
1609 static int __init ipgre_init(void)
1610 {
1611         int err;
1612
1613         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1614
1615         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1616                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1617                 return -EAGAIN;
1618         }
1619
1620         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1621         if (err < 0)
1622                 goto gen_device_failed;
1623
1624         err = rtnl_link_register(&ipgre_link_ops);
1625         if (err < 0)
1626                 goto rtnl_link_failed;
1627
1628         err = rtnl_link_register(&ipgre_tap_ops);
1629         if (err < 0)
1630                 goto tap_ops_failed;
1631
1632 out:
1633         return err;
1634
1635 tap_ops_failed:
1636         rtnl_link_unregister(&ipgre_link_ops);
1637 rtnl_link_failed:
1638         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1639 gen_device_failed:
1640         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1641         goto out;
1642 }
1643
1644 static void __exit ipgre_fini(void)
1645 {
1646         rtnl_link_unregister(&ipgre_tap_ops);
1647         rtnl_link_unregister(&ipgre_link_ops);
1648         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1649         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1650                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1651 }
1652
1653 module_init(ipgre_init);
1654 module_exit(ipgre_fini);
1655 MODULE_LICENSE("GPL");
1656 MODULE_ALIAS_RTNL_LINK("gre");
1657 MODULE_ALIAS_RTNL_LINK("gretap");