Merge git://git.kernel.org/pub/scm/linux/kernel/git/bart/ide-2.6
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 static int ipgre_fb_tunnel_init(struct net_device *dev);
130
131 #define HASH_SIZE  16
132
133 static int ipgre_net_id;
134 struct ipgre_net {
135         struct ip_tunnel *tunnels[4][HASH_SIZE];
136
137         struct net_device *fb_tunnel_dev;
138 };
139
140 /* Tunnel hash table */
141
142 /*
143    4 hash tables:
144
145    3: (remote,local)
146    2: (remote,*)
147    1: (*,local)
148    0: (*,*)
149
150    We require exact key match i.e. if a key is present in packet
151    it will match only tunnel with the same key; if it is not present,
152    it will match only keyless tunnel.
153
154    All keysless packets, if not matched configured keyless tunnels
155    will match fallback tunnel.
156  */
157
158 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
159
160 #define tunnels_r_l     tunnels[3]
161 #define tunnels_r       tunnels[2]
162 #define tunnels_l       tunnels[1]
163 #define tunnels_wc      tunnels[0]
164
165 static DEFINE_RWLOCK(ipgre_lock);
166
167 /* Given src, dst and key, find appropriate for input tunnel. */
168
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
170                                               __be32 remote, __be32 local,
171                                               __be32 key, __be16 gre_proto)
172 {
173         unsigned h0 = HASH(remote);
174         unsigned h1 = HASH(key);
175         struct ip_tunnel *t;
176         struct ip_tunnel *t2 = NULL;
177         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
178         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
179                        ARPHRD_ETHER : ARPHRD_IPGRE;
180
181         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
183                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
184                                 if (t->dev->type == dev_type)
185                                         return t;
186                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
187                                         t2 = t;
188                         }
189                 }
190         }
191
192         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
193                 if (remote == t->parms.iph.daddr) {
194                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
195                                 if (t->dev->type == dev_type)
196                                         return t;
197                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
198                                         t2 = t;
199                         }
200                 }
201         }
202
203         for (t = ign->tunnels_l[h1]; t; t = t->next) {
204                 if (local == t->parms.iph.saddr ||
205                      (local == t->parms.iph.daddr &&
206                       ipv4_is_multicast(local))) {
207                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
208                                 if (t->dev->type == dev_type)
209                                         return t;
210                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
211                                         t2 = t;
212                         }
213                 }
214         }
215
216         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
217                 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
218                         if (t->dev->type == dev_type)
219                                 return t;
220                         if (t->dev->type == ARPHRD_IPGRE && !t2)
221                                 t2 = t;
222                 }
223         }
224
225         if (t2)
226                 return t2;
227
228         if (ign->fb_tunnel_dev->flags&IFF_UP)
229                 return netdev_priv(ign->fb_tunnel_dev);
230         return NULL;
231 }
232
233 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
234                 struct ip_tunnel_parm *parms)
235 {
236         __be32 remote = parms->iph.daddr;
237         __be32 local = parms->iph.saddr;
238         __be32 key = parms->i_key;
239         unsigned h = HASH(key);
240         int prio = 0;
241
242         if (local)
243                 prio |= 1;
244         if (remote && !ipv4_is_multicast(remote)) {
245                 prio |= 2;
246                 h ^= HASH(remote);
247         }
248
249         return &ign->tunnels[prio][h];
250 }
251
252 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
253                 struct ip_tunnel *t)
254 {
255         return __ipgre_bucket(ign, &t->parms);
256 }
257
258 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
259 {
260         struct ip_tunnel **tp = ipgre_bucket(ign, t);
261
262         t->next = *tp;
263         write_lock_bh(&ipgre_lock);
264         *tp = t;
265         write_unlock_bh(&ipgre_lock);
266 }
267
268 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
269 {
270         struct ip_tunnel **tp;
271
272         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
273                 if (t == *tp) {
274                         write_lock_bh(&ipgre_lock);
275                         *tp = t->next;
276                         write_unlock_bh(&ipgre_lock);
277                         break;
278                 }
279         }
280 }
281
282 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
283                                            struct ip_tunnel_parm *parms,
284                                            int type)
285 {
286         __be32 remote = parms->iph.daddr;
287         __be32 local = parms->iph.saddr;
288         __be32 key = parms->i_key;
289         struct ip_tunnel *t, **tp;
290         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
291
292         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
293                 if (local == t->parms.iph.saddr &&
294                     remote == t->parms.iph.daddr &&
295                     key == t->parms.i_key &&
296                     type == t->dev->type)
297                         break;
298
299         return t;
300 }
301
302 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
303                 struct ip_tunnel_parm *parms, int create)
304 {
305         struct ip_tunnel *t, *nt;
306         struct net_device *dev;
307         char name[IFNAMSIZ];
308         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
309
310         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
311         if (t || !create)
312                 return t;
313
314         if (parms->name[0])
315                 strlcpy(name, parms->name, IFNAMSIZ);
316         else
317                 sprintf(name, "gre%%d");
318
319         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
320         if (!dev)
321           return NULL;
322
323         dev_net_set(dev, net);
324
325         if (strchr(name, '%')) {
326                 if (dev_alloc_name(dev, name) < 0)
327                         goto failed_free;
328         }
329
330         nt = netdev_priv(dev);
331         nt->parms = *parms;
332         dev->rtnl_link_ops = &ipgre_link_ops;
333
334         dev->mtu = ipgre_tunnel_bind_dev(dev);
335
336         if (register_netdevice(dev) < 0)
337                 goto failed_free;
338
339         dev_hold(dev);
340         ipgre_tunnel_link(ign, nt);
341         return nt;
342
343 failed_free:
344         free_netdev(dev);
345         return NULL;
346 }
347
348 static void ipgre_tunnel_uninit(struct net_device *dev)
349 {
350         struct net *net = dev_net(dev);
351         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
352
353         ipgre_tunnel_unlink(ign, netdev_priv(dev));
354         dev_put(dev);
355 }
356
357
358 static void ipgre_err(struct sk_buff *skb, u32 info)
359 {
360
361 /* All the routers (except for Linux) return only
362    8 bytes of packet payload. It means, that precise relaying of
363    ICMP in the real Internet is absolutely infeasible.
364
365    Moreover, Cisco "wise men" put GRE key to the third word
366    in GRE header. It makes impossible maintaining even soft state for keyed
367    GRE tunnels with enabled checksum. Tell them "thank you".
368
369    Well, I wonder, rfc1812 was written by Cisco employee,
370    what the hell these idiots break standrads established
371    by themself???
372  */
373
374         struct iphdr *iph = (struct iphdr*)skb->data;
375         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
376         int grehlen = (iph->ihl<<2) + 4;
377         const int type = icmp_hdr(skb)->type;
378         const int code = icmp_hdr(skb)->code;
379         struct ip_tunnel *t;
380         __be16 flags;
381
382         flags = p[0];
383         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
384                 if (flags&(GRE_VERSION|GRE_ROUTING))
385                         return;
386                 if (flags&GRE_KEY) {
387                         grehlen += 4;
388                         if (flags&GRE_CSUM)
389                                 grehlen += 4;
390                 }
391         }
392
393         /* If only 8 bytes returned, keyed message will be dropped here */
394         if (skb_headlen(skb) < grehlen)
395                 return;
396
397         switch (type) {
398         default:
399         case ICMP_PARAMETERPROB:
400                 return;
401
402         case ICMP_DEST_UNREACH:
403                 switch (code) {
404                 case ICMP_SR_FAILED:
405                 case ICMP_PORT_UNREACH:
406                         /* Impossible event. */
407                         return;
408                 case ICMP_FRAG_NEEDED:
409                         /* Soft state for pmtu is maintained by IP core. */
410                         return;
411                 default:
412                         /* All others are translated to HOST_UNREACH.
413                            rfc2003 contains "deep thoughts" about NET_UNREACH,
414                            I believe they are just ether pollution. --ANK
415                          */
416                         break;
417                 }
418                 break;
419         case ICMP_TIME_EXCEEDED:
420                 if (code != ICMP_EXC_TTL)
421                         return;
422                 break;
423         }
424
425         read_lock(&ipgre_lock);
426         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
427                                 flags & GRE_KEY ?
428                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
429                                 p[1]);
430         if (t == NULL || t->parms.iph.daddr == 0 ||
431             ipv4_is_multicast(t->parms.iph.daddr))
432                 goto out;
433
434         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
435                 goto out;
436
437         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
438                 t->err_count++;
439         else
440                 t->err_count = 1;
441         t->err_time = jiffies;
442 out:
443         read_unlock(&ipgre_lock);
444         return;
445 }
446
447 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
448 {
449         if (INET_ECN_is_ce(iph->tos)) {
450                 if (skb->protocol == htons(ETH_P_IP)) {
451                         IP_ECN_set_ce(ip_hdr(skb));
452                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
453                         IP6_ECN_set_ce(ipv6_hdr(skb));
454                 }
455         }
456 }
457
458 static inline u8
459 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
460 {
461         u8 inner = 0;
462         if (skb->protocol == htons(ETH_P_IP))
463                 inner = old_iph->tos;
464         else if (skb->protocol == htons(ETH_P_IPV6))
465                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
466         return INET_ECN_encapsulate(tos, inner);
467 }
468
469 static int ipgre_rcv(struct sk_buff *skb)
470 {
471         struct iphdr *iph;
472         u8     *h;
473         __be16    flags;
474         __sum16   csum = 0;
475         __be32 key = 0;
476         u32    seqno = 0;
477         struct ip_tunnel *tunnel;
478         int    offset = 4;
479         __be16 gre_proto;
480         unsigned int len;
481
482         if (!pskb_may_pull(skb, 16))
483                 goto drop_nolock;
484
485         iph = ip_hdr(skb);
486         h = skb->data;
487         flags = *(__be16*)h;
488
489         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
490                 /* - Version must be 0.
491                    - We do not support routing headers.
492                  */
493                 if (flags&(GRE_VERSION|GRE_ROUTING))
494                         goto drop_nolock;
495
496                 if (flags&GRE_CSUM) {
497                         switch (skb->ip_summed) {
498                         case CHECKSUM_COMPLETE:
499                                 csum = csum_fold(skb->csum);
500                                 if (!csum)
501                                         break;
502                                 /* fall through */
503                         case CHECKSUM_NONE:
504                                 skb->csum = 0;
505                                 csum = __skb_checksum_complete(skb);
506                                 skb->ip_summed = CHECKSUM_COMPLETE;
507                         }
508                         offset += 4;
509                 }
510                 if (flags&GRE_KEY) {
511                         key = *(__be32*)(h + offset);
512                         offset += 4;
513                 }
514                 if (flags&GRE_SEQ) {
515                         seqno = ntohl(*(__be32*)(h + offset));
516                         offset += 4;
517                 }
518         }
519
520         gre_proto = *(__be16 *)(h + 2);
521
522         read_lock(&ipgre_lock);
523         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
524                                           iph->saddr, iph->daddr, key,
525                                           gre_proto))) {
526                 struct net_device_stats *stats = &tunnel->dev->stats;
527
528                 secpath_reset(skb);
529
530                 skb->protocol = gre_proto;
531                 /* WCCP version 1 and 2 protocol decoding.
532                  * - Change protocol to IP
533                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
534                  */
535                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
536                         skb->protocol = htons(ETH_P_IP);
537                         if ((*(h + offset) & 0xF0) != 0x40)
538                                 offset += 4;
539                 }
540
541                 skb->mac_header = skb->network_header;
542                 __pskb_pull(skb, offset);
543                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
544                 skb->pkt_type = PACKET_HOST;
545 #ifdef CONFIG_NET_IPGRE_BROADCAST
546                 if (ipv4_is_multicast(iph->daddr)) {
547                         /* Looped back packet, drop it! */
548                         if (skb->rtable->fl.iif == 0)
549                                 goto drop;
550                         stats->multicast++;
551                         skb->pkt_type = PACKET_BROADCAST;
552                 }
553 #endif
554
555                 if (((flags&GRE_CSUM) && csum) ||
556                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
557                         stats->rx_crc_errors++;
558                         stats->rx_errors++;
559                         goto drop;
560                 }
561                 if (tunnel->parms.i_flags&GRE_SEQ) {
562                         if (!(flags&GRE_SEQ) ||
563                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
564                                 stats->rx_fifo_errors++;
565                                 stats->rx_errors++;
566                                 goto drop;
567                         }
568                         tunnel->i_seqno = seqno + 1;
569                 }
570
571                 len = skb->len;
572
573                 /* Warning: All skb pointers will be invalidated! */
574                 if (tunnel->dev->type == ARPHRD_ETHER) {
575                         if (!pskb_may_pull(skb, ETH_HLEN)) {
576                                 stats->rx_length_errors++;
577                                 stats->rx_errors++;
578                                 goto drop;
579                         }
580
581                         iph = ip_hdr(skb);
582                         skb->protocol = eth_type_trans(skb, tunnel->dev);
583                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
584                 }
585
586                 stats->rx_packets++;
587                 stats->rx_bytes += len;
588                 skb->dev = tunnel->dev;
589                 dst_release(skb->dst);
590                 skb->dst = NULL;
591                 nf_reset(skb);
592
593                 skb_reset_network_header(skb);
594                 ipgre_ecn_decapsulate(iph, skb);
595
596                 netif_rx(skb);
597                 read_unlock(&ipgre_lock);
598                 return(0);
599         }
600         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
601
602 drop:
603         read_unlock(&ipgre_lock);
604 drop_nolock:
605         kfree_skb(skb);
606         return(0);
607 }
608
609 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
610 {
611         struct ip_tunnel *tunnel = netdev_priv(dev);
612         struct net_device_stats *stats = &tunnel->dev->stats;
613         struct iphdr  *old_iph = ip_hdr(skb);
614         struct iphdr  *tiph;
615         u8     tos;
616         __be16 df;
617         struct rtable *rt;                      /* Route to the other host */
618         struct net_device *tdev;                        /* Device to other host */
619         struct iphdr  *iph;                     /* Our new IP header */
620         unsigned int max_headroom;              /* The extra header space needed */
621         int    gre_hlen;
622         __be32 dst;
623         int    mtu;
624
625         if (tunnel->recursion++) {
626                 stats->collisions++;
627                 goto tx_error;
628         }
629
630         if (dev->type == ARPHRD_ETHER)
631                 IPCB(skb)->flags = 0;
632
633         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
634                 gre_hlen = 0;
635                 tiph = (struct iphdr*)skb->data;
636         } else {
637                 gre_hlen = tunnel->hlen;
638                 tiph = &tunnel->parms.iph;
639         }
640
641         if ((dst = tiph->daddr) == 0) {
642                 /* NBMA tunnel */
643
644                 if (skb->dst == NULL) {
645                         stats->tx_fifo_errors++;
646                         goto tx_error;
647                 }
648
649                 if (skb->protocol == htons(ETH_P_IP)) {
650                         rt = skb->rtable;
651                         if ((dst = rt->rt_gateway) == 0)
652                                 goto tx_error_icmp;
653                 }
654 #ifdef CONFIG_IPV6
655                 else if (skb->protocol == htons(ETH_P_IPV6)) {
656                         struct in6_addr *addr6;
657                         int addr_type;
658                         struct neighbour *neigh = skb->dst->neighbour;
659
660                         if (neigh == NULL)
661                                 goto tx_error;
662
663                         addr6 = (struct in6_addr*)&neigh->primary_key;
664                         addr_type = ipv6_addr_type(addr6);
665
666                         if (addr_type == IPV6_ADDR_ANY) {
667                                 addr6 = &ipv6_hdr(skb)->daddr;
668                                 addr_type = ipv6_addr_type(addr6);
669                         }
670
671                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
672                                 goto tx_error_icmp;
673
674                         dst = addr6->s6_addr32[3];
675                 }
676 #endif
677                 else
678                         goto tx_error;
679         }
680
681         tos = tiph->tos;
682         if (tos&1) {
683                 if (skb->protocol == htons(ETH_P_IP))
684                         tos = old_iph->tos;
685                 tos &= ~1;
686         }
687
688         {
689                 struct flowi fl = { .oif = tunnel->parms.link,
690                                     .nl_u = { .ip4_u =
691                                               { .daddr = dst,
692                                                 .saddr = tiph->saddr,
693                                                 .tos = RT_TOS(tos) } },
694                                     .proto = IPPROTO_GRE };
695                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
696                         stats->tx_carrier_errors++;
697                         goto tx_error;
698                 }
699         }
700         tdev = rt->u.dst.dev;
701
702         if (tdev == dev) {
703                 ip_rt_put(rt);
704                 stats->collisions++;
705                 goto tx_error;
706         }
707
708         df = tiph->frag_off;
709         if (df)
710                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
711         else
712                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
713
714         if (skb->dst)
715                 skb->dst->ops->update_pmtu(skb->dst, mtu);
716
717         if (skb->protocol == htons(ETH_P_IP)) {
718                 df |= (old_iph->frag_off&htons(IP_DF));
719
720                 if ((old_iph->frag_off&htons(IP_DF)) &&
721                     mtu < ntohs(old_iph->tot_len)) {
722                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
723                         ip_rt_put(rt);
724                         goto tx_error;
725                 }
726         }
727 #ifdef CONFIG_IPV6
728         else if (skb->protocol == htons(ETH_P_IPV6)) {
729                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
730
731                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
732                         if ((tunnel->parms.iph.daddr &&
733                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
734                             rt6->rt6i_dst.plen == 128) {
735                                 rt6->rt6i_flags |= RTF_MODIFIED;
736                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
737                         }
738                 }
739
740                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
741                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
742                         ip_rt_put(rt);
743                         goto tx_error;
744                 }
745         }
746 #endif
747
748         if (tunnel->err_count > 0) {
749                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
750                         tunnel->err_count--;
751
752                         dst_link_failure(skb);
753                 } else
754                         tunnel->err_count = 0;
755         }
756
757         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
758
759         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
760             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
761                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
762                 if (!new_skb) {
763                         ip_rt_put(rt);
764                         stats->tx_dropped++;
765                         dev_kfree_skb(skb);
766                         tunnel->recursion--;
767                         return 0;
768                 }
769                 if (skb->sk)
770                         skb_set_owner_w(new_skb, skb->sk);
771                 dev_kfree_skb(skb);
772                 skb = new_skb;
773                 old_iph = ip_hdr(skb);
774         }
775
776         skb_reset_transport_header(skb);
777         skb_push(skb, gre_hlen);
778         skb_reset_network_header(skb);
779         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
780         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
781                               IPSKB_REROUTED);
782         dst_release(skb->dst);
783         skb->dst = &rt->u.dst;
784
785         /*
786          *      Push down and install the IPIP header.
787          */
788
789         iph                     =       ip_hdr(skb);
790         iph->version            =       4;
791         iph->ihl                =       sizeof(struct iphdr) >> 2;
792         iph->frag_off           =       df;
793         iph->protocol           =       IPPROTO_GRE;
794         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
795         iph->daddr              =       rt->rt_dst;
796         iph->saddr              =       rt->rt_src;
797
798         if ((iph->ttl = tiph->ttl) == 0) {
799                 if (skb->protocol == htons(ETH_P_IP))
800                         iph->ttl = old_iph->ttl;
801 #ifdef CONFIG_IPV6
802                 else if (skb->protocol == htons(ETH_P_IPV6))
803                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
804 #endif
805                 else
806                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
807         }
808
809         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
810         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
811                                    htons(ETH_P_TEB) : skb->protocol;
812
813         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
814                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
815
816                 if (tunnel->parms.o_flags&GRE_SEQ) {
817                         ++tunnel->o_seqno;
818                         *ptr = htonl(tunnel->o_seqno);
819                         ptr--;
820                 }
821                 if (tunnel->parms.o_flags&GRE_KEY) {
822                         *ptr = tunnel->parms.o_key;
823                         ptr--;
824                 }
825                 if (tunnel->parms.o_flags&GRE_CSUM) {
826                         *ptr = 0;
827                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
828                 }
829         }
830
831         nf_reset(skb);
832
833         IPTUNNEL_XMIT();
834         tunnel->recursion--;
835         return 0;
836
837 tx_error_icmp:
838         dst_link_failure(skb);
839
840 tx_error:
841         stats->tx_errors++;
842         dev_kfree_skb(skb);
843         tunnel->recursion--;
844         return 0;
845 }
846
847 static int ipgre_tunnel_bind_dev(struct net_device *dev)
848 {
849         struct net_device *tdev = NULL;
850         struct ip_tunnel *tunnel;
851         struct iphdr *iph;
852         int hlen = LL_MAX_HEADER;
853         int mtu = ETH_DATA_LEN;
854         int addend = sizeof(struct iphdr) + 4;
855
856         tunnel = netdev_priv(dev);
857         iph = &tunnel->parms.iph;
858
859         /* Guess output device to choose reasonable mtu and needed_headroom */
860
861         if (iph->daddr) {
862                 struct flowi fl = { .oif = tunnel->parms.link,
863                                     .nl_u = { .ip4_u =
864                                               { .daddr = iph->daddr,
865                                                 .saddr = iph->saddr,
866                                                 .tos = RT_TOS(iph->tos) } },
867                                     .proto = IPPROTO_GRE };
868                 struct rtable *rt;
869                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
870                         tdev = rt->u.dst.dev;
871                         ip_rt_put(rt);
872                 }
873
874                 if (dev->type != ARPHRD_ETHER)
875                         dev->flags |= IFF_POINTOPOINT;
876         }
877
878         if (!tdev && tunnel->parms.link)
879                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
880
881         if (tdev) {
882                 hlen = tdev->hard_header_len + tdev->needed_headroom;
883                 mtu = tdev->mtu;
884         }
885         dev->iflink = tunnel->parms.link;
886
887         /* Precalculate GRE options length */
888         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
889                 if (tunnel->parms.o_flags&GRE_CSUM)
890                         addend += 4;
891                 if (tunnel->parms.o_flags&GRE_KEY)
892                         addend += 4;
893                 if (tunnel->parms.o_flags&GRE_SEQ)
894                         addend += 4;
895         }
896         dev->needed_headroom = addend + hlen;
897         mtu -= dev->hard_header_len - addend;
898
899         if (mtu < 68)
900                 mtu = 68;
901
902         tunnel->hlen = addend;
903
904         return mtu;
905 }
906
907 static int
908 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
909 {
910         int err = 0;
911         struct ip_tunnel_parm p;
912         struct ip_tunnel *t;
913         struct net *net = dev_net(dev);
914         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
915
916         switch (cmd) {
917         case SIOCGETTUNNEL:
918                 t = NULL;
919                 if (dev == ign->fb_tunnel_dev) {
920                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
921                                 err = -EFAULT;
922                                 break;
923                         }
924                         t = ipgre_tunnel_locate(net, &p, 0);
925                 }
926                 if (t == NULL)
927                         t = netdev_priv(dev);
928                 memcpy(&p, &t->parms, sizeof(p));
929                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
930                         err = -EFAULT;
931                 break;
932
933         case SIOCADDTUNNEL:
934         case SIOCCHGTUNNEL:
935                 err = -EPERM;
936                 if (!capable(CAP_NET_ADMIN))
937                         goto done;
938
939                 err = -EFAULT;
940                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
941                         goto done;
942
943                 err = -EINVAL;
944                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
945                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
946                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
947                         goto done;
948                 if (p.iph.ttl)
949                         p.iph.frag_off |= htons(IP_DF);
950
951                 if (!(p.i_flags&GRE_KEY))
952                         p.i_key = 0;
953                 if (!(p.o_flags&GRE_KEY))
954                         p.o_key = 0;
955
956                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
957
958                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
959                         if (t != NULL) {
960                                 if (t->dev != dev) {
961                                         err = -EEXIST;
962                                         break;
963                                 }
964                         } else {
965                                 unsigned nflags=0;
966
967                                 t = netdev_priv(dev);
968
969                                 if (ipv4_is_multicast(p.iph.daddr))
970                                         nflags = IFF_BROADCAST;
971                                 else if (p.iph.daddr)
972                                         nflags = IFF_POINTOPOINT;
973
974                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
975                                         err = -EINVAL;
976                                         break;
977                                 }
978                                 ipgre_tunnel_unlink(ign, t);
979                                 t->parms.iph.saddr = p.iph.saddr;
980                                 t->parms.iph.daddr = p.iph.daddr;
981                                 t->parms.i_key = p.i_key;
982                                 t->parms.o_key = p.o_key;
983                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
984                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
985                                 ipgre_tunnel_link(ign, t);
986                                 netdev_state_change(dev);
987                         }
988                 }
989
990                 if (t) {
991                         err = 0;
992                         if (cmd == SIOCCHGTUNNEL) {
993                                 t->parms.iph.ttl = p.iph.ttl;
994                                 t->parms.iph.tos = p.iph.tos;
995                                 t->parms.iph.frag_off = p.iph.frag_off;
996                                 if (t->parms.link != p.link) {
997                                         t->parms.link = p.link;
998                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
999                                         netdev_state_change(dev);
1000                                 }
1001                         }
1002                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1003                                 err = -EFAULT;
1004                 } else
1005                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1006                 break;
1007
1008         case SIOCDELTUNNEL:
1009                 err = -EPERM;
1010                 if (!capable(CAP_NET_ADMIN))
1011                         goto done;
1012
1013                 if (dev == ign->fb_tunnel_dev) {
1014                         err = -EFAULT;
1015                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1016                                 goto done;
1017                         err = -ENOENT;
1018                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1019                                 goto done;
1020                         err = -EPERM;
1021                         if (t == netdev_priv(ign->fb_tunnel_dev))
1022                                 goto done;
1023                         dev = t->dev;
1024                 }
1025                 unregister_netdevice(dev);
1026                 err = 0;
1027                 break;
1028
1029         default:
1030                 err = -EINVAL;
1031         }
1032
1033 done:
1034         return err;
1035 }
1036
1037 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1038 {
1039         struct ip_tunnel *tunnel = netdev_priv(dev);
1040         if (new_mtu < 68 ||
1041             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1042                 return -EINVAL;
1043         dev->mtu = new_mtu;
1044         return 0;
1045 }
1046
1047 /* Nice toy. Unfortunately, useless in real life :-)
1048    It allows to construct virtual multiprotocol broadcast "LAN"
1049    over the Internet, provided multicast routing is tuned.
1050
1051
1052    I have no idea was this bicycle invented before me,
1053    so that I had to set ARPHRD_IPGRE to a random value.
1054    I have an impression, that Cisco could make something similar,
1055    but this feature is apparently missing in IOS<=11.2(8).
1056
1057    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1058    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1059
1060    ping -t 255 224.66.66.66
1061
1062    If nobody answers, mbone does not work.
1063
1064    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1065    ip addr add 10.66.66.<somewhat>/24 dev Universe
1066    ifconfig Universe up
1067    ifconfig Universe add fe80::<Your_real_addr>/10
1068    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1069    ftp 10.66.66.66
1070    ...
1071    ftp fec0:6666:6666::193.233.7.65
1072    ...
1073
1074  */
1075
1076 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1077                         unsigned short type,
1078                         const void *daddr, const void *saddr, unsigned len)
1079 {
1080         struct ip_tunnel *t = netdev_priv(dev);
1081         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1082         __be16 *p = (__be16*)(iph+1);
1083
1084         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1085         p[0]            = t->parms.o_flags;
1086         p[1]            = htons(type);
1087
1088         /*
1089          *      Set the source hardware address.
1090          */
1091
1092         if (saddr)
1093                 memcpy(&iph->saddr, saddr, 4);
1094
1095         if (daddr) {
1096                 memcpy(&iph->daddr, daddr, 4);
1097                 return t->hlen;
1098         }
1099         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1100                 return t->hlen;
1101
1102         return -t->hlen;
1103 }
1104
1105 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1106 {
1107         struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1108         memcpy(haddr, &iph->saddr, 4);
1109         return 4;
1110 }
1111
1112 static const struct header_ops ipgre_header_ops = {
1113         .create = ipgre_header,
1114         .parse  = ipgre_header_parse,
1115 };
1116
1117 #ifdef CONFIG_NET_IPGRE_BROADCAST
1118 static int ipgre_open(struct net_device *dev)
1119 {
1120         struct ip_tunnel *t = netdev_priv(dev);
1121
1122         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1123                 struct flowi fl = { .oif = t->parms.link,
1124                                     .nl_u = { .ip4_u =
1125                                               { .daddr = t->parms.iph.daddr,
1126                                                 .saddr = t->parms.iph.saddr,
1127                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1128                                     .proto = IPPROTO_GRE };
1129                 struct rtable *rt;
1130                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1131                         return -EADDRNOTAVAIL;
1132                 dev = rt->u.dst.dev;
1133                 ip_rt_put(rt);
1134                 if (__in_dev_get_rtnl(dev) == NULL)
1135                         return -EADDRNOTAVAIL;
1136                 t->mlink = dev->ifindex;
1137                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1138         }
1139         return 0;
1140 }
1141
1142 static int ipgre_close(struct net_device *dev)
1143 {
1144         struct ip_tunnel *t = netdev_priv(dev);
1145         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1146                 struct in_device *in_dev;
1147                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1148                 if (in_dev) {
1149                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1150                         in_dev_put(in_dev);
1151                 }
1152         }
1153         return 0;
1154 }
1155
1156 #endif
1157
1158 static void ipgre_tunnel_setup(struct net_device *dev)
1159 {
1160         dev->init               = ipgre_tunnel_init;
1161         dev->uninit             = ipgre_tunnel_uninit;
1162         dev->destructor         = free_netdev;
1163         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1164         dev->do_ioctl           = ipgre_tunnel_ioctl;
1165         dev->change_mtu         = ipgre_tunnel_change_mtu;
1166
1167         dev->type               = ARPHRD_IPGRE;
1168         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1169         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1170         dev->flags              = IFF_NOARP;
1171         dev->iflink             = 0;
1172         dev->addr_len           = 4;
1173         dev->features           |= NETIF_F_NETNS_LOCAL;
1174 }
1175
1176 static int ipgre_tunnel_init(struct net_device *dev)
1177 {
1178         struct ip_tunnel *tunnel;
1179         struct iphdr *iph;
1180
1181         tunnel = netdev_priv(dev);
1182         iph = &tunnel->parms.iph;
1183
1184         tunnel->dev = dev;
1185         strcpy(tunnel->parms.name, dev->name);
1186
1187         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1188         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1189
1190         if (iph->daddr) {
1191 #ifdef CONFIG_NET_IPGRE_BROADCAST
1192                 if (ipv4_is_multicast(iph->daddr)) {
1193                         if (!iph->saddr)
1194                                 return -EINVAL;
1195                         dev->flags = IFF_BROADCAST;
1196                         dev->header_ops = &ipgre_header_ops;
1197                         dev->open = ipgre_open;
1198                         dev->stop = ipgre_close;
1199                 }
1200 #endif
1201         } else
1202                 dev->header_ops = &ipgre_header_ops;
1203
1204         return 0;
1205 }
1206
1207 static int ipgre_fb_tunnel_init(struct net_device *dev)
1208 {
1209         struct ip_tunnel *tunnel = netdev_priv(dev);
1210         struct iphdr *iph = &tunnel->parms.iph;
1211         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1212
1213         tunnel->dev = dev;
1214         strcpy(tunnel->parms.name, dev->name);
1215
1216         iph->version            = 4;
1217         iph->protocol           = IPPROTO_GRE;
1218         iph->ihl                = 5;
1219         tunnel->hlen            = sizeof(struct iphdr) + 4;
1220
1221         dev_hold(dev);
1222         ign->tunnels_wc[0]      = tunnel;
1223         return 0;
1224 }
1225
1226
1227 static struct net_protocol ipgre_protocol = {
1228         .handler        =       ipgre_rcv,
1229         .err_handler    =       ipgre_err,
1230         .netns_ok       =       1,
1231 };
1232
1233 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1234 {
1235         int prio;
1236
1237         for (prio = 0; prio < 4; prio++) {
1238                 int h;
1239                 for (h = 0; h < HASH_SIZE; h++) {
1240                         struct ip_tunnel *t;
1241                         while ((t = ign->tunnels[prio][h]) != NULL)
1242                                 unregister_netdevice(t->dev);
1243                 }
1244         }
1245 }
1246
1247 static int ipgre_init_net(struct net *net)
1248 {
1249         int err;
1250         struct ipgre_net *ign;
1251
1252         err = -ENOMEM;
1253         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1254         if (ign == NULL)
1255                 goto err_alloc;
1256
1257         err = net_assign_generic(net, ipgre_net_id, ign);
1258         if (err < 0)
1259                 goto err_assign;
1260
1261         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1262                                            ipgre_tunnel_setup);
1263         if (!ign->fb_tunnel_dev) {
1264                 err = -ENOMEM;
1265                 goto err_alloc_dev;
1266         }
1267
1268         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1269         dev_net_set(ign->fb_tunnel_dev, net);
1270         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1271
1272         if ((err = register_netdev(ign->fb_tunnel_dev)))
1273                 goto err_reg_dev;
1274
1275         return 0;
1276
1277 err_reg_dev:
1278         free_netdev(ign->fb_tunnel_dev);
1279 err_alloc_dev:
1280         /* nothing */
1281 err_assign:
1282         kfree(ign);
1283 err_alloc:
1284         return err;
1285 }
1286
1287 static void ipgre_exit_net(struct net *net)
1288 {
1289         struct ipgre_net *ign;
1290
1291         ign = net_generic(net, ipgre_net_id);
1292         rtnl_lock();
1293         ipgre_destroy_tunnels(ign);
1294         rtnl_unlock();
1295         kfree(ign);
1296 }
1297
1298 static struct pernet_operations ipgre_net_ops = {
1299         .init = ipgre_init_net,
1300         .exit = ipgre_exit_net,
1301 };
1302
1303 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1304 {
1305         __be16 flags;
1306
1307         if (!data)
1308                 return 0;
1309
1310         flags = 0;
1311         if (data[IFLA_GRE_IFLAGS])
1312                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1313         if (data[IFLA_GRE_OFLAGS])
1314                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1315         if (flags & (GRE_VERSION|GRE_ROUTING))
1316                 return -EINVAL;
1317
1318         return 0;
1319 }
1320
1321 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1322 {
1323         __be32 daddr;
1324
1325         if (tb[IFLA_ADDRESS]) {
1326                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1327                         return -EINVAL;
1328                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1329                         return -EADDRNOTAVAIL;
1330         }
1331
1332         if (!data)
1333                 goto out;
1334
1335         if (data[IFLA_GRE_REMOTE]) {
1336                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1337                 if (!daddr)
1338                         return -EINVAL;
1339         }
1340
1341 out:
1342         return ipgre_tunnel_validate(tb, data);
1343 }
1344
1345 static void ipgre_netlink_parms(struct nlattr *data[],
1346                                 struct ip_tunnel_parm *parms)
1347 {
1348         memset(parms, 0, sizeof(*parms));
1349
1350         parms->iph.protocol = IPPROTO_GRE;
1351
1352         if (!data)
1353                 return;
1354
1355         if (data[IFLA_GRE_LINK])
1356                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1357
1358         if (data[IFLA_GRE_IFLAGS])
1359                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360
1361         if (data[IFLA_GRE_OFLAGS])
1362                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1363
1364         if (data[IFLA_GRE_IKEY])
1365                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1366
1367         if (data[IFLA_GRE_OKEY])
1368                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1369
1370         if (data[IFLA_GRE_LOCAL])
1371                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1372
1373         if (data[IFLA_GRE_REMOTE])
1374                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1375
1376         if (data[IFLA_GRE_TTL])
1377                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1378
1379         if (data[IFLA_GRE_TOS])
1380                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1381
1382         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1383                 parms->iph.frag_off = htons(IP_DF);
1384 }
1385
1386 static int ipgre_tap_init(struct net_device *dev)
1387 {
1388         struct ip_tunnel *tunnel;
1389
1390         tunnel = netdev_priv(dev);
1391
1392         tunnel->dev = dev;
1393         strcpy(tunnel->parms.name, dev->name);
1394
1395         ipgre_tunnel_bind_dev(dev);
1396
1397         return 0;
1398 }
1399
1400 static void ipgre_tap_setup(struct net_device *dev)
1401 {
1402
1403         ether_setup(dev);
1404
1405         dev->init               = ipgre_tap_init;
1406         dev->uninit             = ipgre_tunnel_uninit;
1407         dev->destructor         = free_netdev;
1408         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1409         dev->change_mtu         = ipgre_tunnel_change_mtu;
1410
1411         dev->iflink             = 0;
1412         dev->features           |= NETIF_F_NETNS_LOCAL;
1413 }
1414
1415 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1416                          struct nlattr *data[])
1417 {
1418         struct ip_tunnel *nt;
1419         struct net *net = dev_net(dev);
1420         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1421         int mtu;
1422         int err;
1423
1424         nt = netdev_priv(dev);
1425         ipgre_netlink_parms(data, &nt->parms);
1426
1427         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1428                 return -EEXIST;
1429
1430         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1431                 random_ether_addr(dev->dev_addr);
1432
1433         mtu = ipgre_tunnel_bind_dev(dev);
1434         if (!tb[IFLA_MTU])
1435                 dev->mtu = mtu;
1436
1437         err = register_netdevice(dev);
1438         if (err)
1439                 goto out;
1440
1441         dev_hold(dev);
1442         ipgre_tunnel_link(ign, nt);
1443
1444 out:
1445         return err;
1446 }
1447
1448 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1449                             struct nlattr *data[])
1450 {
1451         struct ip_tunnel *t, *nt;
1452         struct net *net = dev_net(dev);
1453         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1454         struct ip_tunnel_parm p;
1455         int mtu;
1456
1457         if (dev == ign->fb_tunnel_dev)
1458                 return -EINVAL;
1459
1460         nt = netdev_priv(dev);
1461         ipgre_netlink_parms(data, &p);
1462
1463         t = ipgre_tunnel_locate(net, &p, 0);
1464
1465         if (t) {
1466                 if (t->dev != dev)
1467                         return -EEXIST;
1468         } else {
1469                 unsigned nflags = 0;
1470
1471                 t = nt;
1472
1473                 if (ipv4_is_multicast(p.iph.daddr))
1474                         nflags = IFF_BROADCAST;
1475                 else if (p.iph.daddr)
1476                         nflags = IFF_POINTOPOINT;
1477
1478                 if ((dev->flags ^ nflags) &
1479                     (IFF_POINTOPOINT | IFF_BROADCAST))
1480                         return -EINVAL;
1481
1482                 ipgre_tunnel_unlink(ign, t);
1483                 t->parms.iph.saddr = p.iph.saddr;
1484                 t->parms.iph.daddr = p.iph.daddr;
1485                 t->parms.i_key = p.i_key;
1486                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1487                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1488                 ipgre_tunnel_link(ign, t);
1489                 netdev_state_change(dev);
1490         }
1491
1492         t->parms.o_key = p.o_key;
1493         t->parms.iph.ttl = p.iph.ttl;
1494         t->parms.iph.tos = p.iph.tos;
1495         t->parms.iph.frag_off = p.iph.frag_off;
1496
1497         if (t->parms.link != p.link) {
1498                 t->parms.link = p.link;
1499                 mtu = ipgre_tunnel_bind_dev(dev);
1500                 if (!tb[IFLA_MTU])
1501                         dev->mtu = mtu;
1502                 netdev_state_change(dev);
1503         }
1504
1505         return 0;
1506 }
1507
1508 static size_t ipgre_get_size(const struct net_device *dev)
1509 {
1510         return
1511                 /* IFLA_GRE_LINK */
1512                 nla_total_size(4) +
1513                 /* IFLA_GRE_IFLAGS */
1514                 nla_total_size(2) +
1515                 /* IFLA_GRE_OFLAGS */
1516                 nla_total_size(2) +
1517                 /* IFLA_GRE_IKEY */
1518                 nla_total_size(4) +
1519                 /* IFLA_GRE_OKEY */
1520                 nla_total_size(4) +
1521                 /* IFLA_GRE_LOCAL */
1522                 nla_total_size(4) +
1523                 /* IFLA_GRE_REMOTE */
1524                 nla_total_size(4) +
1525                 /* IFLA_GRE_TTL */
1526                 nla_total_size(1) +
1527                 /* IFLA_GRE_TOS */
1528                 nla_total_size(1) +
1529                 /* IFLA_GRE_PMTUDISC */
1530                 nla_total_size(1) +
1531                 0;
1532 }
1533
1534 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1535 {
1536         struct ip_tunnel *t = netdev_priv(dev);
1537         struct ip_tunnel_parm *p = &t->parms;
1538
1539         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1540         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1541         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1542         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1543         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1544         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1545         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1546         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1547         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1548         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1549
1550         return 0;
1551
1552 nla_put_failure:
1553         return -EMSGSIZE;
1554 }
1555
1556 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1557         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1558         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1559         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1560         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1561         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1562         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1563         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1564         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1565         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1566         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1567 };
1568
1569 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1570         .kind           = "gre",
1571         .maxtype        = IFLA_GRE_MAX,
1572         .policy         = ipgre_policy,
1573         .priv_size      = sizeof(struct ip_tunnel),
1574         .setup          = ipgre_tunnel_setup,
1575         .validate       = ipgre_tunnel_validate,
1576         .newlink        = ipgre_newlink,
1577         .changelink     = ipgre_changelink,
1578         .get_size       = ipgre_get_size,
1579         .fill_info      = ipgre_fill_info,
1580 };
1581
1582 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1583         .kind           = "gretap",
1584         .maxtype        = IFLA_GRE_MAX,
1585         .policy         = ipgre_policy,
1586         .priv_size      = sizeof(struct ip_tunnel),
1587         .setup          = ipgre_tap_setup,
1588         .validate       = ipgre_tap_validate,
1589         .newlink        = ipgre_newlink,
1590         .changelink     = ipgre_changelink,
1591         .get_size       = ipgre_get_size,
1592         .fill_info      = ipgre_fill_info,
1593 };
1594
1595 /*
1596  *      And now the modules code and kernel interface.
1597  */
1598
1599 static int __init ipgre_init(void)
1600 {
1601         int err;
1602
1603         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1604
1605         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1606                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1607                 return -EAGAIN;
1608         }
1609
1610         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1611         if (err < 0)
1612                 goto gen_device_failed;
1613
1614         err = rtnl_link_register(&ipgre_link_ops);
1615         if (err < 0)
1616                 goto rtnl_link_failed;
1617
1618         err = rtnl_link_register(&ipgre_tap_ops);
1619         if (err < 0)
1620                 goto tap_ops_failed;
1621
1622 out:
1623         return err;
1624
1625 tap_ops_failed:
1626         rtnl_link_unregister(&ipgre_link_ops);
1627 rtnl_link_failed:
1628         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1629 gen_device_failed:
1630         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1631         goto out;
1632 }
1633
1634 static void __exit ipgre_fini(void)
1635 {
1636         rtnl_link_unregister(&ipgre_tap_ops);
1637         rtnl_link_unregister(&ipgre_link_ops);
1638         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1639         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1640                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1641 }
1642
1643 module_init(ipgre_init);
1644 module_exit(ipgre_fini);
1645 MODULE_LICENSE("GPL");
1646 MODULE_ALIAS_RTNL_LINK("gre");
1647 MODULE_ALIAS_RTNL_LINK("gretap");