Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 #define HASH_SIZE  16
130
131 static int ipgre_net_id;
132 struct ipgre_net {
133         struct ip_tunnel *tunnels[4][HASH_SIZE];
134
135         struct net_device *fb_tunnel_dev;
136 };
137
138 /* Tunnel hash table */
139
140 /*
141    4 hash tables:
142
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157
158 #define tunnels_r_l     tunnels[3]
159 #define tunnels_r       tunnels[2]
160 #define tunnels_l       tunnels[1]
161 #define tunnels_wc      tunnels[0]
162
163 static DEFINE_RWLOCK(ipgre_lock);
164
165 /* Given src, dst and key, find appropriate for input tunnel. */
166
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168                                               __be32 remote, __be32 local,
169                                               __be32 key, __be16 gre_proto)
170 {
171         struct net *net = dev_net(dev);
172         int link = dev->ifindex;
173         unsigned h0 = HASH(remote);
174         unsigned h1 = HASH(key);
175         struct ip_tunnel *t, *cand = NULL;
176         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178                        ARPHRD_ETHER : ARPHRD_IPGRE;
179         int score, cand_score = 4;
180
181         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182                 if (local != t->parms.iph.saddr ||
183                     remote != t->parms.iph.daddr ||
184                     key != t->parms.i_key ||
185                     !(t->dev->flags & IFF_UP))
186                         continue;
187
188                 if (t->dev->type != ARPHRD_IPGRE &&
189                     t->dev->type != dev_type)
190                         continue;
191
192                 score = 0;
193                 if (t->parms.link != link)
194                         score |= 1;
195                 if (t->dev->type != dev_type)
196                         score |= 2;
197                 if (score == 0)
198                         return t;
199
200                 if (score < cand_score) {
201                         cand = t;
202                         cand_score = score;
203                 }
204         }
205
206         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
207                 if (remote != t->parms.iph.daddr ||
208                     key != t->parms.i_key ||
209                     !(t->dev->flags & IFF_UP))
210                         continue;
211
212                 if (t->dev->type != ARPHRD_IPGRE &&
213                     t->dev->type != dev_type)
214                         continue;
215
216                 score = 0;
217                 if (t->parms.link != link)
218                         score |= 1;
219                 if (t->dev->type != dev_type)
220                         score |= 2;
221                 if (score == 0)
222                         return t;
223
224                 if (score < cand_score) {
225                         cand = t;
226                         cand_score = score;
227                 }
228         }
229
230         for (t = ign->tunnels_l[h1]; t; t = t->next) {
231                 if ((local != t->parms.iph.saddr &&
232                      (local != t->parms.iph.daddr ||
233                       !ipv4_is_multicast(local))) ||
234                     key != t->parms.i_key ||
235                     !(t->dev->flags & IFF_UP))
236                         continue;
237
238                 if (t->dev->type != ARPHRD_IPGRE &&
239                     t->dev->type != dev_type)
240                         continue;
241
242                 score = 0;
243                 if (t->parms.link != link)
244                         score |= 1;
245                 if (t->dev->type != dev_type)
246                         score |= 2;
247                 if (score == 0)
248                         return t;
249
250                 if (score < cand_score) {
251                         cand = t;
252                         cand_score = score;
253                 }
254         }
255
256         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
257                 if (t->parms.i_key != key ||
258                     !(t->dev->flags & IFF_UP))
259                         continue;
260
261                 if (t->dev->type != ARPHRD_IPGRE &&
262                     t->dev->type != dev_type)
263                         continue;
264
265                 score = 0;
266                 if (t->parms.link != link)
267                         score |= 1;
268                 if (t->dev->type != dev_type)
269                         score |= 2;
270                 if (score == 0)
271                         return t;
272
273                 if (score < cand_score) {
274                         cand = t;
275                         cand_score = score;
276                 }
277         }
278
279         if (cand != NULL)
280                 return cand;
281
282         if (ign->fb_tunnel_dev->flags & IFF_UP)
283                 return netdev_priv(ign->fb_tunnel_dev);
284
285         return NULL;
286 }
287
288 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
289                 struct ip_tunnel_parm *parms)
290 {
291         __be32 remote = parms->iph.daddr;
292         __be32 local = parms->iph.saddr;
293         __be32 key = parms->i_key;
294         unsigned h = HASH(key);
295         int prio = 0;
296
297         if (local)
298                 prio |= 1;
299         if (remote && !ipv4_is_multicast(remote)) {
300                 prio |= 2;
301                 h ^= HASH(remote);
302         }
303
304         return &ign->tunnels[prio][h];
305 }
306
307 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
308                 struct ip_tunnel *t)
309 {
310         return __ipgre_bucket(ign, &t->parms);
311 }
312
313 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314 {
315         struct ip_tunnel **tp = ipgre_bucket(ign, t);
316
317         t->next = *tp;
318         write_lock_bh(&ipgre_lock);
319         *tp = t;
320         write_unlock_bh(&ipgre_lock);
321 }
322
323 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324 {
325         struct ip_tunnel **tp;
326
327         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
328                 if (t == *tp) {
329                         write_lock_bh(&ipgre_lock);
330                         *tp = t->next;
331                         write_unlock_bh(&ipgre_lock);
332                         break;
333                 }
334         }
335 }
336
337 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
338                                            struct ip_tunnel_parm *parms,
339                                            int type)
340 {
341         __be32 remote = parms->iph.daddr;
342         __be32 local = parms->iph.saddr;
343         __be32 key = parms->i_key;
344         int link = parms->link;
345         struct ip_tunnel *t, **tp;
346         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347
348         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
349                 if (local == t->parms.iph.saddr &&
350                     remote == t->parms.iph.daddr &&
351                     key == t->parms.i_key &&
352                     link == t->parms.link &&
353                     type == t->dev->type)
354                         break;
355
356         return t;
357 }
358
359 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
360                 struct ip_tunnel_parm *parms, int create)
361 {
362         struct ip_tunnel *t, *nt;
363         struct net_device *dev;
364         char name[IFNAMSIZ];
365         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
366
367         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
368         if (t || !create)
369                 return t;
370
371         if (parms->name[0])
372                 strlcpy(name, parms->name, IFNAMSIZ);
373         else
374                 sprintf(name, "gre%%d");
375
376         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377         if (!dev)
378           return NULL;
379
380         dev_net_set(dev, net);
381
382         if (strchr(name, '%')) {
383                 if (dev_alloc_name(dev, name) < 0)
384                         goto failed_free;
385         }
386
387         nt = netdev_priv(dev);
388         nt->parms = *parms;
389         dev->rtnl_link_ops = &ipgre_link_ops;
390
391         dev->mtu = ipgre_tunnel_bind_dev(dev);
392
393         if (register_netdevice(dev) < 0)
394                 goto failed_free;
395
396         dev_hold(dev);
397         ipgre_tunnel_link(ign, nt);
398         return nt;
399
400 failed_free:
401         free_netdev(dev);
402         return NULL;
403 }
404
405 static void ipgre_tunnel_uninit(struct net_device *dev)
406 {
407         struct net *net = dev_net(dev);
408         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
409
410         ipgre_tunnel_unlink(ign, netdev_priv(dev));
411         dev_put(dev);
412 }
413
414
415 static void ipgre_err(struct sk_buff *skb, u32 info)
416 {
417
418 /* All the routers (except for Linux) return only
419    8 bytes of packet payload. It means, that precise relaying of
420    ICMP in the real Internet is absolutely infeasible.
421
422    Moreover, Cisco "wise men" put GRE key to the third word
423    in GRE header. It makes impossible maintaining even soft state for keyed
424    GRE tunnels with enabled checksum. Tell them "thank you".
425
426    Well, I wonder, rfc1812 was written by Cisco employee,
427    what the hell these idiots break standrads established
428    by themself???
429  */
430
431         struct iphdr *iph = (struct iphdr *)skb->data;
432         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
433         int grehlen = (iph->ihl<<2) + 4;
434         const int type = icmp_hdr(skb)->type;
435         const int code = icmp_hdr(skb)->code;
436         struct ip_tunnel *t;
437         __be16 flags;
438
439         flags = p[0];
440         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
441                 if (flags&(GRE_VERSION|GRE_ROUTING))
442                         return;
443                 if (flags&GRE_KEY) {
444                         grehlen += 4;
445                         if (flags&GRE_CSUM)
446                                 grehlen += 4;
447                 }
448         }
449
450         /* If only 8 bytes returned, keyed message will be dropped here */
451         if (skb_headlen(skb) < grehlen)
452                 return;
453
454         switch (type) {
455         default:
456         case ICMP_PARAMETERPROB:
457                 return;
458
459         case ICMP_DEST_UNREACH:
460                 switch (code) {
461                 case ICMP_SR_FAILED:
462                 case ICMP_PORT_UNREACH:
463                         /* Impossible event. */
464                         return;
465                 case ICMP_FRAG_NEEDED:
466                         /* Soft state for pmtu is maintained by IP core. */
467                         return;
468                 default:
469                         /* All others are translated to HOST_UNREACH.
470                            rfc2003 contains "deep thoughts" about NET_UNREACH,
471                            I believe they are just ether pollution. --ANK
472                          */
473                         break;
474                 }
475                 break;
476         case ICMP_TIME_EXCEEDED:
477                 if (code != ICMP_EXC_TTL)
478                         return;
479                 break;
480         }
481
482         read_lock(&ipgre_lock);
483         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484                                 flags & GRE_KEY ?
485                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
486                                 p[1]);
487         if (t == NULL || t->parms.iph.daddr == 0 ||
488             ipv4_is_multicast(t->parms.iph.daddr))
489                 goto out;
490
491         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492                 goto out;
493
494         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
495                 t->err_count++;
496         else
497                 t->err_count = 1;
498         t->err_time = jiffies;
499 out:
500         read_unlock(&ipgre_lock);
501         return;
502 }
503
504 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
505 {
506         if (INET_ECN_is_ce(iph->tos)) {
507                 if (skb->protocol == htons(ETH_P_IP)) {
508                         IP_ECN_set_ce(ip_hdr(skb));
509                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
510                         IP6_ECN_set_ce(ipv6_hdr(skb));
511                 }
512         }
513 }
514
515 static inline u8
516 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
517 {
518         u8 inner = 0;
519         if (skb->protocol == htons(ETH_P_IP))
520                 inner = old_iph->tos;
521         else if (skb->protocol == htons(ETH_P_IPV6))
522                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
523         return INET_ECN_encapsulate(tos, inner);
524 }
525
526 static int ipgre_rcv(struct sk_buff *skb)
527 {
528         struct iphdr *iph;
529         u8     *h;
530         __be16    flags;
531         __sum16   csum = 0;
532         __be32 key = 0;
533         u32    seqno = 0;
534         struct ip_tunnel *tunnel;
535         int    offset = 4;
536         __be16 gre_proto;
537         unsigned int len;
538
539         if (!pskb_may_pull(skb, 16))
540                 goto drop_nolock;
541
542         iph = ip_hdr(skb);
543         h = skb->data;
544         flags = *(__be16*)h;
545
546         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
547                 /* - Version must be 0.
548                    - We do not support routing headers.
549                  */
550                 if (flags&(GRE_VERSION|GRE_ROUTING))
551                         goto drop_nolock;
552
553                 if (flags&GRE_CSUM) {
554                         switch (skb->ip_summed) {
555                         case CHECKSUM_COMPLETE:
556                                 csum = csum_fold(skb->csum);
557                                 if (!csum)
558                                         break;
559                                 /* fall through */
560                         case CHECKSUM_NONE:
561                                 skb->csum = 0;
562                                 csum = __skb_checksum_complete(skb);
563                                 skb->ip_summed = CHECKSUM_COMPLETE;
564                         }
565                         offset += 4;
566                 }
567                 if (flags&GRE_KEY) {
568                         key = *(__be32*)(h + offset);
569                         offset += 4;
570                 }
571                 if (flags&GRE_SEQ) {
572                         seqno = ntohl(*(__be32*)(h + offset));
573                         offset += 4;
574                 }
575         }
576
577         gre_proto = *(__be16 *)(h + 2);
578
579         read_lock(&ipgre_lock);
580         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581                                           iph->saddr, iph->daddr, key,
582                                           gre_proto))) {
583                 struct net_device_stats *stats = &tunnel->dev->stats;
584
585                 secpath_reset(skb);
586
587                 skb->protocol = gre_proto;
588                 /* WCCP version 1 and 2 protocol decoding.
589                  * - Change protocol to IP
590                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
591                  */
592                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
593                         skb->protocol = htons(ETH_P_IP);
594                         if ((*(h + offset) & 0xF0) != 0x40)
595                                 offset += 4;
596                 }
597
598                 skb->mac_header = skb->network_header;
599                 __pskb_pull(skb, offset);
600                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
601                 skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603                 if (ipv4_is_multicast(iph->daddr)) {
604                         /* Looped back packet, drop it! */
605                         if (skb->rtable->fl.iif == 0)
606                                 goto drop;
607                         stats->multicast++;
608                         skb->pkt_type = PACKET_BROADCAST;
609                 }
610 #endif
611
612                 if (((flags&GRE_CSUM) && csum) ||
613                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614                         stats->rx_crc_errors++;
615                         stats->rx_errors++;
616                         goto drop;
617                 }
618                 if (tunnel->parms.i_flags&GRE_SEQ) {
619                         if (!(flags&GRE_SEQ) ||
620                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621                                 stats->rx_fifo_errors++;
622                                 stats->rx_errors++;
623                                 goto drop;
624                         }
625                         tunnel->i_seqno = seqno + 1;
626                 }
627
628                 len = skb->len;
629
630                 /* Warning: All skb pointers will be invalidated! */
631                 if (tunnel->dev->type == ARPHRD_ETHER) {
632                         if (!pskb_may_pull(skb, ETH_HLEN)) {
633                                 stats->rx_length_errors++;
634                                 stats->rx_errors++;
635                                 goto drop;
636                         }
637
638                         iph = ip_hdr(skb);
639                         skb->protocol = eth_type_trans(skb, tunnel->dev);
640                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641                 }
642
643                 stats->rx_packets++;
644                 stats->rx_bytes += len;
645                 skb->dev = tunnel->dev;
646                 dst_release(skb->dst);
647                 skb->dst = NULL;
648                 nf_reset(skb);
649
650                 skb_reset_network_header(skb);
651                 ipgre_ecn_decapsulate(iph, skb);
652
653                 netif_rx(skb);
654                 read_unlock(&ipgre_lock);
655                 return(0);
656         }
657         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
658
659 drop:
660         read_unlock(&ipgre_lock);
661 drop_nolock:
662         kfree_skb(skb);
663         return(0);
664 }
665
666 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
667 {
668         struct ip_tunnel *tunnel = netdev_priv(dev);
669         struct net_device_stats *stats = &tunnel->dev->stats;
670         struct iphdr  *old_iph = ip_hdr(skb);
671         struct iphdr  *tiph;
672         u8     tos;
673         __be16 df;
674         struct rtable *rt;                      /* Route to the other host */
675         struct net_device *tdev;                        /* Device to other host */
676         struct iphdr  *iph;                     /* Our new IP header */
677         unsigned int max_headroom;              /* The extra header space needed */
678         int    gre_hlen;
679         __be32 dst;
680         int    mtu;
681
682         if (tunnel->recursion++) {
683                 stats->collisions++;
684                 goto tx_error;
685         }
686
687         if (dev->type == ARPHRD_ETHER)
688                 IPCB(skb)->flags = 0;
689
690         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
691                 gre_hlen = 0;
692                 tiph = (struct iphdr *)skb->data;
693         } else {
694                 gre_hlen = tunnel->hlen;
695                 tiph = &tunnel->parms.iph;
696         }
697
698         if ((dst = tiph->daddr) == 0) {
699                 /* NBMA tunnel */
700
701                 if (skb->dst == NULL) {
702                         stats->tx_fifo_errors++;
703                         goto tx_error;
704                 }
705
706                 if (skb->protocol == htons(ETH_P_IP)) {
707                         rt = skb->rtable;
708                         if ((dst = rt->rt_gateway) == 0)
709                                 goto tx_error_icmp;
710                 }
711 #ifdef CONFIG_IPV6
712                 else if (skb->protocol == htons(ETH_P_IPV6)) {
713                         struct in6_addr *addr6;
714                         int addr_type;
715                         struct neighbour *neigh = skb->dst->neighbour;
716
717                         if (neigh == NULL)
718                                 goto tx_error;
719
720                         addr6 = (struct in6_addr *)&neigh->primary_key;
721                         addr_type = ipv6_addr_type(addr6);
722
723                         if (addr_type == IPV6_ADDR_ANY) {
724                                 addr6 = &ipv6_hdr(skb)->daddr;
725                                 addr_type = ipv6_addr_type(addr6);
726                         }
727
728                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
729                                 goto tx_error_icmp;
730
731                         dst = addr6->s6_addr32[3];
732                 }
733 #endif
734                 else
735                         goto tx_error;
736         }
737
738         tos = tiph->tos;
739         if (tos&1) {
740                 if (skb->protocol == htons(ETH_P_IP))
741                         tos = old_iph->tos;
742                 tos &= ~1;
743         }
744
745         {
746                 struct flowi fl = { .oif = tunnel->parms.link,
747                                     .nl_u = { .ip4_u =
748                                               { .daddr = dst,
749                                                 .saddr = tiph->saddr,
750                                                 .tos = RT_TOS(tos) } },
751                                     .proto = IPPROTO_GRE };
752                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
753                         stats->tx_carrier_errors++;
754                         goto tx_error;
755                 }
756         }
757         tdev = rt->u.dst.dev;
758
759         if (tdev == dev) {
760                 ip_rt_put(rt);
761                 stats->collisions++;
762                 goto tx_error;
763         }
764
765         df = tiph->frag_off;
766         if (df)
767                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
768         else
769                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
770
771         if (skb->dst)
772                 skb->dst->ops->update_pmtu(skb->dst, mtu);
773
774         if (skb->protocol == htons(ETH_P_IP)) {
775                 df |= (old_iph->frag_off&htons(IP_DF));
776
777                 if ((old_iph->frag_off&htons(IP_DF)) &&
778                     mtu < ntohs(old_iph->tot_len)) {
779                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
780                         ip_rt_put(rt);
781                         goto tx_error;
782                 }
783         }
784 #ifdef CONFIG_IPV6
785         else if (skb->protocol == htons(ETH_P_IPV6)) {
786                 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
787
788                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
789                         if ((tunnel->parms.iph.daddr &&
790                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
791                             rt6->rt6i_dst.plen == 128) {
792                                 rt6->rt6i_flags |= RTF_MODIFIED;
793                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
794                         }
795                 }
796
797                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
798                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
799                         ip_rt_put(rt);
800                         goto tx_error;
801                 }
802         }
803 #endif
804
805         if (tunnel->err_count > 0) {
806                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
807                         tunnel->err_count--;
808
809                         dst_link_failure(skb);
810                 } else
811                         tunnel->err_count = 0;
812         }
813
814         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
815
816         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819                 if (!new_skb) {
820                         ip_rt_put(rt);
821                         stats->tx_dropped++;
822                         dev_kfree_skb(skb);
823                         tunnel->recursion--;
824                         return 0;
825                 }
826                 if (skb->sk)
827                         skb_set_owner_w(new_skb, skb->sk);
828                 dev_kfree_skb(skb);
829                 skb = new_skb;
830                 old_iph = ip_hdr(skb);
831         }
832
833         skb_reset_transport_header(skb);
834         skb_push(skb, gre_hlen);
835         skb_reset_network_header(skb);
836         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
837         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
838                               IPSKB_REROUTED);
839         dst_release(skb->dst);
840         skb->dst = &rt->u.dst;
841
842         /*
843          *      Push down and install the IPIP header.
844          */
845
846         iph                     =       ip_hdr(skb);
847         iph->version            =       4;
848         iph->ihl                =       sizeof(struct iphdr) >> 2;
849         iph->frag_off           =       df;
850         iph->protocol           =       IPPROTO_GRE;
851         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
852         iph->daddr              =       rt->rt_dst;
853         iph->saddr              =       rt->rt_src;
854
855         if ((iph->ttl = tiph->ttl) == 0) {
856                 if (skb->protocol == htons(ETH_P_IP))
857                         iph->ttl = old_iph->ttl;
858 #ifdef CONFIG_IPV6
859                 else if (skb->protocol == htons(ETH_P_IPV6))
860                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
861 #endif
862                 else
863                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
864         }
865
866         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
867         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
868                                    htons(ETH_P_TEB) : skb->protocol;
869
870         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
871                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
872
873                 if (tunnel->parms.o_flags&GRE_SEQ) {
874                         ++tunnel->o_seqno;
875                         *ptr = htonl(tunnel->o_seqno);
876                         ptr--;
877                 }
878                 if (tunnel->parms.o_flags&GRE_KEY) {
879                         *ptr = tunnel->parms.o_key;
880                         ptr--;
881                 }
882                 if (tunnel->parms.o_flags&GRE_CSUM) {
883                         *ptr = 0;
884                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
885                 }
886         }
887
888         nf_reset(skb);
889
890         IPTUNNEL_XMIT();
891         tunnel->recursion--;
892         return 0;
893
894 tx_error_icmp:
895         dst_link_failure(skb);
896
897 tx_error:
898         stats->tx_errors++;
899         dev_kfree_skb(skb);
900         tunnel->recursion--;
901         return 0;
902 }
903
904 static int ipgre_tunnel_bind_dev(struct net_device *dev)
905 {
906         struct net_device *tdev = NULL;
907         struct ip_tunnel *tunnel;
908         struct iphdr *iph;
909         int hlen = LL_MAX_HEADER;
910         int mtu = ETH_DATA_LEN;
911         int addend = sizeof(struct iphdr) + 4;
912
913         tunnel = netdev_priv(dev);
914         iph = &tunnel->parms.iph;
915
916         /* Guess output device to choose reasonable mtu and needed_headroom */
917
918         if (iph->daddr) {
919                 struct flowi fl = { .oif = tunnel->parms.link,
920                                     .nl_u = { .ip4_u =
921                                               { .daddr = iph->daddr,
922                                                 .saddr = iph->saddr,
923                                                 .tos = RT_TOS(iph->tos) } },
924                                     .proto = IPPROTO_GRE };
925                 struct rtable *rt;
926                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
927                         tdev = rt->u.dst.dev;
928                         ip_rt_put(rt);
929                 }
930
931                 if (dev->type != ARPHRD_ETHER)
932                         dev->flags |= IFF_POINTOPOINT;
933         }
934
935         if (!tdev && tunnel->parms.link)
936                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
937
938         if (tdev) {
939                 hlen = tdev->hard_header_len + tdev->needed_headroom;
940                 mtu = tdev->mtu;
941         }
942         dev->iflink = tunnel->parms.link;
943
944         /* Precalculate GRE options length */
945         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
946                 if (tunnel->parms.o_flags&GRE_CSUM)
947                         addend += 4;
948                 if (tunnel->parms.o_flags&GRE_KEY)
949                         addend += 4;
950                 if (tunnel->parms.o_flags&GRE_SEQ)
951                         addend += 4;
952         }
953         dev->needed_headroom = addend + hlen;
954         mtu -= dev->hard_header_len - addend;
955
956         if (mtu < 68)
957                 mtu = 68;
958
959         tunnel->hlen = addend;
960
961         return mtu;
962 }
963
964 static int
965 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
966 {
967         int err = 0;
968         struct ip_tunnel_parm p;
969         struct ip_tunnel *t;
970         struct net *net = dev_net(dev);
971         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
972
973         switch (cmd) {
974         case SIOCGETTUNNEL:
975                 t = NULL;
976                 if (dev == ign->fb_tunnel_dev) {
977                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
978                                 err = -EFAULT;
979                                 break;
980                         }
981                         t = ipgre_tunnel_locate(net, &p, 0);
982                 }
983                 if (t == NULL)
984                         t = netdev_priv(dev);
985                 memcpy(&p, &t->parms, sizeof(p));
986                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
987                         err = -EFAULT;
988                 break;
989
990         case SIOCADDTUNNEL:
991         case SIOCCHGTUNNEL:
992                 err = -EPERM;
993                 if (!capable(CAP_NET_ADMIN))
994                         goto done;
995
996                 err = -EFAULT;
997                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
998                         goto done;
999
1000                 err = -EINVAL;
1001                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1002                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1003                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1004                         goto done;
1005                 if (p.iph.ttl)
1006                         p.iph.frag_off |= htons(IP_DF);
1007
1008                 if (!(p.i_flags&GRE_KEY))
1009                         p.i_key = 0;
1010                 if (!(p.o_flags&GRE_KEY))
1011                         p.o_key = 0;
1012
1013                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1014
1015                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1016                         if (t != NULL) {
1017                                 if (t->dev != dev) {
1018                                         err = -EEXIST;
1019                                         break;
1020                                 }
1021                         } else {
1022                                 unsigned nflags = 0;
1023
1024                                 t = netdev_priv(dev);
1025
1026                                 if (ipv4_is_multicast(p.iph.daddr))
1027                                         nflags = IFF_BROADCAST;
1028                                 else if (p.iph.daddr)
1029                                         nflags = IFF_POINTOPOINT;
1030
1031                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1032                                         err = -EINVAL;
1033                                         break;
1034                                 }
1035                                 ipgre_tunnel_unlink(ign, t);
1036                                 t->parms.iph.saddr = p.iph.saddr;
1037                                 t->parms.iph.daddr = p.iph.daddr;
1038                                 t->parms.i_key = p.i_key;
1039                                 t->parms.o_key = p.o_key;
1040                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1041                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1042                                 ipgre_tunnel_link(ign, t);
1043                                 netdev_state_change(dev);
1044                         }
1045                 }
1046
1047                 if (t) {
1048                         err = 0;
1049                         if (cmd == SIOCCHGTUNNEL) {
1050                                 t->parms.iph.ttl = p.iph.ttl;
1051                                 t->parms.iph.tos = p.iph.tos;
1052                                 t->parms.iph.frag_off = p.iph.frag_off;
1053                                 if (t->parms.link != p.link) {
1054                                         t->parms.link = p.link;
1055                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1056                                         netdev_state_change(dev);
1057                                 }
1058                         }
1059                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1060                                 err = -EFAULT;
1061                 } else
1062                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1063                 break;
1064
1065         case SIOCDELTUNNEL:
1066                 err = -EPERM;
1067                 if (!capable(CAP_NET_ADMIN))
1068                         goto done;
1069
1070                 if (dev == ign->fb_tunnel_dev) {
1071                         err = -EFAULT;
1072                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1073                                 goto done;
1074                         err = -ENOENT;
1075                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1076                                 goto done;
1077                         err = -EPERM;
1078                         if (t == netdev_priv(ign->fb_tunnel_dev))
1079                                 goto done;
1080                         dev = t->dev;
1081                 }
1082                 unregister_netdevice(dev);
1083                 err = 0;
1084                 break;
1085
1086         default:
1087                 err = -EINVAL;
1088         }
1089
1090 done:
1091         return err;
1092 }
1093
1094 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1095 {
1096         struct ip_tunnel *tunnel = netdev_priv(dev);
1097         if (new_mtu < 68 ||
1098             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1099                 return -EINVAL;
1100         dev->mtu = new_mtu;
1101         return 0;
1102 }
1103
1104 /* Nice toy. Unfortunately, useless in real life :-)
1105    It allows to construct virtual multiprotocol broadcast "LAN"
1106    over the Internet, provided multicast routing is tuned.
1107
1108
1109    I have no idea was this bicycle invented before me,
1110    so that I had to set ARPHRD_IPGRE to a random value.
1111    I have an impression, that Cisco could make something similar,
1112    but this feature is apparently missing in IOS<=11.2(8).
1113
1114    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1115    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1116
1117    ping -t 255 224.66.66.66
1118
1119    If nobody answers, mbone does not work.
1120
1121    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1122    ip addr add 10.66.66.<somewhat>/24 dev Universe
1123    ifconfig Universe up
1124    ifconfig Universe add fe80::<Your_real_addr>/10
1125    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1126    ftp 10.66.66.66
1127    ...
1128    ftp fec0:6666:6666::193.233.7.65
1129    ...
1130
1131  */
1132
1133 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1134                         unsigned short type,
1135                         const void *daddr, const void *saddr, unsigned len)
1136 {
1137         struct ip_tunnel *t = netdev_priv(dev);
1138         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1139         __be16 *p = (__be16*)(iph+1);
1140
1141         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1142         p[0]            = t->parms.o_flags;
1143         p[1]            = htons(type);
1144
1145         /*
1146          *      Set the source hardware address.
1147          */
1148
1149         if (saddr)
1150                 memcpy(&iph->saddr, saddr, 4);
1151
1152         if (daddr) {
1153                 memcpy(&iph->daddr, daddr, 4);
1154                 return t->hlen;
1155         }
1156         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1157                 return t->hlen;
1158
1159         return -t->hlen;
1160 }
1161
1162 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1163 {
1164         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1165         memcpy(haddr, &iph->saddr, 4);
1166         return 4;
1167 }
1168
1169 static const struct header_ops ipgre_header_ops = {
1170         .create = ipgre_header,
1171         .parse  = ipgre_header_parse,
1172 };
1173
1174 #ifdef CONFIG_NET_IPGRE_BROADCAST
1175 static int ipgre_open(struct net_device *dev)
1176 {
1177         struct ip_tunnel *t = netdev_priv(dev);
1178
1179         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1180                 struct flowi fl = { .oif = t->parms.link,
1181                                     .nl_u = { .ip4_u =
1182                                               { .daddr = t->parms.iph.daddr,
1183                                                 .saddr = t->parms.iph.saddr,
1184                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1185                                     .proto = IPPROTO_GRE };
1186                 struct rtable *rt;
1187                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1188                         return -EADDRNOTAVAIL;
1189                 dev = rt->u.dst.dev;
1190                 ip_rt_put(rt);
1191                 if (__in_dev_get_rtnl(dev) == NULL)
1192                         return -EADDRNOTAVAIL;
1193                 t->mlink = dev->ifindex;
1194                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1195         }
1196         return 0;
1197 }
1198
1199 static int ipgre_close(struct net_device *dev)
1200 {
1201         struct ip_tunnel *t = netdev_priv(dev);
1202
1203         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1204                 struct in_device *in_dev;
1205                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1206                 if (in_dev) {
1207                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1208                         in_dev_put(in_dev);
1209                 }
1210         }
1211         return 0;
1212 }
1213
1214 #endif
1215
1216 static const struct net_device_ops ipgre_netdev_ops = {
1217         .ndo_init               = ipgre_tunnel_init,
1218         .ndo_uninit             = ipgre_tunnel_uninit,
1219 #ifdef CONFIG_NET_IPGRE_BROADCAST
1220         .ndo_open               = ipgre_open,
1221         .ndo_stop               = ipgre_close,
1222 #endif
1223         .ndo_start_xmit         = ipgre_tunnel_xmit,
1224         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1225         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1226 };
1227
1228 static void ipgre_tunnel_setup(struct net_device *dev)
1229 {
1230         dev->netdev_ops         = &ipgre_netdev_ops;
1231         dev->destructor         = free_netdev;
1232
1233         dev->type               = ARPHRD_IPGRE;
1234         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1235         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1236         dev->flags              = IFF_NOARP;
1237         dev->iflink             = 0;
1238         dev->addr_len           = 4;
1239         dev->features           |= NETIF_F_NETNS_LOCAL;
1240 }
1241
1242 static int ipgre_tunnel_init(struct net_device *dev)
1243 {
1244         struct ip_tunnel *tunnel;
1245         struct iphdr *iph;
1246
1247         tunnel = netdev_priv(dev);
1248         iph = &tunnel->parms.iph;
1249
1250         tunnel->dev = dev;
1251         strcpy(tunnel->parms.name, dev->name);
1252
1253         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1254         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1255
1256         if (iph->daddr) {
1257 #ifdef CONFIG_NET_IPGRE_BROADCAST
1258                 if (ipv4_is_multicast(iph->daddr)) {
1259                         if (!iph->saddr)
1260                                 return -EINVAL;
1261                         dev->flags = IFF_BROADCAST;
1262                         dev->header_ops = &ipgre_header_ops;
1263                 }
1264 #endif
1265         } else
1266                 dev->header_ops = &ipgre_header_ops;
1267
1268         return 0;
1269 }
1270
1271 static void ipgre_fb_tunnel_init(struct net_device *dev)
1272 {
1273         struct ip_tunnel *tunnel = netdev_priv(dev);
1274         struct iphdr *iph = &tunnel->parms.iph;
1275         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1276
1277         tunnel->dev = dev;
1278         strcpy(tunnel->parms.name, dev->name);
1279
1280         iph->version            = 4;
1281         iph->protocol           = IPPROTO_GRE;
1282         iph->ihl                = 5;
1283         tunnel->hlen            = sizeof(struct iphdr) + 4;
1284
1285         dev_hold(dev);
1286         ign->tunnels_wc[0]      = tunnel;
1287 }
1288
1289
1290 static struct net_protocol ipgre_protocol = {
1291         .handler        =       ipgre_rcv,
1292         .err_handler    =       ipgre_err,
1293         .netns_ok       =       1,
1294 };
1295
1296 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1297 {
1298         int prio;
1299
1300         for (prio = 0; prio < 4; prio++) {
1301                 int h;
1302                 for (h = 0; h < HASH_SIZE; h++) {
1303                         struct ip_tunnel *t;
1304                         while ((t = ign->tunnels[prio][h]) != NULL)
1305                                 unregister_netdevice(t->dev);
1306                 }
1307         }
1308 }
1309
1310 static int ipgre_init_net(struct net *net)
1311 {
1312         int err;
1313         struct ipgre_net *ign;
1314
1315         err = -ENOMEM;
1316         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1317         if (ign == NULL)
1318                 goto err_alloc;
1319
1320         err = net_assign_generic(net, ipgre_net_id, ign);
1321         if (err < 0)
1322                 goto err_assign;
1323
1324         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1325                                            ipgre_tunnel_setup);
1326         if (!ign->fb_tunnel_dev) {
1327                 err = -ENOMEM;
1328                 goto err_alloc_dev;
1329         }
1330         dev_net_set(ign->fb_tunnel_dev, net);
1331
1332         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1333         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1334
1335         if ((err = register_netdev(ign->fb_tunnel_dev)))
1336                 goto err_reg_dev;
1337
1338         return 0;
1339
1340 err_reg_dev:
1341         free_netdev(ign->fb_tunnel_dev);
1342 err_alloc_dev:
1343         /* nothing */
1344 err_assign:
1345         kfree(ign);
1346 err_alloc:
1347         return err;
1348 }
1349
1350 static void ipgre_exit_net(struct net *net)
1351 {
1352         struct ipgre_net *ign;
1353
1354         ign = net_generic(net, ipgre_net_id);
1355         rtnl_lock();
1356         ipgre_destroy_tunnels(ign);
1357         rtnl_unlock();
1358         kfree(ign);
1359 }
1360
1361 static struct pernet_operations ipgre_net_ops = {
1362         .init = ipgre_init_net,
1363         .exit = ipgre_exit_net,
1364 };
1365
1366 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1367 {
1368         __be16 flags;
1369
1370         if (!data)
1371                 return 0;
1372
1373         flags = 0;
1374         if (data[IFLA_GRE_IFLAGS])
1375                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1376         if (data[IFLA_GRE_OFLAGS])
1377                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1378         if (flags & (GRE_VERSION|GRE_ROUTING))
1379                 return -EINVAL;
1380
1381         return 0;
1382 }
1383
1384 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1385 {
1386         __be32 daddr;
1387
1388         if (tb[IFLA_ADDRESS]) {
1389                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1390                         return -EINVAL;
1391                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1392                         return -EADDRNOTAVAIL;
1393         }
1394
1395         if (!data)
1396                 goto out;
1397
1398         if (data[IFLA_GRE_REMOTE]) {
1399                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1400                 if (!daddr)
1401                         return -EINVAL;
1402         }
1403
1404 out:
1405         return ipgre_tunnel_validate(tb, data);
1406 }
1407
1408 static void ipgre_netlink_parms(struct nlattr *data[],
1409                                 struct ip_tunnel_parm *parms)
1410 {
1411         memset(parms, 0, sizeof(*parms));
1412
1413         parms->iph.protocol = IPPROTO_GRE;
1414
1415         if (!data)
1416                 return;
1417
1418         if (data[IFLA_GRE_LINK])
1419                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1420
1421         if (data[IFLA_GRE_IFLAGS])
1422                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1423
1424         if (data[IFLA_GRE_OFLAGS])
1425                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1426
1427         if (data[IFLA_GRE_IKEY])
1428                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1429
1430         if (data[IFLA_GRE_OKEY])
1431                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1432
1433         if (data[IFLA_GRE_LOCAL])
1434                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1435
1436         if (data[IFLA_GRE_REMOTE])
1437                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1438
1439         if (data[IFLA_GRE_TTL])
1440                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1441
1442         if (data[IFLA_GRE_TOS])
1443                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1444
1445         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1446                 parms->iph.frag_off = htons(IP_DF);
1447 }
1448
1449 static int ipgre_tap_init(struct net_device *dev)
1450 {
1451         struct ip_tunnel *tunnel;
1452
1453         tunnel = netdev_priv(dev);
1454
1455         tunnel->dev = dev;
1456         strcpy(tunnel->parms.name, dev->name);
1457
1458         ipgre_tunnel_bind_dev(dev);
1459
1460         return 0;
1461 }
1462
1463 static const struct net_device_ops ipgre_tap_netdev_ops = {
1464         .ndo_init               = ipgre_tap_init,
1465         .ndo_uninit             = ipgre_tunnel_uninit,
1466         .ndo_start_xmit         = ipgre_tunnel_xmit,
1467         .ndo_set_mac_address    = eth_mac_addr,
1468         .ndo_validate_addr      = eth_validate_addr,
1469         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1470 };
1471
1472 static void ipgre_tap_setup(struct net_device *dev)
1473 {
1474
1475         ether_setup(dev);
1476
1477         dev->netdev_ops         = &ipgre_netdev_ops;
1478         dev->destructor         = free_netdev;
1479
1480         dev->iflink             = 0;
1481         dev->features           |= NETIF_F_NETNS_LOCAL;
1482 }
1483
1484 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1485                          struct nlattr *data[])
1486 {
1487         struct ip_tunnel *nt;
1488         struct net *net = dev_net(dev);
1489         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1490         int mtu;
1491         int err;
1492
1493         nt = netdev_priv(dev);
1494         ipgre_netlink_parms(data, &nt->parms);
1495
1496         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1497                 return -EEXIST;
1498
1499         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1500                 random_ether_addr(dev->dev_addr);
1501
1502         mtu = ipgre_tunnel_bind_dev(dev);
1503         if (!tb[IFLA_MTU])
1504                 dev->mtu = mtu;
1505
1506         err = register_netdevice(dev);
1507         if (err)
1508                 goto out;
1509
1510         dev_hold(dev);
1511         ipgre_tunnel_link(ign, nt);
1512
1513 out:
1514         return err;
1515 }
1516
1517 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1518                             struct nlattr *data[])
1519 {
1520         struct ip_tunnel *t, *nt;
1521         struct net *net = dev_net(dev);
1522         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1523         struct ip_tunnel_parm p;
1524         int mtu;
1525
1526         if (dev == ign->fb_tunnel_dev)
1527                 return -EINVAL;
1528
1529         nt = netdev_priv(dev);
1530         ipgre_netlink_parms(data, &p);
1531
1532         t = ipgre_tunnel_locate(net, &p, 0);
1533
1534         if (t) {
1535                 if (t->dev != dev)
1536                         return -EEXIST;
1537         } else {
1538                 unsigned nflags = 0;
1539
1540                 t = nt;
1541
1542                 if (ipv4_is_multicast(p.iph.daddr))
1543                         nflags = IFF_BROADCAST;
1544                 else if (p.iph.daddr)
1545                         nflags = IFF_POINTOPOINT;
1546
1547                 if ((dev->flags ^ nflags) &
1548                     (IFF_POINTOPOINT | IFF_BROADCAST))
1549                         return -EINVAL;
1550
1551                 ipgre_tunnel_unlink(ign, t);
1552                 t->parms.iph.saddr = p.iph.saddr;
1553                 t->parms.iph.daddr = p.iph.daddr;
1554                 t->parms.i_key = p.i_key;
1555                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1556                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1557                 ipgre_tunnel_link(ign, t);
1558                 netdev_state_change(dev);
1559         }
1560
1561         t->parms.o_key = p.o_key;
1562         t->parms.iph.ttl = p.iph.ttl;
1563         t->parms.iph.tos = p.iph.tos;
1564         t->parms.iph.frag_off = p.iph.frag_off;
1565
1566         if (t->parms.link != p.link) {
1567                 t->parms.link = p.link;
1568                 mtu = ipgre_tunnel_bind_dev(dev);
1569                 if (!tb[IFLA_MTU])
1570                         dev->mtu = mtu;
1571                 netdev_state_change(dev);
1572         }
1573
1574         return 0;
1575 }
1576
1577 static size_t ipgre_get_size(const struct net_device *dev)
1578 {
1579         return
1580                 /* IFLA_GRE_LINK */
1581                 nla_total_size(4) +
1582                 /* IFLA_GRE_IFLAGS */
1583                 nla_total_size(2) +
1584                 /* IFLA_GRE_OFLAGS */
1585                 nla_total_size(2) +
1586                 /* IFLA_GRE_IKEY */
1587                 nla_total_size(4) +
1588                 /* IFLA_GRE_OKEY */
1589                 nla_total_size(4) +
1590                 /* IFLA_GRE_LOCAL */
1591                 nla_total_size(4) +
1592                 /* IFLA_GRE_REMOTE */
1593                 nla_total_size(4) +
1594                 /* IFLA_GRE_TTL */
1595                 nla_total_size(1) +
1596                 /* IFLA_GRE_TOS */
1597                 nla_total_size(1) +
1598                 /* IFLA_GRE_PMTUDISC */
1599                 nla_total_size(1) +
1600                 0;
1601 }
1602
1603 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1604 {
1605         struct ip_tunnel *t = netdev_priv(dev);
1606         struct ip_tunnel_parm *p = &t->parms;
1607
1608         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1609         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1610         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1611         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1612         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1613         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1614         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1615         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1616         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1617         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1618
1619         return 0;
1620
1621 nla_put_failure:
1622         return -EMSGSIZE;
1623 }
1624
1625 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1626         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1627         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1628         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1629         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1630         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1631         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1632         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1633         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1634         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1635         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1636 };
1637
1638 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1639         .kind           = "gre",
1640         .maxtype        = IFLA_GRE_MAX,
1641         .policy         = ipgre_policy,
1642         .priv_size      = sizeof(struct ip_tunnel),
1643         .setup          = ipgre_tunnel_setup,
1644         .validate       = ipgre_tunnel_validate,
1645         .newlink        = ipgre_newlink,
1646         .changelink     = ipgre_changelink,
1647         .get_size       = ipgre_get_size,
1648         .fill_info      = ipgre_fill_info,
1649 };
1650
1651 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1652         .kind           = "gretap",
1653         .maxtype        = IFLA_GRE_MAX,
1654         .policy         = ipgre_policy,
1655         .priv_size      = sizeof(struct ip_tunnel),
1656         .setup          = ipgre_tap_setup,
1657         .validate       = ipgre_tap_validate,
1658         .newlink        = ipgre_newlink,
1659         .changelink     = ipgre_changelink,
1660         .get_size       = ipgre_get_size,
1661         .fill_info      = ipgre_fill_info,
1662 };
1663
1664 /*
1665  *      And now the modules code and kernel interface.
1666  */
1667
1668 static int __init ipgre_init(void)
1669 {
1670         int err;
1671
1672         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1673
1674         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1675                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1676                 return -EAGAIN;
1677         }
1678
1679         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1680         if (err < 0)
1681                 goto gen_device_failed;
1682
1683         err = rtnl_link_register(&ipgre_link_ops);
1684         if (err < 0)
1685                 goto rtnl_link_failed;
1686
1687         err = rtnl_link_register(&ipgre_tap_ops);
1688         if (err < 0)
1689                 goto tap_ops_failed;
1690
1691 out:
1692         return err;
1693
1694 tap_ops_failed:
1695         rtnl_link_unregister(&ipgre_link_ops);
1696 rtnl_link_failed:
1697         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1698 gen_device_failed:
1699         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1700         goto out;
1701 }
1702
1703 static void __exit ipgre_fini(void)
1704 {
1705         rtnl_link_unregister(&ipgre_tap_ops);
1706         rtnl_link_unregister(&ipgre_link_ops);
1707         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1708         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1709                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1710 }
1711
1712 module_init(ipgre_init);
1713 module_exit(ipgre_fini);
1714 MODULE_LICENSE("GPL");
1715 MODULE_ALIAS_RTNL_LINK("gre");
1716 MODULE_ALIAS_RTNL_LINK("gretap");