Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 #define HASH_SIZE  16
130
131 static int ipgre_net_id;
132 struct ipgre_net {
133         struct ip_tunnel *tunnels[4][HASH_SIZE];
134
135         struct net_device *fb_tunnel_dev;
136 };
137
138 /* Tunnel hash table */
139
140 /*
141    4 hash tables:
142
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157
158 #define tunnels_r_l     tunnels[3]
159 #define tunnels_r       tunnels[2]
160 #define tunnels_l       tunnels[1]
161 #define tunnels_wc      tunnels[0]
162
163 static DEFINE_RWLOCK(ipgre_lock);
164
165 /* Given src, dst and key, find appropriate for input tunnel. */
166
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168                                               __be32 remote, __be32 local,
169                                               __be32 key, __be16 gre_proto)
170 {
171         struct net *net = dev_net(dev);
172         int link = dev->ifindex;
173         unsigned h0 = HASH(remote);
174         unsigned h1 = HASH(key);
175         struct ip_tunnel *t, *cand = NULL;
176         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178                        ARPHRD_ETHER : ARPHRD_IPGRE;
179         int score, cand_score = 4;
180
181         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182                 if (local != t->parms.iph.saddr ||
183                     remote != t->parms.iph.daddr ||
184                     key != t->parms.i_key ||
185                     !(t->dev->flags & IFF_UP))
186                         continue;
187
188                 if (t->dev->type != ARPHRD_IPGRE &&
189                     t->dev->type != dev_type)
190                         continue;
191
192                 score = 0;
193                 if (t->parms.link != link)
194                         score |= 1;
195                 if (t->dev->type != dev_type)
196                         score |= 2;
197                 if (score == 0)
198                         return t;
199
200                 if (score < cand_score) {
201                         cand = t;
202                         cand_score = score;
203                 }
204         }
205
206         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
207                 if (remote != t->parms.iph.daddr ||
208                     key != t->parms.i_key ||
209                     !(t->dev->flags & IFF_UP))
210                         continue;
211
212                 if (t->dev->type != ARPHRD_IPGRE &&
213                     t->dev->type != dev_type)
214                         continue;
215
216                 score = 0;
217                 if (t->parms.link != link)
218                         score |= 1;
219                 if (t->dev->type != dev_type)
220                         score |= 2;
221                 if (score == 0)
222                         return t;
223
224                 if (score < cand_score) {
225                         cand = t;
226                         cand_score = score;
227                 }
228         }
229
230         for (t = ign->tunnels_l[h1]; t; t = t->next) {
231                 if ((local != t->parms.iph.saddr &&
232                      (local != t->parms.iph.daddr ||
233                       !ipv4_is_multicast(local))) ||
234                     key != t->parms.i_key ||
235                     !(t->dev->flags & IFF_UP))
236                         continue;
237
238                 if (t->dev->type != ARPHRD_IPGRE &&
239                     t->dev->type != dev_type)
240                         continue;
241
242                 score = 0;
243                 if (t->parms.link != link)
244                         score |= 1;
245                 if (t->dev->type != dev_type)
246                         score |= 2;
247                 if (score == 0)
248                         return t;
249
250                 if (score < cand_score) {
251                         cand = t;
252                         cand_score = score;
253                 }
254         }
255
256         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
257                 if (t->parms.i_key != key ||
258                     !(t->dev->flags & IFF_UP))
259                         continue;
260
261                 if (t->dev->type != ARPHRD_IPGRE &&
262                     t->dev->type != dev_type)
263                         continue;
264
265                 score = 0;
266                 if (t->parms.link != link)
267                         score |= 1;
268                 if (t->dev->type != dev_type)
269                         score |= 2;
270                 if (score == 0)
271                         return t;
272
273                 if (score < cand_score) {
274                         cand = t;
275                         cand_score = score;
276                 }
277         }
278
279         if (cand != NULL)
280                 return cand;
281
282         if (ign->fb_tunnel_dev->flags & IFF_UP)
283                 return netdev_priv(ign->fb_tunnel_dev);
284
285         return NULL;
286 }
287
288 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
289                 struct ip_tunnel_parm *parms)
290 {
291         __be32 remote = parms->iph.daddr;
292         __be32 local = parms->iph.saddr;
293         __be32 key = parms->i_key;
294         unsigned h = HASH(key);
295         int prio = 0;
296
297         if (local)
298                 prio |= 1;
299         if (remote && !ipv4_is_multicast(remote)) {
300                 prio |= 2;
301                 h ^= HASH(remote);
302         }
303
304         return &ign->tunnels[prio][h];
305 }
306
307 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
308                 struct ip_tunnel *t)
309 {
310         return __ipgre_bucket(ign, &t->parms);
311 }
312
313 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314 {
315         struct ip_tunnel **tp = ipgre_bucket(ign, t);
316
317         t->next = *tp;
318         write_lock_bh(&ipgre_lock);
319         *tp = t;
320         write_unlock_bh(&ipgre_lock);
321 }
322
323 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324 {
325         struct ip_tunnel **tp;
326
327         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
328                 if (t == *tp) {
329                         write_lock_bh(&ipgre_lock);
330                         *tp = t->next;
331                         write_unlock_bh(&ipgre_lock);
332                         break;
333                 }
334         }
335 }
336
337 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
338                                            struct ip_tunnel_parm *parms,
339                                            int type)
340 {
341         __be32 remote = parms->iph.daddr;
342         __be32 local = parms->iph.saddr;
343         __be32 key = parms->i_key;
344         int link = parms->link;
345         struct ip_tunnel *t, **tp;
346         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347
348         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
349                 if (local == t->parms.iph.saddr &&
350                     remote == t->parms.iph.daddr &&
351                     key == t->parms.i_key &&
352                     link == t->parms.link &&
353                     type == t->dev->type)
354                         break;
355
356         return t;
357 }
358
359 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
360                 struct ip_tunnel_parm *parms, int create)
361 {
362         struct ip_tunnel *t, *nt;
363         struct net_device *dev;
364         char name[IFNAMSIZ];
365         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
366
367         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
368         if (t || !create)
369                 return t;
370
371         if (parms->name[0])
372                 strlcpy(name, parms->name, IFNAMSIZ);
373         else
374                 sprintf(name, "gre%%d");
375
376         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377         if (!dev)
378           return NULL;
379
380         dev_net_set(dev, net);
381
382         if (strchr(name, '%')) {
383                 if (dev_alloc_name(dev, name) < 0)
384                         goto failed_free;
385         }
386
387         nt = netdev_priv(dev);
388         nt->parms = *parms;
389         dev->rtnl_link_ops = &ipgre_link_ops;
390
391         dev->mtu = ipgre_tunnel_bind_dev(dev);
392
393         if (register_netdevice(dev) < 0)
394                 goto failed_free;
395
396         dev_hold(dev);
397         ipgre_tunnel_link(ign, nt);
398         return nt;
399
400 failed_free:
401         free_netdev(dev);
402         return NULL;
403 }
404
405 static void ipgre_tunnel_uninit(struct net_device *dev)
406 {
407         struct net *net = dev_net(dev);
408         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
409
410         ipgre_tunnel_unlink(ign, netdev_priv(dev));
411         dev_put(dev);
412 }
413
414
415 static void ipgre_err(struct sk_buff *skb, u32 info)
416 {
417
418 /* All the routers (except for Linux) return only
419    8 bytes of packet payload. It means, that precise relaying of
420    ICMP in the real Internet is absolutely infeasible.
421
422    Moreover, Cisco "wise men" put GRE key to the third word
423    in GRE header. It makes impossible maintaining even soft state for keyed
424    GRE tunnels with enabled checksum. Tell them "thank you".
425
426    Well, I wonder, rfc1812 was written by Cisco employee,
427    what the hell these idiots break standrads established
428    by themself???
429  */
430
431         struct iphdr *iph = (struct iphdr *)skb->data;
432         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
433         int grehlen = (iph->ihl<<2) + 4;
434         const int type = icmp_hdr(skb)->type;
435         const int code = icmp_hdr(skb)->code;
436         struct ip_tunnel *t;
437         __be16 flags;
438
439         flags = p[0];
440         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
441                 if (flags&(GRE_VERSION|GRE_ROUTING))
442                         return;
443                 if (flags&GRE_KEY) {
444                         grehlen += 4;
445                         if (flags&GRE_CSUM)
446                                 grehlen += 4;
447                 }
448         }
449
450         /* If only 8 bytes returned, keyed message will be dropped here */
451         if (skb_headlen(skb) < grehlen)
452                 return;
453
454         switch (type) {
455         default:
456         case ICMP_PARAMETERPROB:
457                 return;
458
459         case ICMP_DEST_UNREACH:
460                 switch (code) {
461                 case ICMP_SR_FAILED:
462                 case ICMP_PORT_UNREACH:
463                         /* Impossible event. */
464                         return;
465                 case ICMP_FRAG_NEEDED:
466                         /* Soft state for pmtu is maintained by IP core. */
467                         return;
468                 default:
469                         /* All others are translated to HOST_UNREACH.
470                            rfc2003 contains "deep thoughts" about NET_UNREACH,
471                            I believe they are just ether pollution. --ANK
472                          */
473                         break;
474                 }
475                 break;
476         case ICMP_TIME_EXCEEDED:
477                 if (code != ICMP_EXC_TTL)
478                         return;
479                 break;
480         }
481
482         read_lock(&ipgre_lock);
483         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484                                 flags & GRE_KEY ?
485                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
486                                 p[1]);
487         if (t == NULL || t->parms.iph.daddr == 0 ||
488             ipv4_is_multicast(t->parms.iph.daddr))
489                 goto out;
490
491         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492                 goto out;
493
494         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
495                 t->err_count++;
496         else
497                 t->err_count = 1;
498         t->err_time = jiffies;
499 out:
500         read_unlock(&ipgre_lock);
501         return;
502 }
503
504 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
505 {
506         if (INET_ECN_is_ce(iph->tos)) {
507                 if (skb->protocol == htons(ETH_P_IP)) {
508                         IP_ECN_set_ce(ip_hdr(skb));
509                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
510                         IP6_ECN_set_ce(ipv6_hdr(skb));
511                 }
512         }
513 }
514
515 static inline u8
516 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
517 {
518         u8 inner = 0;
519         if (skb->protocol == htons(ETH_P_IP))
520                 inner = old_iph->tos;
521         else if (skb->protocol == htons(ETH_P_IPV6))
522                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
523         return INET_ECN_encapsulate(tos, inner);
524 }
525
526 static int ipgre_rcv(struct sk_buff *skb)
527 {
528         struct iphdr *iph;
529         u8     *h;
530         __be16    flags;
531         __sum16   csum = 0;
532         __be32 key = 0;
533         u32    seqno = 0;
534         struct ip_tunnel *tunnel;
535         int    offset = 4;
536         __be16 gre_proto;
537         unsigned int len;
538
539         if (!pskb_may_pull(skb, 16))
540                 goto drop_nolock;
541
542         iph = ip_hdr(skb);
543         h = skb->data;
544         flags = *(__be16*)h;
545
546         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
547                 /* - Version must be 0.
548                    - We do not support routing headers.
549                  */
550                 if (flags&(GRE_VERSION|GRE_ROUTING))
551                         goto drop_nolock;
552
553                 if (flags&GRE_CSUM) {
554                         switch (skb->ip_summed) {
555                         case CHECKSUM_COMPLETE:
556                                 csum = csum_fold(skb->csum);
557                                 if (!csum)
558                                         break;
559                                 /* fall through */
560                         case CHECKSUM_NONE:
561                                 skb->csum = 0;
562                                 csum = __skb_checksum_complete(skb);
563                                 skb->ip_summed = CHECKSUM_COMPLETE;
564                         }
565                         offset += 4;
566                 }
567                 if (flags&GRE_KEY) {
568                         key = *(__be32*)(h + offset);
569                         offset += 4;
570                 }
571                 if (flags&GRE_SEQ) {
572                         seqno = ntohl(*(__be32*)(h + offset));
573                         offset += 4;
574                 }
575         }
576
577         gre_proto = *(__be16 *)(h + 2);
578
579         read_lock(&ipgre_lock);
580         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581                                           iph->saddr, iph->daddr, key,
582                                           gre_proto))) {
583                 struct net_device_stats *stats = &tunnel->dev->stats;
584
585                 secpath_reset(skb);
586
587                 skb->protocol = gre_proto;
588                 /* WCCP version 1 and 2 protocol decoding.
589                  * - Change protocol to IP
590                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
591                  */
592                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
593                         skb->protocol = htons(ETH_P_IP);
594                         if ((*(h + offset) & 0xF0) != 0x40)
595                                 offset += 4;
596                 }
597
598                 skb->mac_header = skb->network_header;
599                 __pskb_pull(skb, offset);
600                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
601                 skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603                 if (ipv4_is_multicast(iph->daddr)) {
604                         /* Looped back packet, drop it! */
605                         if (skb->rtable->fl.iif == 0)
606                                 goto drop;
607                         stats->multicast++;
608                         skb->pkt_type = PACKET_BROADCAST;
609                 }
610 #endif
611
612                 if (((flags&GRE_CSUM) && csum) ||
613                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614                         stats->rx_crc_errors++;
615                         stats->rx_errors++;
616                         goto drop;
617                 }
618                 if (tunnel->parms.i_flags&GRE_SEQ) {
619                         if (!(flags&GRE_SEQ) ||
620                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621                                 stats->rx_fifo_errors++;
622                                 stats->rx_errors++;
623                                 goto drop;
624                         }
625                         tunnel->i_seqno = seqno + 1;
626                 }
627
628                 len = skb->len;
629
630                 /* Warning: All skb pointers will be invalidated! */
631                 if (tunnel->dev->type == ARPHRD_ETHER) {
632                         if (!pskb_may_pull(skb, ETH_HLEN)) {
633                                 stats->rx_length_errors++;
634                                 stats->rx_errors++;
635                                 goto drop;
636                         }
637
638                         iph = ip_hdr(skb);
639                         skb->protocol = eth_type_trans(skb, tunnel->dev);
640                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641                 }
642
643                 stats->rx_packets++;
644                 stats->rx_bytes += len;
645                 skb->dev = tunnel->dev;
646                 dst_release(skb->dst);
647                 skb->dst = NULL;
648                 nf_reset(skb);
649
650                 skb_reset_network_header(skb);
651                 ipgre_ecn_decapsulate(iph, skb);
652
653                 netif_rx(skb);
654                 read_unlock(&ipgre_lock);
655                 return(0);
656         }
657         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
658
659 drop:
660         read_unlock(&ipgre_lock);
661 drop_nolock:
662         kfree_skb(skb);
663         return(0);
664 }
665
666 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
667 {
668         struct ip_tunnel *tunnel = netdev_priv(dev);
669         struct net_device_stats *stats = &tunnel->dev->stats;
670         struct iphdr  *old_iph = ip_hdr(skb);
671         struct iphdr  *tiph;
672         u8     tos;
673         __be16 df;
674         struct rtable *rt;                      /* Route to the other host */
675         struct net_device *tdev;                        /* Device to other host */
676         struct iphdr  *iph;                     /* Our new IP header */
677         unsigned int max_headroom;              /* The extra header space needed */
678         int    gre_hlen;
679         __be32 dst;
680         int    mtu;
681
682         if (tunnel->recursion++) {
683                 stats->collisions++;
684                 goto tx_error;
685         }
686
687         if (dev->type == ARPHRD_ETHER)
688                 IPCB(skb)->flags = 0;
689
690         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
691                 gre_hlen = 0;
692                 tiph = (struct iphdr *)skb->data;
693         } else {
694                 gre_hlen = tunnel->hlen;
695                 tiph = &tunnel->parms.iph;
696         }
697
698         if ((dst = tiph->daddr) == 0) {
699                 /* NBMA tunnel */
700
701                 if (skb->dst == NULL) {
702                         stats->tx_fifo_errors++;
703                         goto tx_error;
704                 }
705
706                 if (skb->protocol == htons(ETH_P_IP)) {
707                         rt = skb->rtable;
708                         if ((dst = rt->rt_gateway) == 0)
709                                 goto tx_error_icmp;
710                 }
711 #ifdef CONFIG_IPV6
712                 else if (skb->protocol == htons(ETH_P_IPV6)) {
713                         struct in6_addr *addr6;
714                         int addr_type;
715                         struct neighbour *neigh = skb->dst->neighbour;
716
717                         if (neigh == NULL)
718                                 goto tx_error;
719
720                         addr6 = (struct in6_addr *)&neigh->primary_key;
721                         addr_type = ipv6_addr_type(addr6);
722
723                         if (addr_type == IPV6_ADDR_ANY) {
724                                 addr6 = &ipv6_hdr(skb)->daddr;
725                                 addr_type = ipv6_addr_type(addr6);
726                         }
727
728                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
729                                 goto tx_error_icmp;
730
731                         dst = addr6->s6_addr32[3];
732                 }
733 #endif
734                 else
735                         goto tx_error;
736         }
737
738         tos = tiph->tos;
739         if (tos&1) {
740                 if (skb->protocol == htons(ETH_P_IP))
741                         tos = old_iph->tos;
742                 tos &= ~1;
743         }
744
745         {
746                 struct flowi fl = { .oif = tunnel->parms.link,
747                                     .nl_u = { .ip4_u =
748                                               { .daddr = dst,
749                                                 .saddr = tiph->saddr,
750                                                 .tos = RT_TOS(tos) } },
751                                     .proto = IPPROTO_GRE };
752                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
753                         stats->tx_carrier_errors++;
754                         goto tx_error;
755                 }
756         }
757         tdev = rt->u.dst.dev;
758
759         if (tdev == dev) {
760                 ip_rt_put(rt);
761                 stats->collisions++;
762                 goto tx_error;
763         }
764
765         df = tiph->frag_off;
766         if (df)
767                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
768         else
769                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
770
771         if (skb->dst)
772                 skb->dst->ops->update_pmtu(skb->dst, mtu);
773
774         if (skb->protocol == htons(ETH_P_IP)) {
775                 df |= (old_iph->frag_off&htons(IP_DF));
776
777                 if ((old_iph->frag_off&htons(IP_DF)) &&
778                     mtu < ntohs(old_iph->tot_len)) {
779                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
780                         ip_rt_put(rt);
781                         goto tx_error;
782                 }
783         }
784 #ifdef CONFIG_IPV6
785         else if (skb->protocol == htons(ETH_P_IPV6)) {
786                 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
787
788                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
789                         if ((tunnel->parms.iph.daddr &&
790                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
791                             rt6->rt6i_dst.plen == 128) {
792                                 rt6->rt6i_flags |= RTF_MODIFIED;
793                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
794                         }
795                 }
796
797                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
798                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
799                         ip_rt_put(rt);
800                         goto tx_error;
801                 }
802         }
803 #endif
804
805         if (tunnel->err_count > 0) {
806                 if (time_before(jiffies,
807                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
808                         tunnel->err_count--;
809
810                         dst_link_failure(skb);
811                 } else
812                         tunnel->err_count = 0;
813         }
814
815         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
816
817         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
818             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
819                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
820                 if (!new_skb) {
821                         ip_rt_put(rt);
822                         stats->tx_dropped++;
823                         dev_kfree_skb(skb);
824                         tunnel->recursion--;
825                         return 0;
826                 }
827                 if (skb->sk)
828                         skb_set_owner_w(new_skb, skb->sk);
829                 dev_kfree_skb(skb);
830                 skb = new_skb;
831                 old_iph = ip_hdr(skb);
832         }
833
834         skb_reset_transport_header(skb);
835         skb_push(skb, gre_hlen);
836         skb_reset_network_header(skb);
837         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
839                               IPSKB_REROUTED);
840         dst_release(skb->dst);
841         skb->dst = &rt->u.dst;
842
843         /*
844          *      Push down and install the IPIP header.
845          */
846
847         iph                     =       ip_hdr(skb);
848         iph->version            =       4;
849         iph->ihl                =       sizeof(struct iphdr) >> 2;
850         iph->frag_off           =       df;
851         iph->protocol           =       IPPROTO_GRE;
852         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
853         iph->daddr              =       rt->rt_dst;
854         iph->saddr              =       rt->rt_src;
855
856         if ((iph->ttl = tiph->ttl) == 0) {
857                 if (skb->protocol == htons(ETH_P_IP))
858                         iph->ttl = old_iph->ttl;
859 #ifdef CONFIG_IPV6
860                 else if (skb->protocol == htons(ETH_P_IPV6))
861                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
862 #endif
863                 else
864                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
865         }
866
867         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
868         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
869                                    htons(ETH_P_TEB) : skb->protocol;
870
871         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
872                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
873
874                 if (tunnel->parms.o_flags&GRE_SEQ) {
875                         ++tunnel->o_seqno;
876                         *ptr = htonl(tunnel->o_seqno);
877                         ptr--;
878                 }
879                 if (tunnel->parms.o_flags&GRE_KEY) {
880                         *ptr = tunnel->parms.o_key;
881                         ptr--;
882                 }
883                 if (tunnel->parms.o_flags&GRE_CSUM) {
884                         *ptr = 0;
885                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
886                 }
887         }
888
889         nf_reset(skb);
890
891         IPTUNNEL_XMIT();
892         tunnel->recursion--;
893         return 0;
894
895 tx_error_icmp:
896         dst_link_failure(skb);
897
898 tx_error:
899         stats->tx_errors++;
900         dev_kfree_skb(skb);
901         tunnel->recursion--;
902         return 0;
903 }
904
905 static int ipgre_tunnel_bind_dev(struct net_device *dev)
906 {
907         struct net_device *tdev = NULL;
908         struct ip_tunnel *tunnel;
909         struct iphdr *iph;
910         int hlen = LL_MAX_HEADER;
911         int mtu = ETH_DATA_LEN;
912         int addend = sizeof(struct iphdr) + 4;
913
914         tunnel = netdev_priv(dev);
915         iph = &tunnel->parms.iph;
916
917         /* Guess output device to choose reasonable mtu and needed_headroom */
918
919         if (iph->daddr) {
920                 struct flowi fl = { .oif = tunnel->parms.link,
921                                     .nl_u = { .ip4_u =
922                                               { .daddr = iph->daddr,
923                                                 .saddr = iph->saddr,
924                                                 .tos = RT_TOS(iph->tos) } },
925                                     .proto = IPPROTO_GRE };
926                 struct rtable *rt;
927                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
928                         tdev = rt->u.dst.dev;
929                         ip_rt_put(rt);
930                 }
931
932                 if (dev->type != ARPHRD_ETHER)
933                         dev->flags |= IFF_POINTOPOINT;
934         }
935
936         if (!tdev && tunnel->parms.link)
937                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
938
939         if (tdev) {
940                 hlen = tdev->hard_header_len + tdev->needed_headroom;
941                 mtu = tdev->mtu;
942         }
943         dev->iflink = tunnel->parms.link;
944
945         /* Precalculate GRE options length */
946         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
947                 if (tunnel->parms.o_flags&GRE_CSUM)
948                         addend += 4;
949                 if (tunnel->parms.o_flags&GRE_KEY)
950                         addend += 4;
951                 if (tunnel->parms.o_flags&GRE_SEQ)
952                         addend += 4;
953         }
954         dev->needed_headroom = addend + hlen;
955         mtu -= dev->hard_header_len - addend;
956
957         if (mtu < 68)
958                 mtu = 68;
959
960         tunnel->hlen = addend;
961
962         return mtu;
963 }
964
965 static int
966 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
967 {
968         int err = 0;
969         struct ip_tunnel_parm p;
970         struct ip_tunnel *t;
971         struct net *net = dev_net(dev);
972         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
973
974         switch (cmd) {
975         case SIOCGETTUNNEL:
976                 t = NULL;
977                 if (dev == ign->fb_tunnel_dev) {
978                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
979                                 err = -EFAULT;
980                                 break;
981                         }
982                         t = ipgre_tunnel_locate(net, &p, 0);
983                 }
984                 if (t == NULL)
985                         t = netdev_priv(dev);
986                 memcpy(&p, &t->parms, sizeof(p));
987                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
988                         err = -EFAULT;
989                 break;
990
991         case SIOCADDTUNNEL:
992         case SIOCCHGTUNNEL:
993                 err = -EPERM;
994                 if (!capable(CAP_NET_ADMIN))
995                         goto done;
996
997                 err = -EFAULT;
998                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
999                         goto done;
1000
1001                 err = -EINVAL;
1002                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1003                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1004                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1005                         goto done;
1006                 if (p.iph.ttl)
1007                         p.iph.frag_off |= htons(IP_DF);
1008
1009                 if (!(p.i_flags&GRE_KEY))
1010                         p.i_key = 0;
1011                 if (!(p.o_flags&GRE_KEY))
1012                         p.o_key = 0;
1013
1014                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1015
1016                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1017                         if (t != NULL) {
1018                                 if (t->dev != dev) {
1019                                         err = -EEXIST;
1020                                         break;
1021                                 }
1022                         } else {
1023                                 unsigned nflags = 0;
1024
1025                                 t = netdev_priv(dev);
1026
1027                                 if (ipv4_is_multicast(p.iph.daddr))
1028                                         nflags = IFF_BROADCAST;
1029                                 else if (p.iph.daddr)
1030                                         nflags = IFF_POINTOPOINT;
1031
1032                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1033                                         err = -EINVAL;
1034                                         break;
1035                                 }
1036                                 ipgre_tunnel_unlink(ign, t);
1037                                 t->parms.iph.saddr = p.iph.saddr;
1038                                 t->parms.iph.daddr = p.iph.daddr;
1039                                 t->parms.i_key = p.i_key;
1040                                 t->parms.o_key = p.o_key;
1041                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1042                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1043                                 ipgre_tunnel_link(ign, t);
1044                                 netdev_state_change(dev);
1045                         }
1046                 }
1047
1048                 if (t) {
1049                         err = 0;
1050                         if (cmd == SIOCCHGTUNNEL) {
1051                                 t->parms.iph.ttl = p.iph.ttl;
1052                                 t->parms.iph.tos = p.iph.tos;
1053                                 t->parms.iph.frag_off = p.iph.frag_off;
1054                                 if (t->parms.link != p.link) {
1055                                         t->parms.link = p.link;
1056                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1057                                         netdev_state_change(dev);
1058                                 }
1059                         }
1060                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1061                                 err = -EFAULT;
1062                 } else
1063                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1064                 break;
1065
1066         case SIOCDELTUNNEL:
1067                 err = -EPERM;
1068                 if (!capable(CAP_NET_ADMIN))
1069                         goto done;
1070
1071                 if (dev == ign->fb_tunnel_dev) {
1072                         err = -EFAULT;
1073                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1074                                 goto done;
1075                         err = -ENOENT;
1076                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1077                                 goto done;
1078                         err = -EPERM;
1079                         if (t == netdev_priv(ign->fb_tunnel_dev))
1080                                 goto done;
1081                         dev = t->dev;
1082                 }
1083                 unregister_netdevice(dev);
1084                 err = 0;
1085                 break;
1086
1087         default:
1088                 err = -EINVAL;
1089         }
1090
1091 done:
1092         return err;
1093 }
1094
1095 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1096 {
1097         struct ip_tunnel *tunnel = netdev_priv(dev);
1098         if (new_mtu < 68 ||
1099             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1100                 return -EINVAL;
1101         dev->mtu = new_mtu;
1102         return 0;
1103 }
1104
1105 /* Nice toy. Unfortunately, useless in real life :-)
1106    It allows to construct virtual multiprotocol broadcast "LAN"
1107    over the Internet, provided multicast routing is tuned.
1108
1109
1110    I have no idea was this bicycle invented before me,
1111    so that I had to set ARPHRD_IPGRE to a random value.
1112    I have an impression, that Cisco could make something similar,
1113    but this feature is apparently missing in IOS<=11.2(8).
1114
1115    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1116    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1117
1118    ping -t 255 224.66.66.66
1119
1120    If nobody answers, mbone does not work.
1121
1122    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1123    ip addr add 10.66.66.<somewhat>/24 dev Universe
1124    ifconfig Universe up
1125    ifconfig Universe add fe80::<Your_real_addr>/10
1126    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1127    ftp 10.66.66.66
1128    ...
1129    ftp fec0:6666:6666::193.233.7.65
1130    ...
1131
1132  */
1133
1134 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1135                         unsigned short type,
1136                         const void *daddr, const void *saddr, unsigned len)
1137 {
1138         struct ip_tunnel *t = netdev_priv(dev);
1139         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1140         __be16 *p = (__be16*)(iph+1);
1141
1142         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1143         p[0]            = t->parms.o_flags;
1144         p[1]            = htons(type);
1145
1146         /*
1147          *      Set the source hardware address.
1148          */
1149
1150         if (saddr)
1151                 memcpy(&iph->saddr, saddr, 4);
1152
1153         if (daddr) {
1154                 memcpy(&iph->daddr, daddr, 4);
1155                 return t->hlen;
1156         }
1157         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1158                 return t->hlen;
1159
1160         return -t->hlen;
1161 }
1162
1163 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1164 {
1165         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1166         memcpy(haddr, &iph->saddr, 4);
1167         return 4;
1168 }
1169
1170 static const struct header_ops ipgre_header_ops = {
1171         .create = ipgre_header,
1172         .parse  = ipgre_header_parse,
1173 };
1174
1175 #ifdef CONFIG_NET_IPGRE_BROADCAST
1176 static int ipgre_open(struct net_device *dev)
1177 {
1178         struct ip_tunnel *t = netdev_priv(dev);
1179
1180         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1181                 struct flowi fl = { .oif = t->parms.link,
1182                                     .nl_u = { .ip4_u =
1183                                               { .daddr = t->parms.iph.daddr,
1184                                                 .saddr = t->parms.iph.saddr,
1185                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1186                                     .proto = IPPROTO_GRE };
1187                 struct rtable *rt;
1188                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1189                         return -EADDRNOTAVAIL;
1190                 dev = rt->u.dst.dev;
1191                 ip_rt_put(rt);
1192                 if (__in_dev_get_rtnl(dev) == NULL)
1193                         return -EADDRNOTAVAIL;
1194                 t->mlink = dev->ifindex;
1195                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1196         }
1197         return 0;
1198 }
1199
1200 static int ipgre_close(struct net_device *dev)
1201 {
1202         struct ip_tunnel *t = netdev_priv(dev);
1203
1204         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1205                 struct in_device *in_dev;
1206                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1207                 if (in_dev) {
1208                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1209                         in_dev_put(in_dev);
1210                 }
1211         }
1212         return 0;
1213 }
1214
1215 #endif
1216
1217 static const struct net_device_ops ipgre_netdev_ops = {
1218         .ndo_init               = ipgre_tunnel_init,
1219         .ndo_uninit             = ipgre_tunnel_uninit,
1220 #ifdef CONFIG_NET_IPGRE_BROADCAST
1221         .ndo_open               = ipgre_open,
1222         .ndo_stop               = ipgre_close,
1223 #endif
1224         .ndo_start_xmit         = ipgre_tunnel_xmit,
1225         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1226         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1227 };
1228
1229 static void ipgre_tunnel_setup(struct net_device *dev)
1230 {
1231         dev->netdev_ops         = &ipgre_netdev_ops;
1232         dev->destructor         = free_netdev;
1233
1234         dev->type               = ARPHRD_IPGRE;
1235         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1236         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1237         dev->flags              = IFF_NOARP;
1238         dev->iflink             = 0;
1239         dev->addr_len           = 4;
1240         dev->features           |= NETIF_F_NETNS_LOCAL;
1241 }
1242
1243 static int ipgre_tunnel_init(struct net_device *dev)
1244 {
1245         struct ip_tunnel *tunnel;
1246         struct iphdr *iph;
1247
1248         tunnel = netdev_priv(dev);
1249         iph = &tunnel->parms.iph;
1250
1251         tunnel->dev = dev;
1252         strcpy(tunnel->parms.name, dev->name);
1253
1254         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1255         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1256
1257         if (iph->daddr) {
1258 #ifdef CONFIG_NET_IPGRE_BROADCAST
1259                 if (ipv4_is_multicast(iph->daddr)) {
1260                         if (!iph->saddr)
1261                                 return -EINVAL;
1262                         dev->flags = IFF_BROADCAST;
1263                         dev->header_ops = &ipgre_header_ops;
1264                 }
1265 #endif
1266         } else
1267                 dev->header_ops = &ipgre_header_ops;
1268
1269         return 0;
1270 }
1271
1272 static void ipgre_fb_tunnel_init(struct net_device *dev)
1273 {
1274         struct ip_tunnel *tunnel = netdev_priv(dev);
1275         struct iphdr *iph = &tunnel->parms.iph;
1276         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1277
1278         tunnel->dev = dev;
1279         strcpy(tunnel->parms.name, dev->name);
1280
1281         iph->version            = 4;
1282         iph->protocol           = IPPROTO_GRE;
1283         iph->ihl                = 5;
1284         tunnel->hlen            = sizeof(struct iphdr) + 4;
1285
1286         dev_hold(dev);
1287         ign->tunnels_wc[0]      = tunnel;
1288 }
1289
1290
1291 static struct net_protocol ipgre_protocol = {
1292         .handler        =       ipgre_rcv,
1293         .err_handler    =       ipgre_err,
1294         .netns_ok       =       1,
1295 };
1296
1297 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1298 {
1299         int prio;
1300
1301         for (prio = 0; prio < 4; prio++) {
1302                 int h;
1303                 for (h = 0; h < HASH_SIZE; h++) {
1304                         struct ip_tunnel *t;
1305                         while ((t = ign->tunnels[prio][h]) != NULL)
1306                                 unregister_netdevice(t->dev);
1307                 }
1308         }
1309 }
1310
1311 static int ipgre_init_net(struct net *net)
1312 {
1313         int err;
1314         struct ipgre_net *ign;
1315
1316         err = -ENOMEM;
1317         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1318         if (ign == NULL)
1319                 goto err_alloc;
1320
1321         err = net_assign_generic(net, ipgre_net_id, ign);
1322         if (err < 0)
1323                 goto err_assign;
1324
1325         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1326                                            ipgre_tunnel_setup);
1327         if (!ign->fb_tunnel_dev) {
1328                 err = -ENOMEM;
1329                 goto err_alloc_dev;
1330         }
1331         dev_net_set(ign->fb_tunnel_dev, net);
1332
1333         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1334         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1335
1336         if ((err = register_netdev(ign->fb_tunnel_dev)))
1337                 goto err_reg_dev;
1338
1339         return 0;
1340
1341 err_reg_dev:
1342         free_netdev(ign->fb_tunnel_dev);
1343 err_alloc_dev:
1344         /* nothing */
1345 err_assign:
1346         kfree(ign);
1347 err_alloc:
1348         return err;
1349 }
1350
1351 static void ipgre_exit_net(struct net *net)
1352 {
1353         struct ipgre_net *ign;
1354
1355         ign = net_generic(net, ipgre_net_id);
1356         rtnl_lock();
1357         ipgre_destroy_tunnels(ign);
1358         rtnl_unlock();
1359         kfree(ign);
1360 }
1361
1362 static struct pernet_operations ipgre_net_ops = {
1363         .init = ipgre_init_net,
1364         .exit = ipgre_exit_net,
1365 };
1366
1367 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1368 {
1369         __be16 flags;
1370
1371         if (!data)
1372                 return 0;
1373
1374         flags = 0;
1375         if (data[IFLA_GRE_IFLAGS])
1376                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1377         if (data[IFLA_GRE_OFLAGS])
1378                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1379         if (flags & (GRE_VERSION|GRE_ROUTING))
1380                 return -EINVAL;
1381
1382         return 0;
1383 }
1384
1385 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1386 {
1387         __be32 daddr;
1388
1389         if (tb[IFLA_ADDRESS]) {
1390                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1391                         return -EINVAL;
1392                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1393                         return -EADDRNOTAVAIL;
1394         }
1395
1396         if (!data)
1397                 goto out;
1398
1399         if (data[IFLA_GRE_REMOTE]) {
1400                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1401                 if (!daddr)
1402                         return -EINVAL;
1403         }
1404
1405 out:
1406         return ipgre_tunnel_validate(tb, data);
1407 }
1408
1409 static void ipgre_netlink_parms(struct nlattr *data[],
1410                                 struct ip_tunnel_parm *parms)
1411 {
1412         memset(parms, 0, sizeof(*parms));
1413
1414         parms->iph.protocol = IPPROTO_GRE;
1415
1416         if (!data)
1417                 return;
1418
1419         if (data[IFLA_GRE_LINK])
1420                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1421
1422         if (data[IFLA_GRE_IFLAGS])
1423                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1424
1425         if (data[IFLA_GRE_OFLAGS])
1426                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1427
1428         if (data[IFLA_GRE_IKEY])
1429                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1430
1431         if (data[IFLA_GRE_OKEY])
1432                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1433
1434         if (data[IFLA_GRE_LOCAL])
1435                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1436
1437         if (data[IFLA_GRE_REMOTE])
1438                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1439
1440         if (data[IFLA_GRE_TTL])
1441                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1442
1443         if (data[IFLA_GRE_TOS])
1444                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1445
1446         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1447                 parms->iph.frag_off = htons(IP_DF);
1448 }
1449
1450 static int ipgre_tap_init(struct net_device *dev)
1451 {
1452         struct ip_tunnel *tunnel;
1453
1454         tunnel = netdev_priv(dev);
1455
1456         tunnel->dev = dev;
1457         strcpy(tunnel->parms.name, dev->name);
1458
1459         ipgre_tunnel_bind_dev(dev);
1460
1461         return 0;
1462 }
1463
1464 static const struct net_device_ops ipgre_tap_netdev_ops = {
1465         .ndo_init               = ipgre_tap_init,
1466         .ndo_uninit             = ipgre_tunnel_uninit,
1467         .ndo_start_xmit         = ipgre_tunnel_xmit,
1468         .ndo_set_mac_address    = eth_mac_addr,
1469         .ndo_validate_addr      = eth_validate_addr,
1470         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1471 };
1472
1473 static void ipgre_tap_setup(struct net_device *dev)
1474 {
1475
1476         ether_setup(dev);
1477
1478         dev->netdev_ops         = &ipgre_netdev_ops;
1479         dev->destructor         = free_netdev;
1480
1481         dev->iflink             = 0;
1482         dev->features           |= NETIF_F_NETNS_LOCAL;
1483 }
1484
1485 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1486                          struct nlattr *data[])
1487 {
1488         struct ip_tunnel *nt;
1489         struct net *net = dev_net(dev);
1490         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1491         int mtu;
1492         int err;
1493
1494         nt = netdev_priv(dev);
1495         ipgre_netlink_parms(data, &nt->parms);
1496
1497         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1498                 return -EEXIST;
1499
1500         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1501                 random_ether_addr(dev->dev_addr);
1502
1503         mtu = ipgre_tunnel_bind_dev(dev);
1504         if (!tb[IFLA_MTU])
1505                 dev->mtu = mtu;
1506
1507         err = register_netdevice(dev);
1508         if (err)
1509                 goto out;
1510
1511         dev_hold(dev);
1512         ipgre_tunnel_link(ign, nt);
1513
1514 out:
1515         return err;
1516 }
1517
1518 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1519                             struct nlattr *data[])
1520 {
1521         struct ip_tunnel *t, *nt;
1522         struct net *net = dev_net(dev);
1523         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1524         struct ip_tunnel_parm p;
1525         int mtu;
1526
1527         if (dev == ign->fb_tunnel_dev)
1528                 return -EINVAL;
1529
1530         nt = netdev_priv(dev);
1531         ipgre_netlink_parms(data, &p);
1532
1533         t = ipgre_tunnel_locate(net, &p, 0);
1534
1535         if (t) {
1536                 if (t->dev != dev)
1537                         return -EEXIST;
1538         } else {
1539                 unsigned nflags = 0;
1540
1541                 t = nt;
1542
1543                 if (ipv4_is_multicast(p.iph.daddr))
1544                         nflags = IFF_BROADCAST;
1545                 else if (p.iph.daddr)
1546                         nflags = IFF_POINTOPOINT;
1547
1548                 if ((dev->flags ^ nflags) &
1549                     (IFF_POINTOPOINT | IFF_BROADCAST))
1550                         return -EINVAL;
1551
1552                 ipgre_tunnel_unlink(ign, t);
1553                 t->parms.iph.saddr = p.iph.saddr;
1554                 t->parms.iph.daddr = p.iph.daddr;
1555                 t->parms.i_key = p.i_key;
1556                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1557                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1558                 ipgre_tunnel_link(ign, t);
1559                 netdev_state_change(dev);
1560         }
1561
1562         t->parms.o_key = p.o_key;
1563         t->parms.iph.ttl = p.iph.ttl;
1564         t->parms.iph.tos = p.iph.tos;
1565         t->parms.iph.frag_off = p.iph.frag_off;
1566
1567         if (t->parms.link != p.link) {
1568                 t->parms.link = p.link;
1569                 mtu = ipgre_tunnel_bind_dev(dev);
1570                 if (!tb[IFLA_MTU])
1571                         dev->mtu = mtu;
1572                 netdev_state_change(dev);
1573         }
1574
1575         return 0;
1576 }
1577
1578 static size_t ipgre_get_size(const struct net_device *dev)
1579 {
1580         return
1581                 /* IFLA_GRE_LINK */
1582                 nla_total_size(4) +
1583                 /* IFLA_GRE_IFLAGS */
1584                 nla_total_size(2) +
1585                 /* IFLA_GRE_OFLAGS */
1586                 nla_total_size(2) +
1587                 /* IFLA_GRE_IKEY */
1588                 nla_total_size(4) +
1589                 /* IFLA_GRE_OKEY */
1590                 nla_total_size(4) +
1591                 /* IFLA_GRE_LOCAL */
1592                 nla_total_size(4) +
1593                 /* IFLA_GRE_REMOTE */
1594                 nla_total_size(4) +
1595                 /* IFLA_GRE_TTL */
1596                 nla_total_size(1) +
1597                 /* IFLA_GRE_TOS */
1598                 nla_total_size(1) +
1599                 /* IFLA_GRE_PMTUDISC */
1600                 nla_total_size(1) +
1601                 0;
1602 }
1603
1604 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1605 {
1606         struct ip_tunnel *t = netdev_priv(dev);
1607         struct ip_tunnel_parm *p = &t->parms;
1608
1609         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1610         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1611         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1612         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1613         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1614         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1615         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1616         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1617         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1618         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1619
1620         return 0;
1621
1622 nla_put_failure:
1623         return -EMSGSIZE;
1624 }
1625
1626 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1627         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1628         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1629         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1630         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1631         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1632         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1633         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1634         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1635         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1636         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1637 };
1638
1639 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1640         .kind           = "gre",
1641         .maxtype        = IFLA_GRE_MAX,
1642         .policy         = ipgre_policy,
1643         .priv_size      = sizeof(struct ip_tunnel),
1644         .setup          = ipgre_tunnel_setup,
1645         .validate       = ipgre_tunnel_validate,
1646         .newlink        = ipgre_newlink,
1647         .changelink     = ipgre_changelink,
1648         .get_size       = ipgre_get_size,
1649         .fill_info      = ipgre_fill_info,
1650 };
1651
1652 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1653         .kind           = "gretap",
1654         .maxtype        = IFLA_GRE_MAX,
1655         .policy         = ipgre_policy,
1656         .priv_size      = sizeof(struct ip_tunnel),
1657         .setup          = ipgre_tap_setup,
1658         .validate       = ipgre_tap_validate,
1659         .newlink        = ipgre_newlink,
1660         .changelink     = ipgre_changelink,
1661         .get_size       = ipgre_get_size,
1662         .fill_info      = ipgre_fill_info,
1663 };
1664
1665 /*
1666  *      And now the modules code and kernel interface.
1667  */
1668
1669 static int __init ipgre_init(void)
1670 {
1671         int err;
1672
1673         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1674
1675         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1676                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1677                 return -EAGAIN;
1678         }
1679
1680         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1681         if (err < 0)
1682                 goto gen_device_failed;
1683
1684         err = rtnl_link_register(&ipgre_link_ops);
1685         if (err < 0)
1686                 goto rtnl_link_failed;
1687
1688         err = rtnl_link_register(&ipgre_tap_ops);
1689         if (err < 0)
1690                 goto tap_ops_failed;
1691
1692 out:
1693         return err;
1694
1695 tap_ops_failed:
1696         rtnl_link_unregister(&ipgre_link_ops);
1697 rtnl_link_failed:
1698         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699 gen_device_failed:
1700         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1701         goto out;
1702 }
1703
1704 static void __exit ipgre_fini(void)
1705 {
1706         rtnl_link_unregister(&ipgre_tap_ops);
1707         rtnl_link_unregister(&ipgre_link_ops);
1708         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1709         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1710                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1711 }
1712
1713 module_init(ipgre_init);
1714 module_exit(ipgre_fini);
1715 MODULE_LICENSE("GPL");
1716 MODULE_ALIAS_RTNL_LINK("gre");
1717 MODULE_ALIAS_RTNL_LINK("gretap");