Merge branch 'for-rmk' of git://git.kernel.org/pub/scm/linux/kernel/git/ycmiao/pxa...
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 #define HASH_SIZE  16
130
131 static int ipgre_net_id;
132 struct ipgre_net {
133         struct ip_tunnel *tunnels[4][HASH_SIZE];
134
135         struct net_device *fb_tunnel_dev;
136 };
137
138 /* Tunnel hash table */
139
140 /*
141    4 hash tables:
142
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157
158 #define tunnels_r_l     tunnels[3]
159 #define tunnels_r       tunnels[2]
160 #define tunnels_l       tunnels[1]
161 #define tunnels_wc      tunnels[0]
162
163 static DEFINE_RWLOCK(ipgre_lock);
164
165 /* Given src, dst and key, find appropriate for input tunnel. */
166
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168                                               __be32 remote, __be32 local,
169                                               __be32 key, __be16 gre_proto)
170 {
171         struct net *net = dev_net(dev);
172         int link = dev->ifindex;
173         unsigned h0 = HASH(remote);
174         unsigned h1 = HASH(key);
175         struct ip_tunnel *t, *cand = NULL;
176         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178                        ARPHRD_ETHER : ARPHRD_IPGRE;
179         int score, cand_score = 4;
180
181         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182                 if (local != t->parms.iph.saddr ||
183                     remote != t->parms.iph.daddr ||
184                     key != t->parms.i_key ||
185                     !(t->dev->flags & IFF_UP))
186                         continue;
187
188                 if (t->dev->type != ARPHRD_IPGRE &&
189                     t->dev->type != dev_type)
190                         continue;
191
192                 score = 0;
193                 if (t->parms.link != link)
194                         score |= 1;
195                 if (t->dev->type != dev_type)
196                         score |= 2;
197                 if (score == 0)
198                         return t;
199
200                 if (score < cand_score) {
201                         cand = t;
202                         cand_score = score;
203                 }
204         }
205
206         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
207                 if (remote != t->parms.iph.daddr ||
208                     key != t->parms.i_key ||
209                     !(t->dev->flags & IFF_UP))
210                         continue;
211
212                 if (t->dev->type != ARPHRD_IPGRE &&
213                     t->dev->type != dev_type)
214                         continue;
215
216                 score = 0;
217                 if (t->parms.link != link)
218                         score |= 1;
219                 if (t->dev->type != dev_type)
220                         score |= 2;
221                 if (score == 0)
222                         return t;
223
224                 if (score < cand_score) {
225                         cand = t;
226                         cand_score = score;
227                 }
228         }
229
230         for (t = ign->tunnels_l[h1]; t; t = t->next) {
231                 if ((local != t->parms.iph.saddr &&
232                      (local != t->parms.iph.daddr ||
233                       !ipv4_is_multicast(local))) ||
234                     key != t->parms.i_key ||
235                     !(t->dev->flags & IFF_UP))
236                         continue;
237
238                 if (t->dev->type != ARPHRD_IPGRE &&
239                     t->dev->type != dev_type)
240                         continue;
241
242                 score = 0;
243                 if (t->parms.link != link)
244                         score |= 1;
245                 if (t->dev->type != dev_type)
246                         score |= 2;
247                 if (score == 0)
248                         return t;
249
250                 if (score < cand_score) {
251                         cand = t;
252                         cand_score = score;
253                 }
254         }
255
256         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
257                 if (t->parms.i_key != key ||
258                     !(t->dev->flags & IFF_UP))
259                         continue;
260
261                 if (t->dev->type != ARPHRD_IPGRE &&
262                     t->dev->type != dev_type)
263                         continue;
264
265                 score = 0;
266                 if (t->parms.link != link)
267                         score |= 1;
268                 if (t->dev->type != dev_type)
269                         score |= 2;
270                 if (score == 0)
271                         return t;
272
273                 if (score < cand_score) {
274                         cand = t;
275                         cand_score = score;
276                 }
277         }
278
279         if (cand != NULL)
280                 return cand;
281
282         if (ign->fb_tunnel_dev->flags & IFF_UP)
283                 return netdev_priv(ign->fb_tunnel_dev);
284
285         return NULL;
286 }
287
288 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
289                 struct ip_tunnel_parm *parms)
290 {
291         __be32 remote = parms->iph.daddr;
292         __be32 local = parms->iph.saddr;
293         __be32 key = parms->i_key;
294         unsigned h = HASH(key);
295         int prio = 0;
296
297         if (local)
298                 prio |= 1;
299         if (remote && !ipv4_is_multicast(remote)) {
300                 prio |= 2;
301                 h ^= HASH(remote);
302         }
303
304         return &ign->tunnels[prio][h];
305 }
306
307 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
308                 struct ip_tunnel *t)
309 {
310         return __ipgre_bucket(ign, &t->parms);
311 }
312
313 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314 {
315         struct ip_tunnel **tp = ipgre_bucket(ign, t);
316
317         t->next = *tp;
318         write_lock_bh(&ipgre_lock);
319         *tp = t;
320         write_unlock_bh(&ipgre_lock);
321 }
322
323 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324 {
325         struct ip_tunnel **tp;
326
327         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
328                 if (t == *tp) {
329                         write_lock_bh(&ipgre_lock);
330                         *tp = t->next;
331                         write_unlock_bh(&ipgre_lock);
332                         break;
333                 }
334         }
335 }
336
337 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
338                                            struct ip_tunnel_parm *parms,
339                                            int type)
340 {
341         __be32 remote = parms->iph.daddr;
342         __be32 local = parms->iph.saddr;
343         __be32 key = parms->i_key;
344         int link = parms->link;
345         struct ip_tunnel *t, **tp;
346         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347
348         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
349                 if (local == t->parms.iph.saddr &&
350                     remote == t->parms.iph.daddr &&
351                     key == t->parms.i_key &&
352                     link == t->parms.link &&
353                     type == t->dev->type)
354                         break;
355
356         return t;
357 }
358
359 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
360                 struct ip_tunnel_parm *parms, int create)
361 {
362         struct ip_tunnel *t, *nt;
363         struct net_device *dev;
364         char name[IFNAMSIZ];
365         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
366
367         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
368         if (t || !create)
369                 return t;
370
371         if (parms->name[0])
372                 strlcpy(name, parms->name, IFNAMSIZ);
373         else
374                 sprintf(name, "gre%%d");
375
376         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377         if (!dev)
378           return NULL;
379
380         dev_net_set(dev, net);
381
382         if (strchr(name, '%')) {
383                 if (dev_alloc_name(dev, name) < 0)
384                         goto failed_free;
385         }
386
387         nt = netdev_priv(dev);
388         nt->parms = *parms;
389         dev->rtnl_link_ops = &ipgre_link_ops;
390
391         dev->mtu = ipgre_tunnel_bind_dev(dev);
392
393         if (register_netdevice(dev) < 0)
394                 goto failed_free;
395
396         dev_hold(dev);
397         ipgre_tunnel_link(ign, nt);
398         return nt;
399
400 failed_free:
401         free_netdev(dev);
402         return NULL;
403 }
404
405 static void ipgre_tunnel_uninit(struct net_device *dev)
406 {
407         struct net *net = dev_net(dev);
408         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
409
410         ipgre_tunnel_unlink(ign, netdev_priv(dev));
411         dev_put(dev);
412 }
413
414
415 static void ipgre_err(struct sk_buff *skb, u32 info)
416 {
417
418 /* All the routers (except for Linux) return only
419    8 bytes of packet payload. It means, that precise relaying of
420    ICMP in the real Internet is absolutely infeasible.
421
422    Moreover, Cisco "wise men" put GRE key to the third word
423    in GRE header. It makes impossible maintaining even soft state for keyed
424    GRE tunnels with enabled checksum. Tell them "thank you".
425
426    Well, I wonder, rfc1812 was written by Cisco employee,
427    what the hell these idiots break standrads established
428    by themself???
429  */
430
431         struct iphdr *iph = (struct iphdr *)skb->data;
432         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
433         int grehlen = (iph->ihl<<2) + 4;
434         const int type = icmp_hdr(skb)->type;
435         const int code = icmp_hdr(skb)->code;
436         struct ip_tunnel *t;
437         __be16 flags;
438
439         flags = p[0];
440         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
441                 if (flags&(GRE_VERSION|GRE_ROUTING))
442                         return;
443                 if (flags&GRE_KEY) {
444                         grehlen += 4;
445                         if (flags&GRE_CSUM)
446                                 grehlen += 4;
447                 }
448         }
449
450         /* If only 8 bytes returned, keyed message will be dropped here */
451         if (skb_headlen(skb) < grehlen)
452                 return;
453
454         switch (type) {
455         default:
456         case ICMP_PARAMETERPROB:
457                 return;
458
459         case ICMP_DEST_UNREACH:
460                 switch (code) {
461                 case ICMP_SR_FAILED:
462                 case ICMP_PORT_UNREACH:
463                         /* Impossible event. */
464                         return;
465                 case ICMP_FRAG_NEEDED:
466                         /* Soft state for pmtu is maintained by IP core. */
467                         return;
468                 default:
469                         /* All others are translated to HOST_UNREACH.
470                            rfc2003 contains "deep thoughts" about NET_UNREACH,
471                            I believe they are just ether pollution. --ANK
472                          */
473                         break;
474                 }
475                 break;
476         case ICMP_TIME_EXCEEDED:
477                 if (code != ICMP_EXC_TTL)
478                         return;
479                 break;
480         }
481
482         read_lock(&ipgre_lock);
483         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484                                 flags & GRE_KEY ?
485                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
486                                 p[1]);
487         if (t == NULL || t->parms.iph.daddr == 0 ||
488             ipv4_is_multicast(t->parms.iph.daddr))
489                 goto out;
490
491         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492                 goto out;
493
494         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
495                 t->err_count++;
496         else
497                 t->err_count = 1;
498         t->err_time = jiffies;
499 out:
500         read_unlock(&ipgre_lock);
501         return;
502 }
503
504 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
505 {
506         if (INET_ECN_is_ce(iph->tos)) {
507                 if (skb->protocol == htons(ETH_P_IP)) {
508                         IP_ECN_set_ce(ip_hdr(skb));
509                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
510                         IP6_ECN_set_ce(ipv6_hdr(skb));
511                 }
512         }
513 }
514
515 static inline u8
516 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
517 {
518         u8 inner = 0;
519         if (skb->protocol == htons(ETH_P_IP))
520                 inner = old_iph->tos;
521         else if (skb->protocol == htons(ETH_P_IPV6))
522                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
523         return INET_ECN_encapsulate(tos, inner);
524 }
525
526 static int ipgre_rcv(struct sk_buff *skb)
527 {
528         struct iphdr *iph;
529         u8     *h;
530         __be16    flags;
531         __sum16   csum = 0;
532         __be32 key = 0;
533         u32    seqno = 0;
534         struct ip_tunnel *tunnel;
535         int    offset = 4;
536         __be16 gre_proto;
537         unsigned int len;
538
539         if (!pskb_may_pull(skb, 16))
540                 goto drop_nolock;
541
542         iph = ip_hdr(skb);
543         h = skb->data;
544         flags = *(__be16*)h;
545
546         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
547                 /* - Version must be 0.
548                    - We do not support routing headers.
549                  */
550                 if (flags&(GRE_VERSION|GRE_ROUTING))
551                         goto drop_nolock;
552
553                 if (flags&GRE_CSUM) {
554                         switch (skb->ip_summed) {
555                         case CHECKSUM_COMPLETE:
556                                 csum = csum_fold(skb->csum);
557                                 if (!csum)
558                                         break;
559                                 /* fall through */
560                         case CHECKSUM_NONE:
561                                 skb->csum = 0;
562                                 csum = __skb_checksum_complete(skb);
563                                 skb->ip_summed = CHECKSUM_COMPLETE;
564                         }
565                         offset += 4;
566                 }
567                 if (flags&GRE_KEY) {
568                         key = *(__be32*)(h + offset);
569                         offset += 4;
570                 }
571                 if (flags&GRE_SEQ) {
572                         seqno = ntohl(*(__be32*)(h + offset));
573                         offset += 4;
574                 }
575         }
576
577         gre_proto = *(__be16 *)(h + 2);
578
579         read_lock(&ipgre_lock);
580         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581                                           iph->saddr, iph->daddr, key,
582                                           gre_proto))) {
583                 struct net_device_stats *stats = &tunnel->dev->stats;
584
585                 secpath_reset(skb);
586
587                 skb->protocol = gre_proto;
588                 /* WCCP version 1 and 2 protocol decoding.
589                  * - Change protocol to IP
590                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
591                  */
592                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
593                         skb->protocol = htons(ETH_P_IP);
594                         if ((*(h + offset) & 0xF0) != 0x40)
595                                 offset += 4;
596                 }
597
598                 skb->mac_header = skb->network_header;
599                 __pskb_pull(skb, offset);
600                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
601                 skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603                 if (ipv4_is_multicast(iph->daddr)) {
604                         /* Looped back packet, drop it! */
605                         if (skb_rtable(skb)->fl.iif == 0)
606                                 goto drop;
607                         stats->multicast++;
608                         skb->pkt_type = PACKET_BROADCAST;
609                 }
610 #endif
611
612                 if (((flags&GRE_CSUM) && csum) ||
613                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614                         stats->rx_crc_errors++;
615                         stats->rx_errors++;
616                         goto drop;
617                 }
618                 if (tunnel->parms.i_flags&GRE_SEQ) {
619                         if (!(flags&GRE_SEQ) ||
620                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621                                 stats->rx_fifo_errors++;
622                                 stats->rx_errors++;
623                                 goto drop;
624                         }
625                         tunnel->i_seqno = seqno + 1;
626                 }
627
628                 len = skb->len;
629
630                 /* Warning: All skb pointers will be invalidated! */
631                 if (tunnel->dev->type == ARPHRD_ETHER) {
632                         if (!pskb_may_pull(skb, ETH_HLEN)) {
633                                 stats->rx_length_errors++;
634                                 stats->rx_errors++;
635                                 goto drop;
636                         }
637
638                         iph = ip_hdr(skb);
639                         skb->protocol = eth_type_trans(skb, tunnel->dev);
640                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641                 }
642
643                 stats->rx_packets++;
644                 stats->rx_bytes += len;
645                 skb->dev = tunnel->dev;
646                 skb_dst_drop(skb);
647                 nf_reset(skb);
648
649                 skb_reset_network_header(skb);
650                 ipgre_ecn_decapsulate(iph, skb);
651
652                 netif_rx(skb);
653                 read_unlock(&ipgre_lock);
654                 return(0);
655         }
656         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
657
658 drop:
659         read_unlock(&ipgre_lock);
660 drop_nolock:
661         kfree_skb(skb);
662         return(0);
663 }
664
665 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
666 {
667         struct ip_tunnel *tunnel = netdev_priv(dev);
668         struct net_device_stats *stats = &tunnel->dev->stats;
669         struct iphdr  *old_iph = ip_hdr(skb);
670         struct iphdr  *tiph;
671         u8     tos;
672         __be16 df;
673         struct rtable *rt;                      /* Route to the other host */
674         struct net_device *tdev;                        /* Device to other host */
675         struct iphdr  *iph;                     /* Our new IP header */
676         unsigned int max_headroom;              /* The extra header space needed */
677         int    gre_hlen;
678         __be32 dst;
679         int    mtu;
680
681         if (tunnel->recursion++) {
682                 stats->collisions++;
683                 goto tx_error;
684         }
685
686         if (dev->type == ARPHRD_ETHER)
687                 IPCB(skb)->flags = 0;
688
689         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
690                 gre_hlen = 0;
691                 tiph = (struct iphdr *)skb->data;
692         } else {
693                 gre_hlen = tunnel->hlen;
694                 tiph = &tunnel->parms.iph;
695         }
696
697         if ((dst = tiph->daddr) == 0) {
698                 /* NBMA tunnel */
699
700                 if (skb_dst(skb) == NULL) {
701                         stats->tx_fifo_errors++;
702                         goto tx_error;
703                 }
704
705                 if (skb->protocol == htons(ETH_P_IP)) {
706                         rt = skb_rtable(skb);
707                         if ((dst = rt->rt_gateway) == 0)
708                                 goto tx_error_icmp;
709                 }
710 #ifdef CONFIG_IPV6
711                 else if (skb->protocol == htons(ETH_P_IPV6)) {
712                         struct in6_addr *addr6;
713                         int addr_type;
714                         struct neighbour *neigh = skb_dst(skb)->neighbour;
715
716                         if (neigh == NULL)
717                                 goto tx_error;
718
719                         addr6 = (struct in6_addr *)&neigh->primary_key;
720                         addr_type = ipv6_addr_type(addr6);
721
722                         if (addr_type == IPV6_ADDR_ANY) {
723                                 addr6 = &ipv6_hdr(skb)->daddr;
724                                 addr_type = ipv6_addr_type(addr6);
725                         }
726
727                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728                                 goto tx_error_icmp;
729
730                         dst = addr6->s6_addr32[3];
731                 }
732 #endif
733                 else
734                         goto tx_error;
735         }
736
737         tos = tiph->tos;
738         if (tos&1) {
739                 if (skb->protocol == htons(ETH_P_IP))
740                         tos = old_iph->tos;
741                 tos &= ~1;
742         }
743
744         {
745                 struct flowi fl = { .oif = tunnel->parms.link,
746                                     .nl_u = { .ip4_u =
747                                               { .daddr = dst,
748                                                 .saddr = tiph->saddr,
749                                                 .tos = RT_TOS(tos) } },
750                                     .proto = IPPROTO_GRE };
751                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
752                         stats->tx_carrier_errors++;
753                         goto tx_error;
754                 }
755         }
756         tdev = rt->u.dst.dev;
757
758         if (tdev == dev) {
759                 ip_rt_put(rt);
760                 stats->collisions++;
761                 goto tx_error;
762         }
763
764         df = tiph->frag_off;
765         if (df)
766                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
767         else
768                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
769
770         if (skb_dst(skb))
771                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
772
773         if (skb->protocol == htons(ETH_P_IP)) {
774                 df |= (old_iph->frag_off&htons(IP_DF));
775
776                 if ((old_iph->frag_off&htons(IP_DF)) &&
777                     mtu < ntohs(old_iph->tot_len)) {
778                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
779                         ip_rt_put(rt);
780                         goto tx_error;
781                 }
782         }
783 #ifdef CONFIG_IPV6
784         else if (skb->protocol == htons(ETH_P_IPV6)) {
785                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
786
787                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
788                         if ((tunnel->parms.iph.daddr &&
789                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
790                             rt6->rt6i_dst.plen == 128) {
791                                 rt6->rt6i_flags |= RTF_MODIFIED;
792                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
793                         }
794                 }
795
796                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
798                         ip_rt_put(rt);
799                         goto tx_error;
800                 }
801         }
802 #endif
803
804         if (tunnel->err_count > 0) {
805                 if (time_before(jiffies,
806                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
807                         tunnel->err_count--;
808
809                         dst_link_failure(skb);
810                 } else
811                         tunnel->err_count = 0;
812         }
813
814         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
815
816         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819                 if (!new_skb) {
820                         ip_rt_put(rt);
821                         stats->tx_dropped++;
822                         dev_kfree_skb(skb);
823                         tunnel->recursion--;
824                         return 0;
825                 }
826                 if (skb->sk)
827                         skb_set_owner_w(new_skb, skb->sk);
828                 dev_kfree_skb(skb);
829                 skb = new_skb;
830                 old_iph = ip_hdr(skb);
831         }
832
833         skb_reset_transport_header(skb);
834         skb_push(skb, gre_hlen);
835         skb_reset_network_header(skb);
836         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
837         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
838                               IPSKB_REROUTED);
839         skb_dst_drop(skb);
840         skb_dst_set(skb, &rt->u.dst);
841
842         /*
843          *      Push down and install the IPIP header.
844          */
845
846         iph                     =       ip_hdr(skb);
847         iph->version            =       4;
848         iph->ihl                =       sizeof(struct iphdr) >> 2;
849         iph->frag_off           =       df;
850         iph->protocol           =       IPPROTO_GRE;
851         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
852         iph->daddr              =       rt->rt_dst;
853         iph->saddr              =       rt->rt_src;
854
855         if ((iph->ttl = tiph->ttl) == 0) {
856                 if (skb->protocol == htons(ETH_P_IP))
857                         iph->ttl = old_iph->ttl;
858 #ifdef CONFIG_IPV6
859                 else if (skb->protocol == htons(ETH_P_IPV6))
860                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
861 #endif
862                 else
863                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
864         }
865
866         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
867         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
868                                    htons(ETH_P_TEB) : skb->protocol;
869
870         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
871                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
872
873                 if (tunnel->parms.o_flags&GRE_SEQ) {
874                         ++tunnel->o_seqno;
875                         *ptr = htonl(tunnel->o_seqno);
876                         ptr--;
877                 }
878                 if (tunnel->parms.o_flags&GRE_KEY) {
879                         *ptr = tunnel->parms.o_key;
880                         ptr--;
881                 }
882                 if (tunnel->parms.o_flags&GRE_CSUM) {
883                         *ptr = 0;
884                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
885                 }
886         }
887
888         nf_reset(skb);
889
890         IPTUNNEL_XMIT();
891         tunnel->recursion--;
892         return 0;
893
894 tx_error_icmp:
895         dst_link_failure(skb);
896
897 tx_error:
898         stats->tx_errors++;
899         dev_kfree_skb(skb);
900         tunnel->recursion--;
901         return 0;
902 }
903
904 static int ipgre_tunnel_bind_dev(struct net_device *dev)
905 {
906         struct net_device *tdev = NULL;
907         struct ip_tunnel *tunnel;
908         struct iphdr *iph;
909         int hlen = LL_MAX_HEADER;
910         int mtu = ETH_DATA_LEN;
911         int addend = sizeof(struct iphdr) + 4;
912
913         tunnel = netdev_priv(dev);
914         iph = &tunnel->parms.iph;
915
916         /* Guess output device to choose reasonable mtu and needed_headroom */
917
918         if (iph->daddr) {
919                 struct flowi fl = { .oif = tunnel->parms.link,
920                                     .nl_u = { .ip4_u =
921                                               { .daddr = iph->daddr,
922                                                 .saddr = iph->saddr,
923                                                 .tos = RT_TOS(iph->tos) } },
924                                     .proto = IPPROTO_GRE };
925                 struct rtable *rt;
926                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
927                         tdev = rt->u.dst.dev;
928                         ip_rt_put(rt);
929                 }
930
931                 if (dev->type != ARPHRD_ETHER)
932                         dev->flags |= IFF_POINTOPOINT;
933         }
934
935         if (!tdev && tunnel->parms.link)
936                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
937
938         if (tdev) {
939                 hlen = tdev->hard_header_len + tdev->needed_headroom;
940                 mtu = tdev->mtu;
941         }
942         dev->iflink = tunnel->parms.link;
943
944         /* Precalculate GRE options length */
945         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
946                 if (tunnel->parms.o_flags&GRE_CSUM)
947                         addend += 4;
948                 if (tunnel->parms.o_flags&GRE_KEY)
949                         addend += 4;
950                 if (tunnel->parms.o_flags&GRE_SEQ)
951                         addend += 4;
952         }
953         dev->needed_headroom = addend + hlen;
954         mtu -= dev->hard_header_len - addend;
955
956         if (mtu < 68)
957                 mtu = 68;
958
959         tunnel->hlen = addend;
960
961         return mtu;
962 }
963
964 static int
965 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
966 {
967         int err = 0;
968         struct ip_tunnel_parm p;
969         struct ip_tunnel *t;
970         struct net *net = dev_net(dev);
971         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
972
973         switch (cmd) {
974         case SIOCGETTUNNEL:
975                 t = NULL;
976                 if (dev == ign->fb_tunnel_dev) {
977                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
978                                 err = -EFAULT;
979                                 break;
980                         }
981                         t = ipgre_tunnel_locate(net, &p, 0);
982                 }
983                 if (t == NULL)
984                         t = netdev_priv(dev);
985                 memcpy(&p, &t->parms, sizeof(p));
986                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
987                         err = -EFAULT;
988                 break;
989
990         case SIOCADDTUNNEL:
991         case SIOCCHGTUNNEL:
992                 err = -EPERM;
993                 if (!capable(CAP_NET_ADMIN))
994                         goto done;
995
996                 err = -EFAULT;
997                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
998                         goto done;
999
1000                 err = -EINVAL;
1001                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1002                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1003                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1004                         goto done;
1005                 if (p.iph.ttl)
1006                         p.iph.frag_off |= htons(IP_DF);
1007
1008                 if (!(p.i_flags&GRE_KEY))
1009                         p.i_key = 0;
1010                 if (!(p.o_flags&GRE_KEY))
1011                         p.o_key = 0;
1012
1013                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1014
1015                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1016                         if (t != NULL) {
1017                                 if (t->dev != dev) {
1018                                         err = -EEXIST;
1019                                         break;
1020                                 }
1021                         } else {
1022                                 unsigned nflags = 0;
1023
1024                                 t = netdev_priv(dev);
1025
1026                                 if (ipv4_is_multicast(p.iph.daddr))
1027                                         nflags = IFF_BROADCAST;
1028                                 else if (p.iph.daddr)
1029                                         nflags = IFF_POINTOPOINT;
1030
1031                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1032                                         err = -EINVAL;
1033                                         break;
1034                                 }
1035                                 ipgre_tunnel_unlink(ign, t);
1036                                 t->parms.iph.saddr = p.iph.saddr;
1037                                 t->parms.iph.daddr = p.iph.daddr;
1038                                 t->parms.i_key = p.i_key;
1039                                 t->parms.o_key = p.o_key;
1040                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1041                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1042                                 ipgre_tunnel_link(ign, t);
1043                                 netdev_state_change(dev);
1044                         }
1045                 }
1046
1047                 if (t) {
1048                         err = 0;
1049                         if (cmd == SIOCCHGTUNNEL) {
1050                                 t->parms.iph.ttl = p.iph.ttl;
1051                                 t->parms.iph.tos = p.iph.tos;
1052                                 t->parms.iph.frag_off = p.iph.frag_off;
1053                                 if (t->parms.link != p.link) {
1054                                         t->parms.link = p.link;
1055                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1056                                         netdev_state_change(dev);
1057                                 }
1058                         }
1059                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1060                                 err = -EFAULT;
1061                 } else
1062                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1063                 break;
1064
1065         case SIOCDELTUNNEL:
1066                 err = -EPERM;
1067                 if (!capable(CAP_NET_ADMIN))
1068                         goto done;
1069
1070                 if (dev == ign->fb_tunnel_dev) {
1071                         err = -EFAULT;
1072                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1073                                 goto done;
1074                         err = -ENOENT;
1075                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1076                                 goto done;
1077                         err = -EPERM;
1078                         if (t == netdev_priv(ign->fb_tunnel_dev))
1079                                 goto done;
1080                         dev = t->dev;
1081                 }
1082                 unregister_netdevice(dev);
1083                 err = 0;
1084                 break;
1085
1086         default:
1087                 err = -EINVAL;
1088         }
1089
1090 done:
1091         return err;
1092 }
1093
1094 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1095 {
1096         struct ip_tunnel *tunnel = netdev_priv(dev);
1097         if (new_mtu < 68 ||
1098             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1099                 return -EINVAL;
1100         dev->mtu = new_mtu;
1101         return 0;
1102 }
1103
1104 /* Nice toy. Unfortunately, useless in real life :-)
1105    It allows to construct virtual multiprotocol broadcast "LAN"
1106    over the Internet, provided multicast routing is tuned.
1107
1108
1109    I have no idea was this bicycle invented before me,
1110    so that I had to set ARPHRD_IPGRE to a random value.
1111    I have an impression, that Cisco could make something similar,
1112    but this feature is apparently missing in IOS<=11.2(8).
1113
1114    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1115    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1116
1117    ping -t 255 224.66.66.66
1118
1119    If nobody answers, mbone does not work.
1120
1121    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1122    ip addr add 10.66.66.<somewhat>/24 dev Universe
1123    ifconfig Universe up
1124    ifconfig Universe add fe80::<Your_real_addr>/10
1125    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1126    ftp 10.66.66.66
1127    ...
1128    ftp fec0:6666:6666::193.233.7.65
1129    ...
1130
1131  */
1132
1133 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1134                         unsigned short type,
1135                         const void *daddr, const void *saddr, unsigned len)
1136 {
1137         struct ip_tunnel *t = netdev_priv(dev);
1138         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1139         __be16 *p = (__be16*)(iph+1);
1140
1141         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1142         p[0]            = t->parms.o_flags;
1143         p[1]            = htons(type);
1144
1145         /*
1146          *      Set the source hardware address.
1147          */
1148
1149         if (saddr)
1150                 memcpy(&iph->saddr, saddr, 4);
1151
1152         if (daddr) {
1153                 memcpy(&iph->daddr, daddr, 4);
1154                 return t->hlen;
1155         }
1156         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1157                 return t->hlen;
1158
1159         return -t->hlen;
1160 }
1161
1162 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1163 {
1164         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1165         memcpy(haddr, &iph->saddr, 4);
1166         return 4;
1167 }
1168
1169 static const struct header_ops ipgre_header_ops = {
1170         .create = ipgre_header,
1171         .parse  = ipgre_header_parse,
1172 };
1173
1174 #ifdef CONFIG_NET_IPGRE_BROADCAST
1175 static int ipgre_open(struct net_device *dev)
1176 {
1177         struct ip_tunnel *t = netdev_priv(dev);
1178
1179         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1180                 struct flowi fl = { .oif = t->parms.link,
1181                                     .nl_u = { .ip4_u =
1182                                               { .daddr = t->parms.iph.daddr,
1183                                                 .saddr = t->parms.iph.saddr,
1184                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1185                                     .proto = IPPROTO_GRE };
1186                 struct rtable *rt;
1187                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1188                         return -EADDRNOTAVAIL;
1189                 dev = rt->u.dst.dev;
1190                 ip_rt_put(rt);
1191                 if (__in_dev_get_rtnl(dev) == NULL)
1192                         return -EADDRNOTAVAIL;
1193                 t->mlink = dev->ifindex;
1194                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1195         }
1196         return 0;
1197 }
1198
1199 static int ipgre_close(struct net_device *dev)
1200 {
1201         struct ip_tunnel *t = netdev_priv(dev);
1202
1203         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1204                 struct in_device *in_dev;
1205                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1206                 if (in_dev) {
1207                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1208                         in_dev_put(in_dev);
1209                 }
1210         }
1211         return 0;
1212 }
1213
1214 #endif
1215
1216 static const struct net_device_ops ipgre_netdev_ops = {
1217         .ndo_init               = ipgre_tunnel_init,
1218         .ndo_uninit             = ipgre_tunnel_uninit,
1219 #ifdef CONFIG_NET_IPGRE_BROADCAST
1220         .ndo_open               = ipgre_open,
1221         .ndo_stop               = ipgre_close,
1222 #endif
1223         .ndo_start_xmit         = ipgre_tunnel_xmit,
1224         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1225         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1226 };
1227
1228 static void ipgre_tunnel_setup(struct net_device *dev)
1229 {
1230         dev->netdev_ops         = &ipgre_netdev_ops;
1231         dev->destructor         = free_netdev;
1232
1233         dev->type               = ARPHRD_IPGRE;
1234         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1235         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1236         dev->flags              = IFF_NOARP;
1237         dev->iflink             = 0;
1238         dev->addr_len           = 4;
1239         dev->features           |= NETIF_F_NETNS_LOCAL;
1240         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1241 }
1242
1243 static int ipgre_tunnel_init(struct net_device *dev)
1244 {
1245         struct ip_tunnel *tunnel;
1246         struct iphdr *iph;
1247
1248         tunnel = netdev_priv(dev);
1249         iph = &tunnel->parms.iph;
1250
1251         tunnel->dev = dev;
1252         strcpy(tunnel->parms.name, dev->name);
1253
1254         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1255         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1256
1257         if (iph->daddr) {
1258 #ifdef CONFIG_NET_IPGRE_BROADCAST
1259                 if (ipv4_is_multicast(iph->daddr)) {
1260                         if (!iph->saddr)
1261                                 return -EINVAL;
1262                         dev->flags = IFF_BROADCAST;
1263                         dev->header_ops = &ipgre_header_ops;
1264                 }
1265 #endif
1266         } else
1267                 dev->header_ops = &ipgre_header_ops;
1268
1269         return 0;
1270 }
1271
1272 static void ipgre_fb_tunnel_init(struct net_device *dev)
1273 {
1274         struct ip_tunnel *tunnel = netdev_priv(dev);
1275         struct iphdr *iph = &tunnel->parms.iph;
1276         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1277
1278         tunnel->dev = dev;
1279         strcpy(tunnel->parms.name, dev->name);
1280
1281         iph->version            = 4;
1282         iph->protocol           = IPPROTO_GRE;
1283         iph->ihl                = 5;
1284         tunnel->hlen            = sizeof(struct iphdr) + 4;
1285
1286         dev_hold(dev);
1287         ign->tunnels_wc[0]      = tunnel;
1288 }
1289
1290
1291 static struct net_protocol ipgre_protocol = {
1292         .handler        =       ipgre_rcv,
1293         .err_handler    =       ipgre_err,
1294         .netns_ok       =       1,
1295 };
1296
1297 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1298 {
1299         int prio;
1300
1301         for (prio = 0; prio < 4; prio++) {
1302                 int h;
1303                 for (h = 0; h < HASH_SIZE; h++) {
1304                         struct ip_tunnel *t;
1305                         while ((t = ign->tunnels[prio][h]) != NULL)
1306                                 unregister_netdevice(t->dev);
1307                 }
1308         }
1309 }
1310
1311 static int ipgre_init_net(struct net *net)
1312 {
1313         int err;
1314         struct ipgre_net *ign;
1315
1316         err = -ENOMEM;
1317         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1318         if (ign == NULL)
1319                 goto err_alloc;
1320
1321         err = net_assign_generic(net, ipgre_net_id, ign);
1322         if (err < 0)
1323                 goto err_assign;
1324
1325         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1326                                            ipgre_tunnel_setup);
1327         if (!ign->fb_tunnel_dev) {
1328                 err = -ENOMEM;
1329                 goto err_alloc_dev;
1330         }
1331         dev_net_set(ign->fb_tunnel_dev, net);
1332
1333         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1334         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1335
1336         if ((err = register_netdev(ign->fb_tunnel_dev)))
1337                 goto err_reg_dev;
1338
1339         return 0;
1340
1341 err_reg_dev:
1342         free_netdev(ign->fb_tunnel_dev);
1343 err_alloc_dev:
1344         /* nothing */
1345 err_assign:
1346         kfree(ign);
1347 err_alloc:
1348         return err;
1349 }
1350
1351 static void ipgre_exit_net(struct net *net)
1352 {
1353         struct ipgre_net *ign;
1354
1355         ign = net_generic(net, ipgre_net_id);
1356         rtnl_lock();
1357         ipgre_destroy_tunnels(ign);
1358         rtnl_unlock();
1359         kfree(ign);
1360 }
1361
1362 static struct pernet_operations ipgre_net_ops = {
1363         .init = ipgre_init_net,
1364         .exit = ipgre_exit_net,
1365 };
1366
1367 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1368 {
1369         __be16 flags;
1370
1371         if (!data)
1372                 return 0;
1373
1374         flags = 0;
1375         if (data[IFLA_GRE_IFLAGS])
1376                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1377         if (data[IFLA_GRE_OFLAGS])
1378                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1379         if (flags & (GRE_VERSION|GRE_ROUTING))
1380                 return -EINVAL;
1381
1382         return 0;
1383 }
1384
1385 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1386 {
1387         __be32 daddr;
1388
1389         if (tb[IFLA_ADDRESS]) {
1390                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1391                         return -EINVAL;
1392                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1393                         return -EADDRNOTAVAIL;
1394         }
1395
1396         if (!data)
1397                 goto out;
1398
1399         if (data[IFLA_GRE_REMOTE]) {
1400                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1401                 if (!daddr)
1402                         return -EINVAL;
1403         }
1404
1405 out:
1406         return ipgre_tunnel_validate(tb, data);
1407 }
1408
1409 static void ipgre_netlink_parms(struct nlattr *data[],
1410                                 struct ip_tunnel_parm *parms)
1411 {
1412         memset(parms, 0, sizeof(*parms));
1413
1414         parms->iph.protocol = IPPROTO_GRE;
1415
1416         if (!data)
1417                 return;
1418
1419         if (data[IFLA_GRE_LINK])
1420                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1421
1422         if (data[IFLA_GRE_IFLAGS])
1423                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1424
1425         if (data[IFLA_GRE_OFLAGS])
1426                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1427
1428         if (data[IFLA_GRE_IKEY])
1429                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1430
1431         if (data[IFLA_GRE_OKEY])
1432                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1433
1434         if (data[IFLA_GRE_LOCAL])
1435                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1436
1437         if (data[IFLA_GRE_REMOTE])
1438                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1439
1440         if (data[IFLA_GRE_TTL])
1441                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1442
1443         if (data[IFLA_GRE_TOS])
1444                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1445
1446         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1447                 parms->iph.frag_off = htons(IP_DF);
1448 }
1449
1450 static int ipgre_tap_init(struct net_device *dev)
1451 {
1452         struct ip_tunnel *tunnel;
1453
1454         tunnel = netdev_priv(dev);
1455
1456         tunnel->dev = dev;
1457         strcpy(tunnel->parms.name, dev->name);
1458
1459         ipgre_tunnel_bind_dev(dev);
1460
1461         return 0;
1462 }
1463
1464 static const struct net_device_ops ipgre_tap_netdev_ops = {
1465         .ndo_init               = ipgre_tap_init,
1466         .ndo_uninit             = ipgre_tunnel_uninit,
1467         .ndo_start_xmit         = ipgre_tunnel_xmit,
1468         .ndo_set_mac_address    = eth_mac_addr,
1469         .ndo_validate_addr      = eth_validate_addr,
1470         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1471 };
1472
1473 static void ipgre_tap_setup(struct net_device *dev)
1474 {
1475
1476         ether_setup(dev);
1477
1478         dev->netdev_ops         = &ipgre_netdev_ops;
1479         dev->destructor         = free_netdev;
1480
1481         dev->iflink             = 0;
1482         dev->features           |= NETIF_F_NETNS_LOCAL;
1483 }
1484
1485 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1486                          struct nlattr *data[])
1487 {
1488         struct ip_tunnel *nt;
1489         struct net *net = dev_net(dev);
1490         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1491         int mtu;
1492         int err;
1493
1494         nt = netdev_priv(dev);
1495         ipgre_netlink_parms(data, &nt->parms);
1496
1497         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1498                 return -EEXIST;
1499
1500         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1501                 random_ether_addr(dev->dev_addr);
1502
1503         mtu = ipgre_tunnel_bind_dev(dev);
1504         if (!tb[IFLA_MTU])
1505                 dev->mtu = mtu;
1506
1507         err = register_netdevice(dev);
1508         if (err)
1509                 goto out;
1510
1511         dev_hold(dev);
1512         ipgre_tunnel_link(ign, nt);
1513
1514 out:
1515         return err;
1516 }
1517
1518 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1519                             struct nlattr *data[])
1520 {
1521         struct ip_tunnel *t, *nt;
1522         struct net *net = dev_net(dev);
1523         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1524         struct ip_tunnel_parm p;
1525         int mtu;
1526
1527         if (dev == ign->fb_tunnel_dev)
1528                 return -EINVAL;
1529
1530         nt = netdev_priv(dev);
1531         ipgre_netlink_parms(data, &p);
1532
1533         t = ipgre_tunnel_locate(net, &p, 0);
1534
1535         if (t) {
1536                 if (t->dev != dev)
1537                         return -EEXIST;
1538         } else {
1539                 unsigned nflags = 0;
1540
1541                 t = nt;
1542
1543                 if (ipv4_is_multicast(p.iph.daddr))
1544                         nflags = IFF_BROADCAST;
1545                 else if (p.iph.daddr)
1546                         nflags = IFF_POINTOPOINT;
1547
1548                 if ((dev->flags ^ nflags) &
1549                     (IFF_POINTOPOINT | IFF_BROADCAST))
1550                         return -EINVAL;
1551
1552                 ipgre_tunnel_unlink(ign, t);
1553                 t->parms.iph.saddr = p.iph.saddr;
1554                 t->parms.iph.daddr = p.iph.daddr;
1555                 t->parms.i_key = p.i_key;
1556                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1557                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1558                 ipgre_tunnel_link(ign, t);
1559                 netdev_state_change(dev);
1560         }
1561
1562         t->parms.o_key = p.o_key;
1563         t->parms.iph.ttl = p.iph.ttl;
1564         t->parms.iph.tos = p.iph.tos;
1565         t->parms.iph.frag_off = p.iph.frag_off;
1566
1567         if (t->parms.link != p.link) {
1568                 t->parms.link = p.link;
1569                 mtu = ipgre_tunnel_bind_dev(dev);
1570                 if (!tb[IFLA_MTU])
1571                         dev->mtu = mtu;
1572                 netdev_state_change(dev);
1573         }
1574
1575         return 0;
1576 }
1577
1578 static size_t ipgre_get_size(const struct net_device *dev)
1579 {
1580         return
1581                 /* IFLA_GRE_LINK */
1582                 nla_total_size(4) +
1583                 /* IFLA_GRE_IFLAGS */
1584                 nla_total_size(2) +
1585                 /* IFLA_GRE_OFLAGS */
1586                 nla_total_size(2) +
1587                 /* IFLA_GRE_IKEY */
1588                 nla_total_size(4) +
1589                 /* IFLA_GRE_OKEY */
1590                 nla_total_size(4) +
1591                 /* IFLA_GRE_LOCAL */
1592                 nla_total_size(4) +
1593                 /* IFLA_GRE_REMOTE */
1594                 nla_total_size(4) +
1595                 /* IFLA_GRE_TTL */
1596                 nla_total_size(1) +
1597                 /* IFLA_GRE_TOS */
1598                 nla_total_size(1) +
1599                 /* IFLA_GRE_PMTUDISC */
1600                 nla_total_size(1) +
1601                 0;
1602 }
1603
1604 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1605 {
1606         struct ip_tunnel *t = netdev_priv(dev);
1607         struct ip_tunnel_parm *p = &t->parms;
1608
1609         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1610         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1611         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1612         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1613         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1614         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1615         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1616         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1617         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1618         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1619
1620         return 0;
1621
1622 nla_put_failure:
1623         return -EMSGSIZE;
1624 }
1625
1626 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1627         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1628         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1629         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1630         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1631         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1632         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1633         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1634         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1635         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1636         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1637 };
1638
1639 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1640         .kind           = "gre",
1641         .maxtype        = IFLA_GRE_MAX,
1642         .policy         = ipgre_policy,
1643         .priv_size      = sizeof(struct ip_tunnel),
1644         .setup          = ipgre_tunnel_setup,
1645         .validate       = ipgre_tunnel_validate,
1646         .newlink        = ipgre_newlink,
1647         .changelink     = ipgre_changelink,
1648         .get_size       = ipgre_get_size,
1649         .fill_info      = ipgre_fill_info,
1650 };
1651
1652 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1653         .kind           = "gretap",
1654         .maxtype        = IFLA_GRE_MAX,
1655         .policy         = ipgre_policy,
1656         .priv_size      = sizeof(struct ip_tunnel),
1657         .setup          = ipgre_tap_setup,
1658         .validate       = ipgre_tap_validate,
1659         .newlink        = ipgre_newlink,
1660         .changelink     = ipgre_changelink,
1661         .get_size       = ipgre_get_size,
1662         .fill_info      = ipgre_fill_info,
1663 };
1664
1665 /*
1666  *      And now the modules code and kernel interface.
1667  */
1668
1669 static int __init ipgre_init(void)
1670 {
1671         int err;
1672
1673         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1674
1675         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1676                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1677                 return -EAGAIN;
1678         }
1679
1680         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1681         if (err < 0)
1682                 goto gen_device_failed;
1683
1684         err = rtnl_link_register(&ipgre_link_ops);
1685         if (err < 0)
1686                 goto rtnl_link_failed;
1687
1688         err = rtnl_link_register(&ipgre_tap_ops);
1689         if (err < 0)
1690                 goto tap_ops_failed;
1691
1692 out:
1693         return err;
1694
1695 tap_ops_failed:
1696         rtnl_link_unregister(&ipgre_link_ops);
1697 rtnl_link_failed:
1698         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699 gen_device_failed:
1700         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1701         goto out;
1702 }
1703
1704 static void __exit ipgre_fini(void)
1705 {
1706         rtnl_link_unregister(&ipgre_tap_ops);
1707         rtnl_link_unregister(&ipgre_link_ops);
1708         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1709         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1710                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1711 }
1712
1713 module_init(ipgre_init);
1714 module_exit(ipgre_fini);
1715 MODULE_LICENSE("GPL");
1716 MODULE_ALIAS_RTNL_LINK("gre");
1717 MODULE_ALIAS_RTNL_LINK("gretap");