tcp: fix loop in ofo handling code and reduce its complexity
[linux-2.6] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 #define HASH_SIZE  16
130
131 static int ipgre_net_id;
132 struct ipgre_net {
133         struct ip_tunnel *tunnels[4][HASH_SIZE];
134
135         struct net_device *fb_tunnel_dev;
136 };
137
138 /* Tunnel hash table */
139
140 /*
141    4 hash tables:
142
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157
158 #define tunnels_r_l     tunnels[3]
159 #define tunnels_r       tunnels[2]
160 #define tunnels_l       tunnels[1]
161 #define tunnels_wc      tunnels[0]
162
163 static DEFINE_RWLOCK(ipgre_lock);
164
165 /* Given src, dst and key, find appropriate for input tunnel. */
166
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168                                               __be32 remote, __be32 local,
169                                               __be32 key, __be16 gre_proto)
170 {
171         struct net *net = dev_net(dev);
172         int link = dev->ifindex;
173         unsigned h0 = HASH(remote);
174         unsigned h1 = HASH(key);
175         struct ip_tunnel *t, *cand = NULL;
176         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178                        ARPHRD_ETHER : ARPHRD_IPGRE;
179         int score, cand_score = 4;
180
181         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182                 if (local != t->parms.iph.saddr ||
183                     remote != t->parms.iph.daddr ||
184                     key != t->parms.i_key ||
185                     !(t->dev->flags & IFF_UP))
186                         continue;
187
188                 if (t->dev->type != ARPHRD_IPGRE &&
189                     t->dev->type != dev_type)
190                         continue;
191
192                 score = 0;
193                 if (t->parms.link != link)
194                         score |= 1;
195                 if (t->dev->type != dev_type)
196                         score |= 2;
197                 if (score == 0)
198                         return t;
199
200                 if (score < cand_score) {
201                         cand = t;
202                         cand_score = score;
203                 }
204         }
205
206         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
207                 if (remote != t->parms.iph.daddr ||
208                     key != t->parms.i_key ||
209                     !(t->dev->flags & IFF_UP))
210                         continue;
211
212                 if (t->dev->type != ARPHRD_IPGRE &&
213                     t->dev->type != dev_type)
214                         continue;
215
216                 score = 0;
217                 if (t->parms.link != link)
218                         score |= 1;
219                 if (t->dev->type != dev_type)
220                         score |= 2;
221                 if (score == 0)
222                         return t;
223
224                 if (score < cand_score) {
225                         cand = t;
226                         cand_score = score;
227                 }
228         }
229
230         for (t = ign->tunnels_l[h1]; t; t = t->next) {
231                 if ((local != t->parms.iph.saddr &&
232                      (local != t->parms.iph.daddr ||
233                       !ipv4_is_multicast(local))) ||
234                     key != t->parms.i_key ||
235                     !(t->dev->flags & IFF_UP))
236                         continue;
237
238                 if (t->dev->type != ARPHRD_IPGRE &&
239                     t->dev->type != dev_type)
240                         continue;
241
242                 score = 0;
243                 if (t->parms.link != link)
244                         score |= 1;
245                 if (t->dev->type != dev_type)
246                         score |= 2;
247                 if (score == 0)
248                         return t;
249
250                 if (score < cand_score) {
251                         cand = t;
252                         cand_score = score;
253                 }
254         }
255
256         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
257                 if (t->parms.i_key != key ||
258                     !(t->dev->flags & IFF_UP))
259                         continue;
260
261                 if (t->dev->type != ARPHRD_IPGRE &&
262                     t->dev->type != dev_type)
263                         continue;
264
265                 score = 0;
266                 if (t->parms.link != link)
267                         score |= 1;
268                 if (t->dev->type != dev_type)
269                         score |= 2;
270                 if (score == 0)
271                         return t;
272
273                 if (score < cand_score) {
274                         cand = t;
275                         cand_score = score;
276                 }
277         }
278
279         if (cand != NULL)
280                 return cand;
281
282         if (ign->fb_tunnel_dev->flags & IFF_UP)
283                 return netdev_priv(ign->fb_tunnel_dev);
284
285         return NULL;
286 }
287
288 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
289                 struct ip_tunnel_parm *parms)
290 {
291         __be32 remote = parms->iph.daddr;
292         __be32 local = parms->iph.saddr;
293         __be32 key = parms->i_key;
294         unsigned h = HASH(key);
295         int prio = 0;
296
297         if (local)
298                 prio |= 1;
299         if (remote && !ipv4_is_multicast(remote)) {
300                 prio |= 2;
301                 h ^= HASH(remote);
302         }
303
304         return &ign->tunnels[prio][h];
305 }
306
307 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
308                 struct ip_tunnel *t)
309 {
310         return __ipgre_bucket(ign, &t->parms);
311 }
312
313 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314 {
315         struct ip_tunnel **tp = ipgre_bucket(ign, t);
316
317         t->next = *tp;
318         write_lock_bh(&ipgre_lock);
319         *tp = t;
320         write_unlock_bh(&ipgre_lock);
321 }
322
323 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324 {
325         struct ip_tunnel **tp;
326
327         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
328                 if (t == *tp) {
329                         write_lock_bh(&ipgre_lock);
330                         *tp = t->next;
331                         write_unlock_bh(&ipgre_lock);
332                         break;
333                 }
334         }
335 }
336
337 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
338                                            struct ip_tunnel_parm *parms,
339                                            int type)
340 {
341         __be32 remote = parms->iph.daddr;
342         __be32 local = parms->iph.saddr;
343         __be32 key = parms->i_key;
344         int link = parms->link;
345         struct ip_tunnel *t, **tp;
346         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347
348         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
349                 if (local == t->parms.iph.saddr &&
350                     remote == t->parms.iph.daddr &&
351                     key == t->parms.i_key &&
352                     link == t->parms.link &&
353                     type == t->dev->type)
354                         break;
355
356         return t;
357 }
358
359 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
360                 struct ip_tunnel_parm *parms, int create)
361 {
362         struct ip_tunnel *t, *nt;
363         struct net_device *dev;
364         char name[IFNAMSIZ];
365         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
366
367         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
368         if (t || !create)
369                 return t;
370
371         if (parms->name[0])
372                 strlcpy(name, parms->name, IFNAMSIZ);
373         else
374                 sprintf(name, "gre%%d");
375
376         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377         if (!dev)
378           return NULL;
379
380         dev_net_set(dev, net);
381
382         if (strchr(name, '%')) {
383                 if (dev_alloc_name(dev, name) < 0)
384                         goto failed_free;
385         }
386
387         nt = netdev_priv(dev);
388         nt->parms = *parms;
389         dev->rtnl_link_ops = &ipgre_link_ops;
390
391         dev->mtu = ipgre_tunnel_bind_dev(dev);
392
393         if (register_netdevice(dev) < 0)
394                 goto failed_free;
395
396         dev_hold(dev);
397         ipgre_tunnel_link(ign, nt);
398         return nt;
399
400 failed_free:
401         free_netdev(dev);
402         return NULL;
403 }
404
405 static void ipgre_tunnel_uninit(struct net_device *dev)
406 {
407         struct net *net = dev_net(dev);
408         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
409
410         ipgre_tunnel_unlink(ign, netdev_priv(dev));
411         dev_put(dev);
412 }
413
414
415 static void ipgre_err(struct sk_buff *skb, u32 info)
416 {
417
418 /* All the routers (except for Linux) return only
419    8 bytes of packet payload. It means, that precise relaying of
420    ICMP in the real Internet is absolutely infeasible.
421
422    Moreover, Cisco "wise men" put GRE key to the third word
423    in GRE header. It makes impossible maintaining even soft state for keyed
424    GRE tunnels with enabled checksum. Tell them "thank you".
425
426    Well, I wonder, rfc1812 was written by Cisco employee,
427    what the hell these idiots break standrads established
428    by themself???
429  */
430
431         struct iphdr *iph = (struct iphdr *)skb->data;
432         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
433         int grehlen = (iph->ihl<<2) + 4;
434         const int type = icmp_hdr(skb)->type;
435         const int code = icmp_hdr(skb)->code;
436         struct ip_tunnel *t;
437         __be16 flags;
438
439         flags = p[0];
440         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
441                 if (flags&(GRE_VERSION|GRE_ROUTING))
442                         return;
443                 if (flags&GRE_KEY) {
444                         grehlen += 4;
445                         if (flags&GRE_CSUM)
446                                 grehlen += 4;
447                 }
448         }
449
450         /* If only 8 bytes returned, keyed message will be dropped here */
451         if (skb_headlen(skb) < grehlen)
452                 return;
453
454         switch (type) {
455         default:
456         case ICMP_PARAMETERPROB:
457                 return;
458
459         case ICMP_DEST_UNREACH:
460                 switch (code) {
461                 case ICMP_SR_FAILED:
462                 case ICMP_PORT_UNREACH:
463                         /* Impossible event. */
464                         return;
465                 case ICMP_FRAG_NEEDED:
466                         /* Soft state for pmtu is maintained by IP core. */
467                         return;
468                 default:
469                         /* All others are translated to HOST_UNREACH.
470                            rfc2003 contains "deep thoughts" about NET_UNREACH,
471                            I believe they are just ether pollution. --ANK
472                          */
473                         break;
474                 }
475                 break;
476         case ICMP_TIME_EXCEEDED:
477                 if (code != ICMP_EXC_TTL)
478                         return;
479                 break;
480         }
481
482         read_lock(&ipgre_lock);
483         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484                                 flags & GRE_KEY ?
485                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
486                                 p[1]);
487         if (t == NULL || t->parms.iph.daddr == 0 ||
488             ipv4_is_multicast(t->parms.iph.daddr))
489                 goto out;
490
491         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492                 goto out;
493
494         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
495                 t->err_count++;
496         else
497                 t->err_count = 1;
498         t->err_time = jiffies;
499 out:
500         read_unlock(&ipgre_lock);
501         return;
502 }
503
504 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
505 {
506         if (INET_ECN_is_ce(iph->tos)) {
507                 if (skb->protocol == htons(ETH_P_IP)) {
508                         IP_ECN_set_ce(ip_hdr(skb));
509                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
510                         IP6_ECN_set_ce(ipv6_hdr(skb));
511                 }
512         }
513 }
514
515 static inline u8
516 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
517 {
518         u8 inner = 0;
519         if (skb->protocol == htons(ETH_P_IP))
520                 inner = old_iph->tos;
521         else if (skb->protocol == htons(ETH_P_IPV6))
522                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
523         return INET_ECN_encapsulate(tos, inner);
524 }
525
526 static int ipgre_rcv(struct sk_buff *skb)
527 {
528         struct iphdr *iph;
529         u8     *h;
530         __be16    flags;
531         __sum16   csum = 0;
532         __be32 key = 0;
533         u32    seqno = 0;
534         struct ip_tunnel *tunnel;
535         int    offset = 4;
536         __be16 gre_proto;
537         unsigned int len;
538
539         if (!pskb_may_pull(skb, 16))
540                 goto drop_nolock;
541
542         iph = ip_hdr(skb);
543         h = skb->data;
544         flags = *(__be16*)h;
545
546         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
547                 /* - Version must be 0.
548                    - We do not support routing headers.
549                  */
550                 if (flags&(GRE_VERSION|GRE_ROUTING))
551                         goto drop_nolock;
552
553                 if (flags&GRE_CSUM) {
554                         switch (skb->ip_summed) {
555                         case CHECKSUM_COMPLETE:
556                                 csum = csum_fold(skb->csum);
557                                 if (!csum)
558                                         break;
559                                 /* fall through */
560                         case CHECKSUM_NONE:
561                                 skb->csum = 0;
562                                 csum = __skb_checksum_complete(skb);
563                                 skb->ip_summed = CHECKSUM_COMPLETE;
564                         }
565                         offset += 4;
566                 }
567                 if (flags&GRE_KEY) {
568                         key = *(__be32*)(h + offset);
569                         offset += 4;
570                 }
571                 if (flags&GRE_SEQ) {
572                         seqno = ntohl(*(__be32*)(h + offset));
573                         offset += 4;
574                 }
575         }
576
577         gre_proto = *(__be16 *)(h + 2);
578
579         read_lock(&ipgre_lock);
580         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581                                           iph->saddr, iph->daddr, key,
582                                           gre_proto))) {
583                 struct net_device_stats *stats = &tunnel->dev->stats;
584
585                 secpath_reset(skb);
586
587                 skb->protocol = gre_proto;
588                 /* WCCP version 1 and 2 protocol decoding.
589                  * - Change protocol to IP
590                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
591                  */
592                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
593                         skb->protocol = htons(ETH_P_IP);
594                         if ((*(h + offset) & 0xF0) != 0x40)
595                                 offset += 4;
596                 }
597
598                 skb->mac_header = skb->network_header;
599                 __pskb_pull(skb, offset);
600                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
601                 skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603                 if (ipv4_is_multicast(iph->daddr)) {
604                         /* Looped back packet, drop it! */
605                         if (skb->rtable->fl.iif == 0)
606                                 goto drop;
607                         stats->multicast++;
608                         skb->pkt_type = PACKET_BROADCAST;
609                 }
610 #endif
611
612                 if (((flags&GRE_CSUM) && csum) ||
613                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614                         stats->rx_crc_errors++;
615                         stats->rx_errors++;
616                         goto drop;
617                 }
618                 if (tunnel->parms.i_flags&GRE_SEQ) {
619                         if (!(flags&GRE_SEQ) ||
620                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621                                 stats->rx_fifo_errors++;
622                                 stats->rx_errors++;
623                                 goto drop;
624                         }
625                         tunnel->i_seqno = seqno + 1;
626                 }
627
628                 len = skb->len;
629
630                 /* Warning: All skb pointers will be invalidated! */
631                 if (tunnel->dev->type == ARPHRD_ETHER) {
632                         if (!pskb_may_pull(skb, ETH_HLEN)) {
633                                 stats->rx_length_errors++;
634                                 stats->rx_errors++;
635                                 goto drop;
636                         }
637
638                         iph = ip_hdr(skb);
639                         skb->protocol = eth_type_trans(skb, tunnel->dev);
640                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641                 }
642
643                 stats->rx_packets++;
644                 stats->rx_bytes += len;
645                 skb->dev = tunnel->dev;
646                 dst_release(skb->dst);
647                 skb->dst = NULL;
648                 nf_reset(skb);
649
650                 skb_reset_network_header(skb);
651                 ipgre_ecn_decapsulate(iph, skb);
652
653                 netif_rx(skb);
654                 read_unlock(&ipgre_lock);
655                 return(0);
656         }
657         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
658
659 drop:
660         read_unlock(&ipgre_lock);
661 drop_nolock:
662         kfree_skb(skb);
663         return(0);
664 }
665
666 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
667 {
668         struct ip_tunnel *tunnel = netdev_priv(dev);
669         struct net_device_stats *stats = &tunnel->dev->stats;
670         struct iphdr  *old_iph = ip_hdr(skb);
671         struct iphdr  *tiph;
672         u8     tos;
673         __be16 df;
674         struct rtable *rt;                      /* Route to the other host */
675         struct net_device *tdev;                        /* Device to other host */
676         struct iphdr  *iph;                     /* Our new IP header */
677         unsigned int max_headroom;              /* The extra header space needed */
678         int    gre_hlen;
679         __be32 dst;
680         int    mtu;
681
682         if (tunnel->recursion++) {
683                 stats->collisions++;
684                 goto tx_error;
685         }
686
687         if (dev->type == ARPHRD_ETHER)
688                 IPCB(skb)->flags = 0;
689
690         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
691                 gre_hlen = 0;
692                 tiph = (struct iphdr *)skb->data;
693         } else {
694                 gre_hlen = tunnel->hlen;
695                 tiph = &tunnel->parms.iph;
696         }
697
698         if ((dst = tiph->daddr) == 0) {
699                 /* NBMA tunnel */
700
701                 if (skb->dst == NULL) {
702                         stats->tx_fifo_errors++;
703                         goto tx_error;
704                 }
705
706                 if (skb->protocol == htons(ETH_P_IP)) {
707                         rt = skb->rtable;
708                         if ((dst = rt->rt_gateway) == 0)
709                                 goto tx_error_icmp;
710                 }
711 #ifdef CONFIG_IPV6
712                 else if (skb->protocol == htons(ETH_P_IPV6)) {
713                         struct in6_addr *addr6;
714                         int addr_type;
715                         struct neighbour *neigh = skb->dst->neighbour;
716
717                         if (neigh == NULL)
718                                 goto tx_error;
719
720                         addr6 = (struct in6_addr *)&neigh->primary_key;
721                         addr_type = ipv6_addr_type(addr6);
722
723                         if (addr_type == IPV6_ADDR_ANY) {
724                                 addr6 = &ipv6_hdr(skb)->daddr;
725                                 addr_type = ipv6_addr_type(addr6);
726                         }
727
728                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
729                                 goto tx_error_icmp;
730
731                         dst = addr6->s6_addr32[3];
732                 }
733 #endif
734                 else
735                         goto tx_error;
736         }
737
738         tos = tiph->tos;
739         if (tos&1) {
740                 if (skb->protocol == htons(ETH_P_IP))
741                         tos = old_iph->tos;
742                 tos &= ~1;
743         }
744
745         {
746                 struct flowi fl = { .oif = tunnel->parms.link,
747                                     .nl_u = { .ip4_u =
748                                               { .daddr = dst,
749                                                 .saddr = tiph->saddr,
750                                                 .tos = RT_TOS(tos) } },
751                                     .proto = IPPROTO_GRE };
752                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
753                         stats->tx_carrier_errors++;
754                         goto tx_error;
755                 }
756         }
757         tdev = rt->u.dst.dev;
758
759         if (tdev == dev) {
760                 ip_rt_put(rt);
761                 stats->collisions++;
762                 goto tx_error;
763         }
764
765         df = tiph->frag_off;
766         if (df)
767                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
768         else
769                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
770
771         if (skb->dst)
772                 skb->dst->ops->update_pmtu(skb->dst, mtu);
773
774         if (skb->protocol == htons(ETH_P_IP)) {
775                 df |= (old_iph->frag_off&htons(IP_DF));
776
777                 if ((old_iph->frag_off&htons(IP_DF)) &&
778                     mtu < ntohs(old_iph->tot_len)) {
779                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
780                         ip_rt_put(rt);
781                         goto tx_error;
782                 }
783         }
784 #ifdef CONFIG_IPV6
785         else if (skb->protocol == htons(ETH_P_IPV6)) {
786                 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
787
788                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
789                         if ((tunnel->parms.iph.daddr &&
790                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
791                             rt6->rt6i_dst.plen == 128) {
792                                 rt6->rt6i_flags |= RTF_MODIFIED;
793                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
794                         }
795                 }
796
797                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
798                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
799                         ip_rt_put(rt);
800                         goto tx_error;
801                 }
802         }
803 #endif
804
805         if (tunnel->err_count > 0) {
806                 if (time_before(jiffies,
807                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
808                         tunnel->err_count--;
809
810                         dst_link_failure(skb);
811                 } else
812                         tunnel->err_count = 0;
813         }
814
815         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
816
817         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
818             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
819                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
820                 if (!new_skb) {
821                         ip_rt_put(rt);
822                         stats->tx_dropped++;
823                         dev_kfree_skb(skb);
824                         tunnel->recursion--;
825                         return 0;
826                 }
827                 if (skb->sk)
828                         skb_set_owner_w(new_skb, skb->sk);
829                 dev_kfree_skb(skb);
830                 skb = new_skb;
831                 old_iph = ip_hdr(skb);
832         }
833
834         skb_reset_transport_header(skb);
835         skb_push(skb, gre_hlen);
836         skb_reset_network_header(skb);
837         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
839                               IPSKB_REROUTED);
840         dst_release(skb->dst);
841         skb->dst = &rt->u.dst;
842
843         /*
844          *      Push down and install the IPIP header.
845          */
846
847         iph                     =       ip_hdr(skb);
848         iph->version            =       4;
849         iph->ihl                =       sizeof(struct iphdr) >> 2;
850         iph->frag_off           =       df;
851         iph->protocol           =       IPPROTO_GRE;
852         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
853         iph->daddr              =       rt->rt_dst;
854         iph->saddr              =       rt->rt_src;
855
856         if ((iph->ttl = tiph->ttl) == 0) {
857                 if (skb->protocol == htons(ETH_P_IP))
858                         iph->ttl = old_iph->ttl;
859 #ifdef CONFIG_IPV6
860                 else if (skb->protocol == htons(ETH_P_IPV6))
861                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
862 #endif
863                 else
864                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
865         }
866
867         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
868         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
869                                    htons(ETH_P_TEB) : skb->protocol;
870
871         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
872                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
873
874                 if (tunnel->parms.o_flags&GRE_SEQ) {
875                         ++tunnel->o_seqno;
876                         *ptr = htonl(tunnel->o_seqno);
877                         ptr--;
878                 }
879                 if (tunnel->parms.o_flags&GRE_KEY) {
880                         *ptr = tunnel->parms.o_key;
881                         ptr--;
882                 }
883                 if (tunnel->parms.o_flags&GRE_CSUM) {
884                         *ptr = 0;
885                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
886                 }
887         }
888
889         nf_reset(skb);
890
891         IPTUNNEL_XMIT();
892         tunnel->recursion--;
893         return 0;
894
895 tx_error_icmp:
896         dst_link_failure(skb);
897
898 tx_error:
899         stats->tx_errors++;
900         dev_kfree_skb(skb);
901         tunnel->recursion--;
902         return 0;
903 }
904
905 static int ipgre_tunnel_bind_dev(struct net_device *dev)
906 {
907         struct net_device *tdev = NULL;
908         struct ip_tunnel *tunnel;
909         struct iphdr *iph;
910         int hlen = LL_MAX_HEADER;
911         int mtu = ETH_DATA_LEN;
912         int addend = sizeof(struct iphdr) + 4;
913
914         tunnel = netdev_priv(dev);
915         iph = &tunnel->parms.iph;
916
917         /* Guess output device to choose reasonable mtu and needed_headroom */
918
919         if (iph->daddr) {
920                 struct flowi fl = { .oif = tunnel->parms.link,
921                                     .nl_u = { .ip4_u =
922                                               { .daddr = iph->daddr,
923                                                 .saddr = iph->saddr,
924                                                 .tos = RT_TOS(iph->tos) } },
925                                     .proto = IPPROTO_GRE };
926                 struct rtable *rt;
927                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
928                         tdev = rt->u.dst.dev;
929                         ip_rt_put(rt);
930                 }
931
932                 if (dev->type != ARPHRD_ETHER)
933                         dev->flags |= IFF_POINTOPOINT;
934         }
935
936         if (!tdev && tunnel->parms.link)
937                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
938
939         if (tdev) {
940                 hlen = tdev->hard_header_len + tdev->needed_headroom;
941                 mtu = tdev->mtu;
942         }
943         dev->iflink = tunnel->parms.link;
944
945         /* Precalculate GRE options length */
946         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
947                 if (tunnel->parms.o_flags&GRE_CSUM)
948                         addend += 4;
949                 if (tunnel->parms.o_flags&GRE_KEY)
950                         addend += 4;
951                 if (tunnel->parms.o_flags&GRE_SEQ)
952                         addend += 4;
953         }
954         dev->needed_headroom = addend + hlen;
955         mtu -= dev->hard_header_len - addend;
956
957         if (mtu < 68)
958                 mtu = 68;
959
960         tunnel->hlen = addend;
961
962         return mtu;
963 }
964
965 static int
966 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
967 {
968         int err = 0;
969         struct ip_tunnel_parm p;
970         struct ip_tunnel *t;
971         struct net *net = dev_net(dev);
972         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
973
974         switch (cmd) {
975         case SIOCGETTUNNEL:
976                 t = NULL;
977                 if (dev == ign->fb_tunnel_dev) {
978                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
979                                 err = -EFAULT;
980                                 break;
981                         }
982                         t = ipgre_tunnel_locate(net, &p, 0);
983                 }
984                 if (t == NULL)
985                         t = netdev_priv(dev);
986                 memcpy(&p, &t->parms, sizeof(p));
987                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
988                         err = -EFAULT;
989                 break;
990
991         case SIOCADDTUNNEL:
992         case SIOCCHGTUNNEL:
993                 err = -EPERM;
994                 if (!capable(CAP_NET_ADMIN))
995                         goto done;
996
997                 err = -EFAULT;
998                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
999                         goto done;
1000
1001                 err = -EINVAL;
1002                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1003                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1004                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1005                         goto done;
1006                 if (p.iph.ttl)
1007                         p.iph.frag_off |= htons(IP_DF);
1008
1009                 if (!(p.i_flags&GRE_KEY))
1010                         p.i_key = 0;
1011                 if (!(p.o_flags&GRE_KEY))
1012                         p.o_key = 0;
1013
1014                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1015
1016                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1017                         if (t != NULL) {
1018                                 if (t->dev != dev) {
1019                                         err = -EEXIST;
1020                                         break;
1021                                 }
1022                         } else {
1023                                 unsigned nflags = 0;
1024
1025                                 t = netdev_priv(dev);
1026
1027                                 if (ipv4_is_multicast(p.iph.daddr))
1028                                         nflags = IFF_BROADCAST;
1029                                 else if (p.iph.daddr)
1030                                         nflags = IFF_POINTOPOINT;
1031
1032                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1033                                         err = -EINVAL;
1034                                         break;
1035                                 }
1036                                 ipgre_tunnel_unlink(ign, t);
1037                                 t->parms.iph.saddr = p.iph.saddr;
1038                                 t->parms.iph.daddr = p.iph.daddr;
1039                                 t->parms.i_key = p.i_key;
1040                                 t->parms.o_key = p.o_key;
1041                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1042                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1043                                 ipgre_tunnel_link(ign, t);
1044                                 netdev_state_change(dev);
1045                         }
1046                 }
1047
1048                 if (t) {
1049                         err = 0;
1050                         if (cmd == SIOCCHGTUNNEL) {
1051                                 t->parms.iph.ttl = p.iph.ttl;
1052                                 t->parms.iph.tos = p.iph.tos;
1053                                 t->parms.iph.frag_off = p.iph.frag_off;
1054                                 if (t->parms.link != p.link) {
1055                                         t->parms.link = p.link;
1056                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1057                                         netdev_state_change(dev);
1058                                 }
1059                         }
1060                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1061                                 err = -EFAULT;
1062                 } else
1063                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1064                 break;
1065
1066         case SIOCDELTUNNEL:
1067                 err = -EPERM;
1068                 if (!capable(CAP_NET_ADMIN))
1069                         goto done;
1070
1071                 if (dev == ign->fb_tunnel_dev) {
1072                         err = -EFAULT;
1073                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1074                                 goto done;
1075                         err = -ENOENT;
1076                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1077                                 goto done;
1078                         err = -EPERM;
1079                         if (t == netdev_priv(ign->fb_tunnel_dev))
1080                                 goto done;
1081                         dev = t->dev;
1082                 }
1083                 unregister_netdevice(dev);
1084                 err = 0;
1085                 break;
1086
1087         default:
1088                 err = -EINVAL;
1089         }
1090
1091 done:
1092         return err;
1093 }
1094
1095 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1096 {
1097         struct ip_tunnel *tunnel = netdev_priv(dev);
1098         if (new_mtu < 68 ||
1099             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1100                 return -EINVAL;
1101         dev->mtu = new_mtu;
1102         return 0;
1103 }
1104
1105 /* Nice toy. Unfortunately, useless in real life :-)
1106    It allows to construct virtual multiprotocol broadcast "LAN"
1107    over the Internet, provided multicast routing is tuned.
1108
1109
1110    I have no idea was this bicycle invented before me,
1111    so that I had to set ARPHRD_IPGRE to a random value.
1112    I have an impression, that Cisco could make something similar,
1113    but this feature is apparently missing in IOS<=11.2(8).
1114
1115    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1116    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1117
1118    ping -t 255 224.66.66.66
1119
1120    If nobody answers, mbone does not work.
1121
1122    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1123    ip addr add 10.66.66.<somewhat>/24 dev Universe
1124    ifconfig Universe up
1125    ifconfig Universe add fe80::<Your_real_addr>/10
1126    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1127    ftp 10.66.66.66
1128    ...
1129    ftp fec0:6666:6666::193.233.7.65
1130    ...
1131
1132  */
1133
1134 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1135                         unsigned short type,
1136                         const void *daddr, const void *saddr, unsigned len)
1137 {
1138         struct ip_tunnel *t = netdev_priv(dev);
1139         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1140         __be16 *p = (__be16*)(iph+1);
1141
1142         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1143         p[0]            = t->parms.o_flags;
1144         p[1]            = htons(type);
1145
1146         /*
1147          *      Set the source hardware address.
1148          */
1149
1150         if (saddr)
1151                 memcpy(&iph->saddr, saddr, 4);
1152
1153         if (daddr) {
1154                 memcpy(&iph->daddr, daddr, 4);
1155                 return t->hlen;
1156         }
1157         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1158                 return t->hlen;
1159
1160         return -t->hlen;
1161 }
1162
1163 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1164 {
1165         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1166         memcpy(haddr, &iph->saddr, 4);
1167         return 4;
1168 }
1169
1170 static const struct header_ops ipgre_header_ops = {
1171         .create = ipgre_header,
1172         .parse  = ipgre_header_parse,
1173 };
1174
1175 #ifdef CONFIG_NET_IPGRE_BROADCAST
1176 static int ipgre_open(struct net_device *dev)
1177 {
1178         struct ip_tunnel *t = netdev_priv(dev);
1179
1180         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1181                 struct flowi fl = { .oif = t->parms.link,
1182                                     .nl_u = { .ip4_u =
1183                                               { .daddr = t->parms.iph.daddr,
1184                                                 .saddr = t->parms.iph.saddr,
1185                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1186                                     .proto = IPPROTO_GRE };
1187                 struct rtable *rt;
1188                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1189                         return -EADDRNOTAVAIL;
1190                 dev = rt->u.dst.dev;
1191                 ip_rt_put(rt);
1192                 if (__in_dev_get_rtnl(dev) == NULL)
1193                         return -EADDRNOTAVAIL;
1194                 t->mlink = dev->ifindex;
1195                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1196         }
1197         return 0;
1198 }
1199
1200 static int ipgre_close(struct net_device *dev)
1201 {
1202         struct ip_tunnel *t = netdev_priv(dev);
1203
1204         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1205                 struct in_device *in_dev;
1206                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1207                 if (in_dev) {
1208                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1209                         in_dev_put(in_dev);
1210                 }
1211         }
1212         return 0;
1213 }
1214
1215 #endif
1216
1217 static const struct net_device_ops ipgre_netdev_ops = {
1218         .ndo_init               = ipgre_tunnel_init,
1219         .ndo_uninit             = ipgre_tunnel_uninit,
1220 #ifdef CONFIG_NET_IPGRE_BROADCAST
1221         .ndo_open               = ipgre_open,
1222         .ndo_stop               = ipgre_close,
1223 #endif
1224         .ndo_start_xmit         = ipgre_tunnel_xmit,
1225         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1226         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1227 };
1228
1229 static void ipgre_tunnel_setup(struct net_device *dev)
1230 {
1231         dev->netdev_ops         = &ipgre_netdev_ops;
1232         dev->destructor         = free_netdev;
1233
1234         dev->type               = ARPHRD_IPGRE;
1235         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1236         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1237         dev->flags              = IFF_NOARP;
1238         dev->iflink             = 0;
1239         dev->addr_len           = 4;
1240         dev->features           |= NETIF_F_NETNS_LOCAL;
1241         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1242 }
1243
1244 static int ipgre_tunnel_init(struct net_device *dev)
1245 {
1246         struct ip_tunnel *tunnel;
1247         struct iphdr *iph;
1248
1249         tunnel = netdev_priv(dev);
1250         iph = &tunnel->parms.iph;
1251
1252         tunnel->dev = dev;
1253         strcpy(tunnel->parms.name, dev->name);
1254
1255         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1256         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1257
1258         if (iph->daddr) {
1259 #ifdef CONFIG_NET_IPGRE_BROADCAST
1260                 if (ipv4_is_multicast(iph->daddr)) {
1261                         if (!iph->saddr)
1262                                 return -EINVAL;
1263                         dev->flags = IFF_BROADCAST;
1264                         dev->header_ops = &ipgre_header_ops;
1265                 }
1266 #endif
1267         } else
1268                 dev->header_ops = &ipgre_header_ops;
1269
1270         return 0;
1271 }
1272
1273 static void ipgre_fb_tunnel_init(struct net_device *dev)
1274 {
1275         struct ip_tunnel *tunnel = netdev_priv(dev);
1276         struct iphdr *iph = &tunnel->parms.iph;
1277         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1278
1279         tunnel->dev = dev;
1280         strcpy(tunnel->parms.name, dev->name);
1281
1282         iph->version            = 4;
1283         iph->protocol           = IPPROTO_GRE;
1284         iph->ihl                = 5;
1285         tunnel->hlen            = sizeof(struct iphdr) + 4;
1286
1287         dev_hold(dev);
1288         ign->tunnels_wc[0]      = tunnel;
1289 }
1290
1291
1292 static struct net_protocol ipgre_protocol = {
1293         .handler        =       ipgre_rcv,
1294         .err_handler    =       ipgre_err,
1295         .netns_ok       =       1,
1296 };
1297
1298 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1299 {
1300         int prio;
1301
1302         for (prio = 0; prio < 4; prio++) {
1303                 int h;
1304                 for (h = 0; h < HASH_SIZE; h++) {
1305                         struct ip_tunnel *t;
1306                         while ((t = ign->tunnels[prio][h]) != NULL)
1307                                 unregister_netdevice(t->dev);
1308                 }
1309         }
1310 }
1311
1312 static int ipgre_init_net(struct net *net)
1313 {
1314         int err;
1315         struct ipgre_net *ign;
1316
1317         err = -ENOMEM;
1318         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1319         if (ign == NULL)
1320                 goto err_alloc;
1321
1322         err = net_assign_generic(net, ipgre_net_id, ign);
1323         if (err < 0)
1324                 goto err_assign;
1325
1326         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1327                                            ipgre_tunnel_setup);
1328         if (!ign->fb_tunnel_dev) {
1329                 err = -ENOMEM;
1330                 goto err_alloc_dev;
1331         }
1332         dev_net_set(ign->fb_tunnel_dev, net);
1333
1334         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1335         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1336
1337         if ((err = register_netdev(ign->fb_tunnel_dev)))
1338                 goto err_reg_dev;
1339
1340         return 0;
1341
1342 err_reg_dev:
1343         free_netdev(ign->fb_tunnel_dev);
1344 err_alloc_dev:
1345         /* nothing */
1346 err_assign:
1347         kfree(ign);
1348 err_alloc:
1349         return err;
1350 }
1351
1352 static void ipgre_exit_net(struct net *net)
1353 {
1354         struct ipgre_net *ign;
1355
1356         ign = net_generic(net, ipgre_net_id);
1357         rtnl_lock();
1358         ipgre_destroy_tunnels(ign);
1359         rtnl_unlock();
1360         kfree(ign);
1361 }
1362
1363 static struct pernet_operations ipgre_net_ops = {
1364         .init = ipgre_init_net,
1365         .exit = ipgre_exit_net,
1366 };
1367
1368 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1369 {
1370         __be16 flags;
1371
1372         if (!data)
1373                 return 0;
1374
1375         flags = 0;
1376         if (data[IFLA_GRE_IFLAGS])
1377                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1378         if (data[IFLA_GRE_OFLAGS])
1379                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1380         if (flags & (GRE_VERSION|GRE_ROUTING))
1381                 return -EINVAL;
1382
1383         return 0;
1384 }
1385
1386 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1387 {
1388         __be32 daddr;
1389
1390         if (tb[IFLA_ADDRESS]) {
1391                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1392                         return -EINVAL;
1393                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1394                         return -EADDRNOTAVAIL;
1395         }
1396
1397         if (!data)
1398                 goto out;
1399
1400         if (data[IFLA_GRE_REMOTE]) {
1401                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1402                 if (!daddr)
1403                         return -EINVAL;
1404         }
1405
1406 out:
1407         return ipgre_tunnel_validate(tb, data);
1408 }
1409
1410 static void ipgre_netlink_parms(struct nlattr *data[],
1411                                 struct ip_tunnel_parm *parms)
1412 {
1413         memset(parms, 0, sizeof(*parms));
1414
1415         parms->iph.protocol = IPPROTO_GRE;
1416
1417         if (!data)
1418                 return;
1419
1420         if (data[IFLA_GRE_LINK])
1421                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1422
1423         if (data[IFLA_GRE_IFLAGS])
1424                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1425
1426         if (data[IFLA_GRE_OFLAGS])
1427                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1428
1429         if (data[IFLA_GRE_IKEY])
1430                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1431
1432         if (data[IFLA_GRE_OKEY])
1433                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1434
1435         if (data[IFLA_GRE_LOCAL])
1436                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1437
1438         if (data[IFLA_GRE_REMOTE])
1439                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1440
1441         if (data[IFLA_GRE_TTL])
1442                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1443
1444         if (data[IFLA_GRE_TOS])
1445                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1446
1447         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1448                 parms->iph.frag_off = htons(IP_DF);
1449 }
1450
1451 static int ipgre_tap_init(struct net_device *dev)
1452 {
1453         struct ip_tunnel *tunnel;
1454
1455         tunnel = netdev_priv(dev);
1456
1457         tunnel->dev = dev;
1458         strcpy(tunnel->parms.name, dev->name);
1459
1460         ipgre_tunnel_bind_dev(dev);
1461
1462         return 0;
1463 }
1464
1465 static const struct net_device_ops ipgre_tap_netdev_ops = {
1466         .ndo_init               = ipgre_tap_init,
1467         .ndo_uninit             = ipgre_tunnel_uninit,
1468         .ndo_start_xmit         = ipgre_tunnel_xmit,
1469         .ndo_set_mac_address    = eth_mac_addr,
1470         .ndo_validate_addr      = eth_validate_addr,
1471         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1472 };
1473
1474 static void ipgre_tap_setup(struct net_device *dev)
1475 {
1476
1477         ether_setup(dev);
1478
1479         dev->netdev_ops         = &ipgre_netdev_ops;
1480         dev->destructor         = free_netdev;
1481
1482         dev->iflink             = 0;
1483         dev->features           |= NETIF_F_NETNS_LOCAL;
1484 }
1485
1486 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1487                          struct nlattr *data[])
1488 {
1489         struct ip_tunnel *nt;
1490         struct net *net = dev_net(dev);
1491         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1492         int mtu;
1493         int err;
1494
1495         nt = netdev_priv(dev);
1496         ipgre_netlink_parms(data, &nt->parms);
1497
1498         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1499                 return -EEXIST;
1500
1501         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1502                 random_ether_addr(dev->dev_addr);
1503
1504         mtu = ipgre_tunnel_bind_dev(dev);
1505         if (!tb[IFLA_MTU])
1506                 dev->mtu = mtu;
1507
1508         err = register_netdevice(dev);
1509         if (err)
1510                 goto out;
1511
1512         dev_hold(dev);
1513         ipgre_tunnel_link(ign, nt);
1514
1515 out:
1516         return err;
1517 }
1518
1519 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1520                             struct nlattr *data[])
1521 {
1522         struct ip_tunnel *t, *nt;
1523         struct net *net = dev_net(dev);
1524         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1525         struct ip_tunnel_parm p;
1526         int mtu;
1527
1528         if (dev == ign->fb_tunnel_dev)
1529                 return -EINVAL;
1530
1531         nt = netdev_priv(dev);
1532         ipgre_netlink_parms(data, &p);
1533
1534         t = ipgre_tunnel_locate(net, &p, 0);
1535
1536         if (t) {
1537                 if (t->dev != dev)
1538                         return -EEXIST;
1539         } else {
1540                 unsigned nflags = 0;
1541
1542                 t = nt;
1543
1544                 if (ipv4_is_multicast(p.iph.daddr))
1545                         nflags = IFF_BROADCAST;
1546                 else if (p.iph.daddr)
1547                         nflags = IFF_POINTOPOINT;
1548
1549                 if ((dev->flags ^ nflags) &
1550                     (IFF_POINTOPOINT | IFF_BROADCAST))
1551                         return -EINVAL;
1552
1553                 ipgre_tunnel_unlink(ign, t);
1554                 t->parms.iph.saddr = p.iph.saddr;
1555                 t->parms.iph.daddr = p.iph.daddr;
1556                 t->parms.i_key = p.i_key;
1557                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1558                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1559                 ipgre_tunnel_link(ign, t);
1560                 netdev_state_change(dev);
1561         }
1562
1563         t->parms.o_key = p.o_key;
1564         t->parms.iph.ttl = p.iph.ttl;
1565         t->parms.iph.tos = p.iph.tos;
1566         t->parms.iph.frag_off = p.iph.frag_off;
1567
1568         if (t->parms.link != p.link) {
1569                 t->parms.link = p.link;
1570                 mtu = ipgre_tunnel_bind_dev(dev);
1571                 if (!tb[IFLA_MTU])
1572                         dev->mtu = mtu;
1573                 netdev_state_change(dev);
1574         }
1575
1576         return 0;
1577 }
1578
1579 static size_t ipgre_get_size(const struct net_device *dev)
1580 {
1581         return
1582                 /* IFLA_GRE_LINK */
1583                 nla_total_size(4) +
1584                 /* IFLA_GRE_IFLAGS */
1585                 nla_total_size(2) +
1586                 /* IFLA_GRE_OFLAGS */
1587                 nla_total_size(2) +
1588                 /* IFLA_GRE_IKEY */
1589                 nla_total_size(4) +
1590                 /* IFLA_GRE_OKEY */
1591                 nla_total_size(4) +
1592                 /* IFLA_GRE_LOCAL */
1593                 nla_total_size(4) +
1594                 /* IFLA_GRE_REMOTE */
1595                 nla_total_size(4) +
1596                 /* IFLA_GRE_TTL */
1597                 nla_total_size(1) +
1598                 /* IFLA_GRE_TOS */
1599                 nla_total_size(1) +
1600                 /* IFLA_GRE_PMTUDISC */
1601                 nla_total_size(1) +
1602                 0;
1603 }
1604
1605 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1606 {
1607         struct ip_tunnel *t = netdev_priv(dev);
1608         struct ip_tunnel_parm *p = &t->parms;
1609
1610         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1611         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1612         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1613         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1614         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1615         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1616         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1617         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1618         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1619         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1620
1621         return 0;
1622
1623 nla_put_failure:
1624         return -EMSGSIZE;
1625 }
1626
1627 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1628         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1629         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1630         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1631         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1632         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1633         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1634         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1635         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1636         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1637         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1638 };
1639
1640 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1641         .kind           = "gre",
1642         .maxtype        = IFLA_GRE_MAX,
1643         .policy         = ipgre_policy,
1644         .priv_size      = sizeof(struct ip_tunnel),
1645         .setup          = ipgre_tunnel_setup,
1646         .validate       = ipgre_tunnel_validate,
1647         .newlink        = ipgre_newlink,
1648         .changelink     = ipgre_changelink,
1649         .get_size       = ipgre_get_size,
1650         .fill_info      = ipgre_fill_info,
1651 };
1652
1653 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1654         .kind           = "gretap",
1655         .maxtype        = IFLA_GRE_MAX,
1656         .policy         = ipgre_policy,
1657         .priv_size      = sizeof(struct ip_tunnel),
1658         .setup          = ipgre_tap_setup,
1659         .validate       = ipgre_tap_validate,
1660         .newlink        = ipgre_newlink,
1661         .changelink     = ipgre_changelink,
1662         .get_size       = ipgre_get_size,
1663         .fill_info      = ipgre_fill_info,
1664 };
1665
1666 /*
1667  *      And now the modules code and kernel interface.
1668  */
1669
1670 static int __init ipgre_init(void)
1671 {
1672         int err;
1673
1674         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1675
1676         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1677                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1678                 return -EAGAIN;
1679         }
1680
1681         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1682         if (err < 0)
1683                 goto gen_device_failed;
1684
1685         err = rtnl_link_register(&ipgre_link_ops);
1686         if (err < 0)
1687                 goto rtnl_link_failed;
1688
1689         err = rtnl_link_register(&ipgre_tap_ops);
1690         if (err < 0)
1691                 goto tap_ops_failed;
1692
1693 out:
1694         return err;
1695
1696 tap_ops_failed:
1697         rtnl_link_unregister(&ipgre_link_ops);
1698 rtnl_link_failed:
1699         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1700 gen_device_failed:
1701         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1702         goto out;
1703 }
1704
1705 static void __exit ipgre_fini(void)
1706 {
1707         rtnl_link_unregister(&ipgre_tap_ops);
1708         rtnl_link_unregister(&ipgre_link_ops);
1709         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1710         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1711                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1712 }
1713
1714 module_init(ipgre_init);
1715 module_exit(ipgre_fini);
1716 MODULE_LICENSE("GPL");
1717 MODULE_ALIAS_RTNL_LINK("gre");
1718 MODULE_ALIAS_RTNL_LINK("gretap");