tun: Introduce tun_file
[linux-2.6] / drivers / net / tun.c
1 /*
2  *  TUN - Universal TUN/TAP device driver.
3  *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
4  *
5  *  This program is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  *  GNU General Public License for more details.
14  *
15  *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
16  */
17
18 /*
19  *  Changes:
20  *
21  *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
22  *    Add TUNSETLINK ioctl to set the link encapsulation
23  *
24  *  Mark Smith <markzzzsmith@yahoo.com.au>
25  *    Use random_ether_addr() for tap MAC address.
26  *
27  *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
28  *    Fixes in packet dropping, queue length setting and queue wakeup.
29  *    Increased default tx queue length.
30  *    Added ethtool API.
31  *    Minor cleanups
32  *
33  *  Daniel Podlejski <underley@underley.eu.org>
34  *    Modifications for 2.3.99-pre5 kernel.
35  */
36
37 #define DRV_NAME        "tun"
38 #define DRV_VERSION     "1.6"
39 #define DRV_DESCRIPTION "Universal TUN/TAP device driver"
40 #define DRV_COPYRIGHT   "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
41
42 #include <linux/module.h>
43 #include <linux/errno.h>
44 #include <linux/kernel.h>
45 #include <linux/major.h>
46 #include <linux/slab.h>
47 #include <linux/smp_lock.h>
48 #include <linux/poll.h>
49 #include <linux/fcntl.h>
50 #include <linux/init.h>
51 #include <linux/skbuff.h>
52 #include <linux/netdevice.h>
53 #include <linux/etherdevice.h>
54 #include <linux/miscdevice.h>
55 #include <linux/ethtool.h>
56 #include <linux/rtnetlink.h>
57 #include <linux/if.h>
58 #include <linux/if_arp.h>
59 #include <linux/if_ether.h>
60 #include <linux/if_tun.h>
61 #include <linux/crc32.h>
62 #include <linux/nsproxy.h>
63 #include <linux/virtio_net.h>
64 #include <net/net_namespace.h>
65 #include <net/netns/generic.h>
66
67 #include <asm/system.h>
68 #include <asm/uaccess.h>
69
70 /* Uncomment to enable debugging */
71 /* #define TUN_DEBUG 1 */
72
73 #ifdef TUN_DEBUG
74 static int debug;
75
76 #define DBG  if(tun->debug)printk
77 #define DBG1 if(debug==2)printk
78 #else
79 #define DBG( a... )
80 #define DBG1( a... )
81 #endif
82
83 #define FLT_EXACT_COUNT 8
84 struct tap_filter {
85         unsigned int    count;    /* Number of addrs. Zero means disabled */
86         u32             mask[2];  /* Mask of the hashed addrs */
87         unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
88 };
89
90 struct tun_file {
91         struct tun_struct *tun;
92 };
93
94 struct tun_struct {
95         struct tun_file         *tfile;
96         unsigned int            flags;
97         uid_t                   owner;
98         gid_t                   group;
99
100         wait_queue_head_t       read_wait;
101         struct sk_buff_head     readq;
102
103         struct net_device       *dev;
104         struct fasync_struct    *fasync;
105
106         struct tap_filter       txflt;
107
108 #ifdef TUN_DEBUG
109         int debug;
110 #endif
111 };
112
113 static int tun_attach(struct tun_struct *tun, struct file *file)
114 {
115         struct tun_file *tfile = file->private_data;
116         const struct cred *cred = current_cred();
117
118         ASSERT_RTNL();
119
120         if (tfile->tun)
121                 return -EINVAL;
122
123         if (tun->tfile)
124                 return -EBUSY;
125
126         /* Check permissions */
127         if (((tun->owner != -1 && cred->euid != tun->owner) ||
128              (tun->group != -1 && cred->egid != tun->group)) &&
129                 !capable(CAP_NET_ADMIN))
130                 return -EPERM;
131
132         tfile->tun = tun;
133         tun->tfile = tfile;
134         get_net(dev_net(tun->dev));
135
136         return 0;
137 }
138
139 static void __tun_detach(struct tun_struct *tun)
140 {
141         struct tun_file *tfile = tun->tfile;
142
143         /* Detach from net device */
144         tfile->tun = NULL;
145         tun->tfile = NULL;
146         put_net(dev_net(tun->dev));
147
148         /* Drop read queue */
149         skb_queue_purge(&tun->readq);
150 }
151
152 static struct tun_struct *__tun_get(struct tun_file *tfile)
153 {
154         return tfile->tun;
155 }
156
157 static struct tun_struct *tun_get(struct file *file)
158 {
159         return __tun_get(file->private_data);
160 }
161
162 static void tun_put(struct tun_struct *tun)
163 {
164         /* Noop for now */
165 }
166
167 /* TAP filterting */
168 static void addr_hash_set(u32 *mask, const u8 *addr)
169 {
170         int n = ether_crc(ETH_ALEN, addr) >> 26;
171         mask[n >> 5] |= (1 << (n & 31));
172 }
173
174 static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
175 {
176         int n = ether_crc(ETH_ALEN, addr) >> 26;
177         return mask[n >> 5] & (1 << (n & 31));
178 }
179
180 static int update_filter(struct tap_filter *filter, void __user *arg)
181 {
182         struct { u8 u[ETH_ALEN]; } *addr;
183         struct tun_filter uf;
184         int err, alen, n, nexact;
185
186         if (copy_from_user(&uf, arg, sizeof(uf)))
187                 return -EFAULT;
188
189         if (!uf.count) {
190                 /* Disabled */
191                 filter->count = 0;
192                 return 0;
193         }
194
195         alen = ETH_ALEN * uf.count;
196         addr = kmalloc(alen, GFP_KERNEL);
197         if (!addr)
198                 return -ENOMEM;
199
200         if (copy_from_user(addr, arg + sizeof(uf), alen)) {
201                 err = -EFAULT;
202                 goto done;
203         }
204
205         /* The filter is updated without holding any locks. Which is
206          * perfectly safe. We disable it first and in the worst
207          * case we'll accept a few undesired packets. */
208         filter->count = 0;
209         wmb();
210
211         /* Use first set of addresses as an exact filter */
212         for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
213                 memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
214
215         nexact = n;
216
217         /* The rest is hashed */
218         memset(filter->mask, 0, sizeof(filter->mask));
219         for (; n < uf.count; n++)
220                 addr_hash_set(filter->mask, addr[n].u);
221
222         /* For ALLMULTI just set the mask to all ones.
223          * This overrides the mask populated above. */
224         if ((uf.flags & TUN_FLT_ALLMULTI))
225                 memset(filter->mask, ~0, sizeof(filter->mask));
226
227         /* Now enable the filter */
228         wmb();
229         filter->count = nexact;
230
231         /* Return the number of exact filters */
232         err = nexact;
233
234 done:
235         kfree(addr);
236         return err;
237 }
238
239 /* Returns: 0 - drop, !=0 - accept */
240 static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
241 {
242         /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
243          * at this point. */
244         struct ethhdr *eh = (struct ethhdr *) skb->data;
245         int i;
246
247         /* Exact match */
248         for (i = 0; i < filter->count; i++)
249                 if (!compare_ether_addr(eh->h_dest, filter->addr[i]))
250                         return 1;
251
252         /* Inexact match (multicast only) */
253         if (is_multicast_ether_addr(eh->h_dest))
254                 return addr_hash_test(filter->mask, eh->h_dest);
255
256         return 0;
257 }
258
259 /*
260  * Checks whether the packet is accepted or not.
261  * Returns: 0 - drop, !=0 - accept
262  */
263 static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
264 {
265         if (!filter->count)
266                 return 1;
267
268         return run_filter(filter, skb);
269 }
270
271 /* Network device part of the driver */
272
273 static const struct ethtool_ops tun_ethtool_ops;
274
275 /* Net device open. */
276 static int tun_net_open(struct net_device *dev)
277 {
278         netif_start_queue(dev);
279         return 0;
280 }
281
282 /* Net device close. */
283 static int tun_net_close(struct net_device *dev)
284 {
285         netif_stop_queue(dev);
286         return 0;
287 }
288
289 /* Net device start xmit */
290 static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
291 {
292         struct tun_struct *tun = netdev_priv(dev);
293
294         DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len);
295
296         /* Drop packet if interface is not attached */
297         if (!tun->tfile)
298                 goto drop;
299
300         /* Drop if the filter does not like it.
301          * This is a noop if the filter is disabled.
302          * Filter can be enabled only for the TAP devices. */
303         if (!check_filter(&tun->txflt, skb))
304                 goto drop;
305
306         if (skb_queue_len(&tun->readq) >= dev->tx_queue_len) {
307                 if (!(tun->flags & TUN_ONE_QUEUE)) {
308                         /* Normal queueing mode. */
309                         /* Packet scheduler handles dropping of further packets. */
310                         netif_stop_queue(dev);
311
312                         /* We won't see all dropped packets individually, so overrun
313                          * error is more appropriate. */
314                         dev->stats.tx_fifo_errors++;
315                 } else {
316                         /* Single queue mode.
317                          * Driver handles dropping of all packets itself. */
318                         goto drop;
319                 }
320         }
321
322         /* Enqueue packet */
323         skb_queue_tail(&tun->readq, skb);
324         dev->trans_start = jiffies;
325
326         /* Notify and wake up reader process */
327         if (tun->flags & TUN_FASYNC)
328                 kill_fasync(&tun->fasync, SIGIO, POLL_IN);
329         wake_up_interruptible(&tun->read_wait);
330         return 0;
331
332 drop:
333         dev->stats.tx_dropped++;
334         kfree_skb(skb);
335         return 0;
336 }
337
338 static void tun_net_mclist(struct net_device *dev)
339 {
340         /*
341          * This callback is supposed to deal with mc filter in
342          * _rx_ path and has nothing to do with the _tx_ path.
343          * In rx path we always accept everything userspace gives us.
344          */
345         return;
346 }
347
348 #define MIN_MTU 68
349 #define MAX_MTU 65535
350
351 static int
352 tun_net_change_mtu(struct net_device *dev, int new_mtu)
353 {
354         if (new_mtu < MIN_MTU || new_mtu + dev->hard_header_len > MAX_MTU)
355                 return -EINVAL;
356         dev->mtu = new_mtu;
357         return 0;
358 }
359
360 static const struct net_device_ops tun_netdev_ops = {
361         .ndo_open               = tun_net_open,
362         .ndo_stop               = tun_net_close,
363         .ndo_start_xmit         = tun_net_xmit,
364         .ndo_change_mtu         = tun_net_change_mtu,
365 };
366
367 static const struct net_device_ops tap_netdev_ops = {
368         .ndo_open               = tun_net_open,
369         .ndo_stop               = tun_net_close,
370         .ndo_start_xmit         = tun_net_xmit,
371         .ndo_change_mtu         = tun_net_change_mtu,
372         .ndo_set_multicast_list = tun_net_mclist,
373         .ndo_set_mac_address    = eth_mac_addr,
374         .ndo_validate_addr      = eth_validate_addr,
375 };
376
377 /* Initialize net device. */
378 static void tun_net_init(struct net_device *dev)
379 {
380         struct tun_struct *tun = netdev_priv(dev);
381
382         switch (tun->flags & TUN_TYPE_MASK) {
383         case TUN_TUN_DEV:
384                 dev->netdev_ops = &tun_netdev_ops;
385
386                 /* Point-to-Point TUN Device */
387                 dev->hard_header_len = 0;
388                 dev->addr_len = 0;
389                 dev->mtu = 1500;
390
391                 /* Zero header length */
392                 dev->type = ARPHRD_NONE;
393                 dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
394                 dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
395                 break;
396
397         case TUN_TAP_DEV:
398                 dev->netdev_ops = &tap_netdev_ops;
399                 /* Ethernet TAP Device */
400                 ether_setup(dev);
401
402                 random_ether_addr(dev->dev_addr);
403
404                 dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
405                 break;
406         }
407 }
408
409 /* Character device part */
410
411 /* Poll */
412 static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
413 {
414         struct tun_struct *tun = tun_get(file);
415         unsigned int mask = POLLOUT | POLLWRNORM;
416
417         if (!tun)
418                 return POLLERR;
419
420         DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
421
422         poll_wait(file, &tun->read_wait, wait);
423
424         if (!skb_queue_empty(&tun->readq))
425                 mask |= POLLIN | POLLRDNORM;
426
427         tun_put(tun);
428         return mask;
429 }
430
431 /* prepad is the amount to reserve at front.  len is length after that.
432  * linear is a hint as to how much to copy (usually headers). */
433 static struct sk_buff *tun_alloc_skb(size_t prepad, size_t len, size_t linear,
434                                      gfp_t gfp)
435 {
436         struct sk_buff *skb;
437         unsigned int i;
438
439         skb = alloc_skb(prepad + len, gfp|__GFP_NOWARN);
440         if (skb) {
441                 skb_reserve(skb, prepad);
442                 skb_put(skb, len);
443                 return skb;
444         }
445
446         /* Under a page?  Don't bother with paged skb. */
447         if (prepad + len < PAGE_SIZE)
448                 return NULL;
449
450         /* Start with a normal skb, and add pages. */
451         skb = alloc_skb(prepad + linear, gfp);
452         if (!skb)
453                 return NULL;
454
455         skb_reserve(skb, prepad);
456         skb_put(skb, linear);
457
458         len -= linear;
459
460         for (i = 0; i < MAX_SKB_FRAGS; i++) {
461                 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
462
463                 f->page = alloc_page(gfp|__GFP_ZERO);
464                 if (!f->page)
465                         break;
466
467                 f->page_offset = 0;
468                 f->size = PAGE_SIZE;
469
470                 skb->data_len += PAGE_SIZE;
471                 skb->len += PAGE_SIZE;
472                 skb->truesize += PAGE_SIZE;
473                 skb_shinfo(skb)->nr_frags++;
474
475                 if (len < PAGE_SIZE) {
476                         len = 0;
477                         break;
478                 }
479                 len -= PAGE_SIZE;
480         }
481
482         /* Too large, or alloc fail? */
483         if (unlikely(len)) {
484                 kfree_skb(skb);
485                 skb = NULL;
486         }
487
488         return skb;
489 }
490
491 /* Get packet from user space buffer */
492 static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
493 {
494         struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
495         struct sk_buff *skb;
496         size_t len = count, align = 0;
497         struct virtio_net_hdr gso = { 0 };
498
499         if (!(tun->flags & TUN_NO_PI)) {
500                 if ((len -= sizeof(pi)) > count)
501                         return -EINVAL;
502
503                 if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
504                         return -EFAULT;
505         }
506
507         if (tun->flags & TUN_VNET_HDR) {
508                 if ((len -= sizeof(gso)) > count)
509                         return -EINVAL;
510
511                 if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
512                         return -EFAULT;
513
514                 if (gso.hdr_len > len)
515                         return -EINVAL;
516         }
517
518         if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
519                 align = NET_IP_ALIGN;
520                 if (unlikely(len < ETH_HLEN))
521                         return -EINVAL;
522         }
523
524         if (!(skb = tun_alloc_skb(align, len, gso.hdr_len, GFP_KERNEL))) {
525                 tun->dev->stats.rx_dropped++;
526                 return -ENOMEM;
527         }
528
529         if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) {
530                 tun->dev->stats.rx_dropped++;
531                 kfree_skb(skb);
532                 return -EFAULT;
533         }
534
535         if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
536                 if (!skb_partial_csum_set(skb, gso.csum_start,
537                                           gso.csum_offset)) {
538                         tun->dev->stats.rx_frame_errors++;
539                         kfree_skb(skb);
540                         return -EINVAL;
541                 }
542         } else if (tun->flags & TUN_NOCHECKSUM)
543                 skb->ip_summed = CHECKSUM_UNNECESSARY;
544
545         switch (tun->flags & TUN_TYPE_MASK) {
546         case TUN_TUN_DEV:
547                 if (tun->flags & TUN_NO_PI) {
548                         switch (skb->data[0] & 0xf0) {
549                         case 0x40:
550                                 pi.proto = htons(ETH_P_IP);
551                                 break;
552                         case 0x60:
553                                 pi.proto = htons(ETH_P_IPV6);
554                                 break;
555                         default:
556                                 tun->dev->stats.rx_dropped++;
557                                 kfree_skb(skb);
558                                 return -EINVAL;
559                         }
560                 }
561
562                 skb_reset_mac_header(skb);
563                 skb->protocol = pi.proto;
564                 skb->dev = tun->dev;
565                 break;
566         case TUN_TAP_DEV:
567                 skb->protocol = eth_type_trans(skb, tun->dev);
568                 break;
569         };
570
571         if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
572                 pr_debug("GSO!\n");
573                 switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
574                 case VIRTIO_NET_HDR_GSO_TCPV4:
575                         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
576                         break;
577                 case VIRTIO_NET_HDR_GSO_TCPV6:
578                         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
579                         break;
580                 default:
581                         tun->dev->stats.rx_frame_errors++;
582                         kfree_skb(skb);
583                         return -EINVAL;
584                 }
585
586                 if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
587                         skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
588
589                 skb_shinfo(skb)->gso_size = gso.gso_size;
590                 if (skb_shinfo(skb)->gso_size == 0) {
591                         tun->dev->stats.rx_frame_errors++;
592                         kfree_skb(skb);
593                         return -EINVAL;
594                 }
595
596                 /* Header must be checked, and gso_segs computed. */
597                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
598                 skb_shinfo(skb)->gso_segs = 0;
599         }
600
601         netif_rx_ni(skb);
602
603         tun->dev->stats.rx_packets++;
604         tun->dev->stats.rx_bytes += len;
605
606         return count;
607 }
608
609 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
610                               unsigned long count, loff_t pos)
611 {
612         struct tun_struct *tun = tun_get(iocb->ki_filp);
613         ssize_t result;
614
615         if (!tun)
616                 return -EBADFD;
617
618         DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
619
620         result = tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
621
622         tun_put(tun);
623         return result;
624 }
625
626 /* Put packet to the user space buffer */
627 static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
628                                        struct sk_buff *skb,
629                                        struct iovec *iv, int len)
630 {
631         struct tun_pi pi = { 0, skb->protocol };
632         ssize_t total = 0;
633
634         if (!(tun->flags & TUN_NO_PI)) {
635                 if ((len -= sizeof(pi)) < 0)
636                         return -EINVAL;
637
638                 if (len < skb->len) {
639                         /* Packet will be striped */
640                         pi.flags |= TUN_PKT_STRIP;
641                 }
642
643                 if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi)))
644                         return -EFAULT;
645                 total += sizeof(pi);
646         }
647
648         if (tun->flags & TUN_VNET_HDR) {
649                 struct virtio_net_hdr gso = { 0 }; /* no info leak */
650                 if ((len -= sizeof(gso)) < 0)
651                         return -EINVAL;
652
653                 if (skb_is_gso(skb)) {
654                         struct skb_shared_info *sinfo = skb_shinfo(skb);
655
656                         /* This is a hint as to how much should be linear. */
657                         gso.hdr_len = skb_headlen(skb);
658                         gso.gso_size = sinfo->gso_size;
659                         if (sinfo->gso_type & SKB_GSO_TCPV4)
660                                 gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
661                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
662                                 gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
663                         else
664                                 BUG();
665                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
666                                 gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
667                 } else
668                         gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
669
670                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
671                         gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
672                         gso.csum_start = skb->csum_start - skb_headroom(skb);
673                         gso.csum_offset = skb->csum_offset;
674                 } /* else everything is zero */
675
676                 if (unlikely(memcpy_toiovec(iv, (void *)&gso, sizeof(gso))))
677                         return -EFAULT;
678                 total += sizeof(gso);
679         }
680
681         len = min_t(int, skb->len, len);
682
683         skb_copy_datagram_iovec(skb, 0, iv, len);
684         total += len;
685
686         tun->dev->stats.tx_packets++;
687         tun->dev->stats.tx_bytes += len;
688
689         return total;
690 }
691
692 static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
693                             unsigned long count, loff_t pos)
694 {
695         struct file *file = iocb->ki_filp;
696         struct tun_struct *tun = tun_get(file);
697         DECLARE_WAITQUEUE(wait, current);
698         struct sk_buff *skb;
699         ssize_t len, ret = 0;
700
701         if (!tun)
702                 return -EBADFD;
703
704         DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);
705
706         len = iov_length(iv, count);
707         if (len < 0) {
708                 ret = -EINVAL;
709                 goto out;
710         }
711
712         add_wait_queue(&tun->read_wait, &wait);
713         while (len) {
714                 current->state = TASK_INTERRUPTIBLE;
715
716                 /* Read frames from the queue */
717                 if (!(skb=skb_dequeue(&tun->readq))) {
718                         if (file->f_flags & O_NONBLOCK) {
719                                 ret = -EAGAIN;
720                                 break;
721                         }
722                         if (signal_pending(current)) {
723                                 ret = -ERESTARTSYS;
724                                 break;
725                         }
726
727                         /* Nothing to read, let's sleep */
728                         schedule();
729                         continue;
730                 }
731                 netif_wake_queue(tun->dev);
732
733                 ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
734                 kfree_skb(skb);
735                 break;
736         }
737
738         current->state = TASK_RUNNING;
739         remove_wait_queue(&tun->read_wait, &wait);
740
741 out:
742         tun_put(tun);
743         return ret;
744 }
745
746 static void tun_setup(struct net_device *dev)
747 {
748         struct tun_struct *tun = netdev_priv(dev);
749
750         skb_queue_head_init(&tun->readq);
751         init_waitqueue_head(&tun->read_wait);
752
753         tun->owner = -1;
754         tun->group = -1;
755
756         dev->ethtool_ops = &tun_ethtool_ops;
757         dev->destructor = free_netdev;
758         dev->features |= NETIF_F_NETNS_LOCAL;
759 }
760
761 static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
762 {
763         struct tun_struct *tun;
764         struct net_device *dev;
765         int err;
766
767         dev = __dev_get_by_name(net, ifr->ifr_name);
768         if (dev) {
769                 if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
770                         tun = netdev_priv(dev);
771                 else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
772                         tun = netdev_priv(dev);
773                 else
774                         return -EINVAL;
775
776                 err = tun_attach(tun, file);
777                 if (err < 0)
778                         return err;
779         }
780         else {
781                 char *name;
782                 unsigned long flags = 0;
783
784                 err = -EINVAL;
785
786                 if (!capable(CAP_NET_ADMIN))
787                         return -EPERM;
788
789                 /* Set dev type */
790                 if (ifr->ifr_flags & IFF_TUN) {
791                         /* TUN device */
792                         flags |= TUN_TUN_DEV;
793                         name = "tun%d";
794                 } else if (ifr->ifr_flags & IFF_TAP) {
795                         /* TAP device */
796                         flags |= TUN_TAP_DEV;
797                         name = "tap%d";
798                 } else
799                         goto failed;
800
801                 if (*ifr->ifr_name)
802                         name = ifr->ifr_name;
803
804                 dev = alloc_netdev(sizeof(struct tun_struct), name,
805                                    tun_setup);
806                 if (!dev)
807                         return -ENOMEM;
808
809                 dev_net_set(dev, net);
810
811                 tun = netdev_priv(dev);
812                 tun->dev = dev;
813                 tun->flags = flags;
814                 tun->txflt.count = 0;
815
816                 tun_net_init(dev);
817
818                 if (strchr(dev->name, '%')) {
819                         err = dev_alloc_name(dev, dev->name);
820                         if (err < 0)
821                                 goto err_free_dev;
822                 }
823
824                 err = register_netdevice(tun->dev);
825                 if (err < 0)
826                         goto err_free_dev;
827
828                 err = tun_attach(tun, file);
829                 if (err < 0)
830                         goto err_free_dev;
831         }
832
833         DBG(KERN_INFO "%s: tun_set_iff\n", tun->dev->name);
834
835         if (ifr->ifr_flags & IFF_NO_PI)
836                 tun->flags |= TUN_NO_PI;
837         else
838                 tun->flags &= ~TUN_NO_PI;
839
840         if (ifr->ifr_flags & IFF_ONE_QUEUE)
841                 tun->flags |= TUN_ONE_QUEUE;
842         else
843                 tun->flags &= ~TUN_ONE_QUEUE;
844
845         if (ifr->ifr_flags & IFF_VNET_HDR)
846                 tun->flags |= TUN_VNET_HDR;
847         else
848                 tun->flags &= ~TUN_VNET_HDR;
849
850         /* Make sure persistent devices do not get stuck in
851          * xoff state.
852          */
853         if (netif_running(tun->dev))
854                 netif_wake_queue(tun->dev);
855
856         strcpy(ifr->ifr_name, tun->dev->name);
857         return 0;
858
859  err_free_dev:
860         free_netdev(dev);
861  failed:
862         return err;
863 }
864
865 static int tun_get_iff(struct net *net, struct file *file, struct ifreq *ifr)
866 {
867         struct tun_struct *tun = tun_get(file);
868
869         if (!tun)
870                 return -EBADFD;
871
872         DBG(KERN_INFO "%s: tun_get_iff\n", tun->dev->name);
873
874         strcpy(ifr->ifr_name, tun->dev->name);
875
876         ifr->ifr_flags = 0;
877
878         if (ifr->ifr_flags & TUN_TUN_DEV)
879                 ifr->ifr_flags |= IFF_TUN;
880         else
881                 ifr->ifr_flags |= IFF_TAP;
882
883         if (tun->flags & TUN_NO_PI)
884                 ifr->ifr_flags |= IFF_NO_PI;
885
886         if (tun->flags & TUN_ONE_QUEUE)
887                 ifr->ifr_flags |= IFF_ONE_QUEUE;
888
889         if (tun->flags & TUN_VNET_HDR)
890                 ifr->ifr_flags |= IFF_VNET_HDR;
891
892         tun_put(tun);
893         return 0;
894 }
895
896 /* This is like a cut-down ethtool ops, except done via tun fd so no
897  * privs required. */
898 static int set_offload(struct net_device *dev, unsigned long arg)
899 {
900         unsigned int old_features, features;
901
902         old_features = dev->features;
903         /* Unset features, set them as we chew on the arg. */
904         features = (old_features & ~(NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST
905                                     |NETIF_F_TSO_ECN|NETIF_F_TSO|NETIF_F_TSO6));
906
907         if (arg & TUN_F_CSUM) {
908                 features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
909                 arg &= ~TUN_F_CSUM;
910
911                 if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
912                         if (arg & TUN_F_TSO_ECN) {
913                                 features |= NETIF_F_TSO_ECN;
914                                 arg &= ~TUN_F_TSO_ECN;
915                         }
916                         if (arg & TUN_F_TSO4)
917                                 features |= NETIF_F_TSO;
918                         if (arg & TUN_F_TSO6)
919                                 features |= NETIF_F_TSO6;
920                         arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
921                 }
922         }
923
924         /* This gives the user a way to test for new features in future by
925          * trying to set them. */
926         if (arg)
927                 return -EINVAL;
928
929         dev->features = features;
930         if (old_features != dev->features)
931                 netdev_features_change(dev);
932
933         return 0;
934 }
935
936 static int tun_chr_ioctl(struct inode *inode, struct file *file,
937                          unsigned int cmd, unsigned long arg)
938 {
939         struct tun_struct *tun;
940         void __user* argp = (void __user*)arg;
941         struct ifreq ifr;
942         int ret;
943
944         if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
945                 if (copy_from_user(&ifr, argp, sizeof ifr))
946                         return -EFAULT;
947
948         if (cmd == TUNGETFEATURES) {
949                 /* Currently this just means: "what IFF flags are valid?".
950                  * This is needed because we never checked for invalid flags on
951                  * TUNSETIFF. */
952                 return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
953                                 IFF_VNET_HDR,
954                                 (unsigned int __user*)argp);
955         }
956
957         tun = tun_get(file);
958         if (cmd == TUNSETIFF && !tun) {
959                 int err;
960
961                 ifr.ifr_name[IFNAMSIZ-1] = '\0';
962
963                 rtnl_lock();
964                 err = tun_set_iff(current->nsproxy->net_ns, file, &ifr);
965                 rtnl_unlock();
966
967                 if (err)
968                         return err;
969
970                 if (copy_to_user(argp, &ifr, sizeof(ifr)))
971                         return -EFAULT;
972                 return 0;
973         }
974
975
976         if (!tun)
977                 return -EBADFD;
978
979         DBG(KERN_INFO "%s: tun_chr_ioctl cmd %d\n", tun->dev->name, cmd);
980
981         ret = 0;
982         switch (cmd) {
983         case TUNGETIFF:
984                 ret = tun_get_iff(current->nsproxy->net_ns, file, &ifr);
985                 if (ret)
986                         break;
987
988                 if (copy_to_user(argp, &ifr, sizeof(ifr)))
989                         ret = -EFAULT;
990                 break;
991
992         case TUNSETNOCSUM:
993                 /* Disable/Enable checksum */
994                 if (arg)
995                         tun->flags |= TUN_NOCHECKSUM;
996                 else
997                         tun->flags &= ~TUN_NOCHECKSUM;
998
999                 DBG(KERN_INFO "%s: checksum %s\n",
1000                     tun->dev->name, arg ? "disabled" : "enabled");
1001                 break;
1002
1003         case TUNSETPERSIST:
1004                 /* Disable/Enable persist mode */
1005                 if (arg)
1006                         tun->flags |= TUN_PERSIST;
1007                 else
1008                         tun->flags &= ~TUN_PERSIST;
1009
1010                 DBG(KERN_INFO "%s: persist %s\n",
1011                     tun->dev->name, arg ? "enabled" : "disabled");
1012                 break;
1013
1014         case TUNSETOWNER:
1015                 /* Set owner of the device */
1016                 tun->owner = (uid_t) arg;
1017
1018                 DBG(KERN_INFO "%s: owner set to %d\n", tun->dev->name, tun->owner);
1019                 break;
1020
1021         case TUNSETGROUP:
1022                 /* Set group of the device */
1023                 tun->group= (gid_t) arg;
1024
1025                 DBG(KERN_INFO "%s: group set to %d\n", tun->dev->name, tun->group);
1026                 break;
1027
1028         case TUNSETLINK:
1029                 /* Only allow setting the type when the interface is down */
1030                 rtnl_lock();
1031                 if (tun->dev->flags & IFF_UP) {
1032                         DBG(KERN_INFO "%s: Linktype set failed because interface is up\n",
1033                                 tun->dev->name);
1034                         ret = -EBUSY;
1035                 } else {
1036                         tun->dev->type = (int) arg;
1037                         DBG(KERN_INFO "%s: linktype set to %d\n", tun->dev->name, tun->dev->type);
1038                         ret = 0;
1039                 }
1040                 rtnl_unlock();
1041                 break;
1042
1043 #ifdef TUN_DEBUG
1044         case TUNSETDEBUG:
1045                 tun->debug = arg;
1046                 break;
1047 #endif
1048         case TUNSETOFFLOAD:
1049                 rtnl_lock();
1050                 ret = set_offload(tun->dev, arg);
1051                 rtnl_unlock();
1052                 break;
1053
1054         case TUNSETTXFILTER:
1055                 /* Can be set only for TAPs */
1056                 ret = -EINVAL;
1057                 if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1058                         break;
1059                 rtnl_lock();
1060                 ret = update_filter(&tun->txflt, (void __user *)arg);
1061                 rtnl_unlock();
1062                 break;
1063
1064         case SIOCGIFHWADDR:
1065                 /* Get hw addres */
1066                 memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
1067                 ifr.ifr_hwaddr.sa_family = tun->dev->type;
1068                 if (copy_to_user(argp, &ifr, sizeof ifr))
1069                         ret = -EFAULT;
1070                 break;
1071
1072         case SIOCSIFHWADDR:
1073                 /* Set hw address */
1074                 DBG(KERN_DEBUG "%s: set hw address: %pM\n",
1075                         tun->dev->name, ifr.ifr_hwaddr.sa_data);
1076
1077                 rtnl_lock();
1078                 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
1079                 rtnl_unlock();
1080                 break;
1081         default:
1082                 ret = -EINVAL;
1083                 break;
1084         };
1085
1086         tun_put(tun);
1087         return ret;
1088 }
1089
1090 static int tun_chr_fasync(int fd, struct file *file, int on)
1091 {
1092         struct tun_struct *tun = tun_get(file);
1093         int ret;
1094
1095         if (!tun)
1096                 return -EBADFD;
1097
1098         DBG(KERN_INFO "%s: tun_chr_fasync %d\n", tun->dev->name, on);
1099
1100         lock_kernel();
1101         if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0)
1102                 goto out;
1103
1104         if (on) {
1105                 ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
1106                 if (ret)
1107                         goto out;
1108                 tun->flags |= TUN_FASYNC;
1109         } else
1110                 tun->flags &= ~TUN_FASYNC;
1111         ret = 0;
1112 out:
1113         unlock_kernel();
1114         tun_put(tun);
1115         return ret;
1116 }
1117
1118 static int tun_chr_open(struct inode *inode, struct file * file)
1119 {
1120         struct tun_file *tfile;
1121         cycle_kernel_lock();
1122         DBG1(KERN_INFO "tunX: tun_chr_open\n");
1123
1124         tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
1125         if (!tfile)
1126                 return -ENOMEM;
1127         tfile->tun = NULL;
1128         file->private_data = tfile;
1129         return 0;
1130 }
1131
1132 static int tun_chr_close(struct inode *inode, struct file *file)
1133 {
1134         struct tun_file *tfile = file->private_data;
1135         struct tun_struct *tun = __tun_get(tfile);
1136
1137
1138         if (tun) {
1139                 DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);
1140
1141                 rtnl_lock();
1142                 __tun_detach(tun);
1143
1144                 /* If desireable, unregister the netdevice. */
1145                 if (!(tun->flags & TUN_PERSIST))
1146                         unregister_netdevice(tun->dev);
1147
1148                 rtnl_unlock();
1149         }
1150
1151         kfree(tfile);
1152
1153         return 0;
1154 }
1155
1156 static const struct file_operations tun_fops = {
1157         .owner  = THIS_MODULE,
1158         .llseek = no_llseek,
1159         .read  = do_sync_read,
1160         .aio_read  = tun_chr_aio_read,
1161         .write = do_sync_write,
1162         .aio_write = tun_chr_aio_write,
1163         .poll   = tun_chr_poll,
1164         .ioctl  = tun_chr_ioctl,
1165         .open   = tun_chr_open,
1166         .release = tun_chr_close,
1167         .fasync = tun_chr_fasync
1168 };
1169
1170 static struct miscdevice tun_miscdev = {
1171         .minor = TUN_MINOR,
1172         .name = "tun",
1173         .fops = &tun_fops,
1174 };
1175
1176 /* ethtool interface */
1177
1178 static int tun_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
1179 {
1180         cmd->supported          = 0;
1181         cmd->advertising        = 0;
1182         cmd->speed              = SPEED_10;
1183         cmd->duplex             = DUPLEX_FULL;
1184         cmd->port               = PORT_TP;
1185         cmd->phy_address        = 0;
1186         cmd->transceiver        = XCVR_INTERNAL;
1187         cmd->autoneg            = AUTONEG_DISABLE;
1188         cmd->maxtxpkt           = 0;
1189         cmd->maxrxpkt           = 0;
1190         return 0;
1191 }
1192
1193 static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
1194 {
1195         struct tun_struct *tun = netdev_priv(dev);
1196
1197         strcpy(info->driver, DRV_NAME);
1198         strcpy(info->version, DRV_VERSION);
1199         strcpy(info->fw_version, "N/A");
1200
1201         switch (tun->flags & TUN_TYPE_MASK) {
1202         case TUN_TUN_DEV:
1203                 strcpy(info->bus_info, "tun");
1204                 break;
1205         case TUN_TAP_DEV:
1206                 strcpy(info->bus_info, "tap");
1207                 break;
1208         }
1209 }
1210
1211 static u32 tun_get_msglevel(struct net_device *dev)
1212 {
1213 #ifdef TUN_DEBUG
1214         struct tun_struct *tun = netdev_priv(dev);
1215         return tun->debug;
1216 #else
1217         return -EOPNOTSUPP;
1218 #endif
1219 }
1220
1221 static void tun_set_msglevel(struct net_device *dev, u32 value)
1222 {
1223 #ifdef TUN_DEBUG
1224         struct tun_struct *tun = netdev_priv(dev);
1225         tun->debug = value;
1226 #endif
1227 }
1228
1229 static u32 tun_get_link(struct net_device *dev)
1230 {
1231         struct tun_struct *tun = netdev_priv(dev);
1232         return !!tun->tfile;
1233 }
1234
1235 static u32 tun_get_rx_csum(struct net_device *dev)
1236 {
1237         struct tun_struct *tun = netdev_priv(dev);
1238         return (tun->flags & TUN_NOCHECKSUM) == 0;
1239 }
1240
1241 static int tun_set_rx_csum(struct net_device *dev, u32 data)
1242 {
1243         struct tun_struct *tun = netdev_priv(dev);
1244         if (data)
1245                 tun->flags &= ~TUN_NOCHECKSUM;
1246         else
1247                 tun->flags |= TUN_NOCHECKSUM;
1248         return 0;
1249 }
1250
1251 static const struct ethtool_ops tun_ethtool_ops = {
1252         .get_settings   = tun_get_settings,
1253         .get_drvinfo    = tun_get_drvinfo,
1254         .get_msglevel   = tun_get_msglevel,
1255         .set_msglevel   = tun_set_msglevel,
1256         .get_link       = tun_get_link,
1257         .get_rx_csum    = tun_get_rx_csum,
1258         .set_rx_csum    = tun_set_rx_csum
1259 };
1260
1261 static int tun_init_net(struct net *net)
1262 {
1263         return 0;
1264 }
1265
1266 static void tun_exit_net(struct net *net)
1267 {
1268         struct net_device *dev, *next;
1269
1270         rtnl_lock();
1271         for_each_netdev_safe(net, dev, next) {
1272                 if (dev->ethtool_ops != &tun_ethtool_ops)
1273                         continue;
1274                 DBG(KERN_INFO "%s cleaned up\n", dev->name);
1275                 unregister_netdevice(dev);
1276         }
1277         rtnl_unlock();
1278 }
1279
1280 static struct pernet_operations tun_net_ops = {
1281         .init = tun_init_net,
1282         .exit = tun_exit_net,
1283 };
1284
1285 static int __init tun_init(void)
1286 {
1287         int ret = 0;
1288
1289         printk(KERN_INFO "tun: %s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
1290         printk(KERN_INFO "tun: %s\n", DRV_COPYRIGHT);
1291
1292         ret = register_pernet_device(&tun_net_ops);
1293         if (ret) {
1294                 printk(KERN_ERR "tun: Can't register pernet ops\n");
1295                 goto err_pernet;
1296         }
1297
1298         ret = misc_register(&tun_miscdev);
1299         if (ret) {
1300                 printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
1301                 goto err_misc;
1302         }
1303         return 0;
1304
1305 err_misc:
1306         unregister_pernet_device(&tun_net_ops);
1307 err_pernet:
1308         return ret;
1309 }
1310
1311 static void tun_cleanup(void)
1312 {
1313         misc_deregister(&tun_miscdev);
1314         unregister_pernet_device(&tun_net_ops);
1315 }
1316
1317 module_init(tun_init);
1318 module_exit(tun_cleanup);
1319 MODULE_DESCRIPTION(DRV_DESCRIPTION);
1320 MODULE_AUTHOR(DRV_COPYRIGHT);
1321 MODULE_LICENSE("GPL");
1322 MODULE_ALIAS_MISCDEV(TUN_MINOR);