git.oblomov.eu Git - linux-2.6/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129
 130 #include "net-sysfs.h"
 131
 132 /* Instead of increasing this, you should create a hash table. */
 133 #define MAX_GRO_SKBS 8
 134
 135 /* This should be increased if a protocol with a bigger head is added. */
 136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 137
 138 /*
 139  *      The list of packet types we will receive (as opposed to discard)
 140  *      and the routines to invoke.
 141  *
 142  *      Why 16. Because with 16 the only overlap we get on a hash of the
 143  *      low nibble of the protocol value is RARP/SNAP/X.25.
 144  *
 145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 146  *             sure which should go first, but I bet it won't make much
 147  *             difference if we are running VLANs.  The good news is that
 148  *             this protocol won't be in the list unless compiled in, so
 149  *             the average user (w/out VLANs) will not be adversely affected.
 150  *             --BLG
 151  *
 152  *              0800    IP
 153  *              8100    802.1Q VLAN
 154  *              0001    802.3
 155  *              0002    AX.25
 156  *              0004    802.2
 157  *              8035    RARP
 158  *              0005    SNAP
 159  *              0805    X.25
 160  *              0806    ARP
 161  *              8137    IPX
 162  *              0009    Localtalk
 163  *              86DD    IPv6
 164  */
 165
 166 #define PTYPE_HASH_SIZE (16)
 167 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 168
 169 static DEFINE_SPINLOCK(ptype_lock);
 170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 171 static struct list_head ptype_all __read_mostly;        /* Taps */
 172
 173 /*
 174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 175  * semaphore.
 176  *
 177  * Pure readers hold dev_base_lock for reading.
 178  *
 179  * Writers must hold the rtnl semaphore while they loop through the
 180  * dev_base_head list, and hold dev_base_lock for writing when they do the
 181  * actual updates.  This allows pure readers to access the list even
 182  * while a writer is preparing to update it.
 183  *
 184  * To put it another way, dev_base_lock is held for writing only to
 185  * protect against pure readers; the rtnl semaphore provides the
 186  * protection against other writers.
 187  *
 188  * See, for example usages, register_netdevice() and
 189  * unregister_netdevice(), which must be called with the rtnl
 190  * semaphore held.
 191  */
 192 DEFINE_RWLOCK(dev_base_lock);
 193
 194 EXPORT_SYMBOL(dev_base_lock);
 195
 196 #define NETDEV_HASHBITS 8
 197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 208 }
 209
 210 /* Device list insertion */
 211 static int list_netdevice(struct net_device *dev)
 212 {
 213         struct net *net = dev_net(dev);
 214
 215         ASSERT_RTNL();
 216
 217         write_lock_bh(&dev_base_lock);
 218         list_add_tail(&dev->dev_list, &net->dev_base_head);
 219         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 220         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 221         write_unlock_bh(&dev_base_lock);
 222         return 0;
 223 }
 224
 225 /* Device list removal */
 226 static void unlist_netdevice(struct net_device *dev)
 227 {
 228         ASSERT_RTNL();
 229
 230         /* Unlink dev from the device chain */
 231         write_lock_bh(&dev_base_lock);
 232         list_del(&dev->dev_list);
 233         hlist_del(&dev->name_hlist);
 234         hlist_del(&dev->index_hlist);
 235         write_unlock_bh(&dev_base_lock);
 236 }
 237
 238 /*
 239  *      Our notifier list
 240  */
 241
 242 static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244 /*
 245  *      Device drivers call our routines to queue packets here. We empty the
 246  *      queue in the local softnet handler.
 247  */
 248
 249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250
 251 #ifdef CONFIG_LOCKDEP
 252 /*
 253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 254  * according to dev->type
 255  */
 256 static const unsigned short netdev_lock_type[] =
 257         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 258          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 259          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 260          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 261          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 262          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 263          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 264          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 265          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 266          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 267          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 268          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 269          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 270          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 271          ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 272
 273 static const char *netdev_lock_name[] =
 274         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 275          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 276          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 277          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 278          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 279          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 280          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 281          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 282          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 283          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 284          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 285          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 286          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 287          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 288          "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 289
 290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 292
 293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 294 {
 295         int i;
 296
 297         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 298                 if (netdev_lock_type[i] == dev_type)
 299                         return i;
 300         /* the last key is used by default */
 301         return ARRAY_SIZE(netdev_lock_type) - 1;
 302 }
 303
 304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 305                                                  unsigned short dev_type)
 306 {
 307         int i;
 308
 309         i = netdev_lock_pos(dev_type);
 310         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 311                                    netdev_lock_name[i]);
 312 }
 313
 314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 315 {
 316         int i;
 317
 318         i = netdev_lock_pos(dev->type);
 319         lockdep_set_class_and_name(&dev->addr_list_lock,
 320                                    &netdev_addr_lock_key[i],
 321                                    netdev_lock_name[i]);
 322 }
 323 #else
 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325                                                  unsigned short dev_type)
 326 {
 327 }
 328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 329 {
 330 }
 331 #endif
 332
 333 /*******************************************************************************
 334
 335                 Protocol management and registration routines
 336
 337 *******************************************************************************/
 338
 339 /*
 340  *      Add a protocol ID to the list. Now that the input handler is
 341  *      smarter we can dispense with all the messy stuff that used to be
 342  *      here.
 343  *
 344  *      BEWARE!!! Protocol handlers, mangling input packets,
 345  *      MUST BE last in hash buckets and checking protocol handlers
 346  *      MUST start from promiscuous ptype_all chain in net_bh.
 347  *      It is true now, do not change it.
 348  *      Explanation follows: if protocol handler, mangling packet, will
 349  *      be the first on list, it is not able to sense, that packet
 350  *      is cloned and should be copied-on-write, so that it will
 351  *      change it and subsequent readers will get broken packet.
 352  *                                                      --ANK (980803)
 353  */
 354
 355 /**
 356  *      dev_add_pack - add packet handler
 357  *      @pt: packet type declaration
 358  *
 359  *      Add a protocol handler to the networking stack. The passed &packet_type
 360  *      is linked into kernel lists and may not be freed until it has been
 361  *      removed from the kernel lists.
 362  *
 363  *      This call does not sleep therefore it can not
 364  *      guarantee all CPU's that are in middle of receiving packets
 365  *      will see the new packet type (until the next received packet).
 366  */
 367
 368 void dev_add_pack(struct packet_type *pt)
 369 {
 370         int hash;
 371
 372         spin_lock_bh(&ptype_lock);
 373         if (pt->type == htons(ETH_P_ALL))
 374                 list_add_rcu(&pt->list, &ptype_all);
 375         else {
 376                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 377                 list_add_rcu(&pt->list, &ptype_base[hash]);
 378         }
 379         spin_unlock_bh(&ptype_lock);
 380 }
 381
 382 /**
 383  *      __dev_remove_pack        - remove packet handler
 384  *      @pt: packet type declaration
 385  *
 386  *      Remove a protocol handler that was previously added to the kernel
 387  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 388  *      from the kernel lists and can be freed or reused once this function
 389  *      returns.
 390  *
 391  *      The packet type might still be in use by receivers
 392  *      and must not be freed until after all the CPU's have gone
 393  *      through a quiescent state.
 394  */
 395 void __dev_remove_pack(struct packet_type *pt)
 396 {
 397         struct list_head *head;
 398         struct packet_type *pt1;
 399
 400         spin_lock_bh(&ptype_lock);
 401
 402         if (pt->type == htons(ETH_P_ALL))
 403                 head = &ptype_all;
 404         else
 405                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 406
 407         list_for_each_entry(pt1, head, list) {
 408                 if (pt == pt1) {
 409                         list_del_rcu(&pt->list);
 410                         goto out;
 411                 }
 412         }
 413
 414         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 415 out:
 416         spin_unlock_bh(&ptype_lock);
 417 }
 418 /**
 419  *      dev_remove_pack  - remove packet handler
 420  *      @pt: packet type declaration
 421  *
 422  *      Remove a protocol handler that was previously added to the kernel
 423  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424  *      from the kernel lists and can be freed or reused once this function
 425  *      returns.
 426  *
 427  *      This call sleeps to guarantee that no CPU is looking at the packet
 428  *      type after return.
 429  */
 430 void dev_remove_pack(struct packet_type *pt)
 431 {
 432         __dev_remove_pack(pt);
 433
 434         synchronize_net();
 435 }
 436
 437 /******************************************************************************
 438
 439                       Device Boot-time Settings Routines
 440
 441 *******************************************************************************/
 442
 443 /* Boot time configuration table */
 444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 445
 446 /**
 447  *      netdev_boot_setup_add   - add new setup entry
 448  *      @name: name of the device
 449  *      @map: configured settings for the device
 450  *
 451  *      Adds new setup entry to the dev_boot_setup list.  The function
 452  *      returns 0 on error and 1 on success.  This is a generic routine to
 453  *      all netdevices.
 454  */
 455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 456 {
 457         struct netdev_boot_setup *s;
 458         int i;
 459
 460         s = dev_boot_setup;
 461         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 462                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 463                         memset(s[i].name, 0, sizeof(s[i].name));
 464                         strlcpy(s[i].name, name, IFNAMSIZ);
 465                         memcpy(&s[i].map, map, sizeof(s[i].map));
 466                         break;
 467                 }
 468         }
 469
 470         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 471 }
 472
 473 /**
 474  *      netdev_boot_setup_check - check boot time settings
 475  *      @dev: the netdevice
 476  *
 477  *      Check boot time settings for the device.
 478  *      The found settings are set for the device to be used
 479  *      later in the device probing.
 480  *      Returns 0 if no settings found, 1 if they are.
 481  */
 482 int netdev_boot_setup_check(struct net_device *dev)
 483 {
 484         struct netdev_boot_setup *s = dev_boot_setup;
 485         int i;
 486
 487         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 488                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 489                     !strcmp(dev->name, s[i].name)) {
 490                         dev->irq        = s[i].map.irq;
 491                         dev->base_addr  = s[i].map.base_addr;
 492                         dev->mem_start  = s[i].map.mem_start;
 493                         dev->mem_end    = s[i].map.mem_end;
 494                         return 1;
 495                 }
 496         }
 497         return 0;
 498 }
 499
 500
 501 /**
 502  *      netdev_boot_base        - get address from boot time settings
 503  *      @prefix: prefix for network device
 504  *      @unit: id for network device
 505  *
 506  *      Check boot time settings for the base address of device.
 507  *      The found settings are set for the device to be used
 508  *      later in the device probing.
 509  *      Returns 0 if no settings found.
 510  */
 511 unsigned long netdev_boot_base(const char *prefix, int unit)
 512 {
 513         const struct netdev_boot_setup *s = dev_boot_setup;
 514         char name[IFNAMSIZ];
 515         int i;
 516
 517         sprintf(name, "%s%d", prefix, unit);
 518
 519         /*
 520          * If device already registered then return base of 1
 521          * to indicate not to probe for this interface
 522          */
 523         if (__dev_get_by_name(&init_net, name))
 524                 return 1;
 525
 526         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 527                 if (!strcmp(name, s[i].name))
 528                         return s[i].map.base_addr;
 529         return 0;
 530 }
 531
 532 /*
 533  * Saves at boot time configured settings for any netdevice.
 534  */
 535 int __init netdev_boot_setup(char *str)
 536 {
 537         int ints[5];
 538         struct ifmap map;
 539
 540         str = get_options(str, ARRAY_SIZE(ints), ints);
 541         if (!str || !*str)
 542                 return 0;
 543
 544         /* Save settings */
 545         memset(&map, 0, sizeof(map));
 546         if (ints[0] > 0)
 547                 map.irq = ints[1];
 548         if (ints[0] > 1)
 549                 map.base_addr = ints[2];
 550         if (ints[0] > 2)
 551                 map.mem_start = ints[3];
 552         if (ints[0] > 3)
 553                 map.mem_end = ints[4];
 554
 555         /* Add new entry to the list */
 556         return netdev_boot_setup_add(str, &map);
 557 }
 558
 559 __setup("netdev=", netdev_boot_setup);
 560
 561 /*******************************************************************************
 562
 563                             Device Interface Subroutines
 564
 565 *******************************************************************************/
 566
 567 /**
 568  *      __dev_get_by_name       - find a device by its name
 569  *      @net: the applicable net namespace
 570  *      @name: name to find
 571  *
 572  *      Find an interface by name. Must be called under RTNL semaphore
 573  *      or @dev_base_lock. If the name is found a pointer to the device
 574  *      is returned. If the name is not found then %NULL is returned. The
 575  *      reference counters are not incremented so the caller must be
 576  *      careful with locks.
 577  */
 578
 579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 580 {
 581         struct hlist_node *p;
 582
 583         hlist_for_each(p, dev_name_hash(net, name)) {
 584                 struct net_device *dev
 585                         = hlist_entry(p, struct net_device, name_hlist);
 586                 if (!strncmp(dev->name, name, IFNAMSIZ))
 587                         return dev;
 588         }
 589         return NULL;
 590 }
 591
 592 /**
 593  *      dev_get_by_name         - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. This can be called from any
 598  *      context and does its own locking. The returned handle has
 599  *      the usage count incremented and the caller must use dev_put() to
 600  *      release it when it is no longer needed. %NULL is returned if no
 601  *      matching device is found.
 602  */
 603
 604 struct net_device *dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct net_device *dev;
 607
 608         read_lock(&dev_base_lock);
 609         dev = __dev_get_by_name(net, name);
 610         if (dev)
 611                 dev_hold(dev);
 612         read_unlock(&dev_base_lock);
 613         return dev;
 614 }
 615
 616 /**
 617  *      __dev_get_by_index - find a device by its ifindex
 618  *      @net: the applicable net namespace
 619  *      @ifindex: index of device
 620  *
 621  *      Search for an interface by index. Returns %NULL if the device
 622  *      is not found or a pointer to the device. The device has not
 623  *      had its reference counter increased so the caller must be careful
 624  *      about locking. The caller must hold either the RTNL semaphore
 625  *      or @dev_base_lock.
 626  */
 627
 628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 629 {
 630         struct hlist_node *p;
 631
 632         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 633                 struct net_device *dev
 634                         = hlist_entry(p, struct net_device, index_hlist);
 635                 if (dev->ifindex == ifindex)
 636                         return dev;
 637         }
 638         return NULL;
 639 }
 640
 641
 642 /**
 643  *      dev_get_by_index - find a device by its ifindex
 644  *      @net: the applicable net namespace
 645  *      @ifindex: index of device
 646  *
 647  *      Search for an interface by index. Returns NULL if the device
 648  *      is not found or a pointer to the device. The device returned has
 649  *      had a reference added and the pointer is safe until the user calls
 650  *      dev_put to indicate they have finished with it.
 651  */
 652
 653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 654 {
 655         struct net_device *dev;
 656
 657         read_lock(&dev_base_lock);
 658         dev = __dev_get_by_index(net, ifindex);
 659         if (dev)
 660                 dev_hold(dev);
 661         read_unlock(&dev_base_lock);
 662         return dev;
 663 }
 664
 665 /**
 666  *      dev_getbyhwaddr - find a device by its hardware address
 667  *      @net: the applicable net namespace
 668  *      @type: media type of device
 669  *      @ha: hardware address
 670  *
 671  *      Search for an interface by MAC address. Returns NULL if the device
 672  *      is not found or a pointer to the device. The caller must hold the
 673  *      rtnl semaphore. The returned device has not had its ref count increased
 674  *      and the caller must therefore be careful about locking
 675  *
 676  *      BUGS:
 677  *      If the API was consistent this would be __dev_get_by_hwaddr
 678  */
 679
 680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 681 {
 682         struct net_device *dev;
 683
 684         ASSERT_RTNL();
 685
 686         for_each_netdev(net, dev)
 687                 if (dev->type == type &&
 688                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 689                         return dev;
 690
 691         return NULL;
 692 }
 693
 694 EXPORT_SYMBOL(dev_getbyhwaddr);
 695
 696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 697 {
 698         struct net_device *dev;
 699
 700         ASSERT_RTNL();
 701         for_each_netdev(net, dev)
 702                 if (dev->type == type)
 703                         return dev;
 704
 705         return NULL;
 706 }
 707
 708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 709
 710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 711 {
 712         struct net_device *dev;
 713
 714         rtnl_lock();
 715         dev = __dev_getfirstbyhwtype(net, type);
 716         if (dev)
 717                 dev_hold(dev);
 718         rtnl_unlock();
 719         return dev;
 720 }
 721
 722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 723
 724 /**
 725  *      dev_get_by_flags - find any device with given flags
 726  *      @net: the applicable net namespace
 727  *      @if_flags: IFF_* values
 728  *      @mask: bitmask of bits in if_flags to check
 729  *
 730  *      Search for any interface with the given flags. Returns NULL if a device
 731  *      is not found or a pointer to the device. The device returned has
 732  *      had a reference added and the pointer is safe until the user calls
 733  *      dev_put to indicate they have finished with it.
 734  */
 735
 736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 737 {
 738         struct net_device *dev, *ret;
 739
 740         ret = NULL;
 741         read_lock(&dev_base_lock);
 742         for_each_netdev(net, dev) {
 743                 if (((dev->flags ^ if_flags) & mask) == 0) {
 744                         dev_hold(dev);
 745                         ret = dev;
 746                         break;
 747                 }
 748         }
 749         read_unlock(&dev_base_lock);
 750         return ret;
 751 }
 752
 753 /**
 754  *      dev_valid_name - check if name is okay for network device
 755  *      @name: name string
 756  *
 757  *      Network device names need to be valid file names to
 758  *      to allow sysfs to work.  We also disallow any kind of
 759  *      whitespace.
 760  */
 761 int dev_valid_name(const char *name)
 762 {
 763         if (*name == '\0')
 764                 return 0;
 765         if (strlen(name) >= IFNAMSIZ)
 766                 return 0;
 767         if (!strcmp(name, ".") || !strcmp(name, ".."))
 768                 return 0;
 769
 770         while (*name) {
 771                 if (*name == '/' || isspace(*name))
 772                         return 0;
 773                 name++;
 774         }
 775         return 1;
 776 }
 777
 778 /**
 779  *      __dev_alloc_name - allocate a name for a device
 780  *      @net: network namespace to allocate the device name in
 781  *      @name: name format string
 782  *      @buf:  scratch buffer and result name string
 783  *
 784  *      Passed a format string - eg "lt%d" it will try and find a suitable
 785  *      id. It scans list of devices to build up a free map, then chooses
 786  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 787  *      while allocating the name and adding the device in order to avoid
 788  *      duplicates.
 789  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 790  *      Returns the number of the unit assigned or a negative errno code.
 791  */
 792
 793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 794 {
 795         int i = 0;
 796         const char *p;
 797         const int max_netdevices = 8*PAGE_SIZE;
 798         unsigned long *inuse;
 799         struct net_device *d;
 800
 801         p = strnchr(name, IFNAMSIZ-1, '%');
 802         if (p) {
 803                 /*
 804                  * Verify the string as this thing may have come from
 805                  * the user.  There must be either one "%d" and no other "%"
 806                  * characters.
 807                  */
 808                 if (p[1] != 'd' || strchr(p + 2, '%'))
 809                         return -EINVAL;
 810
 811                 /* Use one page as a bit array of possible slots */
 812                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 813                 if (!inuse)
 814                         return -ENOMEM;
 815
 816                 for_each_netdev(net, d) {
 817                         if (!sscanf(d->name, name, &i))
 818                                 continue;
 819                         if (i < 0 || i >= max_netdevices)
 820                                 continue;
 821
 822                         /*  avoid cases where sscanf is not exact inverse of printf */
 823                         snprintf(buf, IFNAMSIZ, name, i);
 824                         if (!strncmp(buf, d->name, IFNAMSIZ))
 825                                 set_bit(i, inuse);
 826                 }
 827
 828                 i = find_first_zero_bit(inuse, max_netdevices);
 829                 free_page((unsigned long) inuse);
 830         }
 831
 832         snprintf(buf, IFNAMSIZ, name, i);
 833         if (!__dev_get_by_name(net, buf))
 834                 return i;
 835
 836         /* It is possible to run out of possible slots
 837          * when the name is long and there isn't enough space left
 838          * for the digits, or if all bits are used.
 839          */
 840         return -ENFILE;
 841 }
 842
 843 /**
 844  *      dev_alloc_name - allocate a name for a device
 845  *      @dev: device
 846  *      @name: name format string
 847  *
 848  *      Passed a format string - eg "lt%d" it will try and find a suitable
 849  *      id. It scans list of devices to build up a free map, then chooses
 850  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 851  *      while allocating the name and adding the device in order to avoid
 852  *      duplicates.
 853  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 854  *      Returns the number of the unit assigned or a negative errno code.
 855  */
 856
 857 int dev_alloc_name(struct net_device *dev, const char *name)
 858 {
 859         char buf[IFNAMSIZ];
 860         struct net *net;
 861         int ret;
 862
 863         BUG_ON(!dev_net(dev));
 864         net = dev_net(dev);
 865         ret = __dev_alloc_name(net, name, buf);
 866         if (ret >= 0)
 867                 strlcpy(dev->name, buf, IFNAMSIZ);
 868         return ret;
 869 }
 870
 871
 872 /**
 873  *      dev_change_name - change name of a device
 874  *      @dev: device
 875  *      @newname: name (or format string) must be at least IFNAMSIZ
 876  *
 877  *      Change name of a device, can pass format strings "eth%d".
 878  *      for wildcarding.
 879  */
 880 int dev_change_name(struct net_device *dev, const char *newname)
 881 {
 882         char oldname[IFNAMSIZ];
 883         int err = 0;
 884         int ret;
 885         struct net *net;
 886
 887         ASSERT_RTNL();
 888         BUG_ON(!dev_net(dev));
 889
 890         net = dev_net(dev);
 891         if (dev->flags & IFF_UP)
 892                 return -EBUSY;
 893
 894         if (!dev_valid_name(newname))
 895                 return -EINVAL;
 896
 897         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 898                 return 0;
 899
 900         memcpy(oldname, dev->name, IFNAMSIZ);
 901
 902         if (strchr(newname, '%')) {
 903                 err = dev_alloc_name(dev, newname);
 904                 if (err < 0)
 905                         return err;
 906         }
 907         else if (__dev_get_by_name(net, newname))
 908                 return -EEXIST;
 909         else
 910                 strlcpy(dev->name, newname, IFNAMSIZ);
 911
 912 rollback:
 913         /* For now only devices in the initial network namespace
 914          * are in sysfs.
 915          */
 916         if (net == &init_net) {
 917                 ret = device_rename(&dev->dev, dev->name);
 918                 if (ret) {
 919                         memcpy(dev->name, oldname, IFNAMSIZ);
 920                         return ret;
 921                 }
 922         }
 923
 924         write_lock_bh(&dev_base_lock);
 925         hlist_del(&dev->name_hlist);
 926         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 927         write_unlock_bh(&dev_base_lock);
 928
 929         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 930         ret = notifier_to_errno(ret);
 931
 932         if (ret) {
 933                 if (err) {
 934                         printk(KERN_ERR
 935                                "%s: name change rollback failed: %d.\n",
 936                                dev->name, ret);
 937                 } else {
 938                         err = ret;
 939                         memcpy(dev->name, oldname, IFNAMSIZ);
 940                         goto rollback;
 941                 }
 942         }
 943
 944         return err;
 945 }
 946
 947 /**
 948  *      dev_set_alias - change ifalias of a device
 949  *      @dev: device
 950  *      @alias: name up to IFALIASZ
 951  *      @len: limit of bytes to copy from info
 952  *
 953  *      Set ifalias for a device,
 954  */
 955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 956 {
 957         ASSERT_RTNL();
 958
 959         if (len >= IFALIASZ)
 960                 return -EINVAL;
 961
 962         if (!len) {
 963                 if (dev->ifalias) {
 964                         kfree(dev->ifalias);
 965                         dev->ifalias = NULL;
 966                 }
 967                 return 0;
 968         }
 969
 970         dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 971         if (!dev->ifalias)
 972                 return -ENOMEM;
 973
 974         strlcpy(dev->ifalias, alias, len+1);
 975         return len;
 976 }
 977
 978
 979 /**
 980  *      netdev_features_change - device changes features
 981  *      @dev: device to cause notification
 982  *
 983  *      Called to indicate a device has changed features.
 984  */
 985 void netdev_features_change(struct net_device *dev)
 986 {
 987         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 988 }
 989 EXPORT_SYMBOL(netdev_features_change);
 990
 991 /**
 992  *      netdev_state_change - device changes state
 993  *      @dev: device to cause notification
 994  *
 995  *      Called to indicate a device has changed state. This function calls
 996  *      the notifier chains for netdev_chain and sends a NEWLINK message
 997  *      to the routing socket.
 998  */
 999 void netdev_state_change(struct net_device *dev)
1000 {
1001         if (dev->flags & IFF_UP) {
1002                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004         }
1005 }
1006
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012
1013 /**
1014  *      dev_load        - load a network module
1015  *      @net: the applicable net namespace
1016  *      @name: name of interface
1017  *
1018  *      If a network interface is not present and the process has suitable
1019  *      privileges this function loads the module. If module loading is not
1020  *      available in this kernel then it becomes a nop.
1021  */
1022
1023 void dev_load(struct net *net, const char *name)
1024 {
1025         struct net_device *dev;
1026
1027         read_lock(&dev_base_lock);
1028         dev = __dev_get_by_name(net, name);
1029         read_unlock(&dev_base_lock);
1030
1031         if (!dev && capable(CAP_SYS_MODULE))
1032                 request_module("%s", name);
1033 }
1034
1035 /**
1036  *      dev_open        - prepare an interface for use.
1037  *      @dev:   device to open
1038  *
1039  *      Takes a device from down to up state. The device's private open
1040  *      function is invoked and then the multicast lists are loaded. Finally
1041  *      the device is moved into the up state and a %NETDEV_UP message is
1042  *      sent to the netdev notifier chain.
1043  *
1044  *      Calling this function on an active interface is a nop. On a failure
1045  *      a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049         const struct net_device_ops *ops = dev->netdev_ops;
1050         int ret = 0;
1051
1052         ASSERT_RTNL();
1053
1054         /*
1055          *      Is it already up?
1056          */
1057
1058         if (dev->flags & IFF_UP)
1059                 return 0;
1060
1061         /*
1062          *      Is it even present?
1063          */
1064         if (!netif_device_present(dev))
1065                 return -ENODEV;
1066
1067         /*
1068          *      Call device private open method
1069          */
1070         set_bit(__LINK_STATE_START, &dev->state);
1071
1072         if (ops->ndo_validate_addr)
1073                 ret = ops->ndo_validate_addr(dev);
1074
1075         if (!ret && ops->ndo_open)
1076                 ret = ops->ndo_open(dev);
1077
1078         /*
1079          *      If it went open OK then:
1080          */
1081
1082         if (ret)
1083                 clear_bit(__LINK_STATE_START, &dev->state);
1084         else {
1085                 /*
1086                  *      Set the flags.
1087                  */
1088                 dev->flags |= IFF_UP;
1089
1090                 /*
1091                  *      Initialize multicasting status
1092                  */
1093                 dev_set_rx_mode(dev);
1094
1095                 /*
1096                  *      Wakeup transmit queue engine
1097                  */
1098                 dev_activate(dev);
1099
1100                 /*
1101                  *      ... and announce new interface.
1102                  */
1103                 call_netdevice_notifiers(NETDEV_UP, dev);
1104         }
1105
1106         return ret;
1107 }
1108
1109 /**
1110  *      dev_close - shutdown an interface.
1111  *      @dev: device to shutdown
1112  *
1113  *      This function moves an active device into down state. A
1114  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1115  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1116  *      chain.
1117  */
1118 int dev_close(struct net_device *dev)
1119 {
1120         const struct net_device_ops *ops = dev->netdev_ops;
1121         ASSERT_RTNL();
1122
1123         might_sleep();
1124
1125         if (!(dev->flags & IFF_UP))
1126                 return 0;
1127
1128         /*
1129          *      Tell people we are going down, so that they can
1130          *      prepare to death, when device is still operating.
1131          */
1132         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1133
1134         clear_bit(__LINK_STATE_START, &dev->state);
1135
1136         /* Synchronize to scheduled poll. We cannot touch poll list,
1137          * it can be even on different cpu. So just clear netif_running().
1138          *
1139          * dev->stop() will invoke napi_disable() on all of it's
1140          * napi_struct instances on this device.
1141          */
1142         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1143
1144         dev_deactivate(dev);
1145
1146         /*
1147          *      Call the device specific close. This cannot fail.
1148          *      Only if device is UP
1149          *
1150          *      We allow it to be called even after a DETACH hot-plug
1151          *      event.
1152          */
1153         if (ops->ndo_stop)
1154                 ops->ndo_stop(dev);
1155
1156         /*
1157          *      Device is now down.
1158          */
1159
1160         dev->flags &= ~IFF_UP;
1161
1162         /*
1163          * Tell people we are down
1164          */
1165         call_netdevice_notifiers(NETDEV_DOWN, dev);
1166
1167         return 0;
1168 }
1169
1170
1171 /**
1172  *      dev_disable_lro - disable Large Receive Offload on a device
1173  *      @dev: device
1174  *
1175  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1176  *      called under RTNL.  This is needed if received packets may be
1177  *      forwarded to another interface.
1178  */
1179 void dev_disable_lro(struct net_device *dev)
1180 {
1181         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1182             dev->ethtool_ops->set_flags) {
1183                 u32 flags = dev->ethtool_ops->get_flags(dev);
1184                 if (flags & ETH_FLAG_LRO) {
1185                         flags &= ~ETH_FLAG_LRO;
1186                         dev->ethtool_ops->set_flags(dev, flags);
1187                 }
1188         }
1189         WARN_ON(dev->features & NETIF_F_LRO);
1190 }
1191 EXPORT_SYMBOL(dev_disable_lro);
1192
1193
1194 static int dev_boot_phase = 1;
1195
1196 /*
1197  *      Device change register/unregister. These are not inline or static
1198  *      as we export them to the world.
1199  */
1200
1201 /**
1202  *      register_netdevice_notifier - register a network notifier block
1203  *      @nb: notifier
1204  *
1205  *      Register a notifier to be called when network device events occur.
1206  *      The notifier passed is linked into the kernel structures and must
1207  *      not be reused until it has been unregistered. A negative errno code
1208  *      is returned on a failure.
1209  *
1210  *      When registered all registration and up events are replayed
1211  *      to the new notifier to allow device to have a race free
1212  *      view of the network device list.
1213  */
1214
1215 int register_netdevice_notifier(struct notifier_block *nb)
1216 {
1217         struct net_device *dev;
1218         struct net_device *last;
1219         struct net *net;
1220         int err;
1221
1222         rtnl_lock();
1223         err = raw_notifier_chain_register(&netdev_chain, nb);
1224         if (err)
1225                 goto unlock;
1226         if (dev_boot_phase)
1227                 goto unlock;
1228         for_each_net(net) {
1229                 for_each_netdev(net, dev) {
1230                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1231                         err = notifier_to_errno(err);
1232                         if (err)
1233                                 goto rollback;
1234
1235                         if (!(dev->flags & IFF_UP))
1236                                 continue;
1237
1238                         nb->notifier_call(nb, NETDEV_UP, dev);
1239                 }
1240         }
1241
1242 unlock:
1243         rtnl_unlock();
1244         return err;
1245
1246 rollback:
1247         last = dev;
1248         for_each_net(net) {
1249                 for_each_netdev(net, dev) {
1250                         if (dev == last)
1251                                 break;
1252
1253                         if (dev->flags & IFF_UP) {
1254                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1255                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1256                         }
1257                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1258                 }
1259         }
1260
1261         raw_notifier_chain_unregister(&netdev_chain, nb);
1262         goto unlock;
1263 }
1264
1265 /**
1266  *      unregister_netdevice_notifier - unregister a network notifier block
1267  *      @nb: notifier
1268  *
1269  *      Unregister a notifier previously registered by
1270  *      register_netdevice_notifier(). The notifier is unlinked into the
1271  *      kernel structures and may then be reused. A negative errno code
1272  *      is returned on a failure.
1273  */
1274
1275 int unregister_netdevice_notifier(struct notifier_block *nb)
1276 {
1277         int err;
1278
1279         rtnl_lock();
1280         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1281         rtnl_unlock();
1282         return err;
1283 }
1284
1285 /**
1286  *      call_netdevice_notifiers - call all network notifier blocks
1287  *      @val: value passed unmodified to notifier function
1288  *      @dev: net_device pointer passed unmodified to notifier function
1289  *
1290  *      Call all network notifier blocks.  Parameters and return value
1291  *      are as for raw_notifier_call_chain().
1292  */
1293
1294 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1295 {
1296         return raw_notifier_call_chain(&netdev_chain, val, dev);
1297 }
1298
1299 /* When > 0 there are consumers of rx skb time stamps */
1300 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1301
1302 void net_enable_timestamp(void)
1303 {
1304         atomic_inc(&netstamp_needed);
1305 }
1306
1307 void net_disable_timestamp(void)
1308 {
1309         atomic_dec(&netstamp_needed);
1310 }
1311
1312 static inline void net_timestamp(struct sk_buff *skb)
1313 {
1314         if (atomic_read(&netstamp_needed))
1315                 __net_timestamp(skb);
1316         else
1317                 skb->tstamp.tv64 = 0;
1318 }
1319
1320 /*
1321  *      Support routine. Sends outgoing frames to any network
1322  *      taps currently in use.
1323  */
1324
1325 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1326 {
1327         struct packet_type *ptype;
1328
1329         net_timestamp(skb);
1330
1331         rcu_read_lock();
1332         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1333                 /* Never send packets back to the socket
1334                  * they originated from - MvS (miquels@drinkel.ow.org)
1335                  */
1336                 if ((ptype->dev == dev || !ptype->dev) &&
1337                     (ptype->af_packet_priv == NULL ||
1338                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1339                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1340                         if (!skb2)
1341                                 break;
1342
1343                         /* skb->nh should be correctly
1344                            set by sender, so that the second statement is
1345                            just protection against buggy protocols.
1346                          */
1347                         skb_reset_mac_header(skb2);
1348
1349                         if (skb_network_header(skb2) < skb2->data ||
1350                             skb2->network_header > skb2->tail) {
1351                                 if (net_ratelimit())
1352                                         printk(KERN_CRIT "protocol %04x is "
1353                                                "buggy, dev %s\n",
1354                                                skb2->protocol, dev->name);
1355                                 skb_reset_network_header(skb2);
1356                         }
1357
1358                         skb2->transport_header = skb2->network_header;
1359                         skb2->pkt_type = PACKET_OUTGOING;
1360                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1361                 }
1362         }
1363         rcu_read_unlock();
1364 }
1365
1366
1367 static inline void __netif_reschedule(struct Qdisc *q)
1368 {
1369         struct softnet_data *sd;
1370         unsigned long flags;
1371
1372         local_irq_save(flags);
1373         sd = &__get_cpu_var(softnet_data);
1374         q->next_sched = sd->output_queue;
1375         sd->output_queue = q;
1376         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1377         local_irq_restore(flags);
1378 }
1379
1380 void __netif_schedule(struct Qdisc *q)
1381 {
1382         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1383                 __netif_reschedule(q);
1384 }
1385 EXPORT_SYMBOL(__netif_schedule);
1386
1387 void dev_kfree_skb_irq(struct sk_buff *skb)
1388 {
1389         if (atomic_dec_and_test(&skb->users)) {
1390                 struct softnet_data *sd;
1391                 unsigned long flags;
1392
1393                 local_irq_save(flags);
1394                 sd = &__get_cpu_var(softnet_data);
1395                 skb->next = sd->completion_queue;
1396                 sd->completion_queue = skb;
1397                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1398                 local_irq_restore(flags);
1399         }
1400 }
1401 EXPORT_SYMBOL(dev_kfree_skb_irq);
1402
1403 void dev_kfree_skb_any(struct sk_buff *skb)
1404 {
1405         if (in_irq() || irqs_disabled())
1406                 dev_kfree_skb_irq(skb);
1407         else
1408                 dev_kfree_skb(skb);
1409 }
1410 EXPORT_SYMBOL(dev_kfree_skb_any);
1411
1412
1413 /**
1414  * netif_device_detach - mark device as removed
1415  * @dev: network device
1416  *
1417  * Mark device as removed from system and therefore no longer available.
1418  */
1419 void netif_device_detach(struct net_device *dev)
1420 {
1421         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1422             netif_running(dev)) {
1423                 netif_stop_queue(dev);
1424         }
1425 }
1426 EXPORT_SYMBOL(netif_device_detach);
1427
1428 /**
1429  * netif_device_attach - mark device as attached
1430  * @dev: network device
1431  *
1432  * Mark device as attached from system and restart if needed.
1433  */
1434 void netif_device_attach(struct net_device *dev)
1435 {
1436         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1437             netif_running(dev)) {
1438                 netif_wake_queue(dev);
1439                 __netdev_watchdog_up(dev);
1440         }
1441 }
1442 EXPORT_SYMBOL(netif_device_attach);
1443
1444 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1445 {
1446         return ((features & NETIF_F_GEN_CSUM) ||
1447                 ((features & NETIF_F_IP_CSUM) &&
1448                  protocol == htons(ETH_P_IP)) ||
1449                 ((features & NETIF_F_IPV6_CSUM) &&
1450                  protocol == htons(ETH_P_IPV6)));
1451 }
1452
1453 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1454 {
1455         if (can_checksum_protocol(dev->features, skb->protocol))
1456                 return true;
1457
1458         if (skb->protocol == htons(ETH_P_8021Q)) {
1459                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1460                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1461                                           veh->h_vlan_encapsulated_proto))
1462                         return true;
1463         }
1464
1465         return false;
1466 }
1467
1468 /*
1469  * Invalidate hardware checksum when packet is to be mangled, and
1470  * complete checksum manually on outgoing path.
1471  */
1472 int skb_checksum_help(struct sk_buff *skb)
1473 {
1474         __wsum csum;
1475         int ret = 0, offset;
1476
1477         if (skb->ip_summed == CHECKSUM_COMPLETE)
1478                 goto out_set_summed;
1479
1480         if (unlikely(skb_shinfo(skb)->gso_size)) {
1481                 /* Let GSO fix up the checksum. */
1482                 goto out_set_summed;
1483         }
1484
1485         offset = skb->csum_start - skb_headroom(skb);
1486         BUG_ON(offset >= skb_headlen(skb));
1487         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1488
1489         offset += skb->csum_offset;
1490         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1491
1492         if (skb_cloned(skb) &&
1493             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1494                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1495                 if (ret)
1496                         goto out;
1497         }
1498
1499         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1500 out_set_summed:
1501         skb->ip_summed = CHECKSUM_NONE;
1502 out:
1503         return ret;
1504 }
1505
1506 /**
1507  *      skb_gso_segment - Perform segmentation on skb.
1508  *      @skb: buffer to segment
1509  *      @features: features for the output path (see dev->features)
1510  *
1511  *      This function segments the given skb and returns a list of segments.
1512  *
1513  *      It may return NULL if the skb requires no segmentation.  This is
1514  *      only possible when GSO is used for verifying header integrity.
1515  */
1516 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1517 {
1518         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1519         struct packet_type *ptype;
1520         __be16 type = skb->protocol;
1521         int err;
1522
1523         skb_reset_mac_header(skb);
1524         skb->mac_len = skb->network_header - skb->mac_header;
1525         __skb_pull(skb, skb->mac_len);
1526
1527         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1528                 if (skb_header_cloned(skb) &&
1529                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1530                         return ERR_PTR(err);
1531         }
1532
1533         rcu_read_lock();
1534         list_for_each_entry_rcu(ptype,
1535                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1536                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1537                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538                                 err = ptype->gso_send_check(skb);
1539                                 segs = ERR_PTR(err);
1540                                 if (err || skb_gso_ok(skb, features))
1541                                         break;
1542                                 __skb_push(skb, (skb->data -
1543                                                  skb_network_header(skb)));
1544                         }
1545                         segs = ptype->gso_segment(skb, features);
1546                         break;
1547                 }
1548         }
1549         rcu_read_unlock();
1550
1551         __skb_push(skb, skb->data - skb_mac_header(skb));
1552
1553         return segs;
1554 }
1555
1556 EXPORT_SYMBOL(skb_gso_segment);
1557
1558 /* Take action when hardware reception checksum errors are detected. */
1559 #ifdef CONFIG_BUG
1560 void netdev_rx_csum_fault(struct net_device *dev)
1561 {
1562         if (net_ratelimit()) {
1563                 printk(KERN_ERR "%s: hw csum failure.\n",
1564                         dev ? dev->name : "<unknown>");
1565                 dump_stack();
1566         }
1567 }
1568 EXPORT_SYMBOL(netdev_rx_csum_fault);
1569 #endif
1570
1571 /* Actually, we should eliminate this check as soon as we know, that:
1572  * 1. IOMMU is present and allows to map all the memory.
1573  * 2. No high memory really exists on this machine.
1574  */
1575
1576 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1577 {
1578 #ifdef CONFIG_HIGHMEM
1579         int i;
1580
1581         if (dev->features & NETIF_F_HIGHDMA)
1582                 return 0;
1583
1584         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1585                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1586                         return 1;
1587
1588 #endif
1589         return 0;
1590 }
1591
1592 struct dev_gso_cb {
1593         void (*destructor)(struct sk_buff *skb);
1594 };
1595
1596 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1597
1598 static void dev_gso_skb_destructor(struct sk_buff *skb)
1599 {
1600         struct dev_gso_cb *cb;
1601
1602         do {
1603                 struct sk_buff *nskb = skb->next;
1604
1605                 skb->next = nskb->next;
1606                 nskb->next = NULL;
1607                 kfree_skb(nskb);
1608         } while (skb->next);
1609
1610         cb = DEV_GSO_CB(skb);
1611         if (cb->destructor)
1612                 cb->destructor(skb);
1613 }
1614
1615 /**
1616  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1617  *      @skb: buffer to segment
1618  *
1619  *      This function segments the given skb and stores the list of segments
1620  *      in skb->next.
1621  */
1622 static int dev_gso_segment(struct sk_buff *skb)
1623 {
1624         struct net_device *dev = skb->dev;
1625         struct sk_buff *segs;
1626         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1627                                          NETIF_F_SG : 0);
1628
1629         segs = skb_gso_segment(skb, features);
1630
1631         /* Verifying header integrity only. */
1632         if (!segs)
1633                 return 0;
1634
1635         if (IS_ERR(segs))
1636                 return PTR_ERR(segs);
1637
1638         skb->next = segs;
1639         DEV_GSO_CB(skb)->destructor = skb->destructor;
1640         skb->destructor = dev_gso_skb_destructor;
1641
1642         return 0;
1643 }
1644
1645 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1646                         struct netdev_queue *txq)
1647 {
1648         const struct net_device_ops *ops = dev->netdev_ops;
1649
1650         prefetch(&dev->netdev_ops->ndo_start_xmit);
1651         if (likely(!skb->next)) {
1652                 if (!list_empty(&ptype_all))
1653                         dev_queue_xmit_nit(skb, dev);
1654
1655                 if (netif_needs_gso(dev, skb)) {
1656                         if (unlikely(dev_gso_segment(skb)))
1657                                 goto out_kfree_skb;
1658                         if (skb->next)
1659                                 goto gso;
1660                 }
1661
1662                 return ops->ndo_start_xmit(skb, dev);
1663         }
1664
1665 gso:
1666         do {
1667                 struct sk_buff *nskb = skb->next;
1668                 int rc;
1669
1670                 skb->next = nskb->next;
1671                 nskb->next = NULL;
1672                 rc = ops->ndo_start_xmit(nskb, dev);
1673                 if (unlikely(rc)) {
1674                         nskb->next = skb->next;
1675                         skb->next = nskb;
1676                         return rc;
1677                 }
1678                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1679                         return NETDEV_TX_BUSY;
1680         } while (skb->next);
1681
1682         skb->destructor = DEV_GSO_CB(skb)->destructor;
1683
1684 out_kfree_skb:
1685         kfree_skb(skb);
1686         return 0;
1687 }
1688
1689 static u32 simple_tx_hashrnd;
1690 static int simple_tx_hashrnd_initialized = 0;
1691
1692 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1693 {
1694         u32 addr1, addr2, ports;
1695         u32 hash, ihl;
1696         u8 ip_proto = 0;
1697
1698         if (unlikely(!simple_tx_hashrnd_initialized)) {
1699                 get_random_bytes(&simple_tx_hashrnd, 4);
1700                 simple_tx_hashrnd_initialized = 1;
1701         }
1702
1703         switch (skb->protocol) {
1704         case htons(ETH_P_IP):
1705                 if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1706                         ip_proto = ip_hdr(skb)->protocol;
1707                 addr1 = ip_hdr(skb)->saddr;
1708                 addr2 = ip_hdr(skb)->daddr;
1709                 ihl = ip_hdr(skb)->ihl;
1710                 break;
1711         case htons(ETH_P_IPV6):
1712                 ip_proto = ipv6_hdr(skb)->nexthdr;
1713                 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1714                 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1715                 ihl = (40 >> 2);
1716                 break;
1717         default:
1718                 return 0;
1719         }
1720
1721
1722         switch (ip_proto) {
1723         case IPPROTO_TCP:
1724         case IPPROTO_UDP:
1725         case IPPROTO_DCCP:
1726         case IPPROTO_ESP:
1727         case IPPROTO_AH:
1728         case IPPROTO_SCTP:
1729         case IPPROTO_UDPLITE:
1730                 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1731                 break;
1732
1733         default:
1734                 ports = 0;
1735                 break;
1736         }
1737
1738         hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1739
1740         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1741 }
1742
1743 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1744                                         struct sk_buff *skb)
1745 {
1746         const struct net_device_ops *ops = dev->netdev_ops;
1747         u16 queue_index = 0;
1748
1749         if (ops->ndo_select_queue)
1750                 queue_index = ops->ndo_select_queue(dev, skb);
1751         else if (dev->real_num_tx_queues > 1)
1752                 queue_index = simple_tx_hash(dev, skb);
1753
1754         skb_set_queue_mapping(skb, queue_index);
1755         return netdev_get_tx_queue(dev, queue_index);
1756 }
1757
1758 /**
1759  *      dev_queue_xmit - transmit a buffer
1760  *      @skb: buffer to transmit
1761  *
1762  *      Queue a buffer for transmission to a network device. The caller must
1763  *      have set the device and priority and built the buffer before calling
1764  *      this function. The function can be called from an interrupt.
1765  *
1766  *      A negative errno code is returned on a failure. A success does not
1767  *      guarantee the frame will be transmitted as it may be dropped due
1768  *      to congestion or traffic shaping.
1769  *
1770  * -----------------------------------------------------------------------------------
1771  *      I notice this method can also return errors from the queue disciplines,
1772  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1773  *      be positive.
1774  *
1775  *      Regardless of the return value, the skb is consumed, so it is currently
1776  *      difficult to retry a send to this method.  (You can bump the ref count
1777  *      before sending to hold a reference for retry if you are careful.)
1778  *
1779  *      When calling this method, interrupts MUST be enabled.  This is because
1780  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1781  *          --BLG
1782  */
1783 int dev_queue_xmit(struct sk_buff *skb)
1784 {
1785         struct net_device *dev = skb->dev;
1786         struct netdev_queue *txq;
1787         struct Qdisc *q;
1788         int rc = -ENOMEM;
1789
1790         /* GSO will handle the following emulations directly. */
1791         if (netif_needs_gso(dev, skb))
1792                 goto gso;
1793
1794         if (skb_shinfo(skb)->frag_list &&
1795             !(dev->features & NETIF_F_FRAGLIST) &&
1796             __skb_linearize(skb))
1797                 goto out_kfree_skb;
1798
1799         /* Fragmented skb is linearized if device does not support SG,
1800          * or if at least one of fragments is in highmem and device
1801          * does not support DMA from it.
1802          */
1803         if (skb_shinfo(skb)->nr_frags &&
1804             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1805             __skb_linearize(skb))
1806                 goto out_kfree_skb;
1807
1808         /* If packet is not checksummed and device does not support
1809          * checksumming for this protocol, complete checksumming here.
1810          */
1811         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1812                 skb_set_transport_header(skb, skb->csum_start -
1813                                               skb_headroom(skb));
1814                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1815                         goto out_kfree_skb;
1816         }
1817
1818 gso:
1819         /* Disable soft irqs for various locks below. Also
1820          * stops preemption for RCU.
1821          */
1822         rcu_read_lock_bh();
1823
1824         txq = dev_pick_tx(dev, skb);
1825         q = rcu_dereference(txq->qdisc);
1826
1827 #ifdef CONFIG_NET_CLS_ACT
1828         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1829 #endif
1830         if (q->enqueue) {
1831                 spinlock_t *root_lock = qdisc_lock(q);
1832
1833                 spin_lock(root_lock);
1834
1835                 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1836                         kfree_skb(skb);
1837                         rc = NET_XMIT_DROP;
1838                 } else {
1839                         rc = qdisc_enqueue_root(skb, q);
1840                         qdisc_run(q);
1841                 }
1842                 spin_unlock(root_lock);
1843
1844                 goto out;
1845         }
1846
1847         /* The device has no queue. Common case for software devices:
1848            loopback, all the sorts of tunnels...
1849
1850            Really, it is unlikely that netif_tx_lock protection is necessary
1851            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1852            counters.)
1853            However, it is possible, that they rely on protection
1854            made by us here.
1855
1856            Check this and shot the lock. It is not prone from deadlocks.
1857            Either shot noqueue qdisc, it is even simpler 8)
1858          */
1859         if (dev->flags & IFF_UP) {
1860                 int cpu = smp_processor_id(); /* ok because BHs are off */
1861
1862                 if (txq->xmit_lock_owner != cpu) {
1863
1864                         HARD_TX_LOCK(dev, txq, cpu);
1865
1866                         if (!netif_tx_queue_stopped(txq)) {
1867                                 rc = 0;
1868                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1869                                         HARD_TX_UNLOCK(dev, txq);
1870                                         goto out;
1871                                 }
1872                         }
1873                         HARD_TX_UNLOCK(dev, txq);
1874                         if (net_ratelimit())
1875                                 printk(KERN_CRIT "Virtual device %s asks to "
1876                                        "queue packet!\n", dev->name);
1877                 } else {
1878                         /* Recursion is detected! It is possible,
1879                          * unfortunately */
1880                         if (net_ratelimit())
1881                                 printk(KERN_CRIT "Dead loop on virtual device "
1882                                        "%s, fix it urgently!\n", dev->name);
1883                 }
1884         }
1885
1886         rc = -ENETDOWN;
1887         rcu_read_unlock_bh();
1888
1889 out_kfree_skb:
1890         kfree_skb(skb);
1891         return rc;
1892 out:
1893         rcu_read_unlock_bh();
1894         return rc;
1895 }
1896
1897
1898 /*=======================================================================
1899                         Receiver routines
1900   =======================================================================*/
1901
1902 int netdev_max_backlog __read_mostly = 1000;
1903 int netdev_budget __read_mostly = 300;
1904 int weight_p __read_mostly = 64;            /* old backlog weight */
1905
1906 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1907
1908
1909 /**
1910  *      netif_rx        -       post buffer to the network code
1911  *      @skb: buffer to post
1912  *
1913  *      This function receives a packet from a device driver and queues it for
1914  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1915  *      may be dropped during processing for congestion control or by the
1916  *      protocol layers.
1917  *
1918  *      return values:
1919  *      NET_RX_SUCCESS  (no congestion)
1920  *      NET_RX_DROP     (packet was dropped)
1921  *
1922  */
1923
1924 int netif_rx(struct sk_buff *skb)
1925 {
1926         struct softnet_data *queue;
1927         unsigned long flags;
1928
1929         /* if netpoll wants it, pretend we never saw it */
1930         if (netpoll_rx(skb))
1931                 return NET_RX_DROP;
1932
1933         if (!skb->tstamp.tv64)
1934                 net_timestamp(skb);
1935
1936         /*
1937          * The code is rearranged so that the path is the most
1938          * short when CPU is congested, but is still operating.
1939          */
1940         local_irq_save(flags);
1941         queue = &__get_cpu_var(softnet_data);
1942
1943         __get_cpu_var(netdev_rx_stat).total++;
1944         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1945                 if (queue->input_pkt_queue.qlen) {
1946 enqueue:
1947                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1948                         local_irq_restore(flags);
1949                         return NET_RX_SUCCESS;
1950                 }
1951
1952                 napi_schedule(&queue->backlog);
1953                 goto enqueue;
1954         }
1955
1956         __get_cpu_var(netdev_rx_stat).dropped++;
1957         local_irq_restore(flags);
1958
1959         kfree_skb(skb);
1960         return NET_RX_DROP;
1961 }
1962
1963 int netif_rx_ni(struct sk_buff *skb)
1964 {
1965         int err;
1966
1967         preempt_disable();
1968         err = netif_rx(skb);
1969         if (local_softirq_pending())
1970                 do_softirq();
1971         preempt_enable();
1972
1973         return err;
1974 }
1975
1976 EXPORT_SYMBOL(netif_rx_ni);
1977
1978 static void net_tx_action(struct softirq_action *h)
1979 {
1980         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1981
1982         if (sd->completion_queue) {
1983                 struct sk_buff *clist;
1984
1985                 local_irq_disable();
1986                 clist = sd->completion_queue;
1987                 sd->completion_queue = NULL;
1988                 local_irq_enable();
1989
1990                 while (clist) {
1991                         struct sk_buff *skb = clist;
1992                         clist = clist->next;
1993
1994                         WARN_ON(atomic_read(&skb->users));
1995                         __kfree_skb(skb);
1996                 }
1997         }
1998
1999         if (sd->output_queue) {
2000                 struct Qdisc *head;
2001
2002                 local_irq_disable();
2003                 head = sd->output_queue;
2004                 sd->output_queue = NULL;
2005                 local_irq_enable();
2006
2007                 while (head) {
2008                         struct Qdisc *q = head;
2009                         spinlock_t *root_lock;
2010
2011                         head = head->next_sched;
2012
2013                         root_lock = qdisc_lock(q);
2014                         if (spin_trylock(root_lock)) {
2015                                 smp_mb__before_clear_bit();
2016                                 clear_bit(__QDISC_STATE_SCHED,
2017                                           &q->state);
2018                                 qdisc_run(q);
2019                                 spin_unlock(root_lock);
2020                         } else {
2021                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2022                                               &q->state)) {
2023                                         __netif_reschedule(q);
2024                                 } else {
2025                                         smp_mb__before_clear_bit();
2026                                         clear_bit(__QDISC_STATE_SCHED,
2027                                                   &q->state);
2028                                 }
2029                         }
2030                 }
2031         }
2032 }
2033
2034 static inline int deliver_skb(struct sk_buff *skb,
2035                               struct packet_type *pt_prev,
2036                               struct net_device *orig_dev)
2037 {
2038         atomic_inc(&skb->users);
2039         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2040 }
2041
2042 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2043 /* These hooks defined here for ATM */
2044 struct net_bridge;
2045 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2046                                                 unsigned char *addr);
2047 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2048
2049 /*
2050  * If bridge module is loaded call bridging hook.
2051  *  returns NULL if packet was consumed.
2052  */
2053 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2054                                         struct sk_buff *skb) __read_mostly;
2055 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2056                                             struct packet_type **pt_prev, int *ret,
2057                                             struct net_device *orig_dev)
2058 {
2059         struct net_bridge_port *port;
2060
2061         if (skb->pkt_type == PACKET_LOOPBACK ||
2062             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2063                 return skb;
2064
2065         if (*pt_prev) {
2066                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2067                 *pt_prev = NULL;
2068         }
2069
2070         return br_handle_frame_hook(port, skb);
2071 }
2072 #else
2073 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2074 #endif
2075
2076 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2077 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2078 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2079
2080 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2081                                              struct packet_type **pt_prev,
2082                                              int *ret,
2083                                              struct net_device *orig_dev)
2084 {
2085         if (skb->dev->macvlan_port == NULL)
2086                 return skb;
2087
2088         if (*pt_prev) {
2089                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2090                 *pt_prev = NULL;
2091         }
2092         return macvlan_handle_frame_hook(skb);
2093 }
2094 #else
2095 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2096 #endif
2097
2098 #ifdef CONFIG_NET_CLS_ACT
2099 /* TODO: Maybe we should just force sch_ingress to be compiled in
2100  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2101  * a compare and 2 stores extra right now if we dont have it on
2102  * but have CONFIG_NET_CLS_ACT
2103  * NOTE: This doesnt stop any functionality; if you dont have
2104  * the ingress scheduler, you just cant add policies on ingress.
2105  *
2106  */
2107 static int ing_filter(struct sk_buff *skb)
2108 {
2109         struct net_device *dev = skb->dev;
2110         u32 ttl = G_TC_RTTL(skb->tc_verd);
2111         struct netdev_queue *rxq;
2112         int result = TC_ACT_OK;
2113         struct Qdisc *q;
2114
2115         if (MAX_RED_LOOP < ttl++) {
2116                 printk(KERN_WARNING
2117                        "Redir loop detected Dropping packet (%d->%d)\n",
2118                        skb->iif, dev->ifindex);
2119                 return TC_ACT_SHOT;
2120         }
2121
2122         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2123         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2124
2125         rxq = &dev->rx_queue;
2126
2127         q = rxq->qdisc;
2128         if (q != &noop_qdisc) {
2129                 spin_lock(qdisc_lock(q));
2130                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2131                         result = qdisc_enqueue_root(skb, q);
2132                 spin_unlock(qdisc_lock(q));
2133         }
2134
2135         return result;
2136 }
2137
2138 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2139                                          struct packet_type **pt_prev,
2140                                          int *ret, struct net_device *orig_dev)
2141 {
2142         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2143                 goto out;
2144
2145         if (*pt_prev) {
2146                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2147                 *pt_prev = NULL;
2148         } else {
2149                 /* Huh? Why does turning on AF_PACKET affect this? */
2150                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2151         }
2152
2153         switch (ing_filter(skb)) {
2154         case TC_ACT_SHOT:
2155         case TC_ACT_STOLEN:
2156                 kfree_skb(skb);
2157                 return NULL;
2158         }
2159
2160 out:
2161         skb->tc_verd = 0;
2162         return skb;
2163 }
2164 #endif
2165
2166 /*
2167  *      netif_nit_deliver - deliver received packets to network taps
2168  *      @skb: buffer
2169  *
2170  *      This function is used to deliver incoming packets to network
2171  *      taps. It should be used when the normal netif_receive_skb path
2172  *      is bypassed, for example because of VLAN acceleration.
2173  */
2174 void netif_nit_deliver(struct sk_buff *skb)
2175 {
2176         struct packet_type *ptype;
2177
2178         if (list_empty(&ptype_all))
2179                 return;
2180
2181         skb_reset_network_header(skb);
2182         skb_reset_transport_header(skb);
2183         skb->mac_len = skb->network_header - skb->mac_header;
2184
2185         rcu_read_lock();
2186         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2187                 if (!ptype->dev || ptype->dev == skb->dev)
2188                         deliver_skb(skb, ptype, skb->dev);
2189         }
2190         rcu_read_unlock();
2191 }
2192
2193 /**
2194  *      netif_receive_skb - process receive buffer from network
2195  *      @skb: buffer to process
2196  *
2197  *      netif_receive_skb() is the main receive data processing function.
2198  *      It always succeeds. The buffer may be dropped during processing
2199  *      for congestion control or by the protocol layers.
2200  *
2201  *      This function may only be called from softirq context and interrupts
2202  *      should be enabled.
2203  *
2204  *      Return values (usually ignored):
2205  *      NET_RX_SUCCESS: no congestion
2206  *      NET_RX_DROP: packet was dropped
2207  */
2208 int netif_receive_skb(struct sk_buff *skb)
2209 {
2210         struct packet_type *ptype, *pt_prev;
2211         struct net_device *orig_dev;
2212         struct net_device *null_or_orig;
2213         int ret = NET_RX_DROP;
2214         __be16 type;
2215
2216         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2217                 return NET_RX_SUCCESS;
2218
2219         /* if we've gotten here through NAPI, check netpoll */
2220         if (netpoll_receive_skb(skb))
2221                 return NET_RX_DROP;
2222
2223         if (!skb->tstamp.tv64)
2224                 net_timestamp(skb);
2225
2226         if (!skb->iif)
2227                 skb->iif = skb->dev->ifindex;
2228
2229         null_or_orig = NULL;
2230         orig_dev = skb->dev;
2231         if (orig_dev->master) {
2232                 if (skb_bond_should_drop(skb))
2233                         null_or_orig = orig_dev; /* deliver only exact match */
2234                 else
2235                         skb->dev = orig_dev->master;
2236         }
2237
2238         __get_cpu_var(netdev_rx_stat).total++;
2239
2240         skb_reset_network_header(skb);
2241         skb_reset_transport_header(skb);
2242         skb->mac_len = skb->network_header - skb->mac_header;
2243
2244         pt_prev = NULL;
2245
2246         rcu_read_lock();
2247
2248         /* Don't receive packets in an exiting network namespace */
2249         if (!net_alive(dev_net(skb->dev))) {
2250                 kfree_skb(skb);
2251                 goto out;
2252         }
2253
2254 #ifdef CONFIG_NET_CLS_ACT
2255         if (skb->tc_verd & TC_NCLS) {
2256                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2257                 goto ncls;
2258         }
2259 #endif
2260
2261         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2262                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2263                     ptype->dev == orig_dev) {
2264                         if (pt_prev)
2265                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2266                         pt_prev = ptype;
2267                 }
2268         }
2269
2270 #ifdef CONFIG_NET_CLS_ACT
2271         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2272         if (!skb)
2273                 goto out;
2274 ncls:
2275 #endif
2276
2277         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2278         if (!skb)
2279                 goto out;
2280         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2281         if (!skb)
2282                 goto out;
2283
2284         type = skb->protocol;
2285         list_for_each_entry_rcu(ptype,
2286                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2287                 if (ptype->type == type &&
2288                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2289                      ptype->dev == orig_dev)) {
2290                         if (pt_prev)
2291                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2292                         pt_prev = ptype;
2293                 }
2294         }
2295
2296         if (pt_prev) {
2297                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2298         } else {
2299                 kfree_skb(skb);
2300                 /* Jamal, now you will not able to escape explaining
2301                  * me how you were going to use this. :-)
2302                  */
2303                 ret = NET_RX_DROP;
2304         }
2305
2306 out:
2307         rcu_read_unlock();
2308         return ret;
2309 }
2310
2311 /* Network device is going away, flush any packets still pending  */
2312 static void flush_backlog(void *arg)
2313 {
2314         struct net_device *dev = arg;
2315         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2316         struct sk_buff *skb, *tmp;
2317
2318         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2319                 if (skb->dev == dev) {
2320                         __skb_unlink(skb, &queue->input_pkt_queue);
2321                         kfree_skb(skb);
2322                 }
2323 }
2324
2325 static int napi_gro_complete(struct sk_buff *skb)
2326 {
2327         struct packet_type *ptype;
2328         __be16 type = skb->protocol;
2329         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2330         int err = -ENOENT;
2331
2332         if (NAPI_GRO_CB(skb)->count == 1)
2333                 goto out;
2334
2335         rcu_read_lock();
2336         list_for_each_entry_rcu(ptype, head, list) {
2337                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2338                         continue;
2339
2340                 err = ptype->gro_complete(skb);
2341                 break;
2342         }
2343         rcu_read_unlock();
2344
2345         if (err) {
2346                 WARN_ON(&ptype->list == head);
2347                 kfree_skb(skb);
2348                 return NET_RX_SUCCESS;
2349         }
2350
2351 out:
2352         skb_shinfo(skb)->gso_size = 0;
2353         __skb_push(skb, -skb_network_offset(skb));
2354         return netif_receive_skb(skb);
2355 }
2356
2357 void napi_gro_flush(struct napi_struct *napi)
2358 {
2359         struct sk_buff *skb, *next;
2360
2361         for (skb = napi->gro_list; skb; skb = next) {
2362                 next = skb->next;
2363                 skb->next = NULL;
2364                 napi_gro_complete(skb);
2365         }
2366
2367         napi->gro_list = NULL;
2368 }
2369 EXPORT_SYMBOL(napi_gro_flush);
2370
2371 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2372 {
2373         struct sk_buff **pp = NULL;
2374         struct packet_type *ptype;
2375         __be16 type = skb->protocol;
2376         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2377         int count = 0;
2378         int same_flow;
2379         int mac_len;
2380         int free;
2381
2382         if (!(skb->dev->features & NETIF_F_GRO))
2383                 goto normal;
2384
2385         rcu_read_lock();
2386         list_for_each_entry_rcu(ptype, head, list) {
2387                 struct sk_buff *p;
2388
2389                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2390                         continue;
2391
2392                 skb_reset_network_header(skb);
2393                 mac_len = skb->network_header - skb->mac_header;
2394                 skb->mac_len = mac_len;
2395                 NAPI_GRO_CB(skb)->same_flow = 0;
2396                 NAPI_GRO_CB(skb)->flush = 0;
2397                 NAPI_GRO_CB(skb)->free = 0;
2398
2399                 for (p = napi->gro_list; p; p = p->next) {
2400                         count++;
2401
2402                         if (!NAPI_GRO_CB(p)->same_flow)
2403                                 continue;
2404
2405                         if (p->mac_len != mac_len ||
2406                             memcmp(skb_mac_header(p), skb_mac_header(skb),
2407                                    mac_len))
2408                                 NAPI_GRO_CB(p)->same_flow = 0;
2409                 }
2410
2411                 pp = ptype->gro_receive(&napi->gro_list, skb);
2412                 break;
2413         }
2414         rcu_read_unlock();
2415
2416         if (&ptype->list == head)
2417                 goto normal;
2418
2419         same_flow = NAPI_GRO_CB(skb)->same_flow;
2420         free = NAPI_GRO_CB(skb)->free;
2421
2422         if (pp) {
2423                 struct sk_buff *nskb = *pp;
2424
2425                 *pp = nskb->next;
2426                 nskb->next = NULL;
2427                 napi_gro_complete(nskb);
2428                 count--;
2429         }
2430
2431         if (same_flow)
2432                 goto ok;
2433
2434         if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2435                 __skb_push(skb, -skb_network_offset(skb));
2436                 goto normal;
2437         }
2438
2439         NAPI_GRO_CB(skb)->count = 1;
2440         skb_shinfo(skb)->gso_size = skb->len;
2441         skb->next = napi->gro_list;
2442         napi->gro_list = skb;
2443
2444 ok:
2445         return free;
2446
2447 normal:
2448         return -1;
2449 }
2450 EXPORT_SYMBOL(dev_gro_receive);
2451
2452 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2453 {
2454         struct sk_buff *p;
2455
2456         for (p = napi->gro_list; p; p = p->next) {
2457                 NAPI_GRO_CB(p)->same_flow = 1;
2458                 NAPI_GRO_CB(p)->flush = 0;
2459         }
2460
2461         return dev_gro_receive(napi, skb);
2462 }
2463
2464 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2465 {
2466         switch (__napi_gro_receive(napi, skb)) {
2467         case -1:
2468                 return netif_receive_skb(skb);
2469
2470         case 1:
2471                 kfree_skb(skb);
2472                 break;
2473         }
2474
2475         return NET_RX_SUCCESS;
2476 }
2477 EXPORT_SYMBOL(napi_gro_receive);
2478
2479 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2480 {
2481         skb_shinfo(skb)->nr_frags = 0;
2482
2483         skb->len -= skb->data_len;
2484         skb->truesize -= skb->data_len;
2485         skb->data_len = 0;
2486
2487         __skb_pull(skb, skb_headlen(skb));
2488         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2489
2490         napi->skb = skb;
2491 }
2492 EXPORT_SYMBOL(napi_reuse_skb);
2493
2494 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2495                                   struct napi_gro_fraginfo *info)
2496 {
2497         struct net_device *dev = napi->dev;
2498         struct sk_buff *skb = napi->skb;
2499
2500         napi->skb = NULL;
2501
2502         if (!skb) {
2503                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2504                 if (!skb)
2505                         goto out;
2506
2507                 skb_reserve(skb, NET_IP_ALIGN);
2508         }
2509
2510         BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2511         skb_shinfo(skb)->nr_frags = info->nr_frags;
2512         memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2513
2514         skb->data_len = info->len;
2515         skb->len += info->len;
2516         skb->truesize += info->len;
2517
2518         if (!pskb_may_pull(skb, ETH_HLEN)) {
2519                 napi_reuse_skb(napi, skb);
2520                 goto out;
2521         }
2522
2523         skb->protocol = eth_type_trans(skb, dev);
2524
2525         skb->ip_summed = info->ip_summed;
2526         skb->csum = info->csum;
2527
2528 out:
2529         return skb;
2530 }
2531 EXPORT_SYMBOL(napi_fraginfo_skb);
2532
2533 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2534 {
2535         struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2536         int err = NET_RX_DROP;
2537
2538         if (!skb)
2539                 goto out;
2540
2541         err = NET_RX_SUCCESS;
2542
2543         switch (__napi_gro_receive(napi, skb)) {
2544         case -1:
2545                 return netif_receive_skb(skb);
2546
2547         case 0:
2548                 goto out;
2549         }
2550
2551         napi_reuse_skb(napi, skb);
2552
2553 out:
2554         return err;
2555 }
2556 EXPORT_SYMBOL(napi_gro_frags);
2557
2558 static int process_backlog(struct napi_struct *napi, int quota)
2559 {
2560         int work = 0;
2561         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2562         unsigned long start_time = jiffies;
2563
2564         napi->weight = weight_p;
2565         do {
2566                 struct sk_buff *skb;
2567
2568                 local_irq_disable();
2569                 skb = __skb_dequeue(&queue->input_pkt_queue);
2570                 if (!skb) {
2571                         __napi_complete(napi);
2572                         local_irq_enable();
2573                         break;
2574                 }
2575                 local_irq_enable();
2576
2577                 napi_gro_receive(napi, skb);
2578         } while (++work < quota && jiffies == start_time);
2579
2580         napi_gro_flush(napi);
2581
2582         return work;
2583 }
2584
2585 /**
2586  * __napi_schedule - schedule for receive
2587  * @n: entry to schedule
2588  *
2589  * The entry's receive function will be scheduled to run
2590  */
2591 void __napi_schedule(struct napi_struct *n)
2592 {
2593         unsigned long flags;
2594
2595         local_irq_save(flags);
2596         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2597         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2598         local_irq_restore(flags);
2599 }
2600 EXPORT_SYMBOL(__napi_schedule);
2601
2602 void __napi_complete(struct napi_struct *n)
2603 {
2604         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2605         BUG_ON(n->gro_list);
2606
2607         list_del(&n->poll_list);
2608         smp_mb__before_clear_bit();
2609         clear_bit(NAPI_STATE_SCHED, &n->state);
2610 }
2611 EXPORT_SYMBOL(__napi_complete);
2612
2613 void napi_complete(struct napi_struct *n)
2614 {
2615         unsigned long flags;
2616
2617         /*
2618          * don't let napi dequeue from the cpu poll list
2619          * just in case its running on a different cpu
2620          */
2621         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2622                 return;
2623
2624         napi_gro_flush(n);
2625         local_irq_save(flags);
2626         __napi_complete(n);
2627         local_irq_restore(flags);
2628 }
2629 EXPORT_SYMBOL(napi_complete);
2630
2631 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2632                     int (*poll)(struct napi_struct *, int), int weight)
2633 {
2634         INIT_LIST_HEAD(&napi->poll_list);
2635         napi->gro_list = NULL;
2636         napi->skb = NULL;
2637         napi->poll = poll;
2638         napi->weight = weight;
2639         list_add(&napi->dev_list, &dev->napi_list);
2640         napi->dev = dev;
2641 #ifdef CONFIG_NETPOLL
2642         spin_lock_init(&napi->poll_lock);
2643         napi->poll_owner = -1;
2644 #endif
2645         set_bit(NAPI_STATE_SCHED, &napi->state);
2646 }
2647 EXPORT_SYMBOL(netif_napi_add);
2648
2649 void netif_napi_del(struct napi_struct *napi)
2650 {
2651         struct sk_buff *skb, *next;
2652
2653         list_del_init(&napi->dev_list);
2654         kfree(napi->skb);
2655
2656         for (skb = napi->gro_list; skb; skb = next) {
2657                 next = skb->next;
2658                 skb->next = NULL;
2659                 kfree_skb(skb);
2660         }
2661
2662         napi->gro_list = NULL;
2663 }
2664 EXPORT_SYMBOL(netif_napi_del);
2665
2666
2667 static void net_rx_action(struct softirq_action *h)
2668 {
2669         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2670         unsigned long time_limit = jiffies + 2;
2671         int budget = netdev_budget;
2672         void *have;
2673
2674         local_irq_disable();
2675
2676         while (!list_empty(list)) {
2677                 struct napi_struct *n;
2678                 int work, weight;
2679
2680                 /* If softirq window is exhuasted then punt.
2681                  * Allow this to run for 2 jiffies since which will allow
2682                  * an average latency of 1.5/HZ.
2683                  */
2684                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2685                         goto softnet_break;
2686
2687                 local_irq_enable();
2688
2689                 /* Even though interrupts have been re-enabled, this
2690                  * access is safe because interrupts can only add new
2691                  * entries to the tail of this list, and only ->poll()
2692                  * calls can remove this head entry from the list.
2693                  */
2694                 n = list_entry(list->next, struct napi_struct, poll_list);
2695
2696                 have = netpoll_poll_lock(n);
2697
2698                 weight = n->weight;
2699
2700                 /* This NAPI_STATE_SCHED test is for avoiding a race
2701                  * with netpoll's poll_napi().  Only the entity which
2702                  * obtains the lock and sees NAPI_STATE_SCHED set will
2703                  * actually make the ->poll() call.  Therefore we avoid
2704                  * accidently calling ->poll() when NAPI is not scheduled.
2705                  */
2706                 work = 0;
2707                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2708                         work = n->poll(n, weight);
2709
2710                 WARN_ON_ONCE(work > weight);
2711
2712                 budget -= work;
2713
2714                 local_irq_disable();
2715
2716                 /* Drivers must not modify the NAPI state if they
2717                  * consume the entire weight.  In such cases this code
2718                  * still "owns" the NAPI instance and therefore can
2719                  * move the instance around on the list at-will.
2720                  */
2721                 if (unlikely(work == weight)) {
2722                         if (unlikely(napi_disable_pending(n)))
2723                                 __napi_complete(n);
2724                         else
2725                                 list_move_tail(&n->poll_list, list);
2726                 }
2727
2728                 netpoll_poll_unlock(have);
2729         }
2730 out:
2731         local_irq_enable();
2732
2733 #ifdef CONFIG_NET_DMA
2734         /*
2735          * There may not be any more sk_buffs coming right now, so push
2736          * any pending DMA copies to hardware
2737          */
2738         dma_issue_pending_all();
2739 #endif
2740
2741         return;
2742
2743 softnet_break:
2744         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2745         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2746         goto out;
2747 }
2748
2749 static gifconf_func_t * gifconf_list [NPROTO];
2750
2751 /**
2752  *      register_gifconf        -       register a SIOCGIF handler
2753  *      @family: Address family
2754  *      @gifconf: Function handler
2755  *
2756  *      Register protocol dependent address dumping routines. The handler
2757  *      that is passed must not be freed or reused until it has been replaced
2758  *      by another handler.
2759  */
2760 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2761 {
2762         if (family >= NPROTO)
2763                 return -EINVAL;
2764         gifconf_list[family] = gifconf;
2765         return 0;
2766 }
2767
2768
2769 /*
2770  *      Map an interface index to its name (SIOCGIFNAME)
2771  */
2772
2773 /*
2774  *      We need this ioctl for efficient implementation of the
2775  *      if_indextoname() function required by the IPv6 API.  Without
2776  *      it, we would have to search all the interfaces to find a
2777  *      match.  --pb
2778  */
2779
2780 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2781 {
2782         struct net_device *dev;
2783         struct ifreq ifr;
2784
2785         /*
2786          *      Fetch the caller's info block.
2787          */
2788
2789         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2790                 return -EFAULT;
2791
2792         read_lock(&dev_base_lock);
2793         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2794         if (!dev) {
2795                 read_unlock(&dev_base_lock);
2796                 return -ENODEV;
2797         }
2798
2799         strcpy(ifr.ifr_name, dev->name);
2800         read_unlock(&dev_base_lock);
2801
2802         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2803                 return -EFAULT;
2804         return 0;
2805 }
2806
2807 /*
2808  *      Perform a SIOCGIFCONF call. This structure will change
2809  *      size eventually, and there is nothing I can do about it.
2810  *      Thus we will need a 'compatibility mode'.
2811  */
2812
2813 static int dev_ifconf(struct net *net, char __user *arg)
2814 {
2815         struct ifconf ifc;
2816         struct net_device *dev;
2817         char __user *pos;
2818         int len;
2819         int total;
2820         int i;
2821
2822         /*
2823          *      Fetch the caller's info block.
2824          */
2825
2826         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2827                 return -EFAULT;
2828
2829         pos = ifc.ifc_buf;
2830         len = ifc.ifc_len;
2831
2832         /*
2833          *      Loop over the interfaces, and write an info block for each.
2834          */
2835
2836         total = 0;
2837         for_each_netdev(net, dev) {
2838                 for (i = 0; i < NPROTO; i++) {
2839                         if (gifconf_list[i]) {
2840                                 int done;
2841                                 if (!pos)
2842                                         done = gifconf_list[i](dev, NULL, 0);
2843                                 else
2844                                         done = gifconf_list[i](dev, pos + total,
2845                                                                len - total);
2846                                 if (done < 0)
2847                                         return -EFAULT;
2848                                 total += done;
2849                         }
2850                 }
2851         }
2852
2853         /*
2854          *      All done.  Write the updated control block back to the caller.
2855          */
2856         ifc.ifc_len = total;
2857
2858         /*
2859          *      Both BSD and Solaris return 0 here, so we do too.
2860          */
2861         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2862 }
2863
2864 #ifdef CONFIG_PROC_FS
2865 /*
2866  *      This is invoked by the /proc filesystem handler to display a device
2867  *      in detail.
2868  */
2869 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2870         __acquires(dev_base_lock)
2871 {
2872         struct net *net = seq_file_net(seq);
2873         loff_t off;
2874         struct net_device *dev;
2875
2876         read_lock(&dev_base_lock);
2877         if (!*pos)
2878                 return SEQ_START_TOKEN;
2879
2880         off = 1;
2881         for_each_netdev(net, dev)
2882                 if (off++ == *pos)
2883                         return dev;
2884
2885         return NULL;
2886 }
2887
2888 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2889 {
2890         struct net *net = seq_file_net(seq);
2891         ++*pos;
2892         return v == SEQ_START_TOKEN ?
2893                 first_net_device(net) : next_net_device((struct net_device *)v);
2894 }
2895
2896 void dev_seq_stop(struct seq_file *seq, void *v)
2897         __releases(dev_base_lock)
2898 {
2899         read_unlock(&dev_base_lock);
2900 }
2901
2902 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2903 {
2904         const struct net_device_stats *stats = dev_get_stats(dev);
2905
2906         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2907                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2908                    dev->name, stats->rx_bytes, stats->rx_packets,
2909                    stats->rx_errors,
2910                    stats->rx_dropped + stats->rx_missed_errors,
2911                    stats->rx_fifo_errors,
2912                    stats->rx_length_errors + stats->rx_over_errors +
2913                     stats->rx_crc_errors + stats->rx_frame_errors,
2914                    stats->rx_compressed, stats->multicast,
2915                    stats->tx_bytes, stats->tx_packets,
2916                    stats->tx_errors, stats->tx_dropped,
2917                    stats->tx_fifo_errors, stats->collisions,
2918                    stats->tx_carrier_errors +
2919                     stats->tx_aborted_errors +
2920                     stats->tx_window_errors +
2921                     stats->tx_heartbeat_errors,
2922                    stats->tx_compressed);
2923 }
2924
2925 /*
2926  *      Called from the PROCfs module. This now uses the new arbitrary sized
2927  *      /proc/net interface to create /proc/net/dev
2928  */
2929 static int dev_seq_show(struct seq_file *seq, void *v)
2930 {
2931         if (v == SEQ_START_TOKEN)
2932                 seq_puts(seq, "Inter-|   Receive                            "
2933                               "                    |  Transmit\n"
2934                               " face |bytes    packets errs drop fifo frame "
2935                               "compressed multicast|bytes    packets errs "
2936                               "drop fifo colls carrier compressed\n");
2937         else
2938                 dev_seq_printf_stats(seq, v);
2939         return 0;
2940 }
2941
2942 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2943 {
2944         struct netif_rx_stats *rc = NULL;
2945
2946         while (*pos < nr_cpu_ids)
2947                 if (cpu_online(*pos)) {
2948                         rc = &per_cpu(netdev_rx_stat, *pos);
2949                         break;
2950                 } else
2951                         ++*pos;
2952         return rc;
2953 }
2954
2955 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2956 {
2957         return softnet_get_online(pos);
2958 }
2959
2960 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2961 {
2962         ++*pos;
2963         return softnet_get_online(pos);
2964 }
2965
2966 static void softnet_seq_stop(struct seq_file *seq, void *v)
2967 {
2968 }
2969
2970 static int softnet_seq_show(struct seq_file *seq, void *v)
2971 {
2972         struct netif_rx_stats *s = v;
2973
2974         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2975                    s->total, s->dropped, s->time_squeeze, 0,
2976                    0, 0, 0, 0, /* was fastroute */
2977                    s->cpu_collision );
2978         return 0;
2979 }
2980
2981 static const struct seq_operations dev_seq_ops = {
2982         .start = dev_seq_start,
2983         .next  = dev_seq_next,
2984         .stop  = dev_seq_stop,
2985         .show  = dev_seq_show,
2986 };
2987
2988 static int dev_seq_open(struct inode *inode, struct file *file)
2989 {
2990         return seq_open_net(inode, file, &dev_seq_ops,
2991                             sizeof(struct seq_net_private));
2992 }
2993
2994 static const struct file_operations dev_seq_fops = {
2995         .owner   = THIS_MODULE,
2996         .open    = dev_seq_open,
2997         .read    = seq_read,
2998         .llseek  = seq_lseek,
2999         .release = seq_release_net,
3000 };
3001
3002 static const struct seq_operations softnet_seq_ops = {
3003         .start = softnet_seq_start,
3004         .next  = softnet_seq_next,
3005         .stop  = softnet_seq_stop,
3006         .show  = softnet_seq_show,
3007 };
3008
3009 static int softnet_seq_open(struct inode *inode, struct file *file)
3010 {
3011         return seq_open(file, &softnet_seq_ops);
3012 }
3013
3014 static const struct file_operations softnet_seq_fops = {
3015         .owner   = THIS_MODULE,
3016         .open    = softnet_seq_open,
3017         .read    = seq_read,
3018         .llseek  = seq_lseek,
3019         .release = seq_release,
3020 };
3021
3022 static void *ptype_get_idx(loff_t pos)
3023 {
3024         struct packet_type *pt = NULL;
3025         loff_t i = 0;
3026         int t;
3027
3028         list_for_each_entry_rcu(pt, &ptype_all, list) {
3029                 if (i == pos)
3030                         return pt;
3031                 ++i;
3032         }
3033
3034         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3035                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3036                         if (i == pos)
3037                                 return pt;
3038                         ++i;
3039                 }
3040         }
3041         return NULL;
3042 }
3043
3044 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3045         __acquires(RCU)
3046 {
3047         rcu_read_lock();
3048         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3049 }
3050
3051 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3052 {
3053         struct packet_type *pt;
3054         struct list_head *nxt;
3055         int hash;
3056
3057         ++*pos;
3058         if (v == SEQ_START_TOKEN)
3059                 return ptype_get_idx(0);
3060
3061         pt = v;
3062         nxt = pt->list.next;
3063         if (pt->type == htons(ETH_P_ALL)) {
3064                 if (nxt != &ptype_all)
3065                         goto found;
3066                 hash = 0;
3067                 nxt = ptype_base[0].next;
3068         } else
3069                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3070
3071         while (nxt == &ptype_base[hash]) {
3072                 if (++hash >= PTYPE_HASH_SIZE)
3073                         return NULL;
3074                 nxt = ptype_base[hash].next;
3075         }
3076 found:
3077         return list_entry(nxt, struct packet_type, list);
3078 }
3079
3080 static void ptype_seq_stop(struct seq_file *seq, void *v)
3081         __releases(RCU)
3082 {
3083         rcu_read_unlock();
3084 }
3085
3086 static int ptype_seq_show(struct seq_file *seq, void *v)
3087 {
3088         struct packet_type *pt = v;
3089
3090         if (v == SEQ_START_TOKEN)
3091                 seq_puts(seq, "Type Device      Function\n");
3092         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3093                 if (pt->type == htons(ETH_P_ALL))
3094                         seq_puts(seq, "ALL ");
3095                 else
3096                         seq_printf(seq, "%04x", ntohs(pt->type));
3097
3098                 seq_printf(seq, " %-8s %pF\n",
3099                            pt->dev ? pt->dev->name : "", pt->func);
3100         }
3101
3102         return 0;
3103 }
3104
3105 static const struct seq_operations ptype_seq_ops = {
3106         .start = ptype_seq_start,
3107         .next  = ptype_seq_next,
3108         .stop  = ptype_seq_stop,
3109         .show  = ptype_seq_show,
3110 };
3111
3112 static int ptype_seq_open(struct inode *inode, struct file *file)
3113 {
3114         return seq_open_net(inode, file, &ptype_seq_ops,
3115                         sizeof(struct seq_net_private));
3116 }
3117
3118 static const struct file_operations ptype_seq_fops = {
3119         .owner   = THIS_MODULE,
3120         .open    = ptype_seq_open,
3121         .read    = seq_read,
3122         .llseek  = seq_lseek,
3123         .release = seq_release_net,
3124 };
3125
3126
3127 static int __net_init dev_proc_net_init(struct net *net)
3128 {
3129         int rc = -ENOMEM;
3130
3131         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3132                 goto out;
3133         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3134                 goto out_dev;
3135         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3136                 goto out_softnet;
3137
3138         if (wext_proc_init(net))
3139                 goto out_ptype;
3140         rc = 0;
3141 out:
3142         return rc;
3143 out_ptype:
3144         proc_net_remove(net, "ptype");
3145 out_softnet:
3146         proc_net_remove(net, "softnet_stat");
3147 out_dev:
3148         proc_net_remove(net, "dev");
3149         goto out;
3150 }
3151
3152 static void __net_exit dev_proc_net_exit(struct net *net)
3153 {
3154         wext_proc_exit(net);
3155
3156         proc_net_remove(net, "ptype");
3157         proc_net_remove(net, "softnet_stat");
3158         proc_net_remove(net, "dev");
3159 }
3160
3161 static struct pernet_operations __net_initdata dev_proc_ops = {
3162         .init = dev_proc_net_init,
3163         .exit = dev_proc_net_exit,
3164 };
3165
3166 static int __init dev_proc_init(void)
3167 {
3168         return register_pernet_subsys(&dev_proc_ops);
3169 }
3170 #else
3171 #define dev_proc_init() 0
3172 #endif  /* CONFIG_PROC_FS */
3173
3174
3175 /**
3176  *      netdev_set_master       -       set up master/slave pair
3177  *      @slave: slave device
3178  *      @master: new master device
3179  *
3180  *      Changes the master device of the slave. Pass %NULL to break the
3181  *      bonding. The caller must hold the RTNL semaphore. On a failure
3182  *      a negative errno code is returned. On success the reference counts
3183  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3184  *      function returns zero.
3185  */
3186 int netdev_set_master(struct net_device *slave, struct net_device *master)
3187 {
3188         struct net_device *old = slave->master;
3189
3190         ASSERT_RTNL();
3191
3192         if (master) {
3193                 if (old)
3194                         return -EBUSY;
3195                 dev_hold(master);
3196         }
3197
3198         slave->master = master;
3199
3200         synchronize_net();
3201
3202         if (old)
3203                 dev_put(old);
3204
3205         if (master)
3206                 slave->flags |= IFF_SLAVE;
3207         else
3208                 slave->flags &= ~IFF_SLAVE;
3209
3210         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3211         return 0;
3212 }
3213
3214 static void dev_change_rx_flags(struct net_device *dev, int flags)
3215 {
3216         const struct net_device_ops *ops = dev->netdev_ops;
3217
3218         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3219                 ops->ndo_change_rx_flags(dev, flags);
3220 }
3221
3222 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3223 {
3224         unsigned short old_flags = dev->flags;
3225         uid_t uid;
3226         gid_t gid;
3227
3228         ASSERT_RTNL();
3229
3230         dev->flags |= IFF_PROMISC;
3231         dev->promiscuity += inc;
3232         if (dev->promiscuity == 0) {
3233                 /*
3234                  * Avoid overflow.
3235                  * If inc causes overflow, untouch promisc and return error.
3236                  */
3237                 if (inc < 0)
3238                         dev->flags &= ~IFF_PROMISC;
3239                 else {
3240                         dev->promiscuity -= inc;
3241                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3242                                 "set promiscuity failed, promiscuity feature "
3243                                 "of device might be broken.\n", dev->name);
3244                         return -EOVERFLOW;
3245                 }
3246         }
3247         if (dev->flags != old_flags) {
3248                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3249                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3250                                                                "left");
3251                 if (audit_enabled) {
3252                         current_uid_gid(&uid, &gid);
3253                         audit_log(current->audit_context, GFP_ATOMIC,
3254                                 AUDIT_ANOM_PROMISCUOUS,
3255                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3256                                 dev->name, (dev->flags & IFF_PROMISC),
3257                                 (old_flags & IFF_PROMISC),
3258                                 audit_get_loginuid(current),
3259                                 uid, gid,
3260                                 audit_get_sessionid(current));
3261                 }
3262
3263                 dev_change_rx_flags(dev, IFF_PROMISC);
3264         }
3265         return 0;
3266 }
3267
3268 /**
3269  *      dev_set_promiscuity     - update promiscuity count on a device
3270  *      @dev: device
3271  *      @inc: modifier
3272  *
3273  *      Add or remove promiscuity from a device. While the count in the device
3274  *      remains above zero the interface remains promiscuous. Once it hits zero
3275  *      the device reverts back to normal filtering operation. A negative inc
3276  *      value is used to drop promiscuity on the device.
3277  *      Return 0 if successful or a negative errno code on error.
3278  */
3279 int dev_set_promiscuity(struct net_device *dev, int inc)
3280 {
3281         unsigned short old_flags = dev->flags;
3282         int err;
3283
3284         err = __dev_set_promiscuity(dev, inc);
3285         if (err < 0)
3286                 return err;
3287         if (dev->flags != old_flags)
3288                 dev_set_rx_mode(dev);
3289         return err;
3290 }
3291
3292 /**
3293  *      dev_set_allmulti        - update allmulti count on a device
3294  *      @dev: device
3295  *      @inc: modifier
3296  *
3297  *      Add or remove reception of all multicast frames to a device. While the
3298  *      count in the device remains above zero the interface remains listening
3299  *      to all interfaces. Once it hits zero the device reverts back to normal
3300  *      filtering operation. A negative @inc value is used to drop the counter
3301  *      when releasing a resource needing all multicasts.
3302  *      Return 0 if successful or a negative errno code on error.
3303  */
3304
3305 int dev_set_allmulti(struct net_device *dev, int inc)
3306 {
3307         unsigned short old_flags = dev->flags;
3308
3309         ASSERT_RTNL();
3310
3311         dev->flags |= IFF_ALLMULTI;
3312         dev->allmulti += inc;
3313         if (dev->allmulti == 0) {
3314                 /*
3315                  * Avoid overflow.
3316                  * If inc causes overflow, untouch allmulti and return error.
3317                  */
3318                 if (inc < 0)
3319                         dev->flags &= ~IFF_ALLMULTI;
3320                 else {
3321                         dev->allmulti -= inc;
3322                         printk(KERN_WARNING "%s: allmulti touches roof, "
3323                                 "set allmulti failed, allmulti feature of "
3324                                 "device might be broken.\n", dev->name);
3325                         return -EOVERFLOW;
3326                 }
3327         }
3328         if (dev->flags ^ old_flags) {
3329                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3330                 dev_set_rx_mode(dev);
3331         }
3332         return 0;
3333 }
3334
3335 /*
3336  *      Upload unicast and multicast address lists to device and
3337  *      configure RX filtering. When the device doesn't support unicast
3338  *      filtering it is put in promiscuous mode while unicast addresses
3339  *      are present.
3340  */
3341 void __dev_set_rx_mode(struct net_device *dev)
3342 {
3343         const struct net_device_ops *ops = dev->netdev_ops;
3344
3345         /* dev_open will call this function so the list will stay sane. */
3346         if (!(dev->flags&IFF_UP))
3347                 return;
3348
3349         if (!netif_device_present(dev))
3350                 return;
3351
3352         if (ops->ndo_set_rx_mode)
3353                 ops->ndo_set_rx_mode(dev);
3354         else {
3355                 /* Unicast addresses changes may only happen under the rtnl,
3356                  * therefore calling __dev_set_promiscuity here is safe.
3357                  */
3358                 if (dev->uc_count > 0 && !dev->uc_promisc) {
3359                         __dev_set_promiscuity(dev, 1);
3360                         dev->uc_promisc = 1;
3361                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3362                         __dev_set_promiscuity(dev, -1);
3363                         dev->uc_promisc = 0;
3364                 }
3365
3366                 if (ops->ndo_set_multicast_list)
3367                         ops->ndo_set_multicast_list(dev);
3368         }
3369 }
3370
3371 void dev_set_rx_mode(struct net_device *dev)
3372 {
3373         netif_addr_lock_bh(dev);
3374         __dev_set_rx_mode(dev);
3375         netif_addr_unlock_bh(dev);
3376 }
3377
3378 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3379                       void *addr, int alen, int glbl)
3380 {
3381         struct dev_addr_list *da;
3382
3383         for (; (da = *list) != NULL; list = &da->next) {
3384                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3385                     alen == da->da_addrlen) {
3386                         if (glbl) {
3387                                 int old_glbl = da->da_gusers;
3388                                 da->da_gusers = 0;
3389                                 if (old_glbl == 0)
3390                                         break;
3391                         }
3392                         if (--da->da_users)
3393                                 return 0;
3394
3395                         *list = da->next;
3396                         kfree(da);
3397                         (*count)--;
3398                         return 0;
3399                 }
3400         }
3401         return -ENOENT;
3402 }
3403
3404 int __dev_addr_add(struct dev_addr_list **list, int *count,
3405                    void *addr, int alen, int glbl)
3406 {
3407         struct dev_addr_list *da;
3408
3409         for (da = *list; da != NULL; da = da->next) {
3410                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3411                     da->da_addrlen == alen) {
3412                         if (glbl) {
3413                                 int old_glbl = da->da_gusers;
3414                                 da->da_gusers = 1;
3415                                 if (old_glbl)
3416                                         return 0;
3417                         }
3418                         da->da_users++;
3419                         return 0;
3420                 }
3421         }
3422
3423         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3424         if (da == NULL)
3425                 return -ENOMEM;
3426         memcpy(da->da_addr, addr, alen);
3427         da->da_addrlen = alen;
3428         da->da_users = 1;
3429         da->da_gusers = glbl ? 1 : 0;
3430         da->next = *list;
3431         *list = da;
3432         (*count)++;
3433         return 0;
3434 }
3435
3436 /**
3437  *      dev_unicast_delete      - Release secondary unicast address.
3438  *      @dev: device
3439  *      @addr: address to delete
3440  *      @alen: length of @addr
3441  *
3442  *      Release reference to a secondary unicast address and remove it
3443  *      from the device if the reference count drops to zero.
3444  *
3445  *      The caller must hold the rtnl_mutex.
3446  */
3447 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3448 {
3449         int err;
3450
3451         ASSERT_RTNL();
3452
3453         netif_addr_lock_bh(dev);
3454         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3455         if (!err)
3456                 __dev_set_rx_mode(dev);
3457         netif_addr_unlock_bh(dev);
3458         return err;
3459 }
3460 EXPORT_SYMBOL(dev_unicast_delete);
3461
3462 /**
3463  *      dev_unicast_add         - add a secondary unicast address
3464  *      @dev: device
3465  *      @addr: address to add
3466  *      @alen: length of @addr
3467  *
3468  *      Add a secondary unicast address to the device or increase
3469  *      the reference count if it already exists.
3470  *
3471  *      The caller must hold the rtnl_mutex.
3472  */
3473 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3474 {
3475         int err;
3476
3477         ASSERT_RTNL();
3478
3479         netif_addr_lock_bh(dev);
3480         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3481         if (!err)
3482                 __dev_set_rx_mode(dev);
3483         netif_addr_unlock_bh(dev);
3484         return err;
3485 }
3486 EXPORT_SYMBOL(dev_unicast_add);
3487
3488 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3489                     struct dev_addr_list **from, int *from_count)
3490 {
3491         struct dev_addr_list *da, *next;
3492         int err = 0;
3493
3494         da = *from;
3495         while (da != NULL) {
3496                 next = da->next;
3497                 if (!da->da_synced) {
3498                         err = __dev_addr_add(to, to_count,
3499                                              da->da_addr, da->da_addrlen, 0);
3500                         if (err < 0)
3501                                 break;
3502                         da->da_synced = 1;
3503                         da->da_users++;
3504                 } else if (da->da_users == 1) {
3505                         __dev_addr_delete(to, to_count,
3506                                           da->da_addr, da->da_addrlen, 0);
3507                         __dev_addr_delete(from, from_count,
3508                                           da->da_addr, da->da_addrlen, 0);
3509                 }
3510                 da = next;
3511         }
3512         return err;
3513 }
3514
3515 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3516                        struct dev_addr_list **from, int *from_count)
3517 {
3518         struct dev_addr_list *da, *next;
3519
3520         da = *from;
3521         while (da != NULL) {
3522                 next = da->next;
3523                 if (da->da_synced) {
3524                         __dev_addr_delete(to, to_count,
3525                                           da->da_addr, da->da_addrlen, 0);
3526                         da->da_synced = 0;
3527                         __dev_addr_delete(from, from_count,
3528                                           da->da_addr, da->da_addrlen, 0);
3529                 }
3530                 da = next;
3531         }
3532 }
3533
3534 /**
3535  *      dev_unicast_sync - Synchronize device's unicast list to another device
3536  *      @to: destination device
3537  *      @from: source device
3538  *
3539  *      Add newly added addresses to the destination device and release
3540  *      addresses that have no users left. The source device must be
3541  *      locked by netif_tx_lock_bh.
3542  *
3543  *      This function is intended to be called from the dev->set_rx_mode
3544  *      function of layered software devices.
3545  */
3546 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3547 {
3548         int err = 0;
3549
3550         netif_addr_lock_bh(to);
3551         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3552                               &from->uc_list, &from->uc_count);
3553         if (!err)
3554                 __dev_set_rx_mode(to);
3555         netif_addr_unlock_bh(to);
3556         return err;
3557 }
3558 EXPORT_SYMBOL(dev_unicast_sync);
3559
3560 /**
3561  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3562  *      @to: destination device
3563  *      @from: source device
3564  *
3565  *      Remove all addresses that were added to the destination device by
3566  *      dev_unicast_sync(). This function is intended to be called from the
3567  *      dev->stop function of layered software devices.
3568  */
3569 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3570 {
3571         netif_addr_lock_bh(from);
3572         netif_addr_lock(to);
3573
3574         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3575                           &from->uc_list, &from->uc_count);
3576         __dev_set_rx_mode(to);
3577
3578         netif_addr_unlock(to);
3579         netif_addr_unlock_bh(from);
3580 }
3581 EXPORT_SYMBOL(dev_unicast_unsync);
3582
3583 static void __dev_addr_discard(struct dev_addr_list **list)
3584 {
3585         struct dev_addr_list *tmp;
3586
3587         while (*list != NULL) {
3588                 tmp = *list;
3589                 *list = tmp->next;
3590                 if (tmp->da_users > tmp->da_gusers)
3591                         printk("__dev_addr_discard: address leakage! "
3592                                "da_users=%d\n", tmp->da_users);
3593                 kfree(tmp);
3594         }
3595 }
3596
3597 static void dev_addr_discard(struct net_device *dev)
3598 {
3599         netif_addr_lock_bh(dev);
3600
3601         __dev_addr_discard(&dev->uc_list);
3602         dev->uc_count = 0;
3603
3604         __dev_addr_discard(&dev->mc_list);
3605         dev->mc_count = 0;
3606
3607         netif_addr_unlock_bh(dev);
3608 }
3609
3610 /**
3611  *      dev_get_flags - get flags reported to userspace
3612  *      @dev: device
3613  *
3614  *      Get the combination of flag bits exported through APIs to userspace.
3615  */
3616 unsigned dev_get_flags(const struct net_device *dev)
3617 {
3618         unsigned flags;
3619
3620         flags = (dev->flags & ~(IFF_PROMISC |
3621                                 IFF_ALLMULTI |
3622                                 IFF_RUNNING |
3623                                 IFF_LOWER_UP |
3624                                 IFF_DORMANT)) |
3625                 (dev->gflags & (IFF_PROMISC |
3626                                 IFF_ALLMULTI));
3627
3628         if (netif_running(dev)) {
3629                 if (netif_oper_up(dev))
3630                         flags |= IFF_RUNNING;
3631                 if (netif_carrier_ok(dev))
3632                         flags |= IFF_LOWER_UP;
3633                 if (netif_dormant(dev))
3634                         flags |= IFF_DORMANT;
3635         }
3636
3637         return flags;
3638 }
3639
3640 /**
3641  *      dev_change_flags - change device settings
3642  *      @dev: device
3643  *      @flags: device state flags
3644  *
3645  *      Change settings on device based state flags. The flags are
3646  *      in the userspace exported format.
3647  */
3648 int dev_change_flags(struct net_device *dev, unsigned flags)
3649 {
3650         int ret, changes;
3651         int old_flags = dev->flags;
3652
3653         ASSERT_RTNL();
3654
3655         /*
3656          *      Set the flags on our device.
3657          */
3658
3659         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3660                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3661                                IFF_AUTOMEDIA)) |
3662                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3663                                     IFF_ALLMULTI));
3664
3665         /*
3666          *      Load in the correct multicast list now the flags have changed.
3667          */
3668
3669         if ((old_flags ^ flags) & IFF_MULTICAST)
3670                 dev_change_rx_flags(dev, IFF_MULTICAST);
3671
3672         dev_set_rx_mode(dev);
3673
3674         /*
3675          *      Have we downed the interface. We handle IFF_UP ourselves
3676          *      according to user attempts to set it, rather than blindly
3677          *      setting it.
3678          */
3679
3680         ret = 0;
3681         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3682                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3683
3684                 if (!ret)
3685                         dev_set_rx_mode(dev);
3686         }
3687
3688         if (dev->flags & IFF_UP &&
3689             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3690                                           IFF_VOLATILE)))
3691                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3692
3693         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3694                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3695                 dev->gflags ^= IFF_PROMISC;
3696                 dev_set_promiscuity(dev, inc);
3697         }
3698
3699         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3700            is important. Some (broken) drivers set IFF_PROMISC, when
3701            IFF_ALLMULTI is requested not asking us and not reporting.
3702          */
3703         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3704                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3705                 dev->gflags ^= IFF_ALLMULTI;
3706                 dev_set_allmulti(dev, inc);
3707         }
3708
3709         /* Exclude state transition flags, already notified */
3710         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3711         if (changes)
3712                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3713
3714         return ret;
3715 }
3716
3717 /**
3718  *      dev_set_mtu - Change maximum transfer unit
3719  *      @dev: device
3720  *      @new_mtu: new transfer unit
3721  *
3722  *      Change the maximum transfer size of the network device.
3723  */
3724 int dev_set_mtu(struct net_device *dev, int new_mtu)
3725 {
3726         const struct net_device_ops *ops = dev->netdev_ops;
3727         int err;
3728
3729         if (new_mtu == dev->mtu)
3730                 return 0;
3731
3732         /*      MTU must be positive.    */
3733         if (new_mtu < 0)
3734                 return -EINVAL;
3735
3736         if (!netif_device_present(dev))
3737                 return -ENODEV;
3738
3739         err = 0;
3740         if (ops->ndo_change_mtu)
3741                 err = ops->ndo_change_mtu(dev, new_mtu);
3742         else
3743                 dev->mtu = new_mtu;
3744
3745         if (!err && dev->flags & IFF_UP)
3746                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3747         return err;
3748 }
3749
3750 /**
3751  *      dev_set_mac_address - Change Media Access Control Address
3752  *      @dev: device
3753  *      @sa: new address
3754  *
3755  *      Change the hardware (MAC) address of the device
3756  */
3757 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3758 {
3759         const struct net_device_ops *ops = dev->netdev_ops;
3760         int err;
3761
3762         if (!ops->ndo_set_mac_address)
3763                 return -EOPNOTSUPP;
3764         if (sa->sa_family != dev->type)
3765                 return -EINVAL;
3766         if (!netif_device_present(dev))
3767                 return -ENODEV;
3768         err = ops->ndo_set_mac_address(dev, sa);
3769         if (!err)
3770                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3771         return err;
3772 }
3773
3774 /*
3775  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3776  */
3777 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3778 {
3779         int err;
3780         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3781
3782         if (!dev)
3783                 return -ENODEV;
3784
3785         switch (cmd) {
3786                 case SIOCGIFFLAGS:      /* Get interface flags */
3787                         ifr->ifr_flags = dev_get_flags(dev);
3788                         return 0;
3789
3790                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3791                                            (currently unused) */
3792                         ifr->ifr_metric = 0;
3793                         return 0;
3794
3795                 case SIOCGIFMTU:        /* Get the MTU of a device */
3796                         ifr->ifr_mtu = dev->mtu;
3797                         return 0;
3798
3799                 case SIOCGIFHWADDR:
3800                         if (!dev->addr_len)
3801                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3802                         else
3803                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3804                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3805                         ifr->ifr_hwaddr.sa_family = dev->type;
3806                         return 0;
3807
3808                 case SIOCGIFSLAVE:
3809                         err = -EINVAL;
3810                         break;
3811
3812                 case SIOCGIFMAP:
3813                         ifr->ifr_map.mem_start = dev->mem_start;
3814                         ifr->ifr_map.mem_end   = dev->mem_end;
3815                         ifr->ifr_map.base_addr = dev->base_addr;
3816                         ifr->ifr_map.irq       = dev->irq;
3817                         ifr->ifr_map.dma       = dev->dma;
3818                         ifr->ifr_map.port      = dev->if_port;
3819                         return 0;
3820
3821                 case SIOCGIFINDEX:
3822                         ifr->ifr_ifindex = dev->ifindex;
3823                         return 0;
3824
3825                 case SIOCGIFTXQLEN:
3826                         ifr->ifr_qlen = dev->tx_queue_len;
3827                         return 0;
3828
3829                 default:
3830                         /* dev_ioctl() should ensure this case
3831                          * is never reached
3832                          */
3833                         WARN_ON(1);
3834                         err = -EINVAL;
3835                         break;
3836
3837         }
3838         return err;
3839 }
3840
3841 /*
3842  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3843  */
3844 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3845 {
3846         int err;
3847         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3848         const struct net_device_ops *ops;
3849
3850         if (!dev)
3851                 return -ENODEV;
3852
3853         ops = dev->netdev_ops;
3854
3855         switch (cmd) {
3856                 case SIOCSIFFLAGS:      /* Set interface flags */
3857                         return dev_change_flags(dev, ifr->ifr_flags);
3858
3859                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3860                                            (currently unused) */
3861                         return -EOPNOTSUPP;
3862
3863                 case SIOCSIFMTU:        /* Set the MTU of a device */
3864                         return dev_set_mtu(dev, ifr->ifr_mtu);
3865
3866                 case SIOCSIFHWADDR:
3867                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3868
3869                 case SIOCSIFHWBROADCAST:
3870                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3871                                 return -EINVAL;
3872                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3873                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3874                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3875                         return 0;
3876
3877                 case SIOCSIFMAP:
3878                         if (ops->ndo_set_config) {
3879                                 if (!netif_device_present(dev))
3880                                         return -ENODEV;
3881                                 return ops->ndo_set_config(dev, &ifr->ifr_map);
3882                         }
3883                         return -EOPNOTSUPP;
3884
3885                 case SIOCADDMULTI:
3886                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3887                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3888                                 return -EINVAL;
3889                         if (!netif_device_present(dev))
3890                                 return -ENODEV;
3891                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3892                                           dev->addr_len, 1);
3893
3894                 case SIOCDELMULTI:
3895                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3896                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3897                                 return -EINVAL;
3898                         if (!netif_device_present(dev))
3899                                 return -ENODEV;
3900                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3901                                              dev->addr_len, 1);
3902
3903                 case SIOCSIFTXQLEN:
3904                         if (ifr->ifr_qlen < 0)
3905                                 return -EINVAL;
3906                         dev->tx_queue_len = ifr->ifr_qlen;
3907                         return 0;
3908
3909                 case SIOCSIFNAME:
3910                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3911                         return dev_change_name(dev, ifr->ifr_newname);
3912
3913                 /*
3914                  *      Unknown or private ioctl
3915                  */
3916
3917                 default:
3918                         if ((cmd >= SIOCDEVPRIVATE &&
3919                             cmd <= SIOCDEVPRIVATE + 15) ||
3920                             cmd == SIOCBONDENSLAVE ||
3921                             cmd == SIOCBONDRELEASE ||
3922                             cmd == SIOCBONDSETHWADDR ||
3923                             cmd == SIOCBONDSLAVEINFOQUERY ||
3924                             cmd == SIOCBONDINFOQUERY ||
3925                             cmd == SIOCBONDCHANGEACTIVE ||
3926                             cmd == SIOCGMIIPHY ||
3927                             cmd == SIOCGMIIREG ||
3928                             cmd == SIOCSMIIREG ||
3929                             cmd == SIOCBRADDIF ||
3930                             cmd == SIOCBRDELIF ||
3931                             cmd == SIOCWANDEV) {
3932                                 err = -EOPNOTSUPP;
3933                                 if (ops->ndo_do_ioctl) {
3934                                         if (netif_device_present(dev))
3935                                                 err = ops->ndo_do_ioctl(dev, ifr, cmd);
3936                                         else
3937                                                 err = -ENODEV;
3938                                 }
3939                         } else
3940                                 err = -EINVAL;
3941
3942         }
3943         return err;
3944 }
3945
3946 /*
3947  *      This function handles all "interface"-type I/O control requests. The actual
3948  *      'doing' part of this is dev_ifsioc above.
3949  */
3950
3951 /**
3952  *      dev_ioctl       -       network device ioctl
3953  *      @net: the applicable net namespace
3954  *      @cmd: command to issue
3955  *      @arg: pointer to a struct ifreq in user space
3956  *
3957  *      Issue ioctl functions to devices. This is normally called by the
3958  *      user space syscall interfaces but can sometimes be useful for
3959  *      other purposes. The return value is the return from the syscall if
3960  *      positive or a negative errno code on error.
3961  */
3962
3963 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3964 {
3965         struct ifreq ifr;
3966         int ret;
3967         char *colon;
3968
3969         /* One special case: SIOCGIFCONF takes ifconf argument
3970            and requires shared lock, because it sleeps writing
3971            to user space.
3972          */
3973
3974         if (cmd == SIOCGIFCONF) {
3975                 rtnl_lock();
3976                 ret = dev_ifconf(net, (char __user *) arg);
3977                 rtnl_unlock();
3978                 return ret;
3979         }
3980         if (cmd == SIOCGIFNAME)
3981                 return dev_ifname(net, (struct ifreq __user *)arg);
3982
3983         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3984                 return -EFAULT;
3985
3986         ifr.ifr_name[IFNAMSIZ-1] = 0;
3987
3988         colon = strchr(ifr.ifr_name, ':');
3989         if (colon)
3990                 *colon = 0;
3991
3992         /*
3993          *      See which interface the caller is talking about.
3994          */
3995
3996         switch (cmd) {
3997                 /*
3998                  *      These ioctl calls:
3999                  *      - can be done by all.
4000                  *      - atomic and do not require locking.
4001                  *      - return a value
4002                  */
4003                 case SIOCGIFFLAGS:
4004                 case SIOCGIFMETRIC:
4005                 case SIOCGIFMTU:
4006                 case SIOCGIFHWADDR:
4007                 case SIOCGIFSLAVE:
4008                 case SIOCGIFMAP:
4009                 case SIOCGIFINDEX:
4010                 case SIOCGIFTXQLEN:
4011                         dev_load(net, ifr.ifr_name);
4012                         read_lock(&dev_base_lock);
4013                         ret = dev_ifsioc_locked(net, &ifr, cmd);
4014                         read_unlock(&dev_base_lock);
4015                         if (!ret) {
4016                                 if (colon)
4017                                         *colon = ':';
4018                                 if (copy_to_user(arg, &ifr,
4019                                                  sizeof(struct ifreq)))
4020                                         ret = -EFAULT;
4021                         }
4022                         return ret;
4023
4024                 case SIOCETHTOOL:
4025                         dev_load(net, ifr.ifr_name);
4026                         rtnl_lock();
4027                         ret = dev_ethtool(net, &ifr);
4028                         rtnl_unlock();
4029                         if (!ret) {
4030                                 if (colon)
4031                                         *colon = ':';
4032                                 if (copy_to_user(arg, &ifr,
4033                                                  sizeof(struct ifreq)))
4034                                         ret = -EFAULT;
4035                         }
4036                         return ret;
4037
4038                 /*
4039                  *      These ioctl calls:
4040                  *      - require superuser power.
4041                  *      - require strict serialization.
4042                  *      - return a value
4043                  */
4044                 case SIOCGMIIPHY:
4045                 case SIOCGMIIREG:
4046                 case SIOCSIFNAME:
4047                         if (!capable(CAP_NET_ADMIN))
4048                                 return -EPERM;
4049                         dev_load(net, ifr.ifr_name);
4050                         rtnl_lock();
4051                         ret = dev_ifsioc(net, &ifr, cmd);
4052                         rtnl_unlock();
4053                         if (!ret) {
4054                                 if (colon)
4055                                         *colon = ':';
4056                                 if (copy_to_user(arg, &ifr,
4057                                                  sizeof(struct ifreq)))
4058                                         ret = -EFAULT;
4059                         }
4060                         return ret;
4061
4062                 /*
4063                  *      These ioctl calls:
4064                  *      - require superuser power.
4065                  *      - require strict serialization.
4066                  *      - do not return a value
4067                  */
4068                 case SIOCSIFFLAGS:
4069                 case SIOCSIFMETRIC:
4070                 case SIOCSIFMTU:
4071                 case SIOCSIFMAP:
4072                 case SIOCSIFHWADDR:
4073                 case SIOCSIFSLAVE:
4074                 case SIOCADDMULTI:
4075                 case SIOCDELMULTI:
4076                 case SIOCSIFHWBROADCAST:
4077                 case SIOCSIFTXQLEN:
4078                 case SIOCSMIIREG:
4079                 case SIOCBONDENSLAVE:
4080                 case SIOCBONDRELEASE:
4081                 case SIOCBONDSETHWADDR:
4082                 case SIOCBONDCHANGEACTIVE:
4083                 case SIOCBRADDIF:
4084                 case SIOCBRDELIF:
4085                         if (!capable(CAP_NET_ADMIN))
4086                                 return -EPERM;
4087                         /* fall through */
4088                 case SIOCBONDSLAVEINFOQUERY:
4089                 case SIOCBONDINFOQUERY:
4090                         dev_load(net, ifr.ifr_name);
4091                         rtnl_lock();
4092                         ret = dev_ifsioc(net, &ifr, cmd);
4093                         rtnl_unlock();
4094                         return ret;
4095
4096                 case SIOCGIFMEM:
4097                         /* Get the per device memory space. We can add this but
4098                          * currently do not support it */
4099                 case SIOCSIFMEM:
4100                         /* Set the per device memory buffer space.
4101                          * Not applicable in our case */
4102                 case SIOCSIFLINK:
4103                         return -EINVAL;
4104
4105                 /*
4106                  *      Unknown or private ioctl.
4107                  */
4108                 default:
4109                         if (cmd == SIOCWANDEV ||
4110                             (cmd >= SIOCDEVPRIVATE &&
4111                              cmd <= SIOCDEVPRIVATE + 15)) {
4112                                 dev_load(net, ifr.ifr_name);
4113                                 rtnl_lock();
4114                                 ret = dev_ifsioc(net, &ifr, cmd);
4115                                 rtnl_unlock();
4116                                 if (!ret && copy_to_user(arg, &ifr,
4117                                                          sizeof(struct ifreq)))
4118                                         ret = -EFAULT;
4119                                 return ret;
4120                         }
4121                         /* Take care of Wireless Extensions */
4122                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4123                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
4124                         return -EINVAL;
4125         }
4126 }
4127
4128
4129 /**
4130  *      dev_new_index   -       allocate an ifindex
4131  *      @net: the applicable net namespace
4132  *
4133  *      Returns a suitable unique value for a new device interface
4134  *      number.  The caller must hold the rtnl semaphore or the
4135  *      dev_base_lock to be sure it remains unique.
4136  */
4137 static int dev_new_index(struct net *net)
4138 {
4139         static int ifindex;
4140         for (;;) {
4141                 if (++ifindex <= 0)
4142                         ifindex = 1;
4143                 if (!__dev_get_by_index(net, ifindex))
4144                         return ifindex;
4145         }
4146 }
4147
4148 /* Delayed registration/unregisteration */
4149 static LIST_HEAD(net_todo_list);
4150
4151 static void net_set_todo(struct net_device *dev)
4152 {
4153         list_add_tail(&dev->todo_list, &net_todo_list);
4154 }
4155
4156 static void rollback_registered(struct net_device *dev)
4157 {
4158         BUG_ON(dev_boot_phase);
4159         ASSERT_RTNL();
4160
4161         /* Some devices call without registering for initialization unwind. */
4162         if (dev->reg_state == NETREG_UNINITIALIZED) {
4163                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4164                                   "was registered\n", dev->name, dev);
4165
4166                 WARN_ON(1);
4167                 return;
4168         }
4169
4170         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4171
4172         /* If device is running, close it first. */
4173         dev_close(dev);
4174
4175         /* And unlink it from device chain. */
4176         unlist_netdevice(dev);
4177
4178         dev->reg_state = NETREG_UNREGISTERING;
4179
4180         synchronize_net();
4181
4182         /* Shutdown queueing discipline. */
4183         dev_shutdown(dev);
4184
4185
4186         /* Notify protocols, that we are about to destroy
4187            this device. They should clean all the things.
4188         */
4189         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4190
4191         /*
4192          *      Flush the unicast and multicast chains
4193          */
4194         dev_addr_discard(dev);
4195
4196         if (dev->netdev_ops->ndo_uninit)
4197                 dev->netdev_ops->ndo_uninit(dev);
4198
4199         /* Notifier chain MUST detach us from master device. */
4200         WARN_ON(dev->master);
4201
4202         /* Remove entries from kobject tree */
4203         netdev_unregister_kobject(dev);
4204
4205         synchronize_net();
4206
4207         dev_put(dev);
4208 }
4209
4210 static void __netdev_init_queue_locks_one(struct net_device *dev,
4211                                           struct netdev_queue *dev_queue,
4212                                           void *_unused)
4213 {
4214         spin_lock_init(&dev_queue->_xmit_lock);
4215         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4216         dev_queue->xmit_lock_owner = -1;
4217 }
4218
4219 static void netdev_init_queue_locks(struct net_device *dev)
4220 {
4221         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4222         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4223 }
4224
4225 unsigned long netdev_fix_features(unsigned long features, const char *name)
4226 {
4227         /* Fix illegal SG+CSUM combinations. */
4228         if ((features & NETIF_F_SG) &&
4229             !(features & NETIF_F_ALL_CSUM)) {
4230                 if (name)
4231                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4232                                "checksum feature.\n", name);
4233                 features &= ~NETIF_F_SG;
4234         }
4235
4236         /* TSO requires that SG is present as well. */
4237         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4238                 if (name)
4239                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4240                                "SG feature.\n", name);
4241                 features &= ~NETIF_F_TSO;
4242         }
4243
4244         if (features & NETIF_F_UFO) {
4245                 if (!(features & NETIF_F_GEN_CSUM)) {
4246                         if (name)
4247                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4248                                        "since no NETIF_F_HW_CSUM feature.\n",
4249                                        name);
4250                         features &= ~NETIF_F_UFO;
4251                 }
4252
4253                 if (!(features & NETIF_F_SG)) {
4254                         if (name)
4255                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4256                                        "since no NETIF_F_SG feature.\n", name);
4257                         features &= ~NETIF_F_UFO;
4258                 }
4259         }
4260
4261         return features;
4262 }
4263 EXPORT_SYMBOL(netdev_fix_features);
4264
4265 /**
4266  *      register_netdevice      - register a network device
4267  *      @dev: device to register
4268  *
4269  *      Take a completed network device structure and add it to the kernel
4270  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4271  *      chain. 0 is returned on success. A negative errno code is returned
4272  *      on a failure to set up the device, or if the name is a duplicate.
4273  *
4274  *      Callers must hold the rtnl semaphore. You may want
4275  *      register_netdev() instead of this.
4276  *
4277  *      BUGS:
4278  *      The locking appears insufficient to guarantee two parallel registers
4279  *      will not get the same name.
4280  */
4281
4282 int register_netdevice(struct net_device *dev)
4283 {
4284         struct hlist_head *head;
4285         struct hlist_node *p;
4286         int ret;
4287         struct net *net = dev_net(dev);
4288
4289         BUG_ON(dev_boot_phase);
4290         ASSERT_RTNL();
4291
4292         might_sleep();
4293
4294         /* When net_device's are persistent, this will be fatal. */
4295         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4296         BUG_ON(!net);
4297
4298         spin_lock_init(&dev->addr_list_lock);
4299         netdev_set_addr_lockdep_class(dev);
4300         netdev_init_queue_locks(dev);
4301
4302         dev->iflink = -1;
4303
4304 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4305         /* Netdevice_ops API compatiability support.
4306          * This is temporary until all network devices are converted.
4307          */
4308         if (dev->netdev_ops) {
4309                 const struct net_device_ops *ops = dev->netdev_ops;
4310
4311                 dev->init = ops->ndo_init;
4312                 dev->uninit = ops->ndo_uninit;
4313                 dev->open = ops->ndo_open;
4314                 dev->change_rx_flags = ops->ndo_change_rx_flags;
4315                 dev->set_rx_mode = ops->ndo_set_rx_mode;
4316                 dev->set_multicast_list = ops->ndo_set_multicast_list;
4317                 dev->set_mac_address = ops->ndo_set_mac_address;
4318                 dev->validate_addr = ops->ndo_validate_addr;
4319                 dev->do_ioctl = ops->ndo_do_ioctl;
4320                 dev->set_config = ops->ndo_set_config;
4321                 dev->change_mtu = ops->ndo_change_mtu;
4322                 dev->tx_timeout = ops->ndo_tx_timeout;
4323                 dev->get_stats = ops->ndo_get_stats;
4324                 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4325                 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4326                 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4327 #ifdef CONFIG_NET_POLL_CONTROLLER
4328                 dev->poll_controller = ops->ndo_poll_controller;
4329 #endif
4330         } else {
4331                 char drivername[64];
4332                 pr_info("%s (%s): not using net_device_ops yet\n",
4333                         dev->name, netdev_drivername(dev, drivername, 64));
4334
4335                 /* This works only because net_device_ops and the
4336                    compatiablity structure are the same. */
4337                 dev->netdev_ops = (void *) &(dev->init);
4338         }
4339 #endif
4340
4341         /* Init, if this function is available */
4342         if (dev->netdev_ops->ndo_init) {
4343                 ret = dev->netdev_ops->ndo_init(dev);
4344                 if (ret) {
4345                         if (ret > 0)
4346                                 ret = -EIO;
4347                         goto out;
4348                 }
4349         }
4350
4351         if (!dev_valid_name(dev->name)) {
4352                 ret = -EINVAL;
4353                 goto err_uninit;
4354         }
4355
4356         dev->ifindex = dev_new_index(net);
4357         if (dev->iflink == -1)
4358                 dev->iflink = dev->ifindex;
4359
4360         /* Check for existence of name */
4361         head = dev_name_hash(net, dev->name);
4362         hlist_for_each(p, head) {
4363                 struct net_device *d
4364                         = hlist_entry(p, struct net_device, name_hlist);
4365                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4366                         ret = -EEXIST;
4367                         goto err_uninit;
4368                 }
4369         }
4370
4371         /* Fix illegal checksum combinations */
4372         if ((dev->features & NETIF_F_HW_CSUM) &&
4373             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4374                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4375                        dev->name);
4376                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4377         }
4378
4379         if ((dev->features & NETIF_F_NO_CSUM) &&
4380             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4381                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4382                        dev->name);
4383                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4384         }
4385
4386         dev->features = netdev_fix_features(dev->features, dev->name);
4387
4388         /* Enable software GSO if SG is supported. */
4389         if (dev->features & NETIF_F_SG)
4390                 dev->features |= NETIF_F_GSO;
4391
4392         netdev_initialize_kobject(dev);
4393         ret = netdev_register_kobject(dev);
4394         if (ret)
4395                 goto err_uninit;
4396         dev->reg_state = NETREG_REGISTERED;
4397
4398         /*
4399          *      Default initial state at registry is that the
4400          *      device is present.
4401          */
4402
4403         set_bit(__LINK_STATE_PRESENT, &dev->state);
4404
4405         dev_init_scheduler(dev);
4406         dev_hold(dev);
4407         list_netdevice(dev);
4408
4409         /* Notify protocols, that a new device appeared. */
4410         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4411         ret = notifier_to_errno(ret);
4412         if (ret) {
4413                 rollback_registered(dev);
4414                 dev->reg_state = NETREG_UNREGISTERED;
4415         }
4416
4417 out:
4418         return ret;
4419
4420 err_uninit:
4421         if (dev->netdev_ops->ndo_uninit)
4422                 dev->netdev_ops->ndo_uninit(dev);
4423         goto out;
4424 }
4425
4426 /**
4427  *      register_netdev - register a network device
4428  *      @dev: device to register
4429  *
4430  *      Take a completed network device structure and add it to the kernel
4431  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4432  *      chain. 0 is returned on success. A negative errno code is returned
4433  *      on a failure to set up the device, or if the name is a duplicate.
4434  *
4435  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4436  *      and expands the device name if you passed a format string to
4437  *      alloc_netdev.
4438  */
4439 int register_netdev(struct net_device *dev)
4440 {
4441         int err;
4442
4443         rtnl_lock();
4444
4445         /*
4446          * If the name is a format string the caller wants us to do a
4447          * name allocation.
4448          */
4449         if (strchr(dev->name, '%')) {
4450                 err = dev_alloc_name(dev, dev->name);
4451                 if (err < 0)
4452                         goto out;
4453         }
4454
4455         err = register_netdevice(dev);
4456 out:
4457         rtnl_unlock();
4458         return err;
4459 }
4460 EXPORT_SYMBOL(register_netdev);
4461
4462 /*
4463  * netdev_wait_allrefs - wait until all references are gone.
4464  *
4465  * This is called when unregistering network devices.
4466  *
4467  * Any protocol or device that holds a reference should register
4468  * for netdevice notification, and cleanup and put back the
4469  * reference if they receive an UNREGISTER event.
4470  * We can get stuck here if buggy protocols don't correctly
4471  * call dev_put.
4472  */
4473 static void netdev_wait_allrefs(struct net_device *dev)
4474 {
4475         unsigned long rebroadcast_time, warning_time;
4476
4477         rebroadcast_time = warning_time = jiffies;
4478         while (atomic_read(&dev->refcnt) != 0) {
4479                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4480                         rtnl_lock();
4481
4482                         /* Rebroadcast unregister notification */
4483                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4484
4485                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4486                                      &dev->state)) {
4487                                 /* We must not have linkwatch events
4488                                  * pending on unregister. If this
4489                                  * happens, we simply run the queue
4490                                  * unscheduled, resulting in a noop
4491                                  * for this device.
4492                                  */
4493                                 linkwatch_run_queue();
4494                         }
4495
4496                         __rtnl_unlock();
4497
4498                         rebroadcast_time = jiffies;
4499                 }
4500
4501                 msleep(250);
4502
4503                 if (time_after(jiffies, warning_time + 10 * HZ)) {
4504                         printk(KERN_EMERG "unregister_netdevice: "
4505                                "waiting for %s to become free. Usage "
4506                                "count = %d\n",
4507                                dev->name, atomic_read(&dev->refcnt));
4508                         warning_time = jiffies;
4509                 }
4510         }
4511 }
4512
4513 /* The sequence is:
4514  *
4515  *      rtnl_lock();
4516  *      ...
4517  *      register_netdevice(x1);
4518  *      register_netdevice(x2);
4519  *      ...
4520  *      unregister_netdevice(y1);
4521  *      unregister_netdevice(y2);
4522  *      ...
4523  *      rtnl_unlock();
4524  *      free_netdev(y1);
4525  *      free_netdev(y2);
4526  *
4527  * We are invoked by rtnl_unlock().
4528  * This allows us to deal with problems:
4529  * 1) We can delete sysfs objects which invoke hotplug
4530  *    without deadlocking with linkwatch via keventd.
4531  * 2) Since we run with the RTNL semaphore not held, we can sleep
4532  *    safely in order to wait for the netdev refcnt to drop to zero.
4533  *
4534  * We must not return until all unregister events added during
4535  * the interval the lock was held have been completed.
4536  */
4537 void netdev_run_todo(void)
4538 {
4539         struct list_head list;
4540
4541         /* Snapshot list, allow later requests */
4542         list_replace_init(&net_todo_list, &list);
4543
4544         __rtnl_unlock();
4545
4546         while (!list_empty(&list)) {
4547                 struct net_device *dev
4548                         = list_entry(list.next, struct net_device, todo_list);
4549                 list_del(&dev->todo_list);
4550
4551                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4552                         printk(KERN_ERR "network todo '%s' but state %d\n",
4553                                dev->name, dev->reg_state);
4554                         dump_stack();
4555                         continue;
4556                 }
4557
4558                 dev->reg_state = NETREG_UNREGISTERED;
4559
4560                 on_each_cpu(flush_backlog, dev, 1);
4561
4562                 netdev_wait_allrefs(dev);
4563
4564                 /* paranoia */
4565                 BUG_ON(atomic_read(&dev->refcnt));
4566                 WARN_ON(dev->ip_ptr);
4567                 WARN_ON(dev->ip6_ptr);
4568                 WARN_ON(dev->dn_ptr);
4569
4570                 if (dev->destructor)
4571                         dev->destructor(dev);
4572
4573                 /* Free network device */
4574                 kobject_put(&dev->dev.kobj);
4575         }
4576 }
4577
4578 /**
4579  *      dev_get_stats   - get network device statistics
4580  *      @dev: device to get statistics from
4581  *
4582  *      Get network statistics from device. The device driver may provide
4583  *      its own method by setting dev->netdev_ops->get_stats; otherwise
4584  *      the internal statistics structure is used.
4585  */
4586 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4587  {
4588         const struct net_device_ops *ops = dev->netdev_ops;
4589
4590         if (ops->ndo_get_stats)
4591                 return ops->ndo_get_stats(dev);
4592         else
4593                 return &dev->stats;
4594 }
4595 EXPORT_SYMBOL(dev_get_stats);
4596
4597 static void netdev_init_one_queue(struct net_device *dev,
4598                                   struct netdev_queue *queue,
4599                                   void *_unused)
4600 {
4601         queue->dev = dev;
4602 }
4603
4604 static void netdev_init_queues(struct net_device *dev)
4605 {
4606         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4607         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4608         spin_lock_init(&dev->tx_global_lock);
4609 }
4610
4611 /**
4612  *      alloc_netdev_mq - allocate network device
4613  *      @sizeof_priv:   size of private data to allocate space for
4614  *      @name:          device name format string
4615  *      @setup:         callback to initialize device
4616  *      @queue_count:   the number of subqueues to allocate
4617  *
4618  *      Allocates a struct net_device with private data area for driver use
4619  *      and performs basic initialization.  Also allocates subquue structs
4620  *      for each queue on the device at the end of the netdevice.
4621  */
4622 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4623                 void (*setup)(struct net_device *), unsigned int queue_count)
4624 {
4625         struct netdev_queue *tx;
4626         struct net_device *dev;
4627         size_t alloc_size;
4628         void *p;
4629
4630         BUG_ON(strlen(name) >= sizeof(dev->name));
4631
4632         alloc_size = sizeof(struct net_device);
4633         if (sizeof_priv) {
4634                 /* ensure 32-byte alignment of private area */
4635                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4636                 alloc_size += sizeof_priv;
4637         }
4638         /* ensure 32-byte alignment of whole construct */
4639         alloc_size += NETDEV_ALIGN_CONST;
4640
4641         p = kzalloc(alloc_size, GFP_KERNEL);
4642         if (!p) {
4643                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4644                 return NULL;
4645         }
4646
4647         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4648         if (!tx) {
4649                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4650                        "tx qdiscs.\n");
4651                 kfree(p);
4652                 return NULL;
4653         }
4654
4655         dev = (struct net_device *)
4656                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4657         dev->padded = (char *)dev - (char *)p;
4658         dev_net_set(dev, &init_net);
4659
4660         dev->_tx = tx;
4661         dev->num_tx_queues = queue_count;
4662         dev->real_num_tx_queues = queue_count;
4663
4664         dev->gso_max_size = GSO_MAX_SIZE;
4665
4666         netdev_init_queues(dev);
4667
4668         INIT_LIST_HEAD(&dev->napi_list);
4669         setup(dev);
4670         strcpy(dev->name, name);
4671         return dev;
4672 }
4673 EXPORT_SYMBOL(alloc_netdev_mq);
4674
4675 /**
4676  *      free_netdev - free network device
4677  *      @dev: device
4678  *
4679  *      This function does the last stage of destroying an allocated device
4680  *      interface. The reference to the device object is released.
4681  *      If this is the last reference then it will be freed.
4682  */
4683 void free_netdev(struct net_device *dev)
4684 {
4685         struct napi_struct *p, *n;
4686
4687         release_net(dev_net(dev));
4688
4689         kfree(dev->_tx);
4690
4691         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4692                 netif_napi_del(p);
4693
4694         /*  Compatibility with error handling in drivers */
4695         if (dev->reg_state == NETREG_UNINITIALIZED) {
4696                 kfree((char *)dev - dev->padded);
4697                 return;
4698         }
4699
4700         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4701         dev->reg_state = NETREG_RELEASED;
4702
4703         /* will free via device release */
4704         put_device(&dev->dev);
4705 }
4706
4707 /**
4708  *      synchronize_net -  Synchronize with packet receive processing
4709  *
4710  *      Wait for packets currently being received to be done.
4711  *      Does not block later packets from starting.
4712  */
4713 void synchronize_net(void)
4714 {
4715         might_sleep();
4716         synchronize_rcu();
4717 }
4718
4719 /**
4720  *      unregister_netdevice - remove device from the kernel
4721  *      @dev: device
4722  *
4723  *      This function shuts down a device interface and removes it
4724  *      from the kernel tables.
4725  *
4726  *      Callers must hold the rtnl semaphore.  You may want
4727  *      unregister_netdev() instead of this.
4728  */
4729
4730 void unregister_netdevice(struct net_device *dev)
4731 {
4732         ASSERT_RTNL();
4733
4734         rollback_registered(dev);
4735         /* Finish processing unregister after unlock */
4736         net_set_todo(dev);
4737 }
4738
4739 /**
4740  *      unregister_netdev - remove device from the kernel
4741  *      @dev: device
4742  *
4743  *      This function shuts down a device interface and removes it
4744  *      from the kernel tables.
4745  *
4746  *      This is just a wrapper for unregister_netdevice that takes
4747  *      the rtnl semaphore.  In general you want to use this and not
4748  *      unregister_netdevice.
4749  */
4750 void unregister_netdev(struct net_device *dev)
4751 {
4752         rtnl_lock();
4753         unregister_netdevice(dev);
4754         rtnl_unlock();
4755 }
4756
4757 EXPORT_SYMBOL(unregister_netdev);
4758
4759 /**
4760  *      dev_change_net_namespace - move device to different nethost namespace
4761  *      @dev: device
4762  *      @net: network namespace
4763  *      @pat: If not NULL name pattern to try if the current device name
4764  *            is already taken in the destination network namespace.
4765  *
4766  *      This function shuts down a device interface and moves it
4767  *      to a new network namespace. On success 0 is returned, on
4768  *      a failure a netagive errno code is returned.
4769  *
4770  *      Callers must hold the rtnl semaphore.
4771  */
4772
4773 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4774 {
4775         char buf[IFNAMSIZ];
4776         const char *destname;
4777         int err;
4778
4779         ASSERT_RTNL();
4780
4781         /* Don't allow namespace local devices to be moved. */
4782         err = -EINVAL;
4783         if (dev->features & NETIF_F_NETNS_LOCAL)
4784                 goto out;
4785
4786 #ifdef CONFIG_SYSFS
4787         /* Don't allow real devices to be moved when sysfs
4788          * is enabled.
4789          */
4790         err = -EINVAL;
4791         if (dev->dev.parent)
4792                 goto out;
4793 #endif
4794
4795         /* Ensure the device has been registrered */
4796         err = -EINVAL;
4797         if (dev->reg_state != NETREG_REGISTERED)
4798                 goto out;
4799
4800         /* Get out if there is nothing todo */
4801         err = 0;
4802         if (net_eq(dev_net(dev), net))
4803                 goto out;
4804
4805         /* Pick the destination device name, and ensure
4806          * we can use it in the destination network namespace.
4807          */
4808         err = -EEXIST;
4809         destname = dev->name;
4810         if (__dev_get_by_name(net, destname)) {
4811                 /* We get here if we can't use the current device name */
4812                 if (!pat)
4813                         goto out;
4814                 if (!dev_valid_name(pat))
4815                         goto out;
4816                 if (strchr(pat, '%')) {
4817                         if (__dev_alloc_name(net, pat, buf) < 0)
4818                                 goto out;
4819                         destname = buf;
4820                 } else
4821                         destname = pat;
4822                 if (__dev_get_by_name(net, destname))
4823                         goto out;
4824         }
4825
4826         /*
4827          * And now a mini version of register_netdevice unregister_netdevice.
4828          */
4829
4830         /* If device is running close it first. */
4831         dev_close(dev);
4832
4833         /* And unlink it from device chain */
4834         err = -ENODEV;
4835         unlist_netdevice(dev);
4836
4837         synchronize_net();
4838
4839         /* Shutdown queueing discipline. */
4840         dev_shutdown(dev);
4841
4842         /* Notify protocols, that we are about to destroy
4843            this device. They should clean all the things.
4844         */
4845         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4846
4847         /*
4848          *      Flush the unicast and multicast chains
4849          */
4850         dev_addr_discard(dev);
4851
4852         netdev_unregister_kobject(dev);
4853
4854         /* Actually switch the network namespace */
4855         dev_net_set(dev, net);
4856
4857         /* Assign the new device name */
4858         if (destname != dev->name)
4859                 strcpy(dev->name, destname);
4860
4861         /* If there is an ifindex conflict assign a new one */
4862         if (__dev_get_by_index(net, dev->ifindex)) {
4863                 int iflink = (dev->iflink == dev->ifindex);
4864                 dev->ifindex = dev_new_index(net);
4865                 if (iflink)
4866                         dev->iflink = dev->ifindex;
4867         }
4868
4869         /* Fixup kobjects */
4870         err = netdev_register_kobject(dev);
4871         WARN_ON(err);
4872
4873         /* Add the device back in the hashes */
4874         list_netdevice(dev);
4875
4876         /* Notify protocols, that a new device appeared. */
4877         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4878
4879         synchronize_net();
4880         err = 0;
4881 out:
4882         return err;
4883 }
4884
4885 static int dev_cpu_callback(struct notifier_block *nfb,
4886                             unsigned long action,
4887                             void *ocpu)
4888 {
4889         struct sk_buff **list_skb;
4890         struct Qdisc **list_net;
4891         struct sk_buff *skb;
4892         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4893         struct softnet_data *sd, *oldsd;
4894
4895         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4896                 return NOTIFY_OK;
4897
4898         local_irq_disable();
4899         cpu = smp_processor_id();
4900         sd = &per_cpu(softnet_data, cpu);
4901         oldsd = &per_cpu(softnet_data, oldcpu);
4902
4903         /* Find end of our completion_queue. */
4904         list_skb = &sd->completion_queue;
4905         while (*list_skb)
4906                 list_skb = &(*list_skb)->next;
4907         /* Append completion queue from offline CPU. */
4908         *list_skb = oldsd->completion_queue;
4909         oldsd->completion_queue = NULL;
4910
4911         /* Find end of our output_queue. */
4912         list_net = &sd->output_queue;
4913         while (*list_net)
4914                 list_net = &(*list_net)->next_sched;
4915         /* Append output queue from offline CPU. */
4916         *list_net = oldsd->output_queue;
4917         oldsd->output_queue = NULL;
4918
4919         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4920         local_irq_enable();
4921
4922         /* Process offline CPU's input_pkt_queue */
4923         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4924                 netif_rx(skb);
4925
4926         return NOTIFY_OK;
4927 }
4928
4929
4930 /**
4931  *      netdev_increment_features - increment feature set by one
4932  *      @all: current feature set
4933  *      @one: new feature set
4934  *      @mask: mask feature set
4935  *
4936  *      Computes a new feature set after adding a device with feature set
4937  *      @one to the master device with current feature set @all.  Will not
4938  *      enable anything that is off in @mask. Returns the new feature set.
4939  */
4940 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
4941                                         unsigned long mask)
4942 {
4943         /* If device needs checksumming, downgrade to it. */
4944         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4945                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
4946         else if (mask & NETIF_F_ALL_CSUM) {
4947                 /* If one device supports v4/v6 checksumming, set for all. */
4948                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
4949                     !(all & NETIF_F_GEN_CSUM)) {
4950                         all &= ~NETIF_F_ALL_CSUM;
4951                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
4952                 }
4953
4954                 /* If one device supports hw checksumming, set for all. */
4955                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
4956                         all &= ~NETIF_F_ALL_CSUM;
4957                         all |= NETIF_F_HW_CSUM;
4958                 }
4959         }
4960
4961         one |= NETIF_F_ALL_CSUM;
4962
4963         one |= all & NETIF_F_ONE_FOR_ALL;
4964         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
4965         all |= one & mask & NETIF_F_ONE_FOR_ALL;
4966
4967         return all;
4968 }
4969 EXPORT_SYMBOL(netdev_increment_features);
4970
4971 static struct hlist_head *netdev_create_hash(void)
4972 {
4973         int i;
4974         struct hlist_head *hash;
4975
4976         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4977         if (hash != NULL)
4978                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4979                         INIT_HLIST_HEAD(&hash[i]);
4980
4981         return hash;
4982 }
4983
4984 /* Initialize per network namespace state */
4985 static int __net_init netdev_init(struct net *net)
4986 {
4987         INIT_LIST_HEAD(&net->dev_base_head);
4988
4989         net->dev_name_head = netdev_create_hash();
4990         if (net->dev_name_head == NULL)
4991                 goto err_name;
4992
4993         net->dev_index_head = netdev_create_hash();
4994         if (net->dev_index_head == NULL)
4995                 goto err_idx;
4996
4997         return 0;
4998
4999 err_idx:
5000         kfree(net->dev_name_head);
5001 err_name:
5002         return -ENOMEM;
5003 }
5004
5005 /**
5006  *      netdev_drivername - network driver for the device
5007  *      @dev: network device
5008  *      @buffer: buffer for resulting name
5009  *      @len: size of buffer
5010  *
5011  *      Determine network driver for device.
5012  */
5013 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5014 {
5015         const struct device_driver *driver;
5016         const struct device *parent;
5017
5018         if (len <= 0 || !buffer)
5019                 return buffer;
5020         buffer[0] = 0;
5021
5022         parent = dev->dev.parent;
5023
5024         if (!parent)
5025                 return buffer;
5026
5027         driver = parent->driver;
5028         if (driver && driver->name)
5029                 strlcpy(buffer, driver->name, len);
5030         return buffer;
5031 }
5032
5033 static void __net_exit netdev_exit(struct net *net)
5034 {
5035         kfree(net->dev_name_head);
5036         kfree(net->dev_index_head);
5037 }
5038
5039 static struct pernet_operations __net_initdata netdev_net_ops = {
5040         .init = netdev_init,
5041         .exit = netdev_exit,
5042 };
5043
5044 static void __net_exit default_device_exit(struct net *net)
5045 {
5046         struct net_device *dev;
5047         /*
5048          * Push all migratable of the network devices back to the
5049          * initial network namespace
5050          */
5051         rtnl_lock();
5052 restart:
5053         for_each_netdev(net, dev) {
5054                 int err;
5055                 char fb_name[IFNAMSIZ];
5056
5057                 /* Ignore unmoveable devices (i.e. loopback) */
5058                 if (dev->features & NETIF_F_NETNS_LOCAL)
5059                         continue;
5060
5061                 /* Delete virtual devices */
5062                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5063                         dev->rtnl_link_ops->dellink(dev);
5064                         goto restart;
5065                 }
5066
5067                 /* Push remaing network devices to init_net */
5068                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5069                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5070                 if (err) {
5071                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5072                                 __func__, dev->name, err);
5073                         BUG();
5074                 }
5075                 goto restart;
5076         }
5077         rtnl_unlock();
5078 }
5079
5080 static struct pernet_operations __net_initdata default_device_ops = {
5081         .exit = default_device_exit,
5082 };
5083
5084 /*
5085  *      Initialize the DEV module. At boot time this walks the device list and
5086  *      unhooks any devices that fail to initialise (normally hardware not
5087  *      present) and leaves us with a valid list of present and active devices.
5088  *
5089  */
5090
5091 /*
5092  *       This is called single threaded during boot, so no need
5093  *       to take the rtnl semaphore.
5094  */
5095 static int __init net_dev_init(void)
5096 {
5097         int i, rc = -ENOMEM;
5098
5099         BUG_ON(!dev_boot_phase);
5100
5101         if (dev_proc_init())
5102                 goto out;
5103
5104         if (netdev_kobject_init())
5105                 goto out;
5106
5107         INIT_LIST_HEAD(&ptype_all);
5108         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5109                 INIT_LIST_HEAD(&ptype_base[i]);
5110
5111         if (register_pernet_subsys(&netdev_net_ops))
5112                 goto out;
5113
5114         /*
5115          *      Initialise the packet receive queues.
5116          */
5117
5118         for_each_possible_cpu(i) {
5119                 struct softnet_data *queue;
5120
5121                 queue = &per_cpu(softnet_data, i);
5122                 skb_queue_head_init(&queue->input_pkt_queue);
5123                 queue->completion_queue = NULL;
5124                 INIT_LIST_HEAD(&queue->poll_list);
5125
5126                 queue->backlog.poll = process_backlog;
5127                 queue->backlog.weight = weight_p;
5128                 queue->backlog.gro_list = NULL;
5129         }
5130
5131         dev_boot_phase = 0;
5132
5133         /* The loopback device is special if any other network devices
5134          * is present in a network namespace the loopback device must
5135          * be present. Since we now dynamically allocate and free the
5136          * loopback device ensure this invariant is maintained by
5137          * keeping the loopback device as the first device on the
5138          * list of network devices.  Ensuring the loopback devices
5139          * is the first device that appears and the last network device
5140          * that disappears.
5141          */
5142         if (register_pernet_device(&loopback_net_ops))
5143                 goto out;
5144
5145         if (register_pernet_device(&default_device_ops))
5146                 goto out;
5147
5148         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5149         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5150
5151         hotcpu_notifier(dev_cpu_callback, 0);
5152         dst_init();
5153         dev_mcast_init();
5154         #ifdef CONFIG_NET_DMA
5155         dmaengine_get();
5156         #endif
5157         rc = 0;
5158 out:
5159         return rc;
5160 }
5161
5162 subsys_initcall(net_dev_init);
5163
5164 EXPORT_SYMBOL(__dev_get_by_index);
5165 EXPORT_SYMBOL(__dev_get_by_name);
5166 EXPORT_SYMBOL(__dev_remove_pack);
5167 EXPORT_SYMBOL(dev_valid_name);
5168 EXPORT_SYMBOL(dev_add_pack);
5169 EXPORT_SYMBOL(dev_alloc_name);
5170 EXPORT_SYMBOL(dev_close);
5171 EXPORT_SYMBOL(dev_get_by_flags);
5172 EXPORT_SYMBOL(dev_get_by_index);
5173 EXPORT_SYMBOL(dev_get_by_name);
5174 EXPORT_SYMBOL(dev_open);
5175 EXPORT_SYMBOL(dev_queue_xmit);
5176 EXPORT_SYMBOL(dev_remove_pack);
5177 EXPORT_SYMBOL(dev_set_allmulti);
5178 EXPORT_SYMBOL(dev_set_promiscuity);
5179 EXPORT_SYMBOL(dev_change_flags);
5180 EXPORT_SYMBOL(dev_set_mtu);
5181 EXPORT_SYMBOL(dev_set_mac_address);
5182 EXPORT_SYMBOL(free_netdev);
5183 EXPORT_SYMBOL(netdev_boot_setup_check);
5184 EXPORT_SYMBOL(netdev_set_master);
5185 EXPORT_SYMBOL(netdev_state_change);
5186 EXPORT_SYMBOL(netif_receive_skb);
5187 EXPORT_SYMBOL(netif_rx);
5188 EXPORT_SYMBOL(register_gifconf);
5189 EXPORT_SYMBOL(register_netdevice);
5190 EXPORT_SYMBOL(register_netdevice_notifier);
5191 EXPORT_SYMBOL(skb_checksum_help);
5192 EXPORT_SYMBOL(synchronize_net);
5193 EXPORT_SYMBOL(unregister_netdevice);
5194 EXPORT_SYMBOL(unregister_netdevice_notifier);
5195 EXPORT_SYMBOL(net_enable_timestamp);
5196 EXPORT_SYMBOL(net_disable_timestamp);
5197 EXPORT_SYMBOL(dev_get_flags);
5198
5199 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5200 EXPORT_SYMBOL(br_handle_frame_hook);
5201 EXPORT_SYMBOL(br_fdb_get_hook);
5202 EXPORT_SYMBOL(br_fdb_put_hook);
5203 #endif
5204
5205 EXPORT_SYMBOL(dev_load);
5206
5207 EXPORT_PER_CPU_SYMBOL(softnet_data);