net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/notifier.h>
  94 #include <linux/skbuff.h>
  95 #include <net/net_namespace.h>
  96 #include <net/sock.h>
  97 #include <linux/rtnetlink.h>
  98 #include <linux/proc_fs.h>
  99 #include <linux/seq_file.h>
 100 #include <linux/stat.h>
 101 #include <linux/if_bridge.h>
 102 #include <linux/if_macvlan.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/kmod.h>
 109 #include <linux/module.h>
 110 #include <linux/kallsyms.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122
 123 #include "net-sysfs.h"
 124
 125 /*
 126  *      The list of packet types we will receive (as opposed to discard)
 127  *      and the routines to invoke.
 128  *
 129  *      Why 16. Because with 16 the only overlap we get on a hash of the
 130  *      low nibble of the protocol value is RARP/SNAP/X.25.
 131  *
 132  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 133  *             sure which should go first, but I bet it won't make much
 134  *             difference if we are running VLANs.  The good news is that
 135  *             this protocol won't be in the list unless compiled in, so
 136  *             the average user (w/out VLANs) will not be adversely affected.
 137  *             --BLG
 138  *
 139  *              0800    IP
 140  *              8100    802.1Q VLAN
 141  *              0001    802.3
 142  *              0002    AX.25
 143  *              0004    802.2
 144  *              8035    RARP
 145  *              0005    SNAP
 146  *              0805    X.25
 147  *              0806    ARP
 148  *              8137    IPX
 149  *              0009    Localtalk
 150  *              86DD    IPv6
 151  */
 152
 153 #define PTYPE_HASH_SIZE (16)
 154 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 155
 156 static DEFINE_SPINLOCK(ptype_lock);
 157 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 158 static struct list_head ptype_all __read_mostly;        /* Taps */
 159
 160 #ifdef CONFIG_NET_DMA
 161 struct net_dma {
 162         struct dma_client client;
 163         spinlock_t lock;
 164         cpumask_t channel_mask;
 165         struct dma_chan **channels;
 166 };
 167
 168 static enum dma_state_client
 169 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 170         enum dma_state state);
 171
 172 static struct net_dma net_dma = {
 173         .client = {
 174                 .event_callback = netdev_dma_event,
 175         },
 176 };
 177 #endif
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading.
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 #define NETDEV_HASHBITS 8
 203 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 204
 205 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 206 {
 207         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 208         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 209 }
 210
 211 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 212 {
 213         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 214 }
 215
 216 /* Device list insertion */
 217 static int list_netdevice(struct net_device *dev)
 218 {
 219         struct net *net = dev_net(dev);
 220
 221         ASSERT_RTNL();
 222
 223         write_lock_bh(&dev_base_lock);
 224         list_add_tail(&dev->dev_list, &net->dev_base_head);
 225         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 226         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 227         write_unlock_bh(&dev_base_lock);
 228         return 0;
 229 }
 230
 231 /* Device list removal */
 232 static void unlist_netdevice(struct net_device *dev)
 233 {
 234         ASSERT_RTNL();
 235
 236         /* Unlink dev from the device chain */
 237         write_lock_bh(&dev_base_lock);
 238         list_del(&dev->dev_list);
 239         hlist_del(&dev->name_hlist);
 240         hlist_del(&dev->index_hlist);
 241         write_unlock_bh(&dev_base_lock);
 242 }
 243
 244 /*
 245  *      Our notifier list
 246  */
 247
 248 static RAW_NOTIFIER_HEAD(netdev_chain);
 249
 250 /*
 251  *      Device drivers call our routines to queue packets here. We empty the
 252  *      queue in the local softnet handler.
 253  */
 254
 255 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 256
 257 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 258 /*
 259  * register_netdevice() inits dev->_xmit_lock and sets lockdep class
 260  * according to dev->type
 261  */
 262 static const unsigned short netdev_lock_type[] =
 263         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 264          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 265          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 266          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 267          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 268          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 269          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 270          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 271          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 272          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 273          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 274          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 275          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 276          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 277          ARPHRD_NONE};
 278
 279 static const char *netdev_lock_name[] =
 280         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 281          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 282          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 283          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 284          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 285          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 286          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 287          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 288          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 289          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 290          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 291          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 292          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 293          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 294          "_xmit_NONE"};
 295
 296 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 297
 298 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 299 {
 300         int i;
 301
 302         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 303                 if (netdev_lock_type[i] == dev_type)
 304                         return i;
 305         /* the last key is used by default */
 306         return ARRAY_SIZE(netdev_lock_type) - 1;
 307 }
 308
 309 static inline void netdev_set_lockdep_class(spinlock_t *lock,
 310                                             unsigned short dev_type)
 311 {
 312         int i;
 313
 314         i = netdev_lock_pos(dev_type);
 315         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 316                                    netdev_lock_name[i]);
 317 }
 318 #else
 319 static inline void netdev_set_lockdep_class(spinlock_t *lock,
 320                                             unsigned short dev_type)
 321 {
 322 }
 323 #endif
 324
 325 /*******************************************************************************
 326
 327                 Protocol management and registration routines
 328
 329 *******************************************************************************/
 330
 331 /*
 332  *      Add a protocol ID to the list. Now that the input handler is
 333  *      smarter we can dispense with all the messy stuff that used to be
 334  *      here.
 335  *
 336  *      BEWARE!!! Protocol handlers, mangling input packets,
 337  *      MUST BE last in hash buckets and checking protocol handlers
 338  *      MUST start from promiscuous ptype_all chain in net_bh.
 339  *      It is true now, do not change it.
 340  *      Explanation follows: if protocol handler, mangling packet, will
 341  *      be the first on list, it is not able to sense, that packet
 342  *      is cloned and should be copied-on-write, so that it will
 343  *      change it and subsequent readers will get broken packet.
 344  *                                                      --ANK (980803)
 345  */
 346
 347 /**
 348  *      dev_add_pack - add packet handler
 349  *      @pt: packet type declaration
 350  *
 351  *      Add a protocol handler to the networking stack. The passed &packet_type
 352  *      is linked into kernel lists and may not be freed until it has been
 353  *      removed from the kernel lists.
 354  *
 355  *      This call does not sleep therefore it can not
 356  *      guarantee all CPU's that are in middle of receiving packets
 357  *      will see the new packet type (until the next received packet).
 358  */
 359
 360 void dev_add_pack(struct packet_type *pt)
 361 {
 362         int hash;
 363
 364         spin_lock_bh(&ptype_lock);
 365         if (pt->type == htons(ETH_P_ALL))
 366                 list_add_rcu(&pt->list, &ptype_all);
 367         else {
 368                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 369                 list_add_rcu(&pt->list, &ptype_base[hash]);
 370         }
 371         spin_unlock_bh(&ptype_lock);
 372 }
 373
 374 /**
 375  *      __dev_remove_pack        - remove packet handler
 376  *      @pt: packet type declaration
 377  *
 378  *      Remove a protocol handler that was previously added to the kernel
 379  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 380  *      from the kernel lists and can be freed or reused once this function
 381  *      returns.
 382  *
 383  *      The packet type might still be in use by receivers
 384  *      and must not be freed until after all the CPU's have gone
 385  *      through a quiescent state.
 386  */
 387 void __dev_remove_pack(struct packet_type *pt)
 388 {
 389         struct list_head *head;
 390         struct packet_type *pt1;
 391
 392         spin_lock_bh(&ptype_lock);
 393
 394         if (pt->type == htons(ETH_P_ALL))
 395                 head = &ptype_all;
 396         else
 397                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398
 399         list_for_each_entry(pt1, head, list) {
 400                 if (pt == pt1) {
 401                         list_del_rcu(&pt->list);
 402                         goto out;
 403                 }
 404         }
 405
 406         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 407 out:
 408         spin_unlock_bh(&ptype_lock);
 409 }
 410 /**
 411  *      dev_remove_pack  - remove packet handler
 412  *      @pt: packet type declaration
 413  *
 414  *      Remove a protocol handler that was previously added to the kernel
 415  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 416  *      from the kernel lists and can be freed or reused once this function
 417  *      returns.
 418  *
 419  *      This call sleeps to guarantee that no CPU is looking at the packet
 420  *      type after return.
 421  */
 422 void dev_remove_pack(struct packet_type *pt)
 423 {
 424         __dev_remove_pack(pt);
 425
 426         synchronize_net();
 427 }
 428
 429 /******************************************************************************
 430
 431                       Device Boot-time Settings Routines
 432
 433 *******************************************************************************/
 434
 435 /* Boot time configuration table */
 436 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 437
 438 /**
 439  *      netdev_boot_setup_add   - add new setup entry
 440  *      @name: name of the device
 441  *      @map: configured settings for the device
 442  *
 443  *      Adds new setup entry to the dev_boot_setup list.  The function
 444  *      returns 0 on error and 1 on success.  This is a generic routine to
 445  *      all netdevices.
 446  */
 447 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 448 {
 449         struct netdev_boot_setup *s;
 450         int i;
 451
 452         s = dev_boot_setup;
 453         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 454                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 455                         memset(s[i].name, 0, sizeof(s[i].name));
 456                         strcpy(s[i].name, name);
 457                         memcpy(&s[i].map, map, sizeof(s[i].map));
 458                         break;
 459                 }
 460         }
 461
 462         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 463 }
 464
 465 /**
 466  *      netdev_boot_setup_check - check boot time settings
 467  *      @dev: the netdevice
 468  *
 469  *      Check boot time settings for the device.
 470  *      The found settings are set for the device to be used
 471  *      later in the device probing.
 472  *      Returns 0 if no settings found, 1 if they are.
 473  */
 474 int netdev_boot_setup_check(struct net_device *dev)
 475 {
 476         struct netdev_boot_setup *s = dev_boot_setup;
 477         int i;
 478
 479         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 480                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 481                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 482                         dev->irq        = s[i].map.irq;
 483                         dev->base_addr  = s[i].map.base_addr;
 484                         dev->mem_start  = s[i].map.mem_start;
 485                         dev->mem_end    = s[i].map.mem_end;
 486                         return 1;
 487                 }
 488         }
 489         return 0;
 490 }
 491
 492
 493 /**
 494  *      netdev_boot_base        - get address from boot time settings
 495  *      @prefix: prefix for network device
 496  *      @unit: id for network device
 497  *
 498  *      Check boot time settings for the base address of device.
 499  *      The found settings are set for the device to be used
 500  *      later in the device probing.
 501  *      Returns 0 if no settings found.
 502  */
 503 unsigned long netdev_boot_base(const char *prefix, int unit)
 504 {
 505         const struct netdev_boot_setup *s = dev_boot_setup;
 506         char name[IFNAMSIZ];
 507         int i;
 508
 509         sprintf(name, "%s%d", prefix, unit);
 510
 511         /*
 512          * If device already registered then return base of 1
 513          * to indicate not to probe for this interface
 514          */
 515         if (__dev_get_by_name(&init_net, name))
 516                 return 1;
 517
 518         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 519                 if (!strcmp(name, s[i].name))
 520                         return s[i].map.base_addr;
 521         return 0;
 522 }
 523
 524 /*
 525  * Saves at boot time configured settings for any netdevice.
 526  */
 527 int __init netdev_boot_setup(char *str)
 528 {
 529         int ints[5];
 530         struct ifmap map;
 531
 532         str = get_options(str, ARRAY_SIZE(ints), ints);
 533         if (!str || !*str)
 534                 return 0;
 535
 536         /* Save settings */
 537         memset(&map, 0, sizeof(map));
 538         if (ints[0] > 0)
 539                 map.irq = ints[1];
 540         if (ints[0] > 1)
 541                 map.base_addr = ints[2];
 542         if (ints[0] > 2)
 543                 map.mem_start = ints[3];
 544         if (ints[0] > 3)
 545                 map.mem_end = ints[4];
 546
 547         /* Add new entry to the list */
 548         return netdev_boot_setup_add(str, &map);
 549 }
 550
 551 __setup("netdev=", netdev_boot_setup);
 552
 553 /*******************************************************************************
 554
 555                             Device Interface Subroutines
 556
 557 *******************************************************************************/
 558
 559 /**
 560  *      __dev_get_by_name       - find a device by its name
 561  *      @net: the applicable net namespace
 562  *      @name: name to find
 563  *
 564  *      Find an interface by name. Must be called under RTNL semaphore
 565  *      or @dev_base_lock. If the name is found a pointer to the device
 566  *      is returned. If the name is not found then %NULL is returned. The
 567  *      reference counters are not incremented so the caller must be
 568  *      careful with locks.
 569  */
 570
 571 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 572 {
 573         struct hlist_node *p;
 574
 575         hlist_for_each(p, dev_name_hash(net, name)) {
 576                 struct net_device *dev
 577                         = hlist_entry(p, struct net_device, name_hlist);
 578                 if (!strncmp(dev->name, name, IFNAMSIZ))
 579                         return dev;
 580         }
 581         return NULL;
 582 }
 583
 584 /**
 585  *      dev_get_by_name         - find a device by its name
 586  *      @net: the applicable net namespace
 587  *      @name: name to find
 588  *
 589  *      Find an interface by name. This can be called from any
 590  *      context and does its own locking. The returned handle has
 591  *      the usage count incremented and the caller must use dev_put() to
 592  *      release it when it is no longer needed. %NULL is returned if no
 593  *      matching device is found.
 594  */
 595
 596 struct net_device *dev_get_by_name(struct net *net, const char *name)
 597 {
 598         struct net_device *dev;
 599
 600         read_lock(&dev_base_lock);
 601         dev = __dev_get_by_name(net, name);
 602         if (dev)
 603                 dev_hold(dev);
 604         read_unlock(&dev_base_lock);
 605         return dev;
 606 }
 607
 608 /**
 609  *      __dev_get_by_index - find a device by its ifindex
 610  *      @net: the applicable net namespace
 611  *      @ifindex: index of device
 612  *
 613  *      Search for an interface by index. Returns %NULL if the device
 614  *      is not found or a pointer to the device. The device has not
 615  *      had its reference counter increased so the caller must be careful
 616  *      about locking. The caller must hold either the RTNL semaphore
 617  *      or @dev_base_lock.
 618  */
 619
 620 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 621 {
 622         struct hlist_node *p;
 623
 624         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 625                 struct net_device *dev
 626                         = hlist_entry(p, struct net_device, index_hlist);
 627                 if (dev->ifindex == ifindex)
 628                         return dev;
 629         }
 630         return NULL;
 631 }
 632
 633
 634 /**
 635  *      dev_get_by_index - find a device by its ifindex
 636  *      @net: the applicable net namespace
 637  *      @ifindex: index of device
 638  *
 639  *      Search for an interface by index. Returns NULL if the device
 640  *      is not found or a pointer to the device. The device returned has
 641  *      had a reference added and the pointer is safe until the user calls
 642  *      dev_put to indicate they have finished with it.
 643  */
 644
 645 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 646 {
 647         struct net_device *dev;
 648
 649         read_lock(&dev_base_lock);
 650         dev = __dev_get_by_index(net, ifindex);
 651         if (dev)
 652                 dev_hold(dev);
 653         read_unlock(&dev_base_lock);
 654         return dev;
 655 }
 656
 657 /**
 658  *      dev_getbyhwaddr - find a device by its hardware address
 659  *      @net: the applicable net namespace
 660  *      @type: media type of device
 661  *      @ha: hardware address
 662  *
 663  *      Search for an interface by MAC address. Returns NULL if the device
 664  *      is not found or a pointer to the device. The caller must hold the
 665  *      rtnl semaphore. The returned device has not had its ref count increased
 666  *      and the caller must therefore be careful about locking
 667  *
 668  *      BUGS:
 669  *      If the API was consistent this would be __dev_get_by_hwaddr
 670  */
 671
 672 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 673 {
 674         struct net_device *dev;
 675
 676         ASSERT_RTNL();
 677
 678         for_each_netdev(net, dev)
 679                 if (dev->type == type &&
 680                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 681                         return dev;
 682
 683         return NULL;
 684 }
 685
 686 EXPORT_SYMBOL(dev_getbyhwaddr);
 687
 688 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 689 {
 690         struct net_device *dev;
 691
 692         ASSERT_RTNL();
 693         for_each_netdev(net, dev)
 694                 if (dev->type == type)
 695                         return dev;
 696
 697         return NULL;
 698 }
 699
 700 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 701
 702 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 703 {
 704         struct net_device *dev;
 705
 706         rtnl_lock();
 707         dev = __dev_getfirstbyhwtype(net, type);
 708         if (dev)
 709                 dev_hold(dev);
 710         rtnl_unlock();
 711         return dev;
 712 }
 713
 714 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 715
 716 /**
 717  *      dev_get_by_flags - find any device with given flags
 718  *      @net: the applicable net namespace
 719  *      @if_flags: IFF_* values
 720  *      @mask: bitmask of bits in if_flags to check
 721  *
 722  *      Search for any interface with the given flags. Returns NULL if a device
 723  *      is not found or a pointer to the device. The device returned has
 724  *      had a reference added and the pointer is safe until the user calls
 725  *      dev_put to indicate they have finished with it.
 726  */
 727
 728 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 729 {
 730         struct net_device *dev, *ret;
 731
 732         ret = NULL;
 733         read_lock(&dev_base_lock);
 734         for_each_netdev(net, dev) {
 735                 if (((dev->flags ^ if_flags) & mask) == 0) {
 736                         dev_hold(dev);
 737                         ret = dev;
 738                         break;
 739                 }
 740         }
 741         read_unlock(&dev_base_lock);
 742         return ret;
 743 }
 744
 745 /**
 746  *      dev_valid_name - check if name is okay for network device
 747  *      @name: name string
 748  *
 749  *      Network device names need to be valid file names to
 750  *      to allow sysfs to work.  We also disallow any kind of
 751  *      whitespace.
 752  */
 753 int dev_valid_name(const char *name)
 754 {
 755         if (*name == '\0')
 756                 return 0;
 757         if (strlen(name) >= IFNAMSIZ)
 758                 return 0;
 759         if (!strcmp(name, ".") || !strcmp(name, ".."))
 760                 return 0;
 761
 762         while (*name) {
 763                 if (*name == '/' || isspace(*name))
 764                         return 0;
 765                 name++;
 766         }
 767         return 1;
 768 }
 769
 770 /**
 771  *      __dev_alloc_name - allocate a name for a device
 772  *      @net: network namespace to allocate the device name in
 773  *      @name: name format string
 774  *      @buf:  scratch buffer and result name string
 775  *
 776  *      Passed a format string - eg "lt%d" it will try and find a suitable
 777  *      id. It scans list of devices to build up a free map, then chooses
 778  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 779  *      while allocating the name and adding the device in order to avoid
 780  *      duplicates.
 781  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 782  *      Returns the number of the unit assigned or a negative errno code.
 783  */
 784
 785 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 786 {
 787         int i = 0;
 788         const char *p;
 789         const int max_netdevices = 8*PAGE_SIZE;
 790         unsigned long *inuse;
 791         struct net_device *d;
 792
 793         p = strnchr(name, IFNAMSIZ-1, '%');
 794         if (p) {
 795                 /*
 796                  * Verify the string as this thing may have come from
 797                  * the user.  There must be either one "%d" and no other "%"
 798                  * characters.
 799                  */
 800                 if (p[1] != 'd' || strchr(p + 2, '%'))
 801                         return -EINVAL;
 802
 803                 /* Use one page as a bit array of possible slots */
 804                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 805                 if (!inuse)
 806                         return -ENOMEM;
 807
 808                 for_each_netdev(net, d) {
 809                         if (!sscanf(d->name, name, &i))
 810                                 continue;
 811                         if (i < 0 || i >= max_netdevices)
 812                                 continue;
 813
 814                         /*  avoid cases where sscanf is not exact inverse of printf */
 815                         snprintf(buf, IFNAMSIZ, name, i);
 816                         if (!strncmp(buf, d->name, IFNAMSIZ))
 817                                 set_bit(i, inuse);
 818                 }
 819
 820                 i = find_first_zero_bit(inuse, max_netdevices);
 821                 free_page((unsigned long) inuse);
 822         }
 823
 824         snprintf(buf, IFNAMSIZ, name, i);
 825         if (!__dev_get_by_name(net, buf))
 826                 return i;
 827
 828         /* It is possible to run out of possible slots
 829          * when the name is long and there isn't enough space left
 830          * for the digits, or if all bits are used.
 831          */
 832         return -ENFILE;
 833 }
 834
 835 /**
 836  *      dev_alloc_name - allocate a name for a device
 837  *      @dev: device
 838  *      @name: name format string
 839  *
 840  *      Passed a format string - eg "lt%d" it will try and find a suitable
 841  *      id. It scans list of devices to build up a free map, then chooses
 842  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 843  *      while allocating the name and adding the device in order to avoid
 844  *      duplicates.
 845  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 846  *      Returns the number of the unit assigned or a negative errno code.
 847  */
 848
 849 int dev_alloc_name(struct net_device *dev, const char *name)
 850 {
 851         char buf[IFNAMSIZ];
 852         struct net *net;
 853         int ret;
 854
 855         BUG_ON(!dev_net(dev));
 856         net = dev_net(dev);
 857         ret = __dev_alloc_name(net, name, buf);
 858         if (ret >= 0)
 859                 strlcpy(dev->name, buf, IFNAMSIZ);
 860         return ret;
 861 }
 862
 863
 864 /**
 865  *      dev_change_name - change name of a device
 866  *      @dev: device
 867  *      @newname: name (or format string) must be at least IFNAMSIZ
 868  *
 869  *      Change name of a device, can pass format strings "eth%d".
 870  *      for wildcarding.
 871  */
 872 int dev_change_name(struct net_device *dev, char *newname)
 873 {
 874         char oldname[IFNAMSIZ];
 875         int err = 0;
 876         int ret;
 877         struct net *net;
 878
 879         ASSERT_RTNL();
 880         BUG_ON(!dev_net(dev));
 881
 882         net = dev_net(dev);
 883         if (dev->flags & IFF_UP)
 884                 return -EBUSY;
 885
 886         if (!dev_valid_name(newname))
 887                 return -EINVAL;
 888
 889         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 890                 return 0;
 891
 892         memcpy(oldname, dev->name, IFNAMSIZ);
 893
 894         if (strchr(newname, '%')) {
 895                 err = dev_alloc_name(dev, newname);
 896                 if (err < 0)
 897                         return err;
 898                 strcpy(newname, dev->name);
 899         }
 900         else if (__dev_get_by_name(net, newname))
 901                 return -EEXIST;
 902         else
 903                 strlcpy(dev->name, newname, IFNAMSIZ);
 904
 905 rollback:
 906         err = device_rename(&dev->dev, dev->name);
 907         if (err) {
 908                 memcpy(dev->name, oldname, IFNAMSIZ);
 909                 return err;
 910         }
 911
 912         write_lock_bh(&dev_base_lock);
 913         hlist_del(&dev->name_hlist);
 914         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 915         write_unlock_bh(&dev_base_lock);
 916
 917         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 918         ret = notifier_to_errno(ret);
 919
 920         if (ret) {
 921                 if (err) {
 922                         printk(KERN_ERR
 923                                "%s: name change rollback failed: %d.\n",
 924                                dev->name, ret);
 925                 } else {
 926                         err = ret;
 927                         memcpy(dev->name, oldname, IFNAMSIZ);
 928                         goto rollback;
 929                 }
 930         }
 931
 932         return err;
 933 }
 934
 935 /**
 936  *      netdev_features_change - device changes features
 937  *      @dev: device to cause notification
 938  *
 939  *      Called to indicate a device has changed features.
 940  */
 941 void netdev_features_change(struct net_device *dev)
 942 {
 943         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 944 }
 945 EXPORT_SYMBOL(netdev_features_change);
 946
 947 /**
 948  *      netdev_state_change - device changes state
 949  *      @dev: device to cause notification
 950  *
 951  *      Called to indicate a device has changed state. This function calls
 952  *      the notifier chains for netdev_chain and sends a NEWLINK message
 953  *      to the routing socket.
 954  */
 955 void netdev_state_change(struct net_device *dev)
 956 {
 957         if (dev->flags & IFF_UP) {
 958                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
 959                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 960         }
 961 }
 962
 963 /**
 964  *      dev_load        - load a network module
 965  *      @net: the applicable net namespace
 966  *      @name: name of interface
 967  *
 968  *      If a network interface is not present and the process has suitable
 969  *      privileges this function loads the module. If module loading is not
 970  *      available in this kernel then it becomes a nop.
 971  */
 972
 973 void dev_load(struct net *net, const char *name)
 974 {
 975         struct net_device *dev;
 976
 977         read_lock(&dev_base_lock);
 978         dev = __dev_get_by_name(net, name);
 979         read_unlock(&dev_base_lock);
 980
 981         if (!dev && capable(CAP_SYS_MODULE))
 982                 request_module("%s", name);
 983 }
 984
 985 /**
 986  *      dev_open        - prepare an interface for use.
 987  *      @dev:   device to open
 988  *
 989  *      Takes a device from down to up state. The device's private open
 990  *      function is invoked and then the multicast lists are loaded. Finally
 991  *      the device is moved into the up state and a %NETDEV_UP message is
 992  *      sent to the netdev notifier chain.
 993  *
 994  *      Calling this function on an active interface is a nop. On a failure
 995  *      a negative errno code is returned.
 996  */
 997 int dev_open(struct net_device *dev)
 998 {
 999         int ret = 0;
1000
1001         ASSERT_RTNL();
1002
1003         /*
1004          *      Is it already up?
1005          */
1006
1007         if (dev->flags & IFF_UP)
1008                 return 0;
1009
1010         /*
1011          *      Is it even present?
1012          */
1013         if (!netif_device_present(dev))
1014                 return -ENODEV;
1015
1016         /*
1017          *      Call device private open method
1018          */
1019         set_bit(__LINK_STATE_START, &dev->state);
1020
1021         if (dev->validate_addr)
1022                 ret = dev->validate_addr(dev);
1023
1024         if (!ret && dev->open)
1025                 ret = dev->open(dev);
1026
1027         /*
1028          *      If it went open OK then:
1029          */
1030
1031         if (ret)
1032                 clear_bit(__LINK_STATE_START, &dev->state);
1033         else {
1034                 /*
1035                  *      Set the flags.
1036                  */
1037                 dev->flags |= IFF_UP;
1038
1039                 /*
1040                  *      Initialize multicasting status
1041                  */
1042                 dev_set_rx_mode(dev);
1043
1044                 /*
1045                  *      Wakeup transmit queue engine
1046                  */
1047                 dev_activate(dev);
1048
1049                 /*
1050                  *      ... and announce new interface.
1051                  */
1052                 call_netdevice_notifiers(NETDEV_UP, dev);
1053         }
1054
1055         return ret;
1056 }
1057
1058 /**
1059  *      dev_close - shutdown an interface.
1060  *      @dev: device to shutdown
1061  *
1062  *      This function moves an active device into down state. A
1063  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1064  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1065  *      chain.
1066  */
1067 int dev_close(struct net_device *dev)
1068 {
1069         ASSERT_RTNL();
1070
1071         might_sleep();
1072
1073         if (!(dev->flags & IFF_UP))
1074                 return 0;
1075
1076         /*
1077          *      Tell people we are going down, so that they can
1078          *      prepare to death, when device is still operating.
1079          */
1080         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1081
1082         clear_bit(__LINK_STATE_START, &dev->state);
1083
1084         /* Synchronize to scheduled poll. We cannot touch poll list,
1085          * it can be even on different cpu. So just clear netif_running().
1086          *
1087          * dev->stop() will invoke napi_disable() on all of it's
1088          * napi_struct instances on this device.
1089          */
1090         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1091
1092         dev_deactivate(dev);
1093
1094         /*
1095          *      Call the device specific close. This cannot fail.
1096          *      Only if device is UP
1097          *
1098          *      We allow it to be called even after a DETACH hot-plug
1099          *      event.
1100          */
1101         if (dev->stop)
1102                 dev->stop(dev);
1103
1104         /*
1105          *      Device is now down.
1106          */
1107
1108         dev->flags &= ~IFF_UP;
1109
1110         /*
1111          * Tell people we are down
1112          */
1113         call_netdevice_notifiers(NETDEV_DOWN, dev);
1114
1115         return 0;
1116 }
1117
1118
1119 static int dev_boot_phase = 1;
1120
1121 /*
1122  *      Device change register/unregister. These are not inline or static
1123  *      as we export them to the world.
1124  */
1125
1126 /**
1127  *      register_netdevice_notifier - register a network notifier block
1128  *      @nb: notifier
1129  *
1130  *      Register a notifier to be called when network device events occur.
1131  *      The notifier passed is linked into the kernel structures and must
1132  *      not be reused until it has been unregistered. A negative errno code
1133  *      is returned on a failure.
1134  *
1135  *      When registered all registration and up events are replayed
1136  *      to the new notifier to allow device to have a race free
1137  *      view of the network device list.
1138  */
1139
1140 int register_netdevice_notifier(struct notifier_block *nb)
1141 {
1142         struct net_device *dev;
1143         struct net_device *last;
1144         struct net *net;
1145         int err;
1146
1147         rtnl_lock();
1148         err = raw_notifier_chain_register(&netdev_chain, nb);
1149         if (err)
1150                 goto unlock;
1151         if (dev_boot_phase)
1152                 goto unlock;
1153         for_each_net(net) {
1154                 for_each_netdev(net, dev) {
1155                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1156                         err = notifier_to_errno(err);
1157                         if (err)
1158                                 goto rollback;
1159
1160                         if (!(dev->flags & IFF_UP))
1161                                 continue;
1162
1163                         nb->notifier_call(nb, NETDEV_UP, dev);
1164                 }
1165         }
1166
1167 unlock:
1168         rtnl_unlock();
1169         return err;
1170
1171 rollback:
1172         last = dev;
1173         for_each_net(net) {
1174                 for_each_netdev(net, dev) {
1175                         if (dev == last)
1176                                 break;
1177
1178                         if (dev->flags & IFF_UP) {
1179                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1180                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1181                         }
1182                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1183                 }
1184         }
1185
1186         raw_notifier_chain_unregister(&netdev_chain, nb);
1187         goto unlock;
1188 }
1189
1190 /**
1191  *      unregister_netdevice_notifier - unregister a network notifier block
1192  *      @nb: notifier
1193  *
1194  *      Unregister a notifier previously registered by
1195  *      register_netdevice_notifier(). The notifier is unlinked into the
1196  *      kernel structures and may then be reused. A negative errno code
1197  *      is returned on a failure.
1198  */
1199
1200 int unregister_netdevice_notifier(struct notifier_block *nb)
1201 {
1202         int err;
1203
1204         rtnl_lock();
1205         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1206         rtnl_unlock();
1207         return err;
1208 }
1209
1210 /**
1211  *      call_netdevice_notifiers - call all network notifier blocks
1212  *      @val: value passed unmodified to notifier function
1213  *      @dev: net_device pointer passed unmodified to notifier function
1214  *
1215  *      Call all network notifier blocks.  Parameters and return value
1216  *      are as for raw_notifier_call_chain().
1217  */
1218
1219 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1220 {
1221         return raw_notifier_call_chain(&netdev_chain, val, dev);
1222 }
1223
1224 /* When > 0 there are consumers of rx skb time stamps */
1225 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1226
1227 void net_enable_timestamp(void)
1228 {
1229         atomic_inc(&netstamp_needed);
1230 }
1231
1232 void net_disable_timestamp(void)
1233 {
1234         atomic_dec(&netstamp_needed);
1235 }
1236
1237 static inline void net_timestamp(struct sk_buff *skb)
1238 {
1239         if (atomic_read(&netstamp_needed))
1240                 __net_timestamp(skb);
1241         else
1242                 skb->tstamp.tv64 = 0;
1243 }
1244
1245 /*
1246  *      Support routine. Sends outgoing frames to any network
1247  *      taps currently in use.
1248  */
1249
1250 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1251 {
1252         struct packet_type *ptype;
1253
1254         net_timestamp(skb);
1255
1256         rcu_read_lock();
1257         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1258                 /* Never send packets back to the socket
1259                  * they originated from - MvS (miquels@drinkel.ow.org)
1260                  */
1261                 if ((ptype->dev == dev || !ptype->dev) &&
1262                     (ptype->af_packet_priv == NULL ||
1263                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1264                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1265                         if (!skb2)
1266                                 break;
1267
1268                         /* skb->nh should be correctly
1269                            set by sender, so that the second statement is
1270                            just protection against buggy protocols.
1271                          */
1272                         skb_reset_mac_header(skb2);
1273
1274                         if (skb_network_header(skb2) < skb2->data ||
1275                             skb2->network_header > skb2->tail) {
1276                                 if (net_ratelimit())
1277                                         printk(KERN_CRIT "protocol %04x is "
1278                                                "buggy, dev %s\n",
1279                                                skb2->protocol, dev->name);
1280                                 skb_reset_network_header(skb2);
1281                         }
1282
1283                         skb2->transport_header = skb2->network_header;
1284                         skb2->pkt_type = PACKET_OUTGOING;
1285                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1286                 }
1287         }
1288         rcu_read_unlock();
1289 }
1290
1291
1292 void __netif_schedule(struct net_device *dev)
1293 {
1294         if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1295                 unsigned long flags;
1296                 struct softnet_data *sd;
1297
1298                 local_irq_save(flags);
1299                 sd = &__get_cpu_var(softnet_data);
1300                 dev->next_sched = sd->output_queue;
1301                 sd->output_queue = dev;
1302                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1303                 local_irq_restore(flags);
1304         }
1305 }
1306 EXPORT_SYMBOL(__netif_schedule);
1307
1308 void dev_kfree_skb_irq(struct sk_buff *skb)
1309 {
1310         if (atomic_dec_and_test(&skb->users)) {
1311                 struct softnet_data *sd;
1312                 unsigned long flags;
1313
1314                 local_irq_save(flags);
1315                 sd = &__get_cpu_var(softnet_data);
1316                 skb->next = sd->completion_queue;
1317                 sd->completion_queue = skb;
1318                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1319                 local_irq_restore(flags);
1320         }
1321 }
1322 EXPORT_SYMBOL(dev_kfree_skb_irq);
1323
1324 void dev_kfree_skb_any(struct sk_buff *skb)
1325 {
1326         if (in_irq() || irqs_disabled())
1327                 dev_kfree_skb_irq(skb);
1328         else
1329                 dev_kfree_skb(skb);
1330 }
1331 EXPORT_SYMBOL(dev_kfree_skb_any);
1332
1333
1334 /**
1335  * netif_device_detach - mark device as removed
1336  * @dev: network device
1337  *
1338  * Mark device as removed from system and therefore no longer available.
1339  */
1340 void netif_device_detach(struct net_device *dev)
1341 {
1342         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1343             netif_running(dev)) {
1344                 netif_stop_queue(dev);
1345         }
1346 }
1347 EXPORT_SYMBOL(netif_device_detach);
1348
1349 /**
1350  * netif_device_attach - mark device as attached
1351  * @dev: network device
1352  *
1353  * Mark device as attached from system and restart if needed.
1354  */
1355 void netif_device_attach(struct net_device *dev)
1356 {
1357         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1358             netif_running(dev)) {
1359                 netif_wake_queue(dev);
1360                 __netdev_watchdog_up(dev);
1361         }
1362 }
1363 EXPORT_SYMBOL(netif_device_attach);
1364
1365
1366 /*
1367  * Invalidate hardware checksum when packet is to be mangled, and
1368  * complete checksum manually on outgoing path.
1369  */
1370 int skb_checksum_help(struct sk_buff *skb)
1371 {
1372         __wsum csum;
1373         int ret = 0, offset;
1374
1375         if (skb->ip_summed == CHECKSUM_COMPLETE)
1376                 goto out_set_summed;
1377
1378         if (unlikely(skb_shinfo(skb)->gso_size)) {
1379                 /* Let GSO fix up the checksum. */
1380                 goto out_set_summed;
1381         }
1382
1383         offset = skb->csum_start - skb_headroom(skb);
1384         BUG_ON(offset >= skb_headlen(skb));
1385         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1386
1387         offset += skb->csum_offset;
1388         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1389
1390         if (skb_cloned(skb) &&
1391             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1392                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1393                 if (ret)
1394                         goto out;
1395         }
1396
1397         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1398 out_set_summed:
1399         skb->ip_summed = CHECKSUM_NONE;
1400 out:
1401         return ret;
1402 }
1403
1404 /**
1405  *      skb_gso_segment - Perform segmentation on skb.
1406  *      @skb: buffer to segment
1407  *      @features: features for the output path (see dev->features)
1408  *
1409  *      This function segments the given skb and returns a list of segments.
1410  *
1411  *      It may return NULL if the skb requires no segmentation.  This is
1412  *      only possible when GSO is used for verifying header integrity.
1413  */
1414 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1415 {
1416         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1417         struct packet_type *ptype;
1418         __be16 type = skb->protocol;
1419         int err;
1420
1421         BUG_ON(skb_shinfo(skb)->frag_list);
1422
1423         skb_reset_mac_header(skb);
1424         skb->mac_len = skb->network_header - skb->mac_header;
1425         __skb_pull(skb, skb->mac_len);
1426
1427         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1428                 if (skb_header_cloned(skb) &&
1429                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1430                         return ERR_PTR(err);
1431         }
1432
1433         rcu_read_lock();
1434         list_for_each_entry_rcu(ptype,
1435                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1436                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1437                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1438                                 err = ptype->gso_send_check(skb);
1439                                 segs = ERR_PTR(err);
1440                                 if (err || skb_gso_ok(skb, features))
1441                                         break;
1442                                 __skb_push(skb, (skb->data -
1443                                                  skb_network_header(skb)));
1444                         }
1445                         segs = ptype->gso_segment(skb, features);
1446                         break;
1447                 }
1448         }
1449         rcu_read_unlock();
1450
1451         __skb_push(skb, skb->data - skb_mac_header(skb));
1452
1453         return segs;
1454 }
1455
1456 EXPORT_SYMBOL(skb_gso_segment);
1457
1458 /* Take action when hardware reception checksum errors are detected. */
1459 #ifdef CONFIG_BUG
1460 void netdev_rx_csum_fault(struct net_device *dev)
1461 {
1462         if (net_ratelimit()) {
1463                 printk(KERN_ERR "%s: hw csum failure.\n",
1464                         dev ? dev->name : "<unknown>");
1465                 dump_stack();
1466         }
1467 }
1468 EXPORT_SYMBOL(netdev_rx_csum_fault);
1469 #endif
1470
1471 /* Actually, we should eliminate this check as soon as we know, that:
1472  * 1. IOMMU is present and allows to map all the memory.
1473  * 2. No high memory really exists on this machine.
1474  */
1475
1476 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1477 {
1478 #ifdef CONFIG_HIGHMEM
1479         int i;
1480
1481         if (dev->features & NETIF_F_HIGHDMA)
1482                 return 0;
1483
1484         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1485                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1486                         return 1;
1487
1488 #endif
1489         return 0;
1490 }
1491
1492 struct dev_gso_cb {
1493         void (*destructor)(struct sk_buff *skb);
1494 };
1495
1496 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1497
1498 static void dev_gso_skb_destructor(struct sk_buff *skb)
1499 {
1500         struct dev_gso_cb *cb;
1501
1502         do {
1503                 struct sk_buff *nskb = skb->next;
1504
1505                 skb->next = nskb->next;
1506                 nskb->next = NULL;
1507                 kfree_skb(nskb);
1508         } while (skb->next);
1509
1510         cb = DEV_GSO_CB(skb);
1511         if (cb->destructor)
1512                 cb->destructor(skb);
1513 }
1514
1515 /**
1516  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1517  *      @skb: buffer to segment
1518  *
1519  *      This function segments the given skb and stores the list of segments
1520  *      in skb->next.
1521  */
1522 static int dev_gso_segment(struct sk_buff *skb)
1523 {
1524         struct net_device *dev = skb->dev;
1525         struct sk_buff *segs;
1526         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1527                                          NETIF_F_SG : 0);
1528
1529         segs = skb_gso_segment(skb, features);
1530
1531         /* Verifying header integrity only. */
1532         if (!segs)
1533                 return 0;
1534
1535         if (IS_ERR(segs))
1536                 return PTR_ERR(segs);
1537
1538         skb->next = segs;
1539         DEV_GSO_CB(skb)->destructor = skb->destructor;
1540         skb->destructor = dev_gso_skb_destructor;
1541
1542         return 0;
1543 }
1544
1545 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1546 {
1547         if (likely(!skb->next)) {
1548                 if (!list_empty(&ptype_all))
1549                         dev_queue_xmit_nit(skb, dev);
1550
1551                 if (netif_needs_gso(dev, skb)) {
1552                         if (unlikely(dev_gso_segment(skb)))
1553                                 goto out_kfree_skb;
1554                         if (skb->next)
1555                                 goto gso;
1556                 }
1557
1558                 return dev->hard_start_xmit(skb, dev);
1559         }
1560
1561 gso:
1562         do {
1563                 struct sk_buff *nskb = skb->next;
1564                 int rc;
1565
1566                 skb->next = nskb->next;
1567                 nskb->next = NULL;
1568                 rc = dev->hard_start_xmit(nskb, dev);
1569                 if (unlikely(rc)) {
1570                         nskb->next = skb->next;
1571                         skb->next = nskb;
1572                         return rc;
1573                 }
1574                 if (unlikely((netif_queue_stopped(dev) ||
1575                              netif_subqueue_stopped(dev, skb)) &&
1576                              skb->next))
1577                         return NETDEV_TX_BUSY;
1578         } while (skb->next);
1579
1580         skb->destructor = DEV_GSO_CB(skb)->destructor;
1581
1582 out_kfree_skb:
1583         kfree_skb(skb);
1584         return 0;
1585 }
1586
1587 /**
1588  *      dev_queue_xmit - transmit a buffer
1589  *      @skb: buffer to transmit
1590  *
1591  *      Queue a buffer for transmission to a network device. The caller must
1592  *      have set the device and priority and built the buffer before calling
1593  *      this function. The function can be called from an interrupt.
1594  *
1595  *      A negative errno code is returned on a failure. A success does not
1596  *      guarantee the frame will be transmitted as it may be dropped due
1597  *      to congestion or traffic shaping.
1598  *
1599  * -----------------------------------------------------------------------------------
1600  *      I notice this method can also return errors from the queue disciplines,
1601  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1602  *      be positive.
1603  *
1604  *      Regardless of the return value, the skb is consumed, so it is currently
1605  *      difficult to retry a send to this method.  (You can bump the ref count
1606  *      before sending to hold a reference for retry if you are careful.)
1607  *
1608  *      When calling this method, interrupts MUST be enabled.  This is because
1609  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1610  *          --BLG
1611  */
1612
1613 int dev_queue_xmit(struct sk_buff *skb)
1614 {
1615         struct net_device *dev = skb->dev;
1616         struct Qdisc *q;
1617         int rc = -ENOMEM;
1618
1619         /* GSO will handle the following emulations directly. */
1620         if (netif_needs_gso(dev, skb))
1621                 goto gso;
1622
1623         if (skb_shinfo(skb)->frag_list &&
1624             !(dev->features & NETIF_F_FRAGLIST) &&
1625             __skb_linearize(skb))
1626                 goto out_kfree_skb;
1627
1628         /* Fragmented skb is linearized if device does not support SG,
1629          * or if at least one of fragments is in highmem and device
1630          * does not support DMA from it.
1631          */
1632         if (skb_shinfo(skb)->nr_frags &&
1633             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1634             __skb_linearize(skb))
1635                 goto out_kfree_skb;
1636
1637         /* If packet is not checksummed and device does not support
1638          * checksumming for this protocol, complete checksumming here.
1639          */
1640         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1641                 skb_set_transport_header(skb, skb->csum_start -
1642                                               skb_headroom(skb));
1643
1644                 if (!(dev->features & NETIF_F_GEN_CSUM) &&
1645                     !((dev->features & NETIF_F_IP_CSUM) &&
1646                       skb->protocol == htons(ETH_P_IP)) &&
1647                     !((dev->features & NETIF_F_IPV6_CSUM) &&
1648                       skb->protocol == htons(ETH_P_IPV6)))
1649                         if (skb_checksum_help(skb))
1650                                 goto out_kfree_skb;
1651         }
1652
1653 gso:
1654         spin_lock_prefetch(&dev->queue_lock);
1655
1656         /* Disable soft irqs for various locks below. Also
1657          * stops preemption for RCU.
1658          */
1659         rcu_read_lock_bh();
1660
1661         /* Updates of qdisc are serialized by queue_lock.
1662          * The struct Qdisc which is pointed to by qdisc is now a
1663          * rcu structure - it may be accessed without acquiring
1664          * a lock (but the structure may be stale.) The freeing of the
1665          * qdisc will be deferred until it's known that there are no
1666          * more references to it.
1667          *
1668          * If the qdisc has an enqueue function, we still need to
1669          * hold the queue_lock before calling it, since queue_lock
1670          * also serializes access to the device queue.
1671          */
1672
1673         q = rcu_dereference(dev->qdisc);
1674 #ifdef CONFIG_NET_CLS_ACT
1675         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1676 #endif
1677         if (q->enqueue) {
1678                 /* Grab device queue */
1679                 spin_lock(&dev->queue_lock);
1680                 q = dev->qdisc;
1681                 if (q->enqueue) {
1682                         /* reset queue_mapping to zero */
1683                         skb_set_queue_mapping(skb, 0);
1684                         rc = q->enqueue(skb, q);
1685                         qdisc_run(dev);
1686                         spin_unlock(&dev->queue_lock);
1687
1688                         rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1689                         goto out;
1690                 }
1691                 spin_unlock(&dev->queue_lock);
1692         }
1693
1694         /* The device has no queue. Common case for software devices:
1695            loopback, all the sorts of tunnels...
1696
1697            Really, it is unlikely that netif_tx_lock protection is necessary
1698            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1699            counters.)
1700            However, it is possible, that they rely on protection
1701            made by us here.
1702
1703            Check this and shot the lock. It is not prone from deadlocks.
1704            Either shot noqueue qdisc, it is even simpler 8)
1705          */
1706         if (dev->flags & IFF_UP) {
1707                 int cpu = smp_processor_id(); /* ok because BHs are off */
1708
1709                 if (dev->xmit_lock_owner != cpu) {
1710
1711                         HARD_TX_LOCK(dev, cpu);
1712
1713                         if (!netif_queue_stopped(dev) &&
1714                             !netif_subqueue_stopped(dev, skb)) {
1715                                 rc = 0;
1716                                 if (!dev_hard_start_xmit(skb, dev)) {
1717                                         HARD_TX_UNLOCK(dev);
1718                                         goto out;
1719                                 }
1720                         }
1721                         HARD_TX_UNLOCK(dev);
1722                         if (net_ratelimit())
1723                                 printk(KERN_CRIT "Virtual device %s asks to "
1724                                        "queue packet!\n", dev->name);
1725                 } else {
1726                         /* Recursion is detected! It is possible,
1727                          * unfortunately */
1728                         if (net_ratelimit())
1729                                 printk(KERN_CRIT "Dead loop on virtual device "
1730                                        "%s, fix it urgently!\n", dev->name);
1731                 }
1732         }
1733
1734         rc = -ENETDOWN;
1735         rcu_read_unlock_bh();
1736
1737 out_kfree_skb:
1738         kfree_skb(skb);
1739         return rc;
1740 out:
1741         rcu_read_unlock_bh();
1742         return rc;
1743 }
1744
1745
1746 /*=======================================================================
1747                         Receiver routines
1748   =======================================================================*/
1749
1750 int netdev_max_backlog __read_mostly = 1000;
1751 int netdev_budget __read_mostly = 300;
1752 int weight_p __read_mostly = 64;            /* old backlog weight */
1753
1754 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1755
1756
1757 /**
1758  *      netif_rx        -       post buffer to the network code
1759  *      @skb: buffer to post
1760  *
1761  *      This function receives a packet from a device driver and queues it for
1762  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1763  *      may be dropped during processing for congestion control or by the
1764  *      protocol layers.
1765  *
1766  *      return values:
1767  *      NET_RX_SUCCESS  (no congestion)
1768  *      NET_RX_DROP     (packet was dropped)
1769  *
1770  */
1771
1772 int netif_rx(struct sk_buff *skb)
1773 {
1774         struct softnet_data *queue;
1775         unsigned long flags;
1776
1777         /* if netpoll wants it, pretend we never saw it */
1778         if (netpoll_rx(skb))
1779                 return NET_RX_DROP;
1780
1781         if (!skb->tstamp.tv64)
1782                 net_timestamp(skb);
1783
1784         /*
1785          * The code is rearranged so that the path is the most
1786          * short when CPU is congested, but is still operating.
1787          */
1788         local_irq_save(flags);
1789         queue = &__get_cpu_var(softnet_data);
1790
1791         __get_cpu_var(netdev_rx_stat).total++;
1792         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1793                 if (queue->input_pkt_queue.qlen) {
1794 enqueue:
1795                         dev_hold(skb->dev);
1796                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1797                         local_irq_restore(flags);
1798                         return NET_RX_SUCCESS;
1799                 }
1800
1801                 napi_schedule(&queue->backlog);
1802                 goto enqueue;
1803         }
1804
1805         __get_cpu_var(netdev_rx_stat).dropped++;
1806         local_irq_restore(flags);
1807
1808         kfree_skb(skb);
1809         return NET_RX_DROP;
1810 }
1811
1812 int netif_rx_ni(struct sk_buff *skb)
1813 {
1814         int err;
1815
1816         preempt_disable();
1817         err = netif_rx(skb);
1818         if (local_softirq_pending())
1819                 do_softirq();
1820         preempt_enable();
1821
1822         return err;
1823 }
1824
1825 EXPORT_SYMBOL(netif_rx_ni);
1826
1827 static inline struct net_device *skb_bond(struct sk_buff *skb)
1828 {
1829         struct net_device *dev = skb->dev;
1830
1831         if (dev->master) {
1832                 if (skb_bond_should_drop(skb)) {
1833                         kfree_skb(skb);
1834                         return NULL;
1835                 }
1836                 skb->dev = dev->master;
1837         }
1838
1839         return dev;
1840 }
1841
1842
1843 static void net_tx_action(struct softirq_action *h)
1844 {
1845         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1846
1847         if (sd->completion_queue) {
1848                 struct sk_buff *clist;
1849
1850                 local_irq_disable();
1851                 clist = sd->completion_queue;
1852                 sd->completion_queue = NULL;
1853                 local_irq_enable();
1854
1855                 while (clist) {
1856                         struct sk_buff *skb = clist;
1857                         clist = clist->next;
1858
1859                         BUG_TRAP(!atomic_read(&skb->users));
1860                         __kfree_skb(skb);
1861                 }
1862         }
1863
1864         if (sd->output_queue) {
1865                 struct net_device *head;
1866
1867                 local_irq_disable();
1868                 head = sd->output_queue;
1869                 sd->output_queue = NULL;
1870                 local_irq_enable();
1871
1872                 while (head) {
1873                         struct net_device *dev = head;
1874                         head = head->next_sched;
1875
1876                         smp_mb__before_clear_bit();
1877                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1878
1879                         if (spin_trylock(&dev->queue_lock)) {
1880                                 qdisc_run(dev);
1881                                 spin_unlock(&dev->queue_lock);
1882                         } else {
1883                                 netif_schedule(dev);
1884                         }
1885                 }
1886         }
1887 }
1888
1889 static inline int deliver_skb(struct sk_buff *skb,
1890                               struct packet_type *pt_prev,
1891                               struct net_device *orig_dev)
1892 {
1893         atomic_inc(&skb->users);
1894         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1895 }
1896
1897 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1898 /* These hooks defined here for ATM */
1899 struct net_bridge;
1900 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1901                                                 unsigned char *addr);
1902 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1903
1904 /*
1905  * If bridge module is loaded call bridging hook.
1906  *  returns NULL if packet was consumed.
1907  */
1908 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1909                                         struct sk_buff *skb) __read_mostly;
1910 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1911                                             struct packet_type **pt_prev, int *ret,
1912                                             struct net_device *orig_dev)
1913 {
1914         struct net_bridge_port *port;
1915
1916         if (skb->pkt_type == PACKET_LOOPBACK ||
1917             (port = rcu_dereference(skb->dev->br_port)) == NULL)
1918                 return skb;
1919
1920         if (*pt_prev) {
1921                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1922                 *pt_prev = NULL;
1923         }
1924
1925         return br_handle_frame_hook(port, skb);
1926 }
1927 #else
1928 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
1929 #endif
1930
1931 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1932 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1933 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1934
1935 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1936                                              struct packet_type **pt_prev,
1937                                              int *ret,
1938                                              struct net_device *orig_dev)
1939 {
1940         if (skb->dev->macvlan_port == NULL)
1941                 return skb;
1942
1943         if (*pt_prev) {
1944                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1945                 *pt_prev = NULL;
1946         }
1947         return macvlan_handle_frame_hook(skb);
1948 }
1949 #else
1950 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
1951 #endif
1952
1953 #ifdef CONFIG_NET_CLS_ACT
1954 /* TODO: Maybe we should just force sch_ingress to be compiled in
1955  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1956  * a compare and 2 stores extra right now if we dont have it on
1957  * but have CONFIG_NET_CLS_ACT
1958  * NOTE: This doesnt stop any functionality; if you dont have
1959  * the ingress scheduler, you just cant add policies on ingress.
1960  *
1961  */
1962 static int ing_filter(struct sk_buff *skb)
1963 {
1964         struct Qdisc *q;
1965         struct net_device *dev = skb->dev;
1966         int result = TC_ACT_OK;
1967         u32 ttl = G_TC_RTTL(skb->tc_verd);
1968
1969         if (MAX_RED_LOOP < ttl++) {
1970                 printk(KERN_WARNING
1971                        "Redir loop detected Dropping packet (%d->%d)\n",
1972                        skb->iif, dev->ifindex);
1973                 return TC_ACT_SHOT;
1974         }
1975
1976         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
1977         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1978
1979         spin_lock(&dev->ingress_lock);
1980         if ((q = dev->qdisc_ingress) != NULL)
1981                 result = q->enqueue(skb, q);
1982         spin_unlock(&dev->ingress_lock);
1983
1984         return result;
1985 }
1986
1987 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
1988                                          struct packet_type **pt_prev,
1989                                          int *ret, struct net_device *orig_dev)
1990 {
1991         if (!skb->dev->qdisc_ingress)
1992                 goto out;
1993
1994         if (*pt_prev) {
1995                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1996                 *pt_prev = NULL;
1997         } else {
1998                 /* Huh? Why does turning on AF_PACKET affect this? */
1999                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2000         }
2001
2002         switch (ing_filter(skb)) {
2003         case TC_ACT_SHOT:
2004         case TC_ACT_STOLEN:
2005                 kfree_skb(skb);
2006                 return NULL;
2007         }
2008
2009 out:
2010         skb->tc_verd = 0;
2011         return skb;
2012 }
2013 #endif
2014
2015 /**
2016  *      netif_receive_skb - process receive buffer from network
2017  *      @skb: buffer to process
2018  *
2019  *      netif_receive_skb() is the main receive data processing function.
2020  *      It always succeeds. The buffer may be dropped during processing
2021  *      for congestion control or by the protocol layers.
2022  *
2023  *      This function may only be called from softirq context and interrupts
2024  *      should be enabled.
2025  *
2026  *      Return values (usually ignored):
2027  *      NET_RX_SUCCESS: no congestion
2028  *      NET_RX_DROP: packet was dropped
2029  */
2030 int netif_receive_skb(struct sk_buff *skb)
2031 {
2032         struct packet_type *ptype, *pt_prev;
2033         struct net_device *orig_dev;
2034         int ret = NET_RX_DROP;
2035         __be16 type;
2036
2037         /* if we've gotten here through NAPI, check netpoll */
2038         if (netpoll_receive_skb(skb))
2039                 return NET_RX_DROP;
2040
2041         if (!skb->tstamp.tv64)
2042                 net_timestamp(skb);
2043
2044         if (!skb->iif)
2045                 skb->iif = skb->dev->ifindex;
2046
2047         orig_dev = skb_bond(skb);
2048
2049         if (!orig_dev)
2050                 return NET_RX_DROP;
2051
2052         __get_cpu_var(netdev_rx_stat).total++;
2053
2054         skb_reset_network_header(skb);
2055         skb_reset_transport_header(skb);
2056         skb->mac_len = skb->network_header - skb->mac_header;
2057
2058         pt_prev = NULL;
2059
2060         rcu_read_lock();
2061
2062 #ifdef CONFIG_NET_CLS_ACT
2063         if (skb->tc_verd & TC_NCLS) {
2064                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2065                 goto ncls;
2066         }
2067 #endif
2068
2069         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2070                 if (!ptype->dev || ptype->dev == skb->dev) {
2071                         if (pt_prev)
2072                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2073                         pt_prev = ptype;
2074                 }
2075         }
2076
2077 #ifdef CONFIG_NET_CLS_ACT
2078         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2079         if (!skb)
2080                 goto out;
2081 ncls:
2082 #endif
2083
2084         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2085         if (!skb)
2086                 goto out;
2087         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2088         if (!skb)
2089                 goto out;
2090
2091         type = skb->protocol;
2092         list_for_each_entry_rcu(ptype,
2093                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2094                 if (ptype->type == type &&
2095                     (!ptype->dev || ptype->dev == skb->dev)) {
2096                         if (pt_prev)
2097                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2098                         pt_prev = ptype;
2099                 }
2100         }
2101
2102         if (pt_prev) {
2103                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2104         } else {
2105                 kfree_skb(skb);
2106                 /* Jamal, now you will not able to escape explaining
2107                  * me how you were going to use this. :-)
2108                  */
2109                 ret = NET_RX_DROP;
2110         }
2111
2112 out:
2113         rcu_read_unlock();
2114         return ret;
2115 }
2116
2117 static int process_backlog(struct napi_struct *napi, int quota)
2118 {
2119         int work = 0;
2120         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2121         unsigned long start_time = jiffies;
2122
2123         napi->weight = weight_p;
2124         do {
2125                 struct sk_buff *skb;
2126                 struct net_device *dev;
2127
2128                 local_irq_disable();
2129                 skb = __skb_dequeue(&queue->input_pkt_queue);
2130                 if (!skb) {
2131                         __napi_complete(napi);
2132                         local_irq_enable();
2133                         break;
2134                 }
2135
2136                 local_irq_enable();
2137
2138                 dev = skb->dev;
2139
2140                 netif_receive_skb(skb);
2141
2142                 dev_put(dev);
2143         } while (++work < quota && jiffies == start_time);
2144
2145         return work;
2146 }
2147
2148 /**
2149  * __napi_schedule - schedule for receive
2150  * @n: entry to schedule
2151  *
2152  * The entry's receive function will be scheduled to run
2153  */
2154 void __napi_schedule(struct napi_struct *n)
2155 {
2156         unsigned long flags;
2157
2158         local_irq_save(flags);
2159         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2160         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2161         local_irq_restore(flags);
2162 }
2163 EXPORT_SYMBOL(__napi_schedule);
2164
2165
2166 static void net_rx_action(struct softirq_action *h)
2167 {
2168         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2169         unsigned long start_time = jiffies;
2170         int budget = netdev_budget;
2171         void *have;
2172
2173         local_irq_disable();
2174
2175         while (!list_empty(list)) {
2176                 struct napi_struct *n;
2177                 int work, weight;
2178
2179                 /* If softirq window is exhuasted then punt.
2180                  *
2181                  * Note that this is a slight policy change from the
2182                  * previous NAPI code, which would allow up to 2
2183                  * jiffies to pass before breaking out.  The test
2184                  * used to be "jiffies - start_time > 1".
2185                  */
2186                 if (unlikely(budget <= 0 || jiffies != start_time))
2187                         goto softnet_break;
2188
2189                 local_irq_enable();
2190
2191                 /* Even though interrupts have been re-enabled, this
2192                  * access is safe because interrupts can only add new
2193                  * entries to the tail of this list, and only ->poll()
2194                  * calls can remove this head entry from the list.
2195                  */
2196                 n = list_entry(list->next, struct napi_struct, poll_list);
2197
2198                 have = netpoll_poll_lock(n);
2199
2200                 weight = n->weight;
2201
2202                 /* This NAPI_STATE_SCHED test is for avoiding a race
2203                  * with netpoll's poll_napi().  Only the entity which
2204                  * obtains the lock and sees NAPI_STATE_SCHED set will
2205                  * actually make the ->poll() call.  Therefore we avoid
2206                  * accidently calling ->poll() when NAPI is not scheduled.
2207                  */
2208                 work = 0;
2209                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2210                         work = n->poll(n, weight);
2211
2212                 WARN_ON_ONCE(work > weight);
2213
2214                 budget -= work;
2215
2216                 local_irq_disable();
2217
2218                 /* Drivers must not modify the NAPI state if they
2219                  * consume the entire weight.  In such cases this code
2220                  * still "owns" the NAPI instance and therefore can
2221                  * move the instance around on the list at-will.
2222                  */
2223                 if (unlikely(work == weight)) {
2224                         if (unlikely(napi_disable_pending(n)))
2225                                 __napi_complete(n);
2226                         else
2227                                 list_move_tail(&n->poll_list, list);
2228                 }
2229
2230                 netpoll_poll_unlock(have);
2231         }
2232 out:
2233         local_irq_enable();
2234
2235 #ifdef CONFIG_NET_DMA
2236         /*
2237          * There may not be any more sk_buffs coming right now, so push
2238          * any pending DMA copies to hardware
2239          */
2240         if (!cpus_empty(net_dma.channel_mask)) {
2241                 int chan_idx;
2242                 for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2243                         struct dma_chan *chan = net_dma.channels[chan_idx];
2244                         if (chan)
2245                                 dma_async_memcpy_issue_pending(chan);
2246                 }
2247         }
2248 #endif
2249
2250         return;
2251
2252 softnet_break:
2253         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2254         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2255         goto out;
2256 }
2257
2258 static gifconf_func_t * gifconf_list [NPROTO];
2259
2260 /**
2261  *      register_gifconf        -       register a SIOCGIF handler
2262  *      @family: Address family
2263  *      @gifconf: Function handler
2264  *
2265  *      Register protocol dependent address dumping routines. The handler
2266  *      that is passed must not be freed or reused until it has been replaced
2267  *      by another handler.
2268  */
2269 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2270 {
2271         if (family >= NPROTO)
2272                 return -EINVAL;
2273         gifconf_list[family] = gifconf;
2274         return 0;
2275 }
2276
2277
2278 /*
2279  *      Map an interface index to its name (SIOCGIFNAME)
2280  */
2281
2282 /*
2283  *      We need this ioctl for efficient implementation of the
2284  *      if_indextoname() function required by the IPv6 API.  Without
2285  *      it, we would have to search all the interfaces to find a
2286  *      match.  --pb
2287  */
2288
2289 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2290 {
2291         struct net_device *dev;
2292         struct ifreq ifr;
2293
2294         /*
2295          *      Fetch the caller's info block.
2296          */
2297
2298         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2299                 return -EFAULT;
2300
2301         read_lock(&dev_base_lock);
2302         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2303         if (!dev) {
2304                 read_unlock(&dev_base_lock);
2305                 return -ENODEV;
2306         }
2307
2308         strcpy(ifr.ifr_name, dev->name);
2309         read_unlock(&dev_base_lock);
2310
2311         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2312                 return -EFAULT;
2313         return 0;
2314 }
2315
2316 /*
2317  *      Perform a SIOCGIFCONF call. This structure will change
2318  *      size eventually, and there is nothing I can do about it.
2319  *      Thus we will need a 'compatibility mode'.
2320  */
2321
2322 static int dev_ifconf(struct net *net, char __user *arg)
2323 {
2324         struct ifconf ifc;
2325         struct net_device *dev;
2326         char __user *pos;
2327         int len;
2328         int total;
2329         int i;
2330
2331         /*
2332          *      Fetch the caller's info block.
2333          */
2334
2335         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2336                 return -EFAULT;
2337
2338         pos = ifc.ifc_buf;
2339         len = ifc.ifc_len;
2340
2341         /*
2342          *      Loop over the interfaces, and write an info block for each.
2343          */
2344
2345         total = 0;
2346         for_each_netdev(net, dev) {
2347                 for (i = 0; i < NPROTO; i++) {
2348                         if (gifconf_list[i]) {
2349                                 int done;
2350                                 if (!pos)
2351                                         done = gifconf_list[i](dev, NULL, 0);
2352                                 else
2353                                         done = gifconf_list[i](dev, pos + total,
2354                                                                len - total);
2355                                 if (done < 0)
2356                                         return -EFAULT;
2357                                 total += done;
2358                         }
2359                 }
2360         }
2361
2362         /*
2363          *      All done.  Write the updated control block back to the caller.
2364          */
2365         ifc.ifc_len = total;
2366
2367         /*
2368          *      Both BSD and Solaris return 0 here, so we do too.
2369          */
2370         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2371 }
2372
2373 #ifdef CONFIG_PROC_FS
2374 /*
2375  *      This is invoked by the /proc filesystem handler to display a device
2376  *      in detail.
2377  */
2378 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2379         __acquires(dev_base_lock)
2380 {
2381         struct net *net = seq_file_net(seq);
2382         loff_t off;
2383         struct net_device *dev;
2384
2385         read_lock(&dev_base_lock);
2386         if (!*pos)
2387                 return SEQ_START_TOKEN;
2388
2389         off = 1;
2390         for_each_netdev(net, dev)
2391                 if (off++ == *pos)
2392                         return dev;
2393
2394         return NULL;
2395 }
2396
2397 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2398 {
2399         struct net *net = seq_file_net(seq);
2400         ++*pos;
2401         return v == SEQ_START_TOKEN ?
2402                 first_net_device(net) : next_net_device((struct net_device *)v);
2403 }
2404
2405 void dev_seq_stop(struct seq_file *seq, void *v)
2406         __releases(dev_base_lock)
2407 {
2408         read_unlock(&dev_base_lock);
2409 }
2410
2411 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2412 {
2413         struct net_device_stats *stats = dev->get_stats(dev);
2414
2415         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2416                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2417                    dev->name, stats->rx_bytes, stats->rx_packets,
2418                    stats->rx_errors,
2419                    stats->rx_dropped + stats->rx_missed_errors,
2420                    stats->rx_fifo_errors,
2421                    stats->rx_length_errors + stats->rx_over_errors +
2422                     stats->rx_crc_errors + stats->rx_frame_errors,
2423                    stats->rx_compressed, stats->multicast,
2424                    stats->tx_bytes, stats->tx_packets,
2425                    stats->tx_errors, stats->tx_dropped,
2426                    stats->tx_fifo_errors, stats->collisions,
2427                    stats->tx_carrier_errors +
2428                     stats->tx_aborted_errors +
2429                     stats->tx_window_errors +
2430                     stats->tx_heartbeat_errors,
2431                    stats->tx_compressed);
2432 }
2433
2434 /*
2435  *      Called from the PROCfs module. This now uses the new arbitrary sized
2436  *      /proc/net interface to create /proc/net/dev
2437  */
2438 static int dev_seq_show(struct seq_file *seq, void *v)
2439 {
2440         if (v == SEQ_START_TOKEN)
2441                 seq_puts(seq, "Inter-|   Receive                            "
2442                               "                    |  Transmit\n"
2443                               " face |bytes    packets errs drop fifo frame "
2444                               "compressed multicast|bytes    packets errs "
2445                               "drop fifo colls carrier compressed\n");
2446         else
2447                 dev_seq_printf_stats(seq, v);
2448         return 0;
2449 }
2450
2451 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2452 {
2453         struct netif_rx_stats *rc = NULL;
2454
2455         while (*pos < nr_cpu_ids)
2456                 if (cpu_online(*pos)) {
2457                         rc = &per_cpu(netdev_rx_stat, *pos);
2458                         break;
2459                 } else
2460                         ++*pos;
2461         return rc;
2462 }
2463
2464 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2465 {
2466         return softnet_get_online(pos);
2467 }
2468
2469 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2470 {
2471         ++*pos;
2472         return softnet_get_online(pos);
2473 }
2474
2475 static void softnet_seq_stop(struct seq_file *seq, void *v)
2476 {
2477 }
2478
2479 static int softnet_seq_show(struct seq_file *seq, void *v)
2480 {
2481         struct netif_rx_stats *s = v;
2482
2483         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2484                    s->total, s->dropped, s->time_squeeze, 0,
2485                    0, 0, 0, 0, /* was fastroute */
2486                    s->cpu_collision );
2487         return 0;
2488 }
2489
2490 static const struct seq_operations dev_seq_ops = {
2491         .start = dev_seq_start,
2492         .next  = dev_seq_next,
2493         .stop  = dev_seq_stop,
2494         .show  = dev_seq_show,
2495 };
2496
2497 static int dev_seq_open(struct inode *inode, struct file *file)
2498 {
2499         return seq_open_net(inode, file, &dev_seq_ops,
2500                             sizeof(struct seq_net_private));
2501 }
2502
2503 static const struct file_operations dev_seq_fops = {
2504         .owner   = THIS_MODULE,
2505         .open    = dev_seq_open,
2506         .read    = seq_read,
2507         .llseek  = seq_lseek,
2508         .release = seq_release_net,
2509 };
2510
2511 static const struct seq_operations softnet_seq_ops = {
2512         .start = softnet_seq_start,
2513         .next  = softnet_seq_next,
2514         .stop  = softnet_seq_stop,
2515         .show  = softnet_seq_show,
2516 };
2517
2518 static int softnet_seq_open(struct inode *inode, struct file *file)
2519 {
2520         return seq_open(file, &softnet_seq_ops);
2521 }
2522
2523 static const struct file_operations softnet_seq_fops = {
2524         .owner   = THIS_MODULE,
2525         .open    = softnet_seq_open,
2526         .read    = seq_read,
2527         .llseek  = seq_lseek,
2528         .release = seq_release,
2529 };
2530
2531 static void *ptype_get_idx(loff_t pos)
2532 {
2533         struct packet_type *pt = NULL;
2534         loff_t i = 0;
2535         int t;
2536
2537         list_for_each_entry_rcu(pt, &ptype_all, list) {
2538                 if (i == pos)
2539                         return pt;
2540                 ++i;
2541         }
2542
2543         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2544                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2545                         if (i == pos)
2546                                 return pt;
2547                         ++i;
2548                 }
2549         }
2550         return NULL;
2551 }
2552
2553 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2554         __acquires(RCU)
2555 {
2556         rcu_read_lock();
2557         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2558 }
2559
2560 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2561 {
2562         struct packet_type *pt;
2563         struct list_head *nxt;
2564         int hash;
2565
2566         ++*pos;
2567         if (v == SEQ_START_TOKEN)
2568                 return ptype_get_idx(0);
2569
2570         pt = v;
2571         nxt = pt->list.next;
2572         if (pt->type == htons(ETH_P_ALL)) {
2573                 if (nxt != &ptype_all)
2574                         goto found;
2575                 hash = 0;
2576                 nxt = ptype_base[0].next;
2577         } else
2578                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2579
2580         while (nxt == &ptype_base[hash]) {
2581                 if (++hash >= PTYPE_HASH_SIZE)
2582                         return NULL;
2583                 nxt = ptype_base[hash].next;
2584         }
2585 found:
2586         return list_entry(nxt, struct packet_type, list);
2587 }
2588
2589 static void ptype_seq_stop(struct seq_file *seq, void *v)
2590         __releases(RCU)
2591 {
2592         rcu_read_unlock();
2593 }
2594
2595 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2596 {
2597 #ifdef CONFIG_KALLSYMS
2598         unsigned long offset = 0, symsize;
2599         const char *symname;
2600         char *modname;
2601         char namebuf[128];
2602
2603         symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2604                                   &modname, namebuf);
2605
2606         if (symname) {
2607                 char *delim = ":";
2608
2609                 if (!modname)
2610                         modname = delim = "";
2611                 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2612                            symname, offset);
2613                 return;
2614         }
2615 #endif
2616
2617         seq_printf(seq, "[%p]", sym);
2618 }
2619
2620 static int ptype_seq_show(struct seq_file *seq, void *v)
2621 {
2622         struct packet_type *pt = v;
2623
2624         if (v == SEQ_START_TOKEN)
2625                 seq_puts(seq, "Type Device      Function\n");
2626         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
2627                 if (pt->type == htons(ETH_P_ALL))
2628                         seq_puts(seq, "ALL ");
2629                 else
2630                         seq_printf(seq, "%04x", ntohs(pt->type));
2631
2632                 seq_printf(seq, " %-8s ",
2633                            pt->dev ? pt->dev->name : "");
2634                 ptype_seq_decode(seq,  pt->func);
2635                 seq_putc(seq, '\n');
2636         }
2637
2638         return 0;
2639 }
2640
2641 static const struct seq_operations ptype_seq_ops = {
2642         .start = ptype_seq_start,
2643         .next  = ptype_seq_next,
2644         .stop  = ptype_seq_stop,
2645         .show  = ptype_seq_show,
2646 };
2647
2648 static int ptype_seq_open(struct inode *inode, struct file *file)
2649 {
2650         return seq_open_net(inode, file, &ptype_seq_ops,
2651                         sizeof(struct seq_net_private));
2652 }
2653
2654 static const struct file_operations ptype_seq_fops = {
2655         .owner   = THIS_MODULE,
2656         .open    = ptype_seq_open,
2657         .read    = seq_read,
2658         .llseek  = seq_lseek,
2659         .release = seq_release_net,
2660 };
2661
2662
2663 static int __net_init dev_proc_net_init(struct net *net)
2664 {
2665         int rc = -ENOMEM;
2666
2667         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2668                 goto out;
2669         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2670                 goto out_dev;
2671         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2672                 goto out_softnet;
2673
2674         if (wext_proc_init(net))
2675                 goto out_ptype;
2676         rc = 0;
2677 out:
2678         return rc;
2679 out_ptype:
2680         proc_net_remove(net, "ptype");
2681 out_softnet:
2682         proc_net_remove(net, "softnet_stat");
2683 out_dev:
2684         proc_net_remove(net, "dev");
2685         goto out;
2686 }
2687
2688 static void __net_exit dev_proc_net_exit(struct net *net)
2689 {
2690         wext_proc_exit(net);
2691
2692         proc_net_remove(net, "ptype");
2693         proc_net_remove(net, "softnet_stat");
2694         proc_net_remove(net, "dev");
2695 }
2696
2697 static struct pernet_operations __net_initdata dev_proc_ops = {
2698         .init = dev_proc_net_init,
2699         .exit = dev_proc_net_exit,
2700 };
2701
2702 static int __init dev_proc_init(void)
2703 {
2704         return register_pernet_subsys(&dev_proc_ops);
2705 }
2706 #else
2707 #define dev_proc_init() 0
2708 #endif  /* CONFIG_PROC_FS */
2709
2710
2711 /**
2712  *      netdev_set_master       -       set up master/slave pair
2713  *      @slave: slave device
2714  *      @master: new master device
2715  *
2716  *      Changes the master device of the slave. Pass %NULL to break the
2717  *      bonding. The caller must hold the RTNL semaphore. On a failure
2718  *      a negative errno code is returned. On success the reference counts
2719  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2720  *      function returns zero.
2721  */
2722 int netdev_set_master(struct net_device *slave, struct net_device *master)
2723 {
2724         struct net_device *old = slave->master;
2725
2726         ASSERT_RTNL();
2727
2728         if (master) {
2729                 if (old)
2730                         return -EBUSY;
2731                 dev_hold(master);
2732         }
2733
2734         slave->master = master;
2735
2736         synchronize_net();
2737
2738         if (old)
2739                 dev_put(old);
2740
2741         if (master)
2742                 slave->flags |= IFF_SLAVE;
2743         else
2744                 slave->flags &= ~IFF_SLAVE;
2745
2746         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2747         return 0;
2748 }
2749
2750 static void __dev_set_promiscuity(struct net_device *dev, int inc)
2751 {
2752         unsigned short old_flags = dev->flags;
2753
2754         ASSERT_RTNL();
2755
2756         if ((dev->promiscuity += inc) == 0)
2757                 dev->flags &= ~IFF_PROMISC;
2758         else
2759                 dev->flags |= IFF_PROMISC;
2760         if (dev->flags != old_flags) {
2761                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2762                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2763                                                                "left");
2764                 if (audit_enabled)
2765                         audit_log(current->audit_context, GFP_ATOMIC,
2766                                 AUDIT_ANOM_PROMISCUOUS,
2767                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2768                                 dev->name, (dev->flags & IFF_PROMISC),
2769                                 (old_flags & IFF_PROMISC),
2770                                 audit_get_loginuid(current),
2771                                 current->uid, current->gid,
2772                                 audit_get_sessionid(current));
2773
2774                 if (dev->change_rx_flags)
2775                         dev->change_rx_flags(dev, IFF_PROMISC);
2776         }
2777 }
2778
2779 /**
2780  *      dev_set_promiscuity     - update promiscuity count on a device
2781  *      @dev: device
2782  *      @inc: modifier
2783  *
2784  *      Add or remove promiscuity from a device. While the count in the device
2785  *      remains above zero the interface remains promiscuous. Once it hits zero
2786  *      the device reverts back to normal filtering operation. A negative inc
2787  *      value is used to drop promiscuity on the device.
2788  */
2789 void dev_set_promiscuity(struct net_device *dev, int inc)
2790 {
2791         unsigned short old_flags = dev->flags;
2792
2793         __dev_set_promiscuity(dev, inc);
2794         if (dev->flags != old_flags)
2795                 dev_set_rx_mode(dev);
2796 }
2797
2798 /**
2799  *      dev_set_allmulti        - update allmulti count on a device
2800  *      @dev: device
2801  *      @inc: modifier
2802  *
2803  *      Add or remove reception of all multicast frames to a device. While the
2804  *      count in the device remains above zero the interface remains listening
2805  *      to all interfaces. Once it hits zero the device reverts back to normal
2806  *      filtering operation. A negative @inc value is used to drop the counter
2807  *      when releasing a resource needing all multicasts.
2808  */
2809
2810 void dev_set_allmulti(struct net_device *dev, int inc)
2811 {
2812         unsigned short old_flags = dev->flags;
2813
2814         ASSERT_RTNL();
2815
2816         dev->flags |= IFF_ALLMULTI;
2817         if ((dev->allmulti += inc) == 0)
2818                 dev->flags &= ~IFF_ALLMULTI;
2819         if (dev->flags ^ old_flags) {
2820                 if (dev->change_rx_flags)
2821                         dev->change_rx_flags(dev, IFF_ALLMULTI);
2822                 dev_set_rx_mode(dev);
2823         }
2824 }
2825
2826 /*
2827  *      Upload unicast and multicast address lists to device and
2828  *      configure RX filtering. When the device doesn't support unicast
2829  *      filtering it is put in promiscuous mode while unicast addresses
2830  *      are present.
2831  */
2832 void __dev_set_rx_mode(struct net_device *dev)
2833 {
2834         /* dev_open will call this function so the list will stay sane. */
2835         if (!(dev->flags&IFF_UP))
2836                 return;
2837
2838         if (!netif_device_present(dev))
2839                 return;
2840
2841         if (dev->set_rx_mode)
2842                 dev->set_rx_mode(dev);
2843         else {
2844                 /* Unicast addresses changes may only happen under the rtnl,
2845                  * therefore calling __dev_set_promiscuity here is safe.
2846                  */
2847                 if (dev->uc_count > 0 && !dev->uc_promisc) {
2848                         __dev_set_promiscuity(dev, 1);
2849                         dev->uc_promisc = 1;
2850                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
2851                         __dev_set_promiscuity(dev, -1);
2852                         dev->uc_promisc = 0;
2853                 }
2854
2855                 if (dev->set_multicast_list)
2856                         dev->set_multicast_list(dev);
2857         }
2858 }
2859
2860 void dev_set_rx_mode(struct net_device *dev)
2861 {
2862         netif_tx_lock_bh(dev);
2863         __dev_set_rx_mode(dev);
2864         netif_tx_unlock_bh(dev);
2865 }
2866
2867 int __dev_addr_delete(struct dev_addr_list **list, int *count,
2868                       void *addr, int alen, int glbl)
2869 {
2870         struct dev_addr_list *da;
2871
2872         for (; (da = *list) != NULL; list = &da->next) {
2873                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2874                     alen == da->da_addrlen) {
2875                         if (glbl) {
2876                                 int old_glbl = da->da_gusers;
2877                                 da->da_gusers = 0;
2878                                 if (old_glbl == 0)
2879                                         break;
2880                         }
2881                         if (--da->da_users)
2882                                 return 0;
2883
2884                         *list = da->next;
2885                         kfree(da);
2886                         (*count)--;
2887                         return 0;
2888                 }
2889         }
2890         return -ENOENT;
2891 }
2892
2893 int __dev_addr_add(struct dev_addr_list **list, int *count,
2894                    void *addr, int alen, int glbl)
2895 {
2896         struct dev_addr_list *da;
2897
2898         for (da = *list; da != NULL; da = da->next) {
2899                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2900                     da->da_addrlen == alen) {
2901                         if (glbl) {
2902                                 int old_glbl = da->da_gusers;
2903                                 da->da_gusers = 1;
2904                                 if (old_glbl)
2905                                         return 0;
2906                         }
2907                         da->da_users++;
2908                         return 0;
2909                 }
2910         }
2911
2912         da = kzalloc(sizeof(*da), GFP_ATOMIC);
2913         if (da == NULL)
2914                 return -ENOMEM;
2915         memcpy(da->da_addr, addr, alen);
2916         da->da_addrlen = alen;
2917         da->da_users = 1;
2918         da->da_gusers = glbl ? 1 : 0;
2919         da->next = *list;
2920         *list = da;
2921         (*count)++;
2922         return 0;
2923 }
2924
2925 /**
2926  *      dev_unicast_delete      - Release secondary unicast address.
2927  *      @dev: device
2928  *      @addr: address to delete
2929  *      @alen: length of @addr
2930  *
2931  *      Release reference to a secondary unicast address and remove it
2932  *      from the device if the reference count drops to zero.
2933  *
2934  *      The caller must hold the rtnl_mutex.
2935  */
2936 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2937 {
2938         int err;
2939
2940         ASSERT_RTNL();
2941
2942         netif_tx_lock_bh(dev);
2943         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2944         if (!err)
2945                 __dev_set_rx_mode(dev);
2946         netif_tx_unlock_bh(dev);
2947         return err;
2948 }
2949 EXPORT_SYMBOL(dev_unicast_delete);
2950
2951 /**
2952  *      dev_unicast_add         - add a secondary unicast address
2953  *      @dev: device
2954  *      @addr: address to delete
2955  *      @alen: length of @addr
2956  *
2957  *      Add a secondary unicast address to the device or increase
2958  *      the reference count if it already exists.
2959  *
2960  *      The caller must hold the rtnl_mutex.
2961  */
2962 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2963 {
2964         int err;
2965
2966         ASSERT_RTNL();
2967
2968         netif_tx_lock_bh(dev);
2969         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2970         if (!err)
2971                 __dev_set_rx_mode(dev);
2972         netif_tx_unlock_bh(dev);
2973         return err;
2974 }
2975 EXPORT_SYMBOL(dev_unicast_add);
2976
2977 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
2978                     struct dev_addr_list **from, int *from_count)
2979 {
2980         struct dev_addr_list *da, *next;
2981         int err = 0;
2982
2983         da = *from;
2984         while (da != NULL) {
2985                 next = da->next;
2986                 if (!da->da_synced) {
2987                         err = __dev_addr_add(to, to_count,
2988                                              da->da_addr, da->da_addrlen, 0);
2989                         if (err < 0)
2990                                 break;
2991                         da->da_synced = 1;
2992                         da->da_users++;
2993                 } else if (da->da_users == 1) {
2994                         __dev_addr_delete(to, to_count,
2995                                           da->da_addr, da->da_addrlen, 0);
2996                         __dev_addr_delete(from, from_count,
2997                                           da->da_addr, da->da_addrlen, 0);
2998                 }
2999                 da = next;
3000         }
3001         return err;
3002 }
3003
3004 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3005                        struct dev_addr_list **from, int *from_count)
3006 {
3007         struct dev_addr_list *da, *next;
3008
3009         da = *from;
3010         while (da != NULL) {
3011                 next = da->next;
3012                 if (da->da_synced) {
3013                         __dev_addr_delete(to, to_count,
3014                                           da->da_addr, da->da_addrlen, 0);
3015                         da->da_synced = 0;
3016                         __dev_addr_delete(from, from_count,
3017                                           da->da_addr, da->da_addrlen, 0);
3018                 }
3019                 da = next;
3020         }
3021 }
3022
3023 /**
3024  *      dev_unicast_sync - Synchronize device's unicast list to another device
3025  *      @to: destination device
3026  *      @from: source device
3027  *
3028  *      Add newly added addresses to the destination device and release
3029  *      addresses that have no users left. The source device must be
3030  *      locked by netif_tx_lock_bh.
3031  *
3032  *      This function is intended to be called from the dev->set_rx_mode
3033  *      function of layered software devices.
3034  */
3035 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3036 {
3037         int err = 0;
3038
3039         netif_tx_lock_bh(to);
3040         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3041                               &from->uc_list, &from->uc_count);
3042         if (!err)
3043                 __dev_set_rx_mode(to);
3044         netif_tx_unlock_bh(to);
3045         return err;
3046 }
3047 EXPORT_SYMBOL(dev_unicast_sync);
3048
3049 /**
3050  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3051  *      @to: destination device
3052  *      @from: source device
3053  *
3054  *      Remove all addresses that were added to the destination device by
3055  *      dev_unicast_sync(). This function is intended to be called from the
3056  *      dev->stop function of layered software devices.
3057  */
3058 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3059 {
3060         netif_tx_lock_bh(from);
3061         netif_tx_lock_bh(to);
3062
3063         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3064                           &from->uc_list, &from->uc_count);
3065         __dev_set_rx_mode(to);
3066
3067         netif_tx_unlock_bh(to);
3068         netif_tx_unlock_bh(from);
3069 }
3070 EXPORT_SYMBOL(dev_unicast_unsync);
3071
3072 static void __dev_addr_discard(struct dev_addr_list **list)
3073 {
3074         struct dev_addr_list *tmp;
3075
3076         while (*list != NULL) {
3077                 tmp = *list;
3078                 *list = tmp->next;
3079                 if (tmp->da_users > tmp->da_gusers)
3080                         printk("__dev_addr_discard: address leakage! "
3081                                "da_users=%d\n", tmp->da_users);
3082                 kfree(tmp);
3083         }
3084 }
3085
3086 static void dev_addr_discard(struct net_device *dev)
3087 {
3088         netif_tx_lock_bh(dev);
3089
3090         __dev_addr_discard(&dev->uc_list);
3091         dev->uc_count = 0;
3092
3093         __dev_addr_discard(&dev->mc_list);
3094         dev->mc_count = 0;
3095
3096         netif_tx_unlock_bh(dev);
3097 }
3098
3099 unsigned dev_get_flags(const struct net_device *dev)
3100 {
3101         unsigned flags;
3102
3103         flags = (dev->flags & ~(IFF_PROMISC |
3104                                 IFF_ALLMULTI |
3105                                 IFF_RUNNING |
3106                                 IFF_LOWER_UP |
3107                                 IFF_DORMANT)) |
3108                 (dev->gflags & (IFF_PROMISC |
3109                                 IFF_ALLMULTI));
3110
3111         if (netif_running(dev)) {
3112                 if (netif_oper_up(dev))
3113                         flags |= IFF_RUNNING;
3114                 if (netif_carrier_ok(dev))
3115                         flags |= IFF_LOWER_UP;
3116                 if (netif_dormant(dev))
3117                         flags |= IFF_DORMANT;
3118         }
3119
3120         return flags;
3121 }
3122
3123 int dev_change_flags(struct net_device *dev, unsigned flags)
3124 {
3125         int ret, changes;
3126         int old_flags = dev->flags;
3127
3128         ASSERT_RTNL();
3129
3130         /*
3131          *      Set the flags on our device.
3132          */
3133
3134         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3135                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3136                                IFF_AUTOMEDIA)) |
3137                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3138                                     IFF_ALLMULTI));
3139
3140         /*
3141          *      Load in the correct multicast list now the flags have changed.
3142          */
3143
3144         if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
3145                 dev->change_rx_flags(dev, IFF_MULTICAST);
3146
3147         dev_set_rx_mode(dev);
3148
3149         /*
3150          *      Have we downed the interface. We handle IFF_UP ourselves
3151          *      according to user attempts to set it, rather than blindly
3152          *      setting it.
3153          */
3154
3155         ret = 0;
3156         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3157                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3158
3159                 if (!ret)
3160                         dev_set_rx_mode(dev);
3161         }
3162
3163         if (dev->flags & IFF_UP &&
3164             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3165                                           IFF_VOLATILE)))
3166                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3167
3168         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3169                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3170                 dev->gflags ^= IFF_PROMISC;
3171                 dev_set_promiscuity(dev, inc);
3172         }
3173
3174         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3175            is important. Some (broken) drivers set IFF_PROMISC, when
3176            IFF_ALLMULTI is requested not asking us and not reporting.
3177          */
3178         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3179                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3180                 dev->gflags ^= IFF_ALLMULTI;
3181                 dev_set_allmulti(dev, inc);
3182         }
3183
3184         /* Exclude state transition flags, already notified */
3185         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3186         if (changes)
3187                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3188
3189         return ret;
3190 }
3191
3192 int dev_set_mtu(struct net_device *dev, int new_mtu)
3193 {
3194         int err;
3195
3196         if (new_mtu == dev->mtu)
3197                 return 0;
3198
3199         /*      MTU must be positive.    */
3200         if (new_mtu < 0)
3201                 return -EINVAL;
3202
3203         if (!netif_device_present(dev))
3204                 return -ENODEV;
3205
3206         err = 0;
3207         if (dev->change_mtu)
3208                 err = dev->change_mtu(dev, new_mtu);
3209         else
3210                 dev->mtu = new_mtu;
3211         if (!err && dev->flags & IFF_UP)
3212                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3213         return err;
3214 }
3215
3216 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3217 {
3218         int err;
3219
3220         if (!dev->set_mac_address)
3221                 return -EOPNOTSUPP;
3222         if (sa->sa_family != dev->type)
3223                 return -EINVAL;
3224         if (!netif_device_present(dev))
3225                 return -ENODEV;
3226         err = dev->set_mac_address(dev, sa);
3227         if (!err)
3228                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3229         return err;
3230 }
3231
3232 /*
3233  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3234  */
3235 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3236 {
3237         int err;
3238         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3239
3240         if (!dev)
3241                 return -ENODEV;
3242
3243         switch (cmd) {
3244                 case SIOCGIFFLAGS:      /* Get interface flags */
3245                         ifr->ifr_flags = dev_get_flags(dev);
3246                         return 0;
3247
3248                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3249                                            (currently unused) */
3250                         ifr->ifr_metric = 0;
3251                         return 0;
3252
3253                 case SIOCGIFMTU:        /* Get the MTU of a device */
3254                         ifr->ifr_mtu = dev->mtu;
3255                         return 0;
3256
3257                 case SIOCGIFHWADDR:
3258                         if (!dev->addr_len)
3259                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3260                         else
3261                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3262                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3263                         ifr->ifr_hwaddr.sa_family = dev->type;
3264                         return 0;
3265
3266                 case SIOCGIFSLAVE:
3267                         err = -EINVAL;
3268                         break;
3269
3270                 case SIOCGIFMAP:
3271                         ifr->ifr_map.mem_start = dev->mem_start;
3272                         ifr->ifr_map.mem_end   = dev->mem_end;
3273                         ifr->ifr_map.base_addr = dev->base_addr;
3274                         ifr->ifr_map.irq       = dev->irq;
3275                         ifr->ifr_map.dma       = dev->dma;
3276                         ifr->ifr_map.port      = dev->if_port;
3277                         return 0;
3278
3279                 case SIOCGIFINDEX:
3280                         ifr->ifr_ifindex = dev->ifindex;
3281                         return 0;
3282
3283                 case SIOCGIFTXQLEN:
3284                         ifr->ifr_qlen = dev->tx_queue_len;
3285                         return 0;
3286
3287                 default:
3288                         /* dev_ioctl() should ensure this case
3289                          * is never reached
3290                          */
3291                         WARN_ON(1);
3292                         err = -EINVAL;
3293                         break;
3294
3295         }
3296         return err;
3297 }
3298
3299 /*
3300  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3301  */
3302 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3303 {
3304         int err;
3305         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3306
3307         if (!dev)
3308                 return -ENODEV;
3309
3310         switch (cmd) {
3311                 case SIOCSIFFLAGS:      /* Set interface flags */
3312                         return dev_change_flags(dev, ifr->ifr_flags);
3313
3314                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3315                                            (currently unused) */
3316                         return -EOPNOTSUPP;
3317
3318                 case SIOCSIFMTU:        /* Set the MTU of a device */
3319                         return dev_set_mtu(dev, ifr->ifr_mtu);
3320
3321                 case SIOCSIFHWADDR:
3322                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3323
3324                 case SIOCSIFHWBROADCAST:
3325                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3326                                 return -EINVAL;
3327                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3328                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3329                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3330                         return 0;
3331
3332                 case SIOCSIFMAP:
3333                         if (dev->set_config) {
3334                                 if (!netif_device_present(dev))
3335                                         return -ENODEV;
3336                                 return dev->set_config(dev, &ifr->ifr_map);
3337                         }
3338                         return -EOPNOTSUPP;
3339
3340                 case SIOCADDMULTI:
3341                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3342                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3343                                 return -EINVAL;
3344                         if (!netif_device_present(dev))
3345                                 return -ENODEV;
3346                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3347                                           dev->addr_len, 1);
3348
3349                 case SIOCDELMULTI:
3350                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3351                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3352                                 return -EINVAL;
3353                         if (!netif_device_present(dev))
3354                                 return -ENODEV;
3355                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3356                                              dev->addr_len, 1);
3357
3358                 case SIOCSIFTXQLEN:
3359                         if (ifr->ifr_qlen < 0)
3360                                 return -EINVAL;
3361                         dev->tx_queue_len = ifr->ifr_qlen;
3362                         return 0;
3363
3364                 case SIOCSIFNAME:
3365                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3366                         return dev_change_name(dev, ifr->ifr_newname);
3367
3368                 /*
3369                  *      Unknown or private ioctl
3370                  */
3371
3372                 default:
3373                         if ((cmd >= SIOCDEVPRIVATE &&
3374                             cmd <= SIOCDEVPRIVATE + 15) ||
3375                             cmd == SIOCBONDENSLAVE ||
3376                             cmd == SIOCBONDRELEASE ||
3377                             cmd == SIOCBONDSETHWADDR ||
3378                             cmd == SIOCBONDSLAVEINFOQUERY ||
3379                             cmd == SIOCBONDINFOQUERY ||
3380                             cmd == SIOCBONDCHANGEACTIVE ||
3381                             cmd == SIOCGMIIPHY ||
3382                             cmd == SIOCGMIIREG ||
3383                             cmd == SIOCSMIIREG ||
3384                             cmd == SIOCBRADDIF ||
3385                             cmd == SIOCBRDELIF ||
3386                             cmd == SIOCWANDEV) {
3387                                 err = -EOPNOTSUPP;
3388                                 if (dev->do_ioctl) {
3389                                         if (netif_device_present(dev))
3390                                                 err = dev->do_ioctl(dev, ifr,
3391                                                                     cmd);
3392                                         else
3393                                                 err = -ENODEV;
3394                                 }
3395                         } else
3396                                 err = -EINVAL;
3397
3398         }
3399         return err;
3400 }
3401
3402 /*
3403  *      This function handles all "interface"-type I/O control requests. The actual
3404  *      'doing' part of this is dev_ifsioc above.
3405  */
3406
3407 /**
3408  *      dev_ioctl       -       network device ioctl
3409  *      @net: the applicable net namespace
3410  *      @cmd: command to issue
3411  *      @arg: pointer to a struct ifreq in user space
3412  *
3413  *      Issue ioctl functions to devices. This is normally called by the
3414  *      user space syscall interfaces but can sometimes be useful for
3415  *      other purposes. The return value is the return from the syscall if
3416  *      positive or a negative errno code on error.
3417  */
3418
3419 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3420 {
3421         struct ifreq ifr;
3422         int ret;
3423         char *colon;
3424
3425         /* One special case: SIOCGIFCONF takes ifconf argument
3426            and requires shared lock, because it sleeps writing
3427            to user space.
3428          */
3429
3430         if (cmd == SIOCGIFCONF) {
3431                 rtnl_lock();
3432                 ret = dev_ifconf(net, (char __user *) arg);
3433                 rtnl_unlock();
3434                 return ret;
3435         }
3436         if (cmd == SIOCGIFNAME)
3437                 return dev_ifname(net, (struct ifreq __user *)arg);
3438
3439         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3440                 return -EFAULT;
3441
3442         ifr.ifr_name[IFNAMSIZ-1] = 0;
3443
3444         colon = strchr(ifr.ifr_name, ':');
3445         if (colon)
3446                 *colon = 0;
3447
3448         /*
3449          *      See which interface the caller is talking about.
3450          */
3451
3452         switch (cmd) {
3453                 /*
3454                  *      These ioctl calls:
3455                  *      - can be done by all.
3456                  *      - atomic and do not require locking.
3457                  *      - return a value
3458                  */
3459                 case SIOCGIFFLAGS:
3460                 case SIOCGIFMETRIC:
3461                 case SIOCGIFMTU:
3462                 case SIOCGIFHWADDR:
3463                 case SIOCGIFSLAVE:
3464                 case SIOCGIFMAP:
3465                 case SIOCGIFINDEX:
3466                 case SIOCGIFTXQLEN:
3467                         dev_load(net, ifr.ifr_name);
3468                         read_lock(&dev_base_lock);
3469                         ret = dev_ifsioc_locked(net, &ifr, cmd);
3470                         read_unlock(&dev_base_lock);
3471                         if (!ret) {
3472                                 if (colon)
3473                                         *colon = ':';
3474                                 if (copy_to_user(arg, &ifr,
3475                                                  sizeof(struct ifreq)))
3476                                         ret = -EFAULT;
3477                         }
3478                         return ret;
3479
3480                 case SIOCETHTOOL:
3481                         dev_load(net, ifr.ifr_name);
3482                         rtnl_lock();
3483                         ret = dev_ethtool(net, &ifr);
3484                         rtnl_unlock();
3485                         if (!ret) {
3486                                 if (colon)
3487                                         *colon = ':';
3488                                 if (copy_to_user(arg, &ifr,
3489                                                  sizeof(struct ifreq)))
3490                                         ret = -EFAULT;
3491                         }
3492                         return ret;
3493
3494                 /*
3495                  *      These ioctl calls:
3496                  *      - require superuser power.
3497                  *      - require strict serialization.
3498                  *      - return a value
3499                  */
3500                 case SIOCGMIIPHY:
3501                 case SIOCGMIIREG:
3502                 case SIOCSIFNAME:
3503                         if (!capable(CAP_NET_ADMIN))
3504                                 return -EPERM;
3505                         dev_load(net, ifr.ifr_name);
3506                         rtnl_lock();
3507                         ret = dev_ifsioc(net, &ifr, cmd);
3508                         rtnl_unlock();
3509                         if (!ret) {
3510                                 if (colon)
3511                                         *colon = ':';
3512                                 if (copy_to_user(arg, &ifr,
3513                                                  sizeof(struct ifreq)))
3514                                         ret = -EFAULT;
3515                         }
3516                         return ret;
3517
3518                 /*
3519                  *      These ioctl calls:
3520                  *      - require superuser power.
3521                  *      - require strict serialization.
3522                  *      - do not return a value
3523                  */
3524                 case SIOCSIFFLAGS:
3525                 case SIOCSIFMETRIC:
3526                 case SIOCSIFMTU:
3527                 case SIOCSIFMAP:
3528                 case SIOCSIFHWADDR:
3529                 case SIOCSIFSLAVE:
3530                 case SIOCADDMULTI:
3531                 case SIOCDELMULTI:
3532                 case SIOCSIFHWBROADCAST:
3533                 case SIOCSIFTXQLEN:
3534                 case SIOCSMIIREG:
3535                 case SIOCBONDENSLAVE:
3536                 case SIOCBONDRELEASE:
3537                 case SIOCBONDSETHWADDR:
3538                 case SIOCBONDCHANGEACTIVE:
3539                 case SIOCBRADDIF:
3540                 case SIOCBRDELIF:
3541                         if (!capable(CAP_NET_ADMIN))
3542                                 return -EPERM;
3543                         /* fall through */
3544                 case SIOCBONDSLAVEINFOQUERY:
3545                 case SIOCBONDINFOQUERY:
3546                         dev_load(net, ifr.ifr_name);
3547                         rtnl_lock();
3548                         ret = dev_ifsioc(net, &ifr, cmd);
3549                         rtnl_unlock();
3550                         return ret;
3551
3552                 case SIOCGIFMEM:
3553                         /* Get the per device memory space. We can add this but
3554                          * currently do not support it */
3555                 case SIOCSIFMEM:
3556                         /* Set the per device memory buffer space.
3557                          * Not applicable in our case */
3558                 case SIOCSIFLINK:
3559                         return -EINVAL;
3560
3561                 /*
3562                  *      Unknown or private ioctl.
3563                  */
3564                 default:
3565                         if (cmd == SIOCWANDEV ||
3566                             (cmd >= SIOCDEVPRIVATE &&
3567                              cmd <= SIOCDEVPRIVATE + 15)) {
3568                                 dev_load(net, ifr.ifr_name);
3569                                 rtnl_lock();
3570                                 ret = dev_ifsioc(net, &ifr, cmd);
3571                                 rtnl_unlock();
3572                                 if (!ret && copy_to_user(arg, &ifr,
3573                                                          sizeof(struct ifreq)))
3574                                         ret = -EFAULT;
3575                                 return ret;
3576                         }
3577                         /* Take care of Wireless Extensions */
3578                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3579                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
3580                         return -EINVAL;
3581         }
3582 }
3583
3584
3585 /**
3586  *      dev_new_index   -       allocate an ifindex
3587  *      @net: the applicable net namespace
3588  *
3589  *      Returns a suitable unique value for a new device interface
3590  *      number.  The caller must hold the rtnl semaphore or the
3591  *      dev_base_lock to be sure it remains unique.
3592  */
3593 static int dev_new_index(struct net *net)
3594 {
3595         static int ifindex;
3596         for (;;) {
3597                 if (++ifindex <= 0)
3598                         ifindex = 1;
3599                 if (!__dev_get_by_index(net, ifindex))
3600                         return ifindex;
3601         }
3602 }
3603
3604 /* Delayed registration/unregisteration */
3605 static DEFINE_SPINLOCK(net_todo_list_lock);
3606 static LIST_HEAD(net_todo_list);
3607
3608 static void net_set_todo(struct net_device *dev)
3609 {
3610         spin_lock(&net_todo_list_lock);
3611         list_add_tail(&dev->todo_list, &net_todo_list);
3612         spin_unlock(&net_todo_list_lock);
3613 }
3614
3615 static void rollback_registered(struct net_device *dev)
3616 {
3617         BUG_ON(dev_boot_phase);
3618         ASSERT_RTNL();
3619
3620         /* Some devices call without registering for initialization unwind. */
3621         if (dev->reg_state == NETREG_UNINITIALIZED) {
3622                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3623                                   "was registered\n", dev->name, dev);
3624
3625                 WARN_ON(1);
3626                 return;
3627         }
3628
3629         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3630
3631         /* If device is running, close it first. */
3632         dev_close(dev);
3633
3634         /* And unlink it from device chain. */
3635         unlist_netdevice(dev);
3636
3637         dev->reg_state = NETREG_UNREGISTERING;
3638
3639         synchronize_net();
3640
3641         /* Shutdown queueing discipline. */
3642         dev_shutdown(dev);
3643
3644
3645         /* Notify protocols, that we are about to destroy
3646            this device. They should clean all the things.
3647         */
3648         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3649
3650         /*
3651          *      Flush the unicast and multicast chains
3652          */
3653         dev_addr_discard(dev);
3654
3655         if (dev->uninit)
3656                 dev->uninit(dev);
3657
3658         /* Notifier chain MUST detach us from master device. */
3659         BUG_TRAP(!dev->master);
3660
3661         /* Remove entries from kobject tree */
3662         netdev_unregister_kobject(dev);
3663
3664         synchronize_net();
3665
3666         dev_put(dev);
3667 }
3668
3669 /**
3670  *      register_netdevice      - register a network device
3671  *      @dev: device to register
3672  *
3673  *      Take a completed network device structure and add it to the kernel
3674  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3675  *      chain. 0 is returned on success. A negative errno code is returned
3676  *      on a failure to set up the device, or if the name is a duplicate.
3677  *
3678  *      Callers must hold the rtnl semaphore. You may want
3679  *      register_netdev() instead of this.
3680  *
3681  *      BUGS:
3682  *      The locking appears insufficient to guarantee two parallel registers
3683  *      will not get the same name.
3684  */
3685
3686 int register_netdevice(struct net_device *dev)
3687 {
3688         struct hlist_head *head;
3689         struct hlist_node *p;
3690         int ret;
3691         struct net *net;
3692
3693         BUG_ON(dev_boot_phase);
3694         ASSERT_RTNL();
3695
3696         might_sleep();
3697
3698         /* When net_device's are persistent, this will be fatal. */
3699         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3700         BUG_ON(!dev_net(dev));
3701         net = dev_net(dev);
3702
3703         spin_lock_init(&dev->queue_lock);
3704         spin_lock_init(&dev->_xmit_lock);
3705         netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3706         dev->xmit_lock_owner = -1;
3707         spin_lock_init(&dev->ingress_lock);
3708
3709         dev->iflink = -1;
3710
3711         /* Init, if this function is available */
3712         if (dev->init) {
3713                 ret = dev->init(dev);
3714                 if (ret) {
3715                         if (ret > 0)
3716                                 ret = -EIO;
3717                         goto out;
3718                 }
3719         }
3720
3721         if (!dev_valid_name(dev->name)) {
3722                 ret = -EINVAL;
3723                 goto err_uninit;
3724         }
3725
3726         dev->ifindex = dev_new_index(net);
3727         if (dev->iflink == -1)
3728                 dev->iflink = dev->ifindex;
3729
3730         /* Check for existence of name */
3731         head = dev_name_hash(net, dev->name);
3732         hlist_for_each(p, head) {
3733                 struct net_device *d
3734                         = hlist_entry(p, struct net_device, name_hlist);
3735                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3736                         ret = -EEXIST;
3737                         goto err_uninit;
3738                 }
3739         }
3740
3741         /* Fix illegal checksum combinations */
3742         if ((dev->features & NETIF_F_HW_CSUM) &&
3743             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3744                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3745                        dev->name);
3746                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3747         }
3748
3749         if ((dev->features & NETIF_F_NO_CSUM) &&
3750             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3751                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3752                        dev->name);
3753                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3754         }
3755
3756
3757         /* Fix illegal SG+CSUM combinations. */
3758         if ((dev->features & NETIF_F_SG) &&
3759             !(dev->features & NETIF_F_ALL_CSUM)) {
3760                 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3761                        dev->name);
3762                 dev->features &= ~NETIF_F_SG;
3763         }
3764
3765         /* TSO requires that SG is present as well. */
3766         if ((dev->features & NETIF_F_TSO) &&
3767             !(dev->features & NETIF_F_SG)) {
3768                 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3769                        dev->name);
3770                 dev->features &= ~NETIF_F_TSO;
3771         }
3772         if (dev->features & NETIF_F_UFO) {
3773                 if (!(dev->features & NETIF_F_HW_CSUM)) {
3774                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3775                                         "NETIF_F_HW_CSUM feature.\n",
3776                                                         dev->name);
3777                         dev->features &= ~NETIF_F_UFO;
3778                 }
3779                 if (!(dev->features & NETIF_F_SG)) {
3780                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3781                                         "NETIF_F_SG feature.\n",
3782                                         dev->name);
3783                         dev->features &= ~NETIF_F_UFO;
3784                 }
3785         }
3786
3787         netdev_initialize_kobject(dev);
3788         ret = netdev_register_kobject(dev);
3789         if (ret)
3790                 goto err_uninit;
3791         dev->reg_state = NETREG_REGISTERED;
3792
3793         /*
3794          *      Default initial state at registry is that the
3795          *      device is present.
3796          */
3797
3798         set_bit(__LINK_STATE_PRESENT, &dev->state);
3799
3800         dev_init_scheduler(dev);
3801         dev_hold(dev);
3802         list_netdevice(dev);
3803
3804         /* Notify protocols, that a new device appeared. */
3805         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3806         ret = notifier_to_errno(ret);
3807         if (ret) {
3808                 rollback_registered(dev);
3809                 dev->reg_state = NETREG_UNREGISTERED;
3810         }
3811
3812 out:
3813         return ret;
3814
3815 err_uninit:
3816         if (dev->uninit)
3817                 dev->uninit(dev);
3818         goto out;
3819 }
3820
3821 /**
3822  *      register_netdev - register a network device
3823  *      @dev: device to register
3824  *
3825  *      Take a completed network device structure and add it to the kernel
3826  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3827  *      chain. 0 is returned on success. A negative errno code is returned
3828  *      on a failure to set up the device, or if the name is a duplicate.
3829  *
3830  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
3831  *      and expands the device name if you passed a format string to
3832  *      alloc_netdev.
3833  */
3834 int register_netdev(struct net_device *dev)
3835 {
3836         int err;
3837
3838         rtnl_lock();
3839
3840         /*
3841          * If the name is a format string the caller wants us to do a
3842          * name allocation.
3843          */
3844         if (strchr(dev->name, '%')) {
3845                 err = dev_alloc_name(dev, dev->name);
3846                 if (err < 0)
3847                         goto out;
3848         }
3849
3850         err = register_netdevice(dev);
3851 out:
3852         rtnl_unlock();
3853         return err;
3854 }
3855 EXPORT_SYMBOL(register_netdev);
3856
3857 /*
3858  * netdev_wait_allrefs - wait until all references are gone.
3859  *
3860  * This is called when unregistering network devices.
3861  *
3862  * Any protocol or device that holds a reference should register
3863  * for netdevice notification, and cleanup and put back the
3864  * reference if they receive an UNREGISTER event.
3865  * We can get stuck here if buggy protocols don't correctly
3866  * call dev_put.
3867  */
3868 static void netdev_wait_allrefs(struct net_device *dev)
3869 {
3870         unsigned long rebroadcast_time, warning_time;
3871
3872         rebroadcast_time = warning_time = jiffies;
3873         while (atomic_read(&dev->refcnt) != 0) {
3874                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3875                         rtnl_lock();
3876
3877                         /* Rebroadcast unregister notification */
3878                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3879
3880                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3881                                      &dev->state)) {
3882                                 /* We must not have linkwatch events
3883                                  * pending on unregister. If this
3884                                  * happens, we simply run the queue
3885                                  * unscheduled, resulting in a noop
3886                                  * for this device.
3887                                  */
3888                                 linkwatch_run_queue();
3889                         }
3890
3891                         __rtnl_unlock();
3892
3893                         rebroadcast_time = jiffies;
3894                 }
3895
3896                 msleep(250);
3897
3898                 if (time_after(jiffies, warning_time + 10 * HZ)) {
3899                         printk(KERN_EMERG "unregister_netdevice: "
3900                                "waiting for %s to become free. Usage "
3901                                "count = %d\n",
3902                                dev->name, atomic_read(&dev->refcnt));
3903                         warning_time = jiffies;
3904                 }
3905         }
3906 }
3907
3908 /* The sequence is:
3909  *
3910  *      rtnl_lock();
3911  *      ...
3912  *      register_netdevice(x1);
3913  *      register_netdevice(x2);
3914  *      ...
3915  *      unregister_netdevice(y1);
3916  *      unregister_netdevice(y2);
3917  *      ...
3918  *      rtnl_unlock();
3919  *      free_netdev(y1);
3920  *      free_netdev(y2);
3921  *
3922  * We are invoked by rtnl_unlock() after it drops the semaphore.
3923  * This allows us to deal with problems:
3924  * 1) We can delete sysfs objects which invoke hotplug
3925  *    without deadlocking with linkwatch via keventd.
3926  * 2) Since we run with the RTNL semaphore not held, we can sleep
3927  *    safely in order to wait for the netdev refcnt to drop to zero.
3928  */
3929 static DEFINE_MUTEX(net_todo_run_mutex);
3930 void netdev_run_todo(void)
3931 {
3932         struct list_head list;
3933
3934         /* Need to guard against multiple cpu's getting out of order. */
3935         mutex_lock(&net_todo_run_mutex);
3936
3937         /* Not safe to do outside the semaphore.  We must not return
3938          * until all unregister events invoked by the local processor
3939          * have been completed (either by this todo run, or one on
3940          * another cpu).
3941          */
3942         if (list_empty(&net_todo_list))
3943                 goto out;
3944
3945         /* Snapshot list, allow later requests */
3946         spin_lock(&net_todo_list_lock);
3947         list_replace_init(&net_todo_list, &list);
3948         spin_unlock(&net_todo_list_lock);
3949
3950         while (!list_empty(&list)) {
3951                 struct net_device *dev
3952                         = list_entry(list.next, struct net_device, todo_list);
3953                 list_del(&dev->todo_list);
3954
3955                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3956                         printk(KERN_ERR "network todo '%s' but state %d\n",
3957                                dev->name, dev->reg_state);
3958                         dump_stack();
3959                         continue;
3960                 }
3961
3962                 dev->reg_state = NETREG_UNREGISTERED;
3963
3964                 netdev_wait_allrefs(dev);
3965
3966                 /* paranoia */
3967                 BUG_ON(atomic_read(&dev->refcnt));
3968                 BUG_TRAP(!dev->ip_ptr);
3969                 BUG_TRAP(!dev->ip6_ptr);
3970                 BUG_TRAP(!dev->dn_ptr);
3971
3972                 if (dev->destructor)
3973                         dev->destructor(dev);
3974
3975                 /* Free network device */
3976                 kobject_put(&dev->dev.kobj);
3977         }
3978
3979 out:
3980         mutex_unlock(&net_todo_run_mutex);
3981 }
3982
3983 static struct net_device_stats *internal_stats(struct net_device *dev)
3984 {
3985         return &dev->stats;
3986 }
3987
3988 /**
3989  *      alloc_netdev_mq - allocate network device
3990  *      @sizeof_priv:   size of private data to allocate space for
3991  *      @name:          device name format string
3992  *      @setup:         callback to initialize device
3993  *      @queue_count:   the number of subqueues to allocate
3994  *
3995  *      Allocates a struct net_device with private data area for driver use
3996  *      and performs basic initialization.  Also allocates subquue structs
3997  *      for each queue on the device at the end of the netdevice.
3998  */
3999 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4000                 void (*setup)(struct net_device *), unsigned int queue_count)
4001 {
4002         void *p;
4003         struct net_device *dev;
4004         int alloc_size;
4005
4006         BUG_ON(strlen(name) >= sizeof(dev->name));
4007
4008         alloc_size = sizeof(struct net_device) +
4009                      sizeof(struct net_device_subqueue) * (queue_count - 1);
4010         if (sizeof_priv) {
4011                 /* ensure 32-byte alignment of private area */
4012                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4013                 alloc_size += sizeof_priv;
4014         }
4015         /* ensure 32-byte alignment of whole construct */
4016         alloc_size += NETDEV_ALIGN_CONST;
4017
4018         p = kzalloc(alloc_size, GFP_KERNEL);
4019         if (!p) {
4020                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4021                 return NULL;
4022         }
4023
4024         dev = (struct net_device *)
4025                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4026         dev->padded = (char *)dev - (char *)p;
4027         dev_net_set(dev, &init_net);
4028
4029         if (sizeof_priv) {
4030                 dev->priv = ((char *)dev +
4031                              ((sizeof(struct net_device) +
4032                                (sizeof(struct net_device_subqueue) *
4033                                 (queue_count - 1)) + NETDEV_ALIGN_CONST)
4034                               & ~NETDEV_ALIGN_CONST));
4035         }
4036
4037         dev->egress_subqueue_count = queue_count;
4038         dev->gso_max_size = GSO_MAX_SIZE;
4039
4040         dev->get_stats = internal_stats;
4041         netpoll_netdev_init(dev);
4042         setup(dev);
4043         strcpy(dev->name, name);
4044         return dev;
4045 }
4046 EXPORT_SYMBOL(alloc_netdev_mq);
4047
4048 /**
4049  *      free_netdev - free network device
4050  *      @dev: device
4051  *
4052  *      This function does the last stage of destroying an allocated device
4053  *      interface. The reference to the device object is released.
4054  *      If this is the last reference then it will be freed.
4055  */
4056 void free_netdev(struct net_device *dev)
4057 {
4058         release_net(dev_net(dev));
4059
4060         /*  Compatibility with error handling in drivers */
4061         if (dev->reg_state == NETREG_UNINITIALIZED) {
4062                 kfree((char *)dev - dev->padded);
4063                 return;
4064         }
4065
4066         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4067         dev->reg_state = NETREG_RELEASED;
4068
4069         /* will free via device release */
4070         put_device(&dev->dev);
4071 }
4072
4073 /* Synchronize with packet receive processing. */
4074 void synchronize_net(void)
4075 {
4076         might_sleep();
4077         synchronize_rcu();
4078 }
4079
4080 /**
4081  *      unregister_netdevice - remove device from the kernel
4082  *      @dev: device
4083  *
4084  *      This function shuts down a device interface and removes it
4085  *      from the kernel tables.
4086  *
4087  *      Callers must hold the rtnl semaphore.  You may want
4088  *      unregister_netdev() instead of this.
4089  */
4090
4091 void unregister_netdevice(struct net_device *dev)
4092 {
4093         ASSERT_RTNL();
4094
4095         rollback_registered(dev);
4096         /* Finish processing unregister after unlock */
4097         net_set_todo(dev);
4098 }
4099
4100 /**
4101  *      unregister_netdev - remove device from the kernel
4102  *      @dev: device
4103  *
4104  *      This function shuts down a device interface and removes it
4105  *      from the kernel tables.
4106  *
4107  *      This is just a wrapper for unregister_netdevice that takes
4108  *      the rtnl semaphore.  In general you want to use this and not
4109  *      unregister_netdevice.
4110  */
4111 void unregister_netdev(struct net_device *dev)
4112 {
4113         rtnl_lock();
4114         unregister_netdevice(dev);
4115         rtnl_unlock();
4116 }
4117
4118 EXPORT_SYMBOL(unregister_netdev);
4119
4120 /**
4121  *      dev_change_net_namespace - move device to different nethost namespace
4122  *      @dev: device
4123  *      @net: network namespace
4124  *      @pat: If not NULL name pattern to try if the current device name
4125  *            is already taken in the destination network namespace.
4126  *
4127  *      This function shuts down a device interface and moves it
4128  *      to a new network namespace. On success 0 is returned, on
4129  *      a failure a netagive errno code is returned.
4130  *
4131  *      Callers must hold the rtnl semaphore.
4132  */
4133
4134 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4135 {
4136         char buf[IFNAMSIZ];
4137         const char *destname;
4138         int err;
4139
4140         ASSERT_RTNL();
4141
4142         /* Don't allow namespace local devices to be moved. */
4143         err = -EINVAL;
4144         if (dev->features & NETIF_F_NETNS_LOCAL)
4145                 goto out;
4146
4147         /* Ensure the device has been registrered */
4148         err = -EINVAL;
4149         if (dev->reg_state != NETREG_REGISTERED)
4150                 goto out;
4151
4152         /* Get out if there is nothing todo */
4153         err = 0;
4154         if (net_eq(dev_net(dev), net))
4155                 goto out;
4156
4157         /* Pick the destination device name, and ensure
4158          * we can use it in the destination network namespace.
4159          */
4160         err = -EEXIST;
4161         destname = dev->name;
4162         if (__dev_get_by_name(net, destname)) {
4163                 /* We get here if we can't use the current device name */
4164                 if (!pat)
4165                         goto out;
4166                 if (!dev_valid_name(pat))
4167                         goto out;
4168                 if (strchr(pat, '%')) {
4169                         if (__dev_alloc_name(net, pat, buf) < 0)
4170                                 goto out;
4171                         destname = buf;
4172                 } else
4173                         destname = pat;
4174                 if (__dev_get_by_name(net, destname))
4175                         goto out;
4176         }
4177
4178         /*
4179          * And now a mini version of register_netdevice unregister_netdevice.
4180          */
4181
4182         /* If device is running close it first. */
4183         dev_close(dev);
4184
4185         /* And unlink it from device chain */
4186         err = -ENODEV;
4187         unlist_netdevice(dev);
4188
4189         synchronize_net();
4190
4191         /* Shutdown queueing discipline. */
4192         dev_shutdown(dev);
4193
4194         /* Notify protocols, that we are about to destroy
4195            this device. They should clean all the things.
4196         */
4197         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4198
4199         /*
4200          *      Flush the unicast and multicast chains
4201          */
4202         dev_addr_discard(dev);
4203
4204         /* Actually switch the network namespace */
4205         dev_net_set(dev, net);
4206
4207         /* Assign the new device name */
4208         if (destname != dev->name)
4209                 strcpy(dev->name, destname);
4210
4211         /* If there is an ifindex conflict assign a new one */
4212         if (__dev_get_by_index(net, dev->ifindex)) {
4213                 int iflink = (dev->iflink == dev->ifindex);
4214                 dev->ifindex = dev_new_index(net);
4215                 if (iflink)
4216                         dev->iflink = dev->ifindex;
4217         }
4218
4219         /* Fixup kobjects */
4220         netdev_unregister_kobject(dev);
4221         err = netdev_register_kobject(dev);
4222         WARN_ON(err);
4223
4224         /* Add the device back in the hashes */
4225         list_netdevice(dev);
4226
4227         /* Notify protocols, that a new device appeared. */
4228         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4229
4230         synchronize_net();
4231         err = 0;
4232 out:
4233         return err;
4234 }
4235
4236 static int dev_cpu_callback(struct notifier_block *nfb,
4237                             unsigned long action,
4238                             void *ocpu)
4239 {
4240         struct sk_buff **list_skb;
4241         struct net_device **list_net;
4242         struct sk_buff *skb;
4243         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4244         struct softnet_data *sd, *oldsd;
4245
4246         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4247                 return NOTIFY_OK;
4248
4249         local_irq_disable();
4250         cpu = smp_processor_id();
4251         sd = &per_cpu(softnet_data, cpu);
4252         oldsd = &per_cpu(softnet_data, oldcpu);
4253
4254         /* Find end of our completion_queue. */
4255         list_skb = &sd->completion_queue;
4256         while (*list_skb)
4257                 list_skb = &(*list_skb)->next;
4258         /* Append completion queue from offline CPU. */
4259         *list_skb = oldsd->completion_queue;
4260         oldsd->completion_queue = NULL;
4261
4262         /* Find end of our output_queue. */
4263         list_net = &sd->output_queue;
4264         while (*list_net)
4265                 list_net = &(*list_net)->next_sched;
4266         /* Append output queue from offline CPU. */
4267         *list_net = oldsd->output_queue;
4268         oldsd->output_queue = NULL;
4269
4270         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4271         local_irq_enable();
4272
4273         /* Process offline CPU's input_pkt_queue */
4274         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4275                 netif_rx(skb);
4276
4277         return NOTIFY_OK;
4278 }
4279
4280 #ifdef CONFIG_NET_DMA
4281 /**
4282  * net_dma_rebalance - try to maintain one DMA channel per CPU
4283  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4284  *
4285  * This is called when the number of channels allocated to the net_dma client
4286  * changes.  The net_dma client tries to have one DMA channel per CPU.
4287  */
4288
4289 static void net_dma_rebalance(struct net_dma *net_dma)
4290 {
4291         unsigned int cpu, i, n, chan_idx;
4292         struct dma_chan *chan;
4293
4294         if (cpus_empty(net_dma->channel_mask)) {
4295                 for_each_online_cpu(cpu)
4296                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4297                 return;
4298         }
4299
4300         i = 0;
4301         cpu = first_cpu(cpu_online_map);
4302
4303         for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4304                 chan = net_dma->channels[chan_idx];
4305
4306                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4307                    + (i < (num_online_cpus() %
4308                         cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4309
4310                 while(n) {
4311                         per_cpu(softnet_data, cpu).net_dma = chan;
4312                         cpu = next_cpu(cpu, cpu_online_map);
4313                         n--;
4314                 }
4315                 i++;
4316         }
4317 }
4318
4319 /**
4320  * netdev_dma_event - event callback for the net_dma_client
4321  * @client: should always be net_dma_client
4322  * @chan: DMA channel for the event
4323  * @state: DMA state to be handled
4324  */
4325 static enum dma_state_client
4326 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4327         enum dma_state state)
4328 {
4329         int i, found = 0, pos = -1;
4330         struct net_dma *net_dma =
4331                 container_of(client, struct net_dma, client);
4332         enum dma_state_client ack = DMA_DUP; /* default: take no action */
4333
4334         spin_lock(&net_dma->lock);
4335         switch (state) {
4336         case DMA_RESOURCE_AVAILABLE:
4337                 for (i = 0; i < nr_cpu_ids; i++)
4338                         if (net_dma->channels[i] == chan) {
4339                                 found = 1;
4340                                 break;
4341                         } else if (net_dma->channels[i] == NULL && pos < 0)
4342                                 pos = i;
4343
4344                 if (!found && pos >= 0) {
4345                         ack = DMA_ACK;
4346                         net_dma->channels[pos] = chan;
4347                         cpu_set(pos, net_dma->channel_mask);
4348                         net_dma_rebalance(net_dma);
4349                 }
4350                 break;
4351         case DMA_RESOURCE_REMOVED:
4352                 for (i = 0; i < nr_cpu_ids; i++)
4353                         if (net_dma->channels[i] == chan) {
4354                                 found = 1;
4355                                 pos = i;
4356                                 break;
4357                         }
4358
4359                 if (found) {
4360                         ack = DMA_ACK;
4361                         cpu_clear(pos, net_dma->channel_mask);
4362                         net_dma->channels[i] = NULL;
4363                         net_dma_rebalance(net_dma);
4364                 }
4365                 break;
4366         default:
4367                 break;
4368         }
4369         spin_unlock(&net_dma->lock);
4370
4371         return ack;
4372 }
4373
4374 /**
4375  * netdev_dma_regiser - register the networking subsystem as a DMA client
4376  */
4377 static int __init netdev_dma_register(void)
4378 {
4379         net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4380                                                                 GFP_KERNEL);
4381         if (unlikely(!net_dma.channels)) {
4382                 printk(KERN_NOTICE
4383                                 "netdev_dma: no memory for net_dma.channels\n");
4384                 return -ENOMEM;
4385         }
4386         spin_lock_init(&net_dma.lock);
4387         dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4388         dma_async_client_register(&net_dma.client);
4389         dma_async_client_chan_request(&net_dma.client);
4390         return 0;
4391 }
4392
4393 #else
4394 static int __init netdev_dma_register(void) { return -ENODEV; }
4395 #endif /* CONFIG_NET_DMA */
4396
4397 /**
4398  *      netdev_compute_feature - compute conjunction of two feature sets
4399  *      @all: first feature set
4400  *      @one: second feature set
4401  *
4402  *      Computes a new feature set after adding a device with feature set
4403  *      @one to the master device with current feature set @all.  Returns
4404  *      the new feature set.
4405  */
4406 int netdev_compute_features(unsigned long all, unsigned long one)
4407 {
4408         /* if device needs checksumming, downgrade to hw checksumming */
4409         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4410                 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4411
4412         /* if device can't do all checksum, downgrade to ipv4/ipv6 */
4413         if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4414                 all ^= NETIF_F_HW_CSUM
4415                         | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4416
4417         if (one & NETIF_F_GSO)
4418                 one |= NETIF_F_GSO_SOFTWARE;
4419         one |= NETIF_F_GSO;
4420
4421         /* If even one device supports robust GSO, enable it for all. */
4422         if (one & NETIF_F_GSO_ROBUST)
4423                 all |= NETIF_F_GSO_ROBUST;
4424
4425         all &= one | NETIF_F_LLTX;
4426
4427         if (!(all & NETIF_F_ALL_CSUM))
4428                 all &= ~NETIF_F_SG;
4429         if (!(all & NETIF_F_SG))
4430                 all &= ~NETIF_F_GSO_MASK;
4431
4432         return all;
4433 }
4434 EXPORT_SYMBOL(netdev_compute_features);
4435
4436 static struct hlist_head *netdev_create_hash(void)
4437 {
4438         int i;
4439         struct hlist_head *hash;
4440
4441         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4442         if (hash != NULL)
4443                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4444                         INIT_HLIST_HEAD(&hash[i]);
4445
4446         return hash;
4447 }
4448
4449 /* Initialize per network namespace state */
4450 static int __net_init netdev_init(struct net *net)
4451 {
4452         INIT_LIST_HEAD(&net->dev_base_head);
4453
4454         net->dev_name_head = netdev_create_hash();
4455         if (net->dev_name_head == NULL)
4456                 goto err_name;
4457
4458         net->dev_index_head = netdev_create_hash();
4459         if (net->dev_index_head == NULL)
4460                 goto err_idx;
4461
4462         return 0;
4463
4464 err_idx:
4465         kfree(net->dev_name_head);
4466 err_name:
4467         return -ENOMEM;
4468 }
4469
4470 static void __net_exit netdev_exit(struct net *net)
4471 {
4472         kfree(net->dev_name_head);
4473         kfree(net->dev_index_head);
4474 }
4475
4476 static struct pernet_operations __net_initdata netdev_net_ops = {
4477         .init = netdev_init,
4478         .exit = netdev_exit,
4479 };
4480
4481 static void __net_exit default_device_exit(struct net *net)
4482 {
4483         struct net_device *dev, *next;
4484         /*
4485          * Push all migratable of the network devices back to the
4486          * initial network namespace
4487          */
4488         rtnl_lock();
4489         for_each_netdev_safe(net, dev, next) {
4490                 int err;
4491                 char fb_name[IFNAMSIZ];
4492
4493                 /* Ignore unmoveable devices (i.e. loopback) */
4494                 if (dev->features & NETIF_F_NETNS_LOCAL)
4495                         continue;
4496
4497                 /* Push remaing network devices to init_net */
4498                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4499                 err = dev_change_net_namespace(dev, &init_net, fb_name);
4500                 if (err) {
4501                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
4502                                 __func__, dev->name, err);
4503                         BUG();
4504                 }
4505         }
4506         rtnl_unlock();
4507 }
4508
4509 static struct pernet_operations __net_initdata default_device_ops = {
4510         .exit = default_device_exit,
4511 };
4512
4513 /*
4514  *      Initialize the DEV module. At boot time this walks the device list and
4515  *      unhooks any devices that fail to initialise (normally hardware not
4516  *      present) and leaves us with a valid list of present and active devices.
4517  *
4518  */
4519
4520 /*
4521  *       This is called single threaded during boot, so no need
4522  *       to take the rtnl semaphore.
4523  */
4524 static int __init net_dev_init(void)
4525 {
4526         int i, rc = -ENOMEM;
4527
4528         BUG_ON(!dev_boot_phase);
4529
4530         if (dev_proc_init())
4531                 goto out;
4532
4533         if (netdev_kobject_init())
4534                 goto out;
4535
4536         INIT_LIST_HEAD(&ptype_all);
4537         for (i = 0; i < PTYPE_HASH_SIZE; i++)
4538                 INIT_LIST_HEAD(&ptype_base[i]);
4539
4540         if (register_pernet_subsys(&netdev_net_ops))
4541                 goto out;
4542
4543         if (register_pernet_device(&default_device_ops))
4544                 goto out;
4545
4546         /*
4547          *      Initialise the packet receive queues.
4548          */
4549
4550         for_each_possible_cpu(i) {
4551                 struct softnet_data *queue;
4552
4553                 queue = &per_cpu(softnet_data, i);
4554                 skb_queue_head_init(&queue->input_pkt_queue);
4555                 queue->completion_queue = NULL;
4556                 INIT_LIST_HEAD(&queue->poll_list);
4557
4558                 queue->backlog.poll = process_backlog;
4559                 queue->backlog.weight = weight_p;
4560         }
4561
4562         netdev_dma_register();
4563
4564         dev_boot_phase = 0;
4565
4566         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4567         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4568
4569         hotcpu_notifier(dev_cpu_callback, 0);
4570         dst_init();
4571         dev_mcast_init();
4572         rc = 0;
4573 out:
4574         return rc;
4575 }
4576
4577 subsys_initcall(net_dev_init);
4578
4579 EXPORT_SYMBOL(__dev_get_by_index);
4580 EXPORT_SYMBOL(__dev_get_by_name);
4581 EXPORT_SYMBOL(__dev_remove_pack);
4582 EXPORT_SYMBOL(dev_valid_name);
4583 EXPORT_SYMBOL(dev_add_pack);
4584 EXPORT_SYMBOL(dev_alloc_name);
4585 EXPORT_SYMBOL(dev_close);
4586 EXPORT_SYMBOL(dev_get_by_flags);
4587 EXPORT_SYMBOL(dev_get_by_index);
4588 EXPORT_SYMBOL(dev_get_by_name);
4589 EXPORT_SYMBOL(dev_open);
4590 EXPORT_SYMBOL(dev_queue_xmit);
4591 EXPORT_SYMBOL(dev_remove_pack);
4592 EXPORT_SYMBOL(dev_set_allmulti);
4593 EXPORT_SYMBOL(dev_set_promiscuity);
4594 EXPORT_SYMBOL(dev_change_flags);
4595 EXPORT_SYMBOL(dev_set_mtu);
4596 EXPORT_SYMBOL(dev_set_mac_address);
4597 EXPORT_SYMBOL(free_netdev);
4598 EXPORT_SYMBOL(netdev_boot_setup_check);
4599 EXPORT_SYMBOL(netdev_set_master);
4600 EXPORT_SYMBOL(netdev_state_change);
4601 EXPORT_SYMBOL(netif_receive_skb);
4602 EXPORT_SYMBOL(netif_rx);
4603 EXPORT_SYMBOL(register_gifconf);
4604 EXPORT_SYMBOL(register_netdevice);
4605 EXPORT_SYMBOL(register_netdevice_notifier);
4606 EXPORT_SYMBOL(skb_checksum_help);
4607 EXPORT_SYMBOL(synchronize_net);
4608 EXPORT_SYMBOL(unregister_netdevice);
4609 EXPORT_SYMBOL(unregister_netdevice_notifier);
4610 EXPORT_SYMBOL(net_enable_timestamp);
4611 EXPORT_SYMBOL(net_disable_timestamp);
4612 EXPORT_SYMBOL(dev_get_flags);
4613
4614 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4615 EXPORT_SYMBOL(br_handle_frame_hook);
4616 EXPORT_SYMBOL(br_fdb_get_hook);
4617 EXPORT_SYMBOL(br_fdb_put_hook);
4618 #endif
4619
4620 #ifdef CONFIG_KMOD
4621 EXPORT_SYMBOL(dev_load);
4622 #endif
4623
4624 EXPORT_PER_CPU_SYMBOL(softnet_data);