net/sched/sch_generic.c

   1 /*
   2  * net/sched/sch_generic.c      Generic packet scheduler routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
  11  *              - Ingress support
  12  */
  13
  14 #include <linux/bitops.h>
  15 #include <linux/module.h>
  16 #include <linux/types.h>
  17 #include <linux/kernel.h>
  18 #include <linux/sched.h>
  19 #include <linux/string.h>
  20 #include <linux/errno.h>
  21 #include <linux/netdevice.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/rtnetlink.h>
  24 #include <linux/init.h>
  25 #include <linux/rcupdate.h>
  26 #include <linux/list.h>
  27 #include <net/pkt_sched.h>
  28
  29 /* Main transmission queue. */
  30
  31 /* Modifications to data participating in scheduling must be protected with
  32  * dev->queue_lock spinlock.
  33  *
  34  * The idea is the following:
  35  * - enqueue, dequeue are serialized via top level device
  36  *   spinlock dev->queue_lock.
  37  * - ingress filtering is serialized via top level device
  38  *   spinlock dev->ingress_lock.
  39  * - updates to tree and tree walking are only done under the rtnl mutex.
  40  */
  41
  42 void qdisc_lock_tree(struct net_device *dev)
  43         __acquires(dev->queue_lock)
  44         __acquires(dev->ingress_lock)
  45 {
  46         spin_lock_bh(&dev->queue_lock);
  47         spin_lock(&dev->ingress_lock);
  48 }
  49
  50 void qdisc_unlock_tree(struct net_device *dev)
  51         __releases(dev->ingress_lock)
  52         __releases(dev->queue_lock)
  53 {
  54         spin_unlock(&dev->ingress_lock);
  55         spin_unlock_bh(&dev->queue_lock);
  56 }
  57
  58 static inline int qdisc_qlen(struct Qdisc *q)
  59 {
  60         return q->q.qlen;
  61 }
  62
  63 static inline int dev_requeue_skb(struct sk_buff *skb, struct net_device *dev,
  64                                   struct Qdisc *q)
  65 {
  66         if (unlikely(skb->next))
  67                 dev->gso_skb = skb;
  68         else
  69                 q->ops->requeue(skb, q);
  70
  71         netif_schedule(dev);
  72         return 0;
  73 }
  74
  75 static inline struct sk_buff *dev_dequeue_skb(struct net_device *dev,
  76                                               struct Qdisc *q)
  77 {
  78         struct sk_buff *skb;
  79
  80         if ((skb = dev->gso_skb))
  81                 dev->gso_skb = NULL;
  82         else
  83                 skb = q->dequeue(q);
  84
  85         return skb;
  86 }
  87
  88 static inline int handle_dev_cpu_collision(struct sk_buff *skb,
  89                                            struct net_device *dev,
  90                                            struct Qdisc *q)
  91 {
  92         int ret;
  93
  94         if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
  95                 /*
  96                  * Same CPU holding the lock. It may be a transient
  97                  * configuration error, when hard_start_xmit() recurses. We
  98                  * detect it by checking xmit owner and drop the packet when
  99                  * deadloop is detected. Return OK to try the next skb.
 100                  */
 101                 kfree_skb(skb);
 102                 if (net_ratelimit())
 103                         printk(KERN_WARNING "Dead loop on netdevice %s, "
 104                                "fix it urgently!\n", dev->name);
 105                 ret = qdisc_qlen(q);
 106         } else {
 107                 /*
 108                  * Another cpu is holding lock, requeue & delay xmits for
 109                  * some time.
 110                  */
 111                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
 112                 ret = dev_requeue_skb(skb, dev, q);
 113         }
 114
 115         return ret;
 116 }
 117
 118 /*
 119  * NOTE: Called under dev->queue_lock with locally disabled BH.
 120  *
 121  * __LINK_STATE_QDISC_RUNNING guarantees only one CPU can process this
 122  * device at a time. dev->queue_lock serializes queue accesses for
 123  * this device AND dev->qdisc pointer itself.
 124  *
 125  *  netif_tx_lock serializes accesses to device driver.
 126  *
 127  *  dev->queue_lock and netif_tx_lock are mutually exclusive,
 128  *  if one is grabbed, another must be free.
 129  *
 130  * Note, that this procedure can be called by a watchdog timer
 131  *
 132  * Returns to the caller:
 133  *                              0  - queue is empty or throttled.
 134  *                              >0 - queue is not empty.
 135  *
 136  */
 137 static inline int qdisc_restart(struct net_device *dev)
 138 {
 139         struct Qdisc *q = dev->qdisc;
 140         struct sk_buff *skb;
 141         int ret = NETDEV_TX_BUSY;
 142
 143         /* Dequeue packet */
 144         if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL))
 145                 return 0;
 146
 147
 148         /* And release queue */
 149         spin_unlock(&dev->queue_lock);
 150
 151         HARD_TX_LOCK(dev, smp_processor_id());
 152         if (!netif_subqueue_stopped(dev, skb))
 153                 ret = dev_hard_start_xmit(skb, dev);
 154         HARD_TX_UNLOCK(dev);
 155
 156         spin_lock(&dev->queue_lock);
 157         q = dev->qdisc;
 158
 159         switch (ret) {
 160         case NETDEV_TX_OK:
 161                 /* Driver sent out skb successfully */
 162                 ret = qdisc_qlen(q);
 163                 break;
 164
 165         case NETDEV_TX_LOCKED:
 166                 /* Driver try lock failed */
 167                 ret = handle_dev_cpu_collision(skb, dev, q);
 168                 break;
 169
 170         default:
 171                 /* Driver returned NETDEV_TX_BUSY - requeue skb */
 172                 if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
 173                         printk(KERN_WARNING "BUG %s code %d qlen %d\n",
 174                                dev->name, ret, q->q.qlen);
 175
 176                 ret = dev_requeue_skb(skb, dev, q);
 177                 break;
 178         }
 179
 180         return ret;
 181 }
 182
 183 void __qdisc_run(struct net_device *dev)
 184 {
 185         do {
 186                 if (!qdisc_restart(dev))
 187                         break;
 188         } while (!netif_queue_stopped(dev));
 189
 190         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
 191 }
 192
 193 static void dev_watchdog(unsigned long arg)
 194 {
 195         struct net_device *dev = (struct net_device *)arg;
 196
 197         netif_tx_lock(dev);
 198         if (dev->qdisc != &noop_qdisc) {
 199                 if (netif_device_present(dev) &&
 200                     netif_running(dev) &&
 201                     netif_carrier_ok(dev)) {
 202                         if (netif_queue_stopped(dev) &&
 203                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
 204
 205                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
 206                                        dev->name);
 207                                 dev->tx_timeout(dev);
 208                         }
 209                         if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
 210                                 dev_hold(dev);
 211                 }
 212         }
 213         netif_tx_unlock(dev);
 214
 215         dev_put(dev);
 216 }
 217
 218 void __netdev_watchdog_up(struct net_device *dev)
 219 {
 220         if (dev->tx_timeout) {
 221                 if (dev->watchdog_timeo <= 0)
 222                         dev->watchdog_timeo = 5*HZ;
 223                 if (!mod_timer(&dev->watchdog_timer,
 224                                round_jiffies(jiffies + dev->watchdog_timeo)))
 225                         dev_hold(dev);
 226         }
 227 }
 228
 229 static void dev_watchdog_up(struct net_device *dev)
 230 {
 231         __netdev_watchdog_up(dev);
 232 }
 233
 234 static void dev_watchdog_down(struct net_device *dev)
 235 {
 236         netif_tx_lock_bh(dev);
 237         if (del_timer(&dev->watchdog_timer))
 238                 dev_put(dev);
 239         netif_tx_unlock_bh(dev);
 240 }
 241
 242 /**
 243  *      netif_carrier_on - set carrier
 244  *      @dev: network device
 245  *
 246  * Device has detected that carrier.
 247  */
 248 void netif_carrier_on(struct net_device *dev)
 249 {
 250         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 251                 linkwatch_fire_event(dev);
 252                 if (netif_running(dev))
 253                         __netdev_watchdog_up(dev);
 254         }
 255 }
 256
 257 /**
 258  *      netif_carrier_off - clear carrier
 259  *      @dev: network device
 260  *
 261  * Device has detected loss of carrier.
 262  */
 263 void netif_carrier_off(struct net_device *dev)
 264 {
 265         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
 266                 linkwatch_fire_event(dev);
 267 }
 268
 269 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
 270    under all circumstances. It is difficult to invent anything faster or
 271    cheaper.
 272  */
 273
 274 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
 275 {
 276         kfree_skb(skb);
 277         return NET_XMIT_CN;
 278 }
 279
 280 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
 281 {
 282         return NULL;
 283 }
 284
 285 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 286 {
 287         if (net_ratelimit())
 288                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
 289                        skb->dev->name);
 290         kfree_skb(skb);
 291         return NET_XMIT_CN;
 292 }
 293
 294 struct Qdisc_ops noop_qdisc_ops __read_mostly = {
 295         .id             =       "noop",
 296         .priv_size      =       0,
 297         .enqueue        =       noop_enqueue,
 298         .dequeue        =       noop_dequeue,
 299         .requeue        =       noop_requeue,
 300         .owner          =       THIS_MODULE,
 301 };
 302
 303 struct Qdisc noop_qdisc = {
 304         .enqueue        =       noop_enqueue,
 305         .dequeue        =       noop_dequeue,
 306         .flags          =       TCQ_F_BUILTIN,
 307         .ops            =       &noop_qdisc_ops,
 308         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
 309 };
 310
 311 static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 312         .id             =       "noqueue",
 313         .priv_size      =       0,
 314         .enqueue        =       noop_enqueue,
 315         .dequeue        =       noop_dequeue,
 316         .requeue        =       noop_requeue,
 317         .owner          =       THIS_MODULE,
 318 };
 319
 320 static struct Qdisc noqueue_qdisc = {
 321         .enqueue        =       NULL,
 322         .dequeue        =       noop_dequeue,
 323         .flags          =       TCQ_F_BUILTIN,
 324         .ops            =       &noqueue_qdisc_ops,
 325         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
 326 };
 327
 328
 329 static const u8 prio2band[TC_PRIO_MAX+1] =
 330         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
 331
 332 /* 3-band FIFO queue: old style, but should be a bit faster than
 333    generic prio+fifo combination.
 334  */
 335
 336 #define PFIFO_FAST_BANDS 3
 337
 338 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
 339                                              struct Qdisc *qdisc)
 340 {
 341         struct sk_buff_head *list = qdisc_priv(qdisc);
 342         return list + prio2band[skb->priority & TC_PRIO_MAX];
 343 }
 344
 345 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 346 {
 347         struct sk_buff_head *list = prio2list(skb, qdisc);
 348
 349         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
 350                 qdisc->q.qlen++;
 351                 return __qdisc_enqueue_tail(skb, qdisc, list);
 352         }
 353
 354         return qdisc_drop(skb, qdisc);
 355 }
 356
 357 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 358 {
 359         int prio;
 360         struct sk_buff_head *list = qdisc_priv(qdisc);
 361
 362         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
 363                 if (!skb_queue_empty(list + prio)) {
 364                         qdisc->q.qlen--;
 365                         return __qdisc_dequeue_head(qdisc, list + prio);
 366                 }
 367         }
 368
 369         return NULL;
 370 }
 371
 372 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 373 {
 374         qdisc->q.qlen++;
 375         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
 376 }
 377
 378 static void pfifo_fast_reset(struct Qdisc* qdisc)
 379 {
 380         int prio;
 381         struct sk_buff_head *list = qdisc_priv(qdisc);
 382
 383         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 384                 __qdisc_reset_queue(qdisc, list + prio);
 385
 386         qdisc->qstats.backlog = 0;
 387         qdisc->q.qlen = 0;
 388 }
 389
 390 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 391 {
 392         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 393
 394         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
 395         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 396         return skb->len;
 397
 398 rtattr_failure:
 399         return -1;
 400 }
 401
 402 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
 403 {
 404         int prio;
 405         struct sk_buff_head *list = qdisc_priv(qdisc);
 406
 407         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 408                 skb_queue_head_init(list + prio);
 409
 410         return 0;
 411 }
 412
 413 static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 414         .id             =       "pfifo_fast",
 415         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
 416         .enqueue        =       pfifo_fast_enqueue,
 417         .dequeue        =       pfifo_fast_dequeue,
 418         .requeue        =       pfifo_fast_requeue,
 419         .init           =       pfifo_fast_init,
 420         .reset          =       pfifo_fast_reset,
 421         .dump           =       pfifo_fast_dump,
 422         .owner          =       THIS_MODULE,
 423 };
 424
 425 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
 426 {
 427         void *p;
 428         struct Qdisc *sch;
 429         unsigned int size;
 430         int err = -ENOBUFS;
 431
 432         /* ensure that the Qdisc and the private data are 32-byte aligned */
 433         size = QDISC_ALIGN(sizeof(*sch));
 434         size += ops->priv_size + (QDISC_ALIGNTO - 1);
 435
 436         p = kzalloc(size, GFP_KERNEL);
 437         if (!p)
 438                 goto errout;
 439         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
 440         sch->padded = (char *) sch - (char *) p;
 441
 442         INIT_LIST_HEAD(&sch->list);
 443         skb_queue_head_init(&sch->q);
 444         sch->ops = ops;
 445         sch->enqueue = ops->enqueue;
 446         sch->dequeue = ops->dequeue;
 447         sch->dev = dev;
 448         dev_hold(dev);
 449         atomic_set(&sch->refcnt, 1);
 450
 451         return sch;
 452 errout:
 453         return ERR_PTR(-err);
 454 }
 455
 456 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
 457                                  unsigned int parentid)
 458 {
 459         struct Qdisc *sch;
 460
 461         sch = qdisc_alloc(dev, ops);
 462         if (IS_ERR(sch))
 463                 goto errout;
 464         sch->stats_lock = &dev->queue_lock;
 465         sch->parent = parentid;
 466
 467         if (!ops->init || ops->init(sch, NULL) == 0)
 468                 return sch;
 469
 470         qdisc_destroy(sch);
 471 errout:
 472         return NULL;
 473 }
 474
 475 /* Under dev->queue_lock and BH! */
 476
 477 void qdisc_reset(struct Qdisc *qdisc)
 478 {
 479         const struct Qdisc_ops *ops = qdisc->ops;
 480
 481         if (ops->reset)
 482                 ops->reset(qdisc);
 483 }
 484
 485 /* this is the rcu callback function to clean up a qdisc when there
 486  * are no further references to it */
 487
 488 static void __qdisc_destroy(struct rcu_head *head)
 489 {
 490         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
 491         kfree((char *) qdisc - qdisc->padded);
 492 }
 493
 494 /* Under dev->queue_lock and BH! */
 495
 496 void qdisc_destroy(struct Qdisc *qdisc)
 497 {
 498         const struct Qdisc_ops  *ops = qdisc->ops;
 499
 500         if (qdisc->flags & TCQ_F_BUILTIN ||
 501             !atomic_dec_and_test(&qdisc->refcnt))
 502                 return;
 503
 504         list_del(&qdisc->list);
 505         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
 506         if (ops->reset)
 507                 ops->reset(qdisc);
 508         if (ops->destroy)
 509                 ops->destroy(qdisc);
 510
 511         module_put(ops->owner);
 512         dev_put(qdisc->dev);
 513         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
 514 }
 515
 516 void dev_activate(struct net_device *dev)
 517 {
 518         /* No queueing discipline is attached to device;
 519            create default one i.e. pfifo_fast for devices,
 520            which need queueing and noqueue_qdisc for
 521            virtual interfaces
 522          */
 523
 524         if (dev->qdisc_sleeping == &noop_qdisc) {
 525                 struct Qdisc *qdisc;
 526                 if (dev->tx_queue_len) {
 527                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
 528                                                   TC_H_ROOT);
 529                         if (qdisc == NULL) {
 530                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
 531                                 return;
 532                         }
 533                         list_add_tail(&qdisc->list, &dev->qdisc_list);
 534                 } else {
 535                         qdisc =  &noqueue_qdisc;
 536                 }
 537                 dev->qdisc_sleeping = qdisc;
 538         }
 539
 540         if (!netif_carrier_ok(dev))
 541                 /* Delay activation until next carrier-on event */
 542                 return;
 543
 544         spin_lock_bh(&dev->queue_lock);
 545         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
 546         if (dev->qdisc != &noqueue_qdisc) {
 547                 dev->trans_start = jiffies;
 548                 dev_watchdog_up(dev);
 549         }
 550         spin_unlock_bh(&dev->queue_lock);
 551 }
 552
 553 void dev_deactivate(struct net_device *dev)
 554 {
 555         struct Qdisc *qdisc;
 556         struct sk_buff *skb;
 557         int running;
 558
 559         spin_lock_bh(&dev->queue_lock);
 560         qdisc = dev->qdisc;
 561         dev->qdisc = &noop_qdisc;
 562
 563         qdisc_reset(qdisc);
 564
 565         skb = dev->gso_skb;
 566         dev->gso_skb = NULL;
 567         spin_unlock_bh(&dev->queue_lock);
 568
 569         kfree_skb(skb);
 570
 571         dev_watchdog_down(dev);
 572
 573         /* Wait for outstanding qdisc-less dev_queue_xmit calls. */
 574         synchronize_rcu();
 575
 576         /* Wait for outstanding qdisc_run calls. */
 577         do {
 578                 while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
 579                         yield();
 580
 581                 /*
 582                  * Double-check inside queue lock to ensure that all effects
 583                  * of the queue run are visible when we return.
 584                  */
 585                 spin_lock_bh(&dev->queue_lock);
 586                 running = test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
 587                 spin_unlock_bh(&dev->queue_lock);
 588
 589                 /*
 590                  * The running flag should never be set at this point because
 591                  * we've already set dev->qdisc to noop_qdisc *inside* the same
 592                  * pair of spin locks.  That is, if any qdisc_run starts after
 593                  * our initial test it should see the noop_qdisc and then
 594                  * clear the RUNNING bit before dropping the queue lock.  So
 595                  * if it is set here then we've found a bug.
 596                  */
 597         } while (WARN_ON_ONCE(running));
 598 }
 599
 600 void dev_init_scheduler(struct net_device *dev)
 601 {
 602         qdisc_lock_tree(dev);
 603         dev->qdisc = &noop_qdisc;
 604         dev->qdisc_sleeping = &noop_qdisc;
 605         INIT_LIST_HEAD(&dev->qdisc_list);
 606         qdisc_unlock_tree(dev);
 607
 608         setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 609 }
 610
 611 void dev_shutdown(struct net_device *dev)
 612 {
 613         struct Qdisc *qdisc;
 614
 615         qdisc_lock_tree(dev);
 616         qdisc = dev->qdisc_sleeping;
 617         dev->qdisc = &noop_qdisc;
 618         dev->qdisc_sleeping = &noop_qdisc;
 619         qdisc_destroy(qdisc);
 620 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
 621         if ((qdisc = dev->qdisc_ingress) != NULL) {
 622                 dev->qdisc_ingress = NULL;
 623                 qdisc_destroy(qdisc);
 624         }
 625 #endif
 626         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
 627         qdisc_unlock_tree(dev);
 628 }
 629
 630 EXPORT_SYMBOL(netif_carrier_on);
 631 EXPORT_SYMBOL(netif_carrier_off);
 632 EXPORT_SYMBOL(noop_qdisc);
 633 EXPORT_SYMBOL(qdisc_create_dflt);
 634 EXPORT_SYMBOL(qdisc_destroy);
 635 EXPORT_SYMBOL(qdisc_reset);
 636 EXPORT_SYMBOL(qdisc_lock_tree);
 637 EXPORT_SYMBOL(qdisc_unlock_tree);