2 * net/sched/sch_htb.c Hierarchical token bucket, feed tree version
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Martin Devera, <devik@cdi.cz>
11 * Credits (in time order) for older HTB versions:
12 * Stef Coene <stef.coene@docum.org>
13 * HTB support at LARTC mailing list
14 * Ondrej Kraus, <krauso@barr.cz>
15 * found missing INIT_QDISC(htb)
16 * Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17 * helped a lot to locate nasty class stall bug
18 * Andi Kleen, Jamal Hadi, Bert Hubert
19 * code review and helpful comments on shaping
20 * Tomasz Wrona, <tw@eter.tym.pl>
21 * created test case so that I was able to fix nasty bug
23 * spotted bug in dequeue code and helped with fix
25 * fixed requeue routine
26 * and many others. thanks.
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <net/netlink.h>
39 #include <net/pkt_sched.h>
43 ========================================================================
44 HTB is like TBF with multiple classes. It is also similar to CBQ because
45 it allows to assign priority to each class in hierarchy.
46 In fact it is another implementation of Floyd's formal sharing.
49 Each class is assigned level. Leaf has ALWAYS level 0 and root
50 classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51 one less than their parent.
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
61 /* Module parameter and sysfs export */
62 module_param (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
65 /* used internaly to keep status of single class */
67 HTB_CANT_SEND, /* class can't send and can't borrow */
68 HTB_MAY_BORROW, /* class can't send but may borrow */
69 HTB_CAN_SEND /* class can send */
72 /* interior & leaf nodes; props specific to leaves are marked L: */
74 struct Qdisc_class_common common;
75 /* general class parameters */
76 struct gnet_stats_basic bstats;
77 struct gnet_stats_queue qstats;
78 struct gnet_stats_rate_est rate_est;
79 struct tc_htb_xstats xstats; /* our special stats */
80 int refcnt; /* usage count of this class */
83 int level; /* our level (see above) */
84 unsigned int children;
85 struct htb_class *parent; /* parent class */
88 struct htb_class_leaf {
93 int deficit[TC_HTB_MAXDEPTH];
94 struct list_head drop_list;
96 struct htb_class_inner {
97 struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
98 struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
99 /* When class changes from state 1->2 and disconnects from
100 parent's feed then we lost ptr value and start from the
101 first child again. Here we store classid of the
102 last valid ptr (used when ptr is NULL). */
103 u32 last_ptr_id[TC_HTB_NUMPRIO];
106 struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
107 struct rb_node pq_node; /* node for event queue */
108 psched_time_t pq_key;
110 int prio_activity; /* for which prios are we active */
111 enum htb_cmode cmode; /* current mode of the class */
113 /* class attached filters */
114 struct tcf_proto *filter_list;
117 int warned; /* only one warning about non work conserving .. */
119 /* token bucket parameters */
120 struct qdisc_rate_table *rate; /* rate table of the class itself */
121 struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
122 long buffer, cbuffer; /* token bucket depth/rate */
123 psched_tdiff_t mbuffer; /* max wait time */
124 long tokens, ctokens; /* current number of tokens */
125 psched_time_t t_c; /* checkpoint time */
127 int prio; /* For parent to leaf return possible here */
128 int quantum; /* we do backup. Finally full replacement */
129 /* of un.leaf originals should be done. */
132 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
135 long result = qdisc_l2t(rate, size);
140 struct Qdisc_class_hash clhash;
141 struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
143 /* self list - roots of self generating tree */
144 struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
145 int row_mask[TC_HTB_MAXDEPTH];
146 struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
147 u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
149 /* self wait list - roots of wait PQs per row */
150 struct rb_root wait_pq[TC_HTB_MAXDEPTH];
152 /* time of nearest event per level (row) */
153 psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
155 /* whether we hit non-work conserving class during this dequeue; we use */
156 int nwc_hit; /* this to disable mindelay complaint in dequeue */
158 int defcls; /* class where unclassified flows go to */
160 /* filters for qdisc itself */
161 struct tcf_proto *filter_list;
164 int rate2quantum; /* quant = rate / rate2quantum */
165 psched_time_t now; /* cached dequeue time */
166 struct qdisc_watchdog watchdog;
168 /* non shaped skbs; let them go directly thru */
169 struct sk_buff_head direct_queue;
170 int direct_qlen; /* max qlen of above */
175 /* find class in global hash table using given handle */
176 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
178 struct htb_sched *q = qdisc_priv(sch);
179 struct Qdisc_class_common *clc;
181 clc = qdisc_class_find(&q->clhash, handle);
184 return container_of(clc, struct htb_class, common);
188 * htb_classify - classify a packet into class
190 * It returns NULL if the packet should be dropped or -1 if the packet
191 * should be passed directly thru. In all other cases leaf class is returned.
192 * We allow direct class selection by classid in priority. The we examine
193 * filters in qdisc and in inner nodes (if higher filter points to the inner
194 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
195 * internal fifo (direct). These packets then go directly thru. If we still
196 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
197 * then finish and return direct queue.
199 #define HTB_DIRECT (struct htb_class*)-1
201 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
204 struct htb_sched *q = qdisc_priv(sch);
205 struct htb_class *cl;
206 struct tcf_result res;
207 struct tcf_proto *tcf;
210 /* allow to select class by setting skb->priority to valid classid;
211 note that nfmark can be used too by attaching filter fw with no
213 if (skb->priority == sch->handle)
214 return HTB_DIRECT; /* X:0 (direct flow) selected */
215 if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
218 *qerr = NET_XMIT_BYPASS;
219 tcf = q->filter_list;
220 while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
221 #ifdef CONFIG_NET_CLS_ACT
225 *qerr = NET_XMIT_SUCCESS;
230 if ((cl = (void *)res.class) == NULL) {
231 if (res.classid == sch->handle)
232 return HTB_DIRECT; /* X:0 (direct flow) */
233 if ((cl = htb_find(res.classid, sch)) == NULL)
234 break; /* filter selected invalid classid */
237 return cl; /* we hit leaf; return it */
239 /* we have got inner class; apply inner filter chain */
240 tcf = cl->filter_list;
242 /* classification failed; try to use default class */
243 cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
244 if (!cl || cl->level)
245 return HTB_DIRECT; /* bad default .. this is safe bet */
250 * htb_add_to_id_tree - adds class to the round robin list
252 * Routine adds class to the list (actually tree) sorted by classid.
253 * Make sure that class is not already on such list for given prio.
255 static void htb_add_to_id_tree(struct rb_root *root,
256 struct htb_class *cl, int prio)
258 struct rb_node **p = &root->rb_node, *parent = NULL;
263 c = rb_entry(parent, struct htb_class, node[prio]);
265 if (cl->common.classid > c->common.classid)
266 p = &parent->rb_right;
268 p = &parent->rb_left;
270 rb_link_node(&cl->node[prio], parent, p);
271 rb_insert_color(&cl->node[prio], root);
275 * htb_add_to_wait_tree - adds class to the event queue with delay
277 * The class is added to priority event queue to indicate that class will
278 * change its mode in cl->pq_key microseconds. Make sure that class is not
279 * already in the queue.
281 static void htb_add_to_wait_tree(struct htb_sched *q,
282 struct htb_class *cl, long delay)
284 struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
286 cl->pq_key = q->now + delay;
287 if (cl->pq_key == q->now)
290 /* update the nearest event cache */
291 if (q->near_ev_cache[cl->level] > cl->pq_key)
292 q->near_ev_cache[cl->level] = cl->pq_key;
297 c = rb_entry(parent, struct htb_class, pq_node);
298 if (cl->pq_key >= c->pq_key)
299 p = &parent->rb_right;
301 p = &parent->rb_left;
303 rb_link_node(&cl->pq_node, parent, p);
304 rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
308 * htb_next_rb_node - finds next node in binary tree
310 * When we are past last key we return NULL.
311 * Average complexity is 2 steps per call.
313 static inline void htb_next_rb_node(struct rb_node **n)
319 * htb_add_class_to_row - add class to its row
321 * The class is added to row at priorities marked in mask.
322 * It does nothing if mask == 0.
324 static inline void htb_add_class_to_row(struct htb_sched *q,
325 struct htb_class *cl, int mask)
327 q->row_mask[cl->level] |= mask;
329 int prio = ffz(~mask);
330 mask &= ~(1 << prio);
331 htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
335 /* If this triggers, it is a bug in this code, but it need not be fatal */
336 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
338 if (RB_EMPTY_NODE(rb)) {
348 * htb_remove_class_from_row - removes class from its row
350 * The class is removed from row at priorities marked in mask.
351 * It does nothing if mask == 0.
353 static inline void htb_remove_class_from_row(struct htb_sched *q,
354 struct htb_class *cl, int mask)
359 int prio = ffz(~mask);
361 mask &= ~(1 << prio);
362 if (q->ptr[cl->level][prio] == cl->node + prio)
363 htb_next_rb_node(q->ptr[cl->level] + prio);
365 htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
366 if (!q->row[cl->level][prio].rb_node)
369 q->row_mask[cl->level] &= ~m;
373 * htb_activate_prios - creates active classe's feed chain
375 * The class is connected to ancestors and/or appropriate rows
376 * for priorities it is participating on. cl->cmode must be new
377 * (activated) mode. It does nothing if cl->prio_activity == 0.
379 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
381 struct htb_class *p = cl->parent;
382 long m, mask = cl->prio_activity;
384 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
390 if (p->un.inner.feed[prio].rb_node)
391 /* parent already has its feed in use so that
392 reset bit in mask as parent is already ok */
393 mask &= ~(1 << prio);
395 htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
397 p->prio_activity |= mask;
402 if (cl->cmode == HTB_CAN_SEND && mask)
403 htb_add_class_to_row(q, cl, mask);
407 * htb_deactivate_prios - remove class from feed chain
409 * cl->cmode must represent old mode (before deactivation). It does
410 * nothing if cl->prio_activity == 0. Class is removed from all feed
413 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
415 struct htb_class *p = cl->parent;
416 long m, mask = cl->prio_activity;
418 while (cl->cmode == HTB_MAY_BORROW && p && mask) {
425 if (p->un.inner.ptr[prio] == cl->node + prio) {
426 /* we are removing child which is pointed to from
427 parent feed - forget the pointer but remember
429 p->un.inner.last_ptr_id[prio] = cl->common.classid;
430 p->un.inner.ptr[prio] = NULL;
433 htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
435 if (!p->un.inner.feed[prio].rb_node)
439 p->prio_activity &= ~mask;
444 if (cl->cmode == HTB_CAN_SEND && mask)
445 htb_remove_class_from_row(q, cl, mask);
448 static inline long htb_lowater(const struct htb_class *cl)
451 return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
455 static inline long htb_hiwater(const struct htb_class *cl)
458 return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
465 * htb_class_mode - computes and returns current class mode
467 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
468 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
469 * from now to time when cl will change its state.
470 * Also it is worth to note that class mode doesn't change simply
471 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
472 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
473 * mode transitions per time unit. The speed gain is about 1/6.
475 static inline enum htb_cmode
476 htb_class_mode(struct htb_class *cl, long *diff)
480 if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
482 return HTB_CANT_SEND;
485 if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
489 return HTB_MAY_BORROW;
493 * htb_change_class_mode - changes classe's mode
495 * This should be the only way how to change classe's mode under normal
496 * cirsumstances. Routine will update feed lists linkage, change mode
497 * and add class to the wait event queue if appropriate. New mode should
498 * be different from old one and cl->pq_key has to be valid if changing
499 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
502 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
504 enum htb_cmode new_mode = htb_class_mode(cl, diff);
506 if (new_mode == cl->cmode)
509 if (cl->prio_activity) { /* not necessary: speed optimization */
510 if (cl->cmode != HTB_CANT_SEND)
511 htb_deactivate_prios(q, cl);
512 cl->cmode = new_mode;
513 if (new_mode != HTB_CANT_SEND)
514 htb_activate_prios(q, cl);
516 cl->cmode = new_mode;
520 * htb_activate - inserts leaf cl into appropriate active feeds
522 * Routine learns (new) priority of leaf and activates feed chain
523 * for the prio. It can be called on already active leaf safely.
524 * It also adds leaf into droplist.
526 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
528 BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
530 if (!cl->prio_activity) {
531 cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
532 htb_activate_prios(q, cl);
533 list_add_tail(&cl->un.leaf.drop_list,
534 q->drops + cl->un.leaf.aprio);
539 * htb_deactivate - remove leaf cl from active feeds
541 * Make sure that leaf is active. In the other words it can't be called
542 * with non-active leaf. It also removes class from the drop list.
544 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
546 BUG_TRAP(cl->prio_activity);
548 htb_deactivate_prios(q, cl);
549 cl->prio_activity = 0;
550 list_del_init(&cl->un.leaf.drop_list);
553 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
556 struct htb_sched *q = qdisc_priv(sch);
557 struct htb_class *cl = htb_classify(skb, sch, &ret);
559 if (cl == HTB_DIRECT) {
560 /* enqueue to helper queue */
561 if (q->direct_queue.qlen < q->direct_qlen) {
562 __skb_queue_tail(&q->direct_queue, skb);
567 return NET_XMIT_DROP;
569 #ifdef CONFIG_NET_CLS_ACT
571 if (ret == NET_XMIT_BYPASS)
576 } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) !=
580 return NET_XMIT_DROP;
582 cl->bstats.packets +=
583 skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
584 cl->bstats.bytes += skb->len;
589 sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
590 sch->bstats.bytes += skb->len;
591 return NET_XMIT_SUCCESS;
594 /* TODO: requeuing packet charges it to policers again !! */
595 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
598 struct htb_sched *q = qdisc_priv(sch);
599 struct htb_class *cl = htb_classify(skb, sch, &ret);
600 struct sk_buff *tskb;
602 if (cl == HTB_DIRECT) {
603 /* enqueue to helper queue */
604 if (q->direct_queue.qlen < q->direct_qlen) {
605 __skb_queue_head(&q->direct_queue, skb);
607 __skb_queue_head(&q->direct_queue, skb);
608 tskb = __skb_dequeue_tail(&q->direct_queue);
613 #ifdef CONFIG_NET_CLS_ACT
615 if (ret == NET_XMIT_BYPASS)
620 } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
624 return NET_XMIT_DROP;
629 sch->qstats.requeues++;
630 return NET_XMIT_SUCCESS;
634 * htb_charge_class - charges amount "bytes" to leaf and ancestors
636 * Routine assumes that packet "bytes" long was dequeued from leaf cl
637 * borrowing from "level". It accounts bytes to ceil leaky bucket for
638 * leaf and all ancestors and to rate bucket for ancestors at levels
639 * "level" and higher. It also handles possible change of mode resulting
640 * from the update. Note that mode can also increase here (MAY_BORROW to
641 * CAN_SEND) because we can use more precise clock that event queue here.
642 * In such case we remove class from event queue first.
644 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
645 int level, struct sk_buff *skb)
647 int bytes = skb->len;
649 enum htb_cmode old_mode;
651 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
652 if (toks > cl->B) toks = cl->B; \
653 toks -= L2T(cl, cl->R, bytes); \
654 if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
658 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
659 if (cl->level >= level) {
660 if (cl->level == level)
662 HTB_ACCNT(tokens, buffer, rate);
664 cl->xstats.borrows++;
665 cl->tokens += diff; /* we moved t_c; update tokens */
667 HTB_ACCNT(ctokens, cbuffer, ceil);
670 old_mode = cl->cmode;
672 htb_change_class_mode(q, cl, &diff);
673 if (old_mode != cl->cmode) {
674 if (old_mode != HTB_CAN_SEND)
675 htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
676 if (cl->cmode != HTB_CAN_SEND)
677 htb_add_to_wait_tree(q, cl, diff);
680 /* update byte stats except for leaves which are already updated */
682 cl->bstats.bytes += bytes;
683 cl->bstats.packets += skb_is_gso(skb)?
684 skb_shinfo(skb)->gso_segs:1;
691 * htb_do_events - make mode changes to classes at the level
693 * Scans event queue for pending events and applies them. Returns time of
694 * next pending event (0 for no event in pq).
695 * Note: Applied are events whose have cl->pq_key <= q->now.
697 static psched_time_t htb_do_events(struct htb_sched *q, int level)
699 /* don't run for longer than 2 jiffies; 2 is used instead of
700 1 to simplify things when jiffy is going to be incremented
702 unsigned long stop_at = jiffies + 2;
703 while (time_before(jiffies, stop_at)) {
704 struct htb_class *cl;
706 struct rb_node *p = rb_first(&q->wait_pq[level]);
711 cl = rb_entry(p, struct htb_class, pq_node);
712 if (cl->pq_key > q->now)
715 htb_safe_rb_erase(p, q->wait_pq + level);
716 diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
717 htb_change_class_mode(q, cl, &diff);
718 if (cl->cmode != HTB_CAN_SEND)
719 htb_add_to_wait_tree(q, cl, diff);
721 /* too much load - let's continue on next jiffie */
722 return q->now + PSCHED_TICKS_PER_SEC / HZ;
725 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
726 is no such one exists. */
727 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
730 struct rb_node *r = NULL;
732 struct htb_class *cl =
733 rb_entry(n, struct htb_class, node[prio]);
734 if (id == cl->common.classid)
737 if (id > cl->common.classid) {
748 * htb_lookup_leaf - returns next leaf class in DRR order
750 * Find leaf where current feed pointers points to.
752 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
753 struct rb_node **pptr, u32 * pid)
757 struct rb_node *root;
758 struct rb_node **pptr;
760 } stk[TC_HTB_MAXDEPTH], *sp = stk;
762 BUG_TRAP(tree->rb_node);
763 sp->root = tree->rb_node;
767 for (i = 0; i < 65535; i++) {
768 if (!*sp->pptr && *sp->pid) {
769 /* ptr was invalidated but id is valid - try to recover
770 the original or next ptr */
772 htb_id_find_next_upper(prio, sp->root, *sp->pid);
774 *sp->pid = 0; /* ptr is valid now so that remove this hint as it
775 can become out of date quickly */
776 if (!*sp->pptr) { /* we are at right end; rewind & go up */
777 *sp->pptr = sp->root;
778 while ((*sp->pptr)->rb_left)
779 *sp->pptr = (*sp->pptr)->rb_left;
785 htb_next_rb_node(sp->pptr);
788 struct htb_class *cl;
789 cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
792 (++sp)->root = cl->un.inner.feed[prio].rb_node;
793 sp->pptr = cl->un.inner.ptr + prio;
794 sp->pid = cl->un.inner.last_ptr_id + prio;
801 /* dequeues packet at given priority and level; call only if
802 you are sure that there is active class at prio/level */
803 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
806 struct sk_buff *skb = NULL;
807 struct htb_class *cl, *start;
808 /* look initial class up in the row */
809 start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
810 q->ptr[level] + prio,
811 q->last_ptr_id[level] + prio);
819 /* class can be empty - it is unlikely but can be true if leaf
820 qdisc drops packets in enqueue routine or if someone used
821 graft operation on the leaf since last dequeue;
822 simply deactivate and skip such class */
823 if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
824 struct htb_class *next;
825 htb_deactivate(q, cl);
827 /* row/level might become empty */
828 if ((q->row_mask[level] & (1 << prio)) == 0)
831 next = htb_lookup_leaf(q->row[level] + prio,
832 prio, q->ptr[level] + prio,
833 q->last_ptr_id[level] + prio);
835 if (cl == start) /* fix start if we just deleted it */
841 skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
842 if (likely(skb != NULL))
846 "htb: class %X isn't work conserving ?!\n",
851 htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
853 cl = htb_lookup_leaf(q->row[level] + prio, prio,
854 q->ptr[level] + prio,
855 q->last_ptr_id[level] + prio);
857 } while (cl != start);
859 if (likely(skb != NULL)) {
860 if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
861 cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
862 htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
865 /* this used to be after charge_class but this constelation
866 gives us slightly better performance */
867 if (!cl->un.leaf.q->q.qlen)
868 htb_deactivate(q, cl);
869 htb_charge_class(q, cl, level, skb);
874 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
876 struct sk_buff *skb = NULL;
877 struct htb_sched *q = qdisc_priv(sch);
879 psched_time_t next_event;
881 /* try to dequeue direct packets as high prio (!) to minimize cpu work */
882 skb = __skb_dequeue(&q->direct_queue);
884 sch->flags &= ~TCQ_F_THROTTLED;
891 q->now = psched_get_time();
893 next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
895 for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
896 /* common case optimization - skip event handler quickly */
900 if (q->now >= q->near_ev_cache[level]) {
901 event = htb_do_events(q, level);
903 event = q->now + PSCHED_TICKS_PER_SEC;
904 q->near_ev_cache[level] = event;
906 event = q->near_ev_cache[level];
908 if (event && next_event > event)
911 m = ~q->row_mask[level];
912 while (m != (int)(-1)) {
915 skb = htb_dequeue_tree(q, prio, level);
916 if (likely(skb != NULL)) {
918 sch->flags &= ~TCQ_F_THROTTLED;
923 sch->qstats.overlimits++;
924 qdisc_watchdog_schedule(&q->watchdog, next_event);
929 /* try to drop from each class (by prio) until one succeed */
930 static unsigned int htb_drop(struct Qdisc *sch)
932 struct htb_sched *q = qdisc_priv(sch);
935 for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
937 list_for_each(p, q->drops + prio) {
938 struct htb_class *cl = list_entry(p, struct htb_class,
941 if (cl->un.leaf.q->ops->drop &&
942 (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
944 if (!cl->un.leaf.q->q.qlen)
945 htb_deactivate(q, cl);
953 /* reset all classes */
954 /* always caled under BH & queue lock */
955 static void htb_reset(struct Qdisc *sch)
957 struct htb_sched *q = qdisc_priv(sch);
958 struct htb_class *cl;
959 struct hlist_node *n;
962 for (i = 0; i < q->clhash.hashsize; i++) {
963 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
965 memset(&cl->un.inner, 0, sizeof(cl->un.inner));
968 qdisc_reset(cl->un.leaf.q);
969 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
971 cl->prio_activity = 0;
972 cl->cmode = HTB_CAN_SEND;
976 qdisc_watchdog_cancel(&q->watchdog);
977 __skb_queue_purge(&q->direct_queue);
979 memset(q->row, 0, sizeof(q->row));
980 memset(q->row_mask, 0, sizeof(q->row_mask));
981 memset(q->wait_pq, 0, sizeof(q->wait_pq));
982 memset(q->ptr, 0, sizeof(q->ptr));
983 for (i = 0; i < TC_HTB_NUMPRIO; i++)
984 INIT_LIST_HEAD(q->drops + i);
987 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
988 [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
989 [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
990 [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
991 [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
994 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
996 struct htb_sched *q = qdisc_priv(sch);
997 struct nlattr *tb[TCA_HTB_INIT + 1];
998 struct tc_htb_glob *gopt;
1005 err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1009 if (tb[TCA_HTB_INIT] == NULL) {
1010 printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1013 gopt = nla_data(tb[TCA_HTB_INIT]);
1014 if (gopt->version != HTB_VER >> 16) {
1016 "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1017 HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1021 err = qdisc_class_hash_init(&q->clhash);
1024 for (i = 0; i < TC_HTB_NUMPRIO; i++)
1025 INIT_LIST_HEAD(q->drops + i);
1027 qdisc_watchdog_init(&q->watchdog, sch);
1028 skb_queue_head_init(&q->direct_queue);
1030 q->direct_qlen = sch->dev->tx_queue_len;
1031 if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
1034 if ((q->rate2quantum = gopt->rate2quantum) < 1)
1035 q->rate2quantum = 1;
1036 q->defcls = gopt->defcls;
1041 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1043 struct htb_sched *q = qdisc_priv(sch);
1044 struct nlattr *nest;
1045 struct tc_htb_glob gopt;
1047 spin_lock_bh(&sch->dev->queue_lock);
1049 gopt.direct_pkts = q->direct_pkts;
1050 gopt.version = HTB_VER;
1051 gopt.rate2quantum = q->rate2quantum;
1052 gopt.defcls = q->defcls;
1055 nest = nla_nest_start(skb, TCA_OPTIONS);
1057 goto nla_put_failure;
1058 NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1059 nla_nest_end(skb, nest);
1061 spin_unlock_bh(&sch->dev->queue_lock);
1065 spin_unlock_bh(&sch->dev->queue_lock);
1066 nla_nest_cancel(skb, nest);
1070 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1071 struct sk_buff *skb, struct tcmsg *tcm)
1073 struct htb_class *cl = (struct htb_class *)arg;
1074 struct nlattr *nest;
1075 struct tc_htb_opt opt;
1077 spin_lock_bh(&sch->dev->queue_lock);
1078 tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1079 tcm->tcm_handle = cl->common.classid;
1080 if (!cl->level && cl->un.leaf.q)
1081 tcm->tcm_info = cl->un.leaf.q->handle;
1083 nest = nla_nest_start(skb, TCA_OPTIONS);
1085 goto nla_put_failure;
1087 memset(&opt, 0, sizeof(opt));
1089 opt.rate = cl->rate->rate;
1090 opt.buffer = cl->buffer;
1091 opt.ceil = cl->ceil->rate;
1092 opt.cbuffer = cl->cbuffer;
1093 opt.quantum = cl->un.leaf.quantum;
1094 opt.prio = cl->un.leaf.prio;
1095 opt.level = cl->level;
1096 NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1098 nla_nest_end(skb, nest);
1099 spin_unlock_bh(&sch->dev->queue_lock);
1103 spin_unlock_bh(&sch->dev->queue_lock);
1104 nla_nest_cancel(skb, nest);
1109 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1111 struct htb_class *cl = (struct htb_class *)arg;
1113 if (!cl->level && cl->un.leaf.q)
1114 cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1115 cl->xstats.tokens = cl->tokens;
1116 cl->xstats.ctokens = cl->ctokens;
1118 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1119 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1120 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1123 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1126 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1129 struct htb_class *cl = (struct htb_class *)arg;
1131 if (cl && !cl->level) {
1133 (new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1134 cl->common.classid))
1138 if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1139 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1142 sch_tree_unlock(sch);
1148 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1150 struct htb_class *cl = (struct htb_class *)arg;
1151 return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1154 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1156 struct htb_class *cl = (struct htb_class *)arg;
1158 if (cl->un.leaf.q->q.qlen == 0)
1159 htb_deactivate(qdisc_priv(sch), cl);
1162 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1164 struct htb_class *cl = htb_find(classid, sch);
1167 return (unsigned long)cl;
1170 static inline int htb_parent_last_child(struct htb_class *cl)
1173 /* the root class */
1175 if (cl->parent->children > 1)
1176 /* not the last child */
1181 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1182 struct Qdisc *new_q)
1184 struct htb_class *parent = cl->parent;
1186 BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
1188 if (parent->cmode != HTB_CAN_SEND)
1189 htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1192 memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1193 INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1194 parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1195 parent->un.leaf.quantum = parent->quantum;
1196 parent->un.leaf.prio = parent->prio;
1197 parent->tokens = parent->buffer;
1198 parent->ctokens = parent->cbuffer;
1199 parent->t_c = psched_get_time();
1200 parent->cmode = HTB_CAN_SEND;
1203 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1206 BUG_TRAP(cl->un.leaf.q);
1207 qdisc_destroy(cl->un.leaf.q);
1209 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1210 qdisc_put_rtab(cl->rate);
1211 qdisc_put_rtab(cl->ceil);
1213 tcf_destroy_chain(&cl->filter_list);
1217 /* always caled under BH & queue lock */
1218 static void htb_destroy(struct Qdisc *sch)
1220 struct htb_sched *q = qdisc_priv(sch);
1221 struct hlist_node *n, *next;
1222 struct htb_class *cl;
1225 qdisc_watchdog_cancel(&q->watchdog);
1226 /* This line used to be after htb_destroy_class call below
1227 and surprisingly it worked in 2.4. But it must precede it
1228 because filter need its target class alive to be able to call
1229 unbind_filter on it (without Oops). */
1230 tcf_destroy_chain(&q->filter_list);
1232 for (i = 0; i < q->clhash.hashsize; i++) {
1233 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1234 tcf_destroy_chain(&cl->filter_list);
1236 for (i = 0; i < q->clhash.hashsize; i++) {
1237 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1239 htb_destroy_class(sch, cl);
1241 qdisc_class_hash_destroy(&q->clhash);
1242 __skb_queue_purge(&q->direct_queue);
1245 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1247 struct htb_sched *q = qdisc_priv(sch);
1248 struct htb_class *cl = (struct htb_class *)arg;
1250 struct Qdisc *new_q = NULL;
1253 // TODO: why don't allow to delete subtree ? references ? does
1254 // tc subsys quarantee us that in htb_destroy it holds no class
1255 // refs so that we can remove children safely there ?
1256 if (cl->children || cl->filter_cnt)
1259 if (!cl->level && htb_parent_last_child(cl)) {
1260 new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1261 cl->parent->common.classid);
1268 qlen = cl->un.leaf.q->q.qlen;
1269 qdisc_reset(cl->un.leaf.q);
1270 qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1273 /* delete from hash and active; remainder in destroy_class */
1274 qdisc_class_hash_remove(&q->clhash, &cl->common);
1275 cl->parent->children--;
1277 if (cl->prio_activity)
1278 htb_deactivate(q, cl);
1280 if (cl->cmode != HTB_CAN_SEND)
1281 htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1284 htb_parent_to_leaf(q, cl, new_q);
1286 if (--cl->refcnt == 0)
1287 htb_destroy_class(sch, cl);
1289 sch_tree_unlock(sch);
1293 static void htb_put(struct Qdisc *sch, unsigned long arg)
1295 struct htb_class *cl = (struct htb_class *)arg;
1297 if (--cl->refcnt == 0)
1298 htb_destroy_class(sch, cl);
1301 static int htb_change_class(struct Qdisc *sch, u32 classid,
1302 u32 parentid, struct nlattr **tca,
1306 struct htb_sched *q = qdisc_priv(sch);
1307 struct htb_class *cl = (struct htb_class *)*arg, *parent;
1308 struct nlattr *opt = tca[TCA_OPTIONS];
1309 struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1310 struct nlattr *tb[TCA_HTB_RTAB + 1];
1311 struct tc_htb_opt *hopt;
1313 /* extract all subattrs from opt attr */
1317 err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1322 if (tb[TCA_HTB_PARMS] == NULL)
1325 parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1327 hopt = nla_data(tb[TCA_HTB_PARMS]);
1329 rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1330 ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1334 if (!cl) { /* new class */
1335 struct Qdisc *new_q;
1339 struct gnet_estimator opt;
1342 .nla_len = nla_attr_size(sizeof(est.opt)),
1343 .nla_type = TCA_RATE,
1346 /* 4s interval, 16s averaging constant */
1352 /* check for valid classid */
1353 if (!classid || TC_H_MAJ(classid ^ sch->handle)
1354 || htb_find(classid, sch))
1357 /* check maximal depth */
1358 if (parent && parent->parent && parent->parent->level < 2) {
1359 printk(KERN_ERR "htb: tree is too deep\n");
1363 if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1366 gen_new_estimator(&cl->bstats, &cl->rate_est,
1367 &sch->dev->queue_lock,
1368 tca[TCA_RATE] ? : &est.nla);
1371 INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1372 RB_CLEAR_NODE(&cl->pq_node);
1374 for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1375 RB_CLEAR_NODE(&cl->node[prio]);
1377 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1378 so that can't be used inside of sch_tree_lock
1379 -- thanks to Karlis Peisenieks */
1380 new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
1382 if (parent && !parent->level) {
1383 unsigned int qlen = parent->un.leaf.q->q.qlen;
1385 /* turn parent into inner node */
1386 qdisc_reset(parent->un.leaf.q);
1387 qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1388 qdisc_destroy(parent->un.leaf.q);
1389 if (parent->prio_activity)
1390 htb_deactivate(q, parent);
1392 /* remove from evt list because of level change */
1393 if (parent->cmode != HTB_CAN_SEND) {
1394 htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1395 parent->cmode = HTB_CAN_SEND;
1397 parent->level = (parent->parent ? parent->parent->level
1398 : TC_HTB_MAXDEPTH) - 1;
1399 memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1401 /* leaf (we) needs elementary qdisc */
1402 cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1404 cl->common.classid = classid;
1405 cl->parent = parent;
1407 /* set class to be in HTB_CAN_SEND state */
1408 cl->tokens = hopt->buffer;
1409 cl->ctokens = hopt->cbuffer;
1410 cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */
1411 cl->t_c = psched_get_time();
1412 cl->cmode = HTB_CAN_SEND;
1414 /* attach to the hash list and parent's family */
1415 qdisc_class_hash_insert(&q->clhash, &cl->common);
1420 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1421 &sch->dev->queue_lock,
1426 /* it used to be a nasty bug here, we have to check that node
1427 is really leaf before changing cl->un.leaf ! */
1429 cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1430 if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1432 "HTB: quantum of class %X is small. Consider r2q change.\n",
1433 cl->common.classid);
1434 cl->un.leaf.quantum = 1000;
1436 if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1438 "HTB: quantum of class %X is big. Consider r2q change.\n",
1439 cl->common.classid);
1440 cl->un.leaf.quantum = 200000;
1443 cl->un.leaf.quantum = hopt->quantum;
1444 if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1445 cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1447 /* backup for htb_parent_to_leaf */
1448 cl->quantum = cl->un.leaf.quantum;
1449 cl->prio = cl->un.leaf.prio;
1452 cl->buffer = hopt->buffer;
1453 cl->cbuffer = hopt->cbuffer;
1455 qdisc_put_rtab(cl->rate);
1458 qdisc_put_rtab(cl->ceil);
1460 sch_tree_unlock(sch);
1462 qdisc_class_hash_grow(sch, &q->clhash);
1464 *arg = (unsigned long)cl;
1469 qdisc_put_rtab(rtab);
1471 qdisc_put_rtab(ctab);
1475 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1477 struct htb_sched *q = qdisc_priv(sch);
1478 struct htb_class *cl = (struct htb_class *)arg;
1479 struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1484 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1487 struct htb_sched *q = qdisc_priv(sch);
1488 struct htb_class *cl = htb_find(classid, sch);
1490 /*if (cl && !cl->level) return 0;
1491 The line above used to be there to prevent attaching filters to
1492 leaves. But at least tc_index filter uses this just to get class
1493 for other reasons so that we have to allow for it.
1495 19.6.2002 As Werner explained it is ok - bind filter is just
1496 another way to "lock" the class - unlike "get" this lock can
1497 be broken by class during destroy IIUC.
1503 return (unsigned long)cl;
1506 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1508 struct htb_sched *q = qdisc_priv(sch);
1509 struct htb_class *cl = (struct htb_class *)arg;
1517 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1519 struct htb_sched *q = qdisc_priv(sch);
1520 struct htb_class *cl;
1521 struct hlist_node *n;
1527 for (i = 0; i < q->clhash.hashsize; i++) {
1528 hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1529 if (arg->count < arg->skip) {
1533 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1542 static const struct Qdisc_class_ops htb_class_ops = {
1545 .qlen_notify = htb_qlen_notify,
1548 .change = htb_change_class,
1549 .delete = htb_delete,
1551 .tcf_chain = htb_find_tcf,
1552 .bind_tcf = htb_bind_filter,
1553 .unbind_tcf = htb_unbind_filter,
1554 .dump = htb_dump_class,
1555 .dump_stats = htb_dump_class_stats,
1558 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1560 .cl_ops = &htb_class_ops,
1562 .priv_size = sizeof(struct htb_sched),
1563 .enqueue = htb_enqueue,
1564 .dequeue = htb_dequeue,
1565 .requeue = htb_requeue,
1569 .destroy = htb_destroy,
1570 .change = NULL /* htb_change */,
1572 .owner = THIS_MODULE,
1575 static int __init htb_module_init(void)
1577 return register_qdisc(&htb_qdisc_ops);
1579 static void __exit htb_module_exit(void)
1581 unregister_qdisc(&htb_qdisc_ops);
1584 module_init(htb_module_init)
1585 module_exit(htb_module_exit)
1586 MODULE_LICENSE("GPL");