2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
32 #include <net/net_namespace.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
47 This file consists of two interrelated parts:
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
71 All real intelligent work is done inside qdisc modules.
75 Every discipline has two major routines: enqueue and dequeue.
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
102 requeues once dequeued packet. It is used for non-standard or
103 just buggy devices, which can defer output even if netif_queue_stopped()=0.
107 like dequeue but without removing a packet from the queue
111 returns qdisc to initial state: purge all buffers, clear all
112 timers, counters (except for statistics) etc.
116 initializes newly created qdisc.
120 destroys resources allocated by init and during lifetime of qdisc.
124 changes qdisc parameters.
127 /* Protects list of registered TC modules. It is pure SMP lock. */
128 static DEFINE_RWLOCK(qdisc_mod_lock);
131 /************************************************
132 * Queueing disciplines manipulation. *
133 ************************************************/
136 /* The list of all installed queueing disciplines. */
138 static struct Qdisc_ops *qdisc_base;
140 /* Register/uregister queueing discipline */
142 int register_qdisc(struct Qdisc_ops *qops)
144 struct Qdisc_ops *q, **qp;
147 write_lock(&qdisc_mod_lock);
148 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
149 if (!strcmp(qops->id, q->id))
152 if (qops->enqueue == NULL)
153 qops->enqueue = noop_qdisc_ops.enqueue;
154 if (qops->requeue == NULL)
155 qops->requeue = noop_qdisc_ops.requeue;
156 if (qops->peek == NULL) {
157 if (qops->dequeue == NULL) {
158 qops->peek = noop_qdisc_ops.peek;
164 if (qops->dequeue == NULL)
165 qops->dequeue = noop_qdisc_ops.dequeue;
171 write_unlock(&qdisc_mod_lock);
174 EXPORT_SYMBOL(register_qdisc);
176 int unregister_qdisc(struct Qdisc_ops *qops)
178 struct Qdisc_ops *q, **qp;
181 write_lock(&qdisc_mod_lock);
182 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
190 write_unlock(&qdisc_mod_lock);
193 EXPORT_SYMBOL(unregister_qdisc);
195 /* We know handle. Find qdisc among all qdisc's attached to device
196 (root qdisc, all its children, children of children etc.)
199 struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
203 if (!(root->flags & TCQ_F_BUILTIN) &&
204 root->handle == handle)
207 list_for_each_entry(q, &root->list, list) {
208 if (q->handle == handle)
215 * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
216 * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
218 static DEFINE_SPINLOCK(qdisc_list_lock);
220 static void qdisc_list_add(struct Qdisc *q)
222 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
223 spin_lock_bh(&qdisc_list_lock);
224 list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
225 spin_unlock_bh(&qdisc_list_lock);
229 void qdisc_list_del(struct Qdisc *q)
231 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
232 spin_lock_bh(&qdisc_list_lock);
234 spin_unlock_bh(&qdisc_list_lock);
237 EXPORT_SYMBOL(qdisc_list_del);
239 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
244 spin_lock_bh(&qdisc_list_lock);
246 for (i = 0; i < dev->num_tx_queues; i++) {
247 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
248 struct Qdisc *txq_root = txq->qdisc_sleeping;
250 q = qdisc_match_from_root(txq_root, handle);
255 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
258 spin_unlock_bh(&qdisc_list_lock);
263 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
267 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
271 cl = cops->get(p, classid);
275 leaf = cops->leaf(p, cl);
280 /* Find queueing discipline by name */
282 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
284 struct Qdisc_ops *q = NULL;
287 read_lock(&qdisc_mod_lock);
288 for (q = qdisc_base; q; q = q->next) {
289 if (nla_strcmp(kind, q->id) == 0) {
290 if (!try_module_get(q->owner))
295 read_unlock(&qdisc_mod_lock);
300 static struct qdisc_rate_table *qdisc_rtab_list;
302 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
304 struct qdisc_rate_table *rtab;
306 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
307 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
313 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
314 nla_len(tab) != TC_RTAB_SIZE)
317 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
321 memcpy(rtab->data, nla_data(tab), 1024);
322 rtab->next = qdisc_rtab_list;
323 qdisc_rtab_list = rtab;
327 EXPORT_SYMBOL(qdisc_get_rtab);
329 void qdisc_put_rtab(struct qdisc_rate_table *tab)
331 struct qdisc_rate_table *rtab, **rtabp;
333 if (!tab || --tab->refcnt)
336 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
344 EXPORT_SYMBOL(qdisc_put_rtab);
346 static LIST_HEAD(qdisc_stab_list);
347 static DEFINE_SPINLOCK(qdisc_stab_lock);
349 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
350 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
351 [TCA_STAB_DATA] = { .type = NLA_BINARY },
354 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
356 struct nlattr *tb[TCA_STAB_MAX + 1];
357 struct qdisc_size_table *stab;
358 struct tc_sizespec *s;
359 unsigned int tsize = 0;
363 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
366 if (!tb[TCA_STAB_BASE])
367 return ERR_PTR(-EINVAL);
369 s = nla_data(tb[TCA_STAB_BASE]);
372 if (!tb[TCA_STAB_DATA])
373 return ERR_PTR(-EINVAL);
374 tab = nla_data(tb[TCA_STAB_DATA]);
375 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
378 if (!s || tsize != s->tsize || (!tab && tsize > 0))
379 return ERR_PTR(-EINVAL);
381 spin_lock(&qdisc_stab_lock);
383 list_for_each_entry(stab, &qdisc_stab_list, list) {
384 if (memcmp(&stab->szopts, s, sizeof(*s)))
386 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
389 spin_unlock(&qdisc_stab_lock);
393 spin_unlock(&qdisc_stab_lock);
395 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
397 return ERR_PTR(-ENOMEM);
402 memcpy(stab->data, tab, tsize * sizeof(u16));
404 spin_lock(&qdisc_stab_lock);
405 list_add_tail(&stab->list, &qdisc_stab_list);
406 spin_unlock(&qdisc_stab_lock);
411 void qdisc_put_stab(struct qdisc_size_table *tab)
416 spin_lock(&qdisc_stab_lock);
418 if (--tab->refcnt == 0) {
419 list_del(&tab->list);
423 spin_unlock(&qdisc_stab_lock);
425 EXPORT_SYMBOL(qdisc_put_stab);
427 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
431 nest = nla_nest_start(skb, TCA_STAB);
432 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
433 nla_nest_end(skb, nest);
441 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
445 pkt_len = skb->len + stab->szopts.overhead;
446 if (unlikely(!stab->szopts.tsize))
449 slot = pkt_len + stab->szopts.cell_align;
450 if (unlikely(slot < 0))
453 slot >>= stab->szopts.cell_log;
454 if (likely(slot < stab->szopts.tsize))
455 pkt_len = stab->data[slot];
457 pkt_len = stab->data[stab->szopts.tsize - 1] *
458 (slot / stab->szopts.tsize) +
459 stab->data[slot % stab->szopts.tsize];
461 pkt_len <<= stab->szopts.size_log;
463 if (unlikely(pkt_len < 1))
465 qdisc_skb_cb(skb)->pkt_len = pkt_len;
467 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
469 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
471 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
474 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
476 __netif_schedule(qdisc_root(wd->qdisc));
478 return HRTIMER_NORESTART;
481 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
483 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
484 wd->timer.function = qdisc_watchdog;
487 EXPORT_SYMBOL(qdisc_watchdog_init);
489 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
493 if (test_bit(__QDISC_STATE_DEACTIVATED,
494 &qdisc_root_sleeping(wd->qdisc)->state))
497 wd->qdisc->flags |= TCQ_F_THROTTLED;
498 time = ktime_set(0, 0);
499 time = ktime_add_ns(time, PSCHED_US2NS(expires));
500 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
502 EXPORT_SYMBOL(qdisc_watchdog_schedule);
504 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
506 hrtimer_cancel(&wd->timer);
507 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
509 EXPORT_SYMBOL(qdisc_watchdog_cancel);
511 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
513 unsigned int size = n * sizeof(struct hlist_head), i;
514 struct hlist_head *h;
516 if (size <= PAGE_SIZE)
517 h = kmalloc(size, GFP_KERNEL);
519 h = (struct hlist_head *)
520 __get_free_pages(GFP_KERNEL, get_order(size));
523 for (i = 0; i < n; i++)
524 INIT_HLIST_HEAD(&h[i]);
529 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
531 unsigned int size = n * sizeof(struct hlist_head);
533 if (size <= PAGE_SIZE)
536 free_pages((unsigned long)h, get_order(size));
539 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
541 struct Qdisc_class_common *cl;
542 struct hlist_node *n, *next;
543 struct hlist_head *nhash, *ohash;
544 unsigned int nsize, nmask, osize;
547 /* Rehash when load factor exceeds 0.75 */
548 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
550 nsize = clhash->hashsize * 2;
552 nhash = qdisc_class_hash_alloc(nsize);
556 ohash = clhash->hash;
557 osize = clhash->hashsize;
560 for (i = 0; i < osize; i++) {
561 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
562 h = qdisc_class_hash(cl->classid, nmask);
563 hlist_add_head(&cl->hnode, &nhash[h]);
566 clhash->hash = nhash;
567 clhash->hashsize = nsize;
568 clhash->hashmask = nmask;
569 sch_tree_unlock(sch);
571 qdisc_class_hash_free(ohash, osize);
573 EXPORT_SYMBOL(qdisc_class_hash_grow);
575 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
577 unsigned int size = 4;
579 clhash->hash = qdisc_class_hash_alloc(size);
580 if (clhash->hash == NULL)
582 clhash->hashsize = size;
583 clhash->hashmask = size - 1;
584 clhash->hashelems = 0;
587 EXPORT_SYMBOL(qdisc_class_hash_init);
589 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
591 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
593 EXPORT_SYMBOL(qdisc_class_hash_destroy);
595 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
596 struct Qdisc_class_common *cl)
600 INIT_HLIST_NODE(&cl->hnode);
601 h = qdisc_class_hash(cl->classid, clhash->hashmask);
602 hlist_add_head(&cl->hnode, &clhash->hash[h]);
605 EXPORT_SYMBOL(qdisc_class_hash_insert);
607 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
608 struct Qdisc_class_common *cl)
610 hlist_del(&cl->hnode);
613 EXPORT_SYMBOL(qdisc_class_hash_remove);
615 /* Allocate an unique handle from space managed by kernel */
617 static u32 qdisc_alloc_handle(struct net_device *dev)
620 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
623 autohandle += TC_H_MAKE(0x10000U, 0);
624 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
625 autohandle = TC_H_MAKE(0x80000000U, 0);
626 } while (qdisc_lookup(dev, autohandle) && --i > 0);
628 return i>0 ? autohandle : 0;
631 /* Attach toplevel qdisc to device queue. */
633 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
636 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
637 spinlock_t *root_lock;
639 root_lock = qdisc_lock(oqdisc);
640 spin_lock_bh(root_lock);
642 /* Prune old scheduler */
643 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
646 /* ... and graft new one */
649 dev_queue->qdisc_sleeping = qdisc;
650 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
652 spin_unlock_bh(root_lock);
657 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
659 const struct Qdisc_class_ops *cops;
665 while ((parentid = sch->parent)) {
666 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
669 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
671 WARN_ON(parentid != TC_H_ROOT);
674 cops = sch->ops->cl_ops;
675 if (cops->qlen_notify) {
676 cl = cops->get(sch, parentid);
677 cops->qlen_notify(sch, cl);
683 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
685 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
686 struct Qdisc *old, struct Qdisc *new)
689 qdisc_notify(skb, n, clid, old, new);
695 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
698 * When appropriate send a netlink notification using 'skb'
701 * On success, destroy old qdisc.
704 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
705 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
706 struct Qdisc *new, struct Qdisc *old)
708 struct Qdisc *q = old;
711 if (parent == NULL) {
712 unsigned int i, num_q, ingress;
715 num_q = dev->num_tx_queues;
716 if ((q && q->flags & TCQ_F_INGRESS) ||
717 (new && new->flags & TCQ_F_INGRESS)) {
722 if (dev->flags & IFF_UP)
725 for (i = 0; i < num_q; i++) {
726 struct netdev_queue *dev_queue = &dev->rx_queue;
729 dev_queue = netdev_get_tx_queue(dev, i);
731 old = dev_graft_qdisc(dev_queue, new);
733 atomic_inc(&new->refcnt);
735 notify_and_destroy(skb, n, classid, old, new);
738 if (dev->flags & IFF_UP)
741 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
746 unsigned long cl = cops->get(parent, classid);
748 err = cops->graft(parent, cl, new, &old);
749 cops->put(parent, cl);
753 notify_and_destroy(skb, n, classid, old, new);
758 /* lockdep annotation is needed for ingress; egress gets it only for name */
759 static struct lock_class_key qdisc_tx_lock;
760 static struct lock_class_key qdisc_rx_lock;
763 Allocate and initialize new qdisc.
765 Parameters are passed via opt.
768 static struct Qdisc *
769 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
770 u32 parent, u32 handle, struct nlattr **tca, int *errp)
773 struct nlattr *kind = tca[TCA_KIND];
775 struct Qdisc_ops *ops;
776 struct qdisc_size_table *stab;
778 ops = qdisc_lookup_ops(kind);
779 #ifdef CONFIG_MODULES
780 if (ops == NULL && kind != NULL) {
782 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
783 /* We dropped the RTNL semaphore in order to
784 * perform the module load. So, even if we
785 * succeeded in loading the module we have to
786 * tell the caller to replay the request. We
787 * indicate this using -EAGAIN.
788 * We replay the request because the device may
789 * go away in the mean time.
792 request_module("sch_%s", name);
794 ops = qdisc_lookup_ops(kind);
796 /* We will try again qdisc_lookup_ops,
797 * so don't keep a reference.
799 module_put(ops->owner);
811 sch = qdisc_alloc(dev_queue, ops);
817 sch->parent = parent;
819 if (handle == TC_H_INGRESS) {
820 sch->flags |= TCQ_F_INGRESS;
821 handle = TC_H_MAKE(TC_H_INGRESS, 0);
822 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
825 handle = qdisc_alloc_handle(dev);
830 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
833 sch->handle = handle;
835 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
837 stab = qdisc_get_stab(tca[TCA_STAB]);
845 spinlock_t *root_lock;
847 if ((sch->parent != TC_H_ROOT) &&
848 !(sch->flags & TCQ_F_INGRESS))
849 root_lock = qdisc_root_sleeping_lock(sch);
851 root_lock = qdisc_lock(sch);
853 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
854 root_lock, tca[TCA_RATE]);
857 * Any broken qdiscs that would require
858 * a ops->reset() here? The qdisc was never
859 * in action so it shouldn't be necessary.
872 qdisc_put_stab(sch->stab);
874 kfree((char *) sch - sch->padded);
876 module_put(ops->owner);
882 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
884 struct qdisc_size_table *stab = NULL;
887 if (tca[TCA_OPTIONS]) {
888 if (sch->ops->change == NULL)
890 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
896 stab = qdisc_get_stab(tca[TCA_STAB]);
898 return PTR_ERR(stab);
901 qdisc_put_stab(sch->stab);
905 gen_replace_estimator(&sch->bstats, &sch->rate_est,
906 qdisc_root_sleeping_lock(sch),
911 struct check_loop_arg
913 struct qdisc_walker w;
918 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
920 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
922 struct check_loop_arg arg;
924 if (q->ops->cl_ops == NULL)
927 arg.w.stop = arg.w.skip = arg.w.count = 0;
928 arg.w.fn = check_loop_fn;
931 q->ops->cl_ops->walk(q, &arg.w);
932 return arg.w.stop ? -ELOOP : 0;
936 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
939 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
940 struct check_loop_arg *arg = (struct check_loop_arg *)w;
942 leaf = cops->leaf(q, cl);
944 if (leaf == arg->p || arg->depth > 7)
946 return check_loop(leaf, arg->p, arg->depth + 1);
955 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
957 struct net *net = sock_net(skb->sk);
958 struct tcmsg *tcm = NLMSG_DATA(n);
959 struct nlattr *tca[TCA_MAX + 1];
960 struct net_device *dev;
961 u32 clid = tcm->tcm_parent;
962 struct Qdisc *q = NULL;
963 struct Qdisc *p = NULL;
966 if (net != &init_net)
969 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
972 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
977 if (clid != TC_H_ROOT) {
978 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
979 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
981 q = qdisc_leaf(p, clid);
982 } else { /* ingress */
983 q = dev->rx_queue.qdisc_sleeping;
986 struct netdev_queue *dev_queue;
987 dev_queue = netdev_get_tx_queue(dev, 0);
988 q = dev_queue->qdisc_sleeping;
993 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
996 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1000 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1003 if (n->nlmsg_type == RTM_DELQDISC) {
1008 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1011 qdisc_notify(skb, n, clid, NULL, q);
1017 Create/change qdisc.
1020 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1022 struct net *net = sock_net(skb->sk);
1024 struct nlattr *tca[TCA_MAX + 1];
1025 struct net_device *dev;
1027 struct Qdisc *q, *p;
1030 if (net != &init_net)
1034 /* Reinit, just in case something touches this. */
1035 tcm = NLMSG_DATA(n);
1036 clid = tcm->tcm_parent;
1039 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1042 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1047 if (clid != TC_H_ROOT) {
1048 if (clid != TC_H_INGRESS) {
1049 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1051 q = qdisc_leaf(p, clid);
1052 } else { /*ingress */
1053 q = dev->rx_queue.qdisc_sleeping;
1056 struct netdev_queue *dev_queue;
1057 dev_queue = netdev_get_tx_queue(dev, 0);
1058 q = dev_queue->qdisc_sleeping;
1061 /* It may be default qdisc, ignore it */
1062 if (q && q->handle == 0)
1065 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1066 if (tcm->tcm_handle) {
1067 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1069 if (TC_H_MIN(tcm->tcm_handle))
1071 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1072 goto create_n_graft;
1073 if (n->nlmsg_flags&NLM_F_EXCL)
1075 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1078 (p && check_loop(q, p, 0)))
1080 atomic_inc(&q->refcnt);
1084 goto create_n_graft;
1086 /* This magic test requires explanation.
1088 * We know, that some child q is already
1089 * attached to this parent and have choice:
1090 * either to change it or to create/graft new one.
1092 * 1. We are allowed to create/graft only
1093 * if CREATE and REPLACE flags are set.
1095 * 2. If EXCL is set, requestor wanted to say,
1096 * that qdisc tcm_handle is not expected
1097 * to exist, so that we choose create/graft too.
1099 * 3. The last case is when no flags are set.
1100 * Alas, it is sort of hole in API, we
1101 * cannot decide what to do unambiguously.
1102 * For now we select create/graft, if
1103 * user gave KIND, which does not match existing.
1105 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1106 (n->nlmsg_flags&NLM_F_REPLACE) &&
1107 ((n->nlmsg_flags&NLM_F_EXCL) ||
1109 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1110 goto create_n_graft;
1114 if (!tcm->tcm_handle)
1116 q = qdisc_lookup(dev, tcm->tcm_handle);
1119 /* Change qdisc parameters */
1122 if (n->nlmsg_flags&NLM_F_EXCL)
1124 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1126 err = qdisc_change(q, tca);
1128 qdisc_notify(skb, n, clid, NULL, q);
1132 if (!(n->nlmsg_flags&NLM_F_CREATE))
1134 if (clid == TC_H_INGRESS)
1135 q = qdisc_create(dev, &dev->rx_queue,
1136 tcm->tcm_parent, tcm->tcm_parent,
1139 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1140 tcm->tcm_parent, tcm->tcm_handle,
1149 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1159 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1160 u32 pid, u32 seq, u16 flags, int event)
1163 struct nlmsghdr *nlh;
1164 unsigned char *b = skb_tail_pointer(skb);
1167 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1168 tcm = NLMSG_DATA(nlh);
1169 tcm->tcm_family = AF_UNSPEC;
1172 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1173 tcm->tcm_parent = clid;
1174 tcm->tcm_handle = q->handle;
1175 tcm->tcm_info = atomic_read(&q->refcnt);
1176 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1177 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1178 goto nla_put_failure;
1179 q->qstats.qlen = q->q.qlen;
1181 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1182 goto nla_put_failure;
1184 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1185 qdisc_root_sleeping_lock(q), &d) < 0)
1186 goto nla_put_failure;
1188 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1189 goto nla_put_failure;
1191 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1192 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1193 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1194 goto nla_put_failure;
1196 if (gnet_stats_finish_copy(&d) < 0)
1197 goto nla_put_failure;
1199 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1208 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1209 u32 clid, struct Qdisc *old, struct Qdisc *new)
1211 struct sk_buff *skb;
1212 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1214 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1218 if (old && old->handle) {
1219 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1223 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1228 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1235 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1237 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1240 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1241 struct netlink_callback *cb,
1242 int *q_idx_p, int s_q_idx)
1244 int ret = 0, q_idx = *q_idx_p;
1251 if (q_idx < s_q_idx) {
1254 if (!tc_qdisc_dump_ignore(q) &&
1255 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1256 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1260 list_for_each_entry(q, &root->list, list) {
1261 if (q_idx < s_q_idx) {
1265 if (!tc_qdisc_dump_ignore(q) &&
1266 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1267 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1280 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1282 struct net *net = sock_net(skb->sk);
1285 struct net_device *dev;
1287 if (net != &init_net)
1290 s_idx = cb->args[0];
1291 s_q_idx = q_idx = cb->args[1];
1292 read_lock(&dev_base_lock);
1294 for_each_netdev(&init_net, dev) {
1295 struct netdev_queue *dev_queue;
1303 dev_queue = netdev_get_tx_queue(dev, 0);
1304 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1307 dev_queue = &dev->rx_queue;
1308 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1316 read_unlock(&dev_base_lock);
1319 cb->args[1] = q_idx;
1326 /************************************************
1327 * Traffic classes manipulation. *
1328 ************************************************/
1332 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1334 struct net *net = sock_net(skb->sk);
1335 struct netdev_queue *dev_queue;
1336 struct tcmsg *tcm = NLMSG_DATA(n);
1337 struct nlattr *tca[TCA_MAX + 1];
1338 struct net_device *dev;
1339 struct Qdisc *q = NULL;
1340 const struct Qdisc_class_ops *cops;
1341 unsigned long cl = 0;
1342 unsigned long new_cl;
1343 u32 pid = tcm->tcm_parent;
1344 u32 clid = tcm->tcm_handle;
1345 u32 qid = TC_H_MAJ(clid);
1348 if (net != &init_net)
1351 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1354 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1359 parent == TC_H_UNSPEC - unspecified parent.
1360 parent == TC_H_ROOT - class is root, which has no parent.
1361 parent == X:0 - parent is root class.
1362 parent == X:Y - parent is a node in hierarchy.
1363 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1365 handle == 0:0 - generate handle from kernel pool.
1366 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1367 handle == X:Y - clear.
1368 handle == X:0 - root class.
1371 /* Step 1. Determine qdisc handle X:0 */
1373 dev_queue = netdev_get_tx_queue(dev, 0);
1374 if (pid != TC_H_ROOT) {
1375 u32 qid1 = TC_H_MAJ(pid);
1378 /* If both majors are known, they must be identical. */
1383 } else if (qid == 0)
1384 qid = dev_queue->qdisc_sleeping->handle;
1386 /* Now qid is genuine qdisc handle consistent
1387 both with parent and child.
1389 TC_H_MAJ(pid) still may be unspecified, complete it now.
1392 pid = TC_H_MAKE(qid, pid);
1395 qid = dev_queue->qdisc_sleeping->handle;
1398 /* OK. Locate qdisc */
1399 if ((q = qdisc_lookup(dev, qid)) == NULL)
1402 /* An check that it supports classes */
1403 cops = q->ops->cl_ops;
1407 /* Now try to get class */
1409 if (pid == TC_H_ROOT)
1412 clid = TC_H_MAKE(qid, clid);
1415 cl = cops->get(q, clid);
1419 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1422 switch (n->nlmsg_type) {
1425 if (n->nlmsg_flags&NLM_F_EXCL)
1429 err = cops->delete(q, cl);
1431 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1434 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1443 err = cops->change(q, clid, pid, tca, &new_cl);
1445 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1455 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1457 u32 pid, u32 seq, u16 flags, int event)
1460 struct nlmsghdr *nlh;
1461 unsigned char *b = skb_tail_pointer(skb);
1463 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1465 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1466 tcm = NLMSG_DATA(nlh);
1467 tcm->tcm_family = AF_UNSPEC;
1468 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1469 tcm->tcm_parent = q->handle;
1470 tcm->tcm_handle = q->handle;
1472 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1473 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1474 goto nla_put_failure;
1476 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1477 qdisc_root_sleeping_lock(q), &d) < 0)
1478 goto nla_put_failure;
1480 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1481 goto nla_put_failure;
1483 if (gnet_stats_finish_copy(&d) < 0)
1484 goto nla_put_failure;
1486 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1495 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1496 struct Qdisc *q, unsigned long cl, int event)
1498 struct sk_buff *skb;
1499 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1501 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1505 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1510 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1513 struct qdisc_dump_args
1515 struct qdisc_walker w;
1516 struct sk_buff *skb;
1517 struct netlink_callback *cb;
1520 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1522 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1524 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1525 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1528 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1529 struct tcmsg *tcm, struct netlink_callback *cb,
1532 struct qdisc_dump_args arg;
1534 if (tc_qdisc_dump_ignore(q) ||
1535 *t_p < s_t || !q->ops->cl_ops ||
1537 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1542 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1543 arg.w.fn = qdisc_class_dump;
1547 arg.w.skip = cb->args[1];
1549 q->ops->cl_ops->walk(q, &arg.w);
1550 cb->args[1] = arg.w.count;
1557 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1558 struct tcmsg *tcm, struct netlink_callback *cb,
1566 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1569 list_for_each_entry(q, &root->list, list) {
1570 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1577 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1579 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1580 struct net *net = sock_net(skb->sk);
1581 struct netdev_queue *dev_queue;
1582 struct net_device *dev;
1585 if (net != &init_net)
1588 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1590 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1596 dev_queue = netdev_get_tx_queue(dev, 0);
1597 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1600 dev_queue = &dev->rx_queue;
1601 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1611 /* Main classifier routine: scans classifier chain attached
1612 to this qdisc, (optionally) tests for protocol and asks
1613 specific classifiers.
1615 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1616 struct tcf_result *res)
1618 __be16 protocol = skb->protocol;
1621 for (; tp; tp = tp->next) {
1622 if ((tp->protocol == protocol ||
1623 tp->protocol == htons(ETH_P_ALL)) &&
1624 (err = tp->classify(skb, tp, res)) >= 0) {
1625 #ifdef CONFIG_NET_CLS_ACT
1626 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1627 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1634 EXPORT_SYMBOL(tc_classify_compat);
1636 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1637 struct tcf_result *res)
1641 #ifdef CONFIG_NET_CLS_ACT
1642 struct tcf_proto *otp = tp;
1645 protocol = skb->protocol;
1647 err = tc_classify_compat(skb, tp, res);
1648 #ifdef CONFIG_NET_CLS_ACT
1649 if (err == TC_ACT_RECLASSIFY) {
1650 u32 verd = G_TC_VERD(skb->tc_verd);
1653 if (verd++ >= MAX_REC_LOOP) {
1654 printk("rule prio %u protocol %02x reclassify loop, "
1656 tp->prio&0xffff, ntohs(tp->protocol));
1659 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1665 EXPORT_SYMBOL(tc_classify);
1667 void tcf_destroy(struct tcf_proto *tp)
1669 tp->ops->destroy(tp);
1670 module_put(tp->ops->owner);
1674 void tcf_destroy_chain(struct tcf_proto **fl)
1676 struct tcf_proto *tp;
1678 while ((tp = *fl) != NULL) {
1683 EXPORT_SYMBOL(tcf_destroy_chain);
1685 #ifdef CONFIG_PROC_FS
1686 static int psched_show(struct seq_file *seq, void *v)
1690 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1691 seq_printf(seq, "%08x %08x %08x %08x\n",
1692 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1694 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1699 static int psched_open(struct inode *inode, struct file *file)
1701 return single_open(file, psched_show, PDE(inode)->data);
1704 static const struct file_operations psched_fops = {
1705 .owner = THIS_MODULE,
1706 .open = psched_open,
1708 .llseek = seq_lseek,
1709 .release = single_release,
1713 static int __init pktsched_init(void)
1715 register_qdisc(&pfifo_qdisc_ops);
1716 register_qdisc(&bfifo_qdisc_ops);
1717 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1719 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1720 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1721 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1722 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1723 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1724 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1729 subsys_initcall(pktsched_init);