2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
32 #include <net/net_namespace.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
47 This file consists of two interrelated parts:
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
71 All real intelligent work is done inside qdisc modules.
75 Every discipline has two major routines: enqueue and dequeue.
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
102 like dequeue but without removing a packet from the queue
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
111 initializes newly created qdisc.
115 destroys resources allocated by init and during lifetime of qdisc.
119 changes qdisc parameters.
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
131 /* The list of all installed queueing disciplines. */
133 static struct Qdisc_ops *qdisc_base;
135 /* Register/uregister queueing discipline */
137 int register_qdisc(struct Qdisc_ops *qops)
139 struct Qdisc_ops *q, **qp;
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->peek == NULL) {
150 if (qops->dequeue == NULL) {
151 qops->peek = noop_qdisc_ops.peek;
157 if (qops->dequeue == NULL)
158 qops->dequeue = noop_qdisc_ops.dequeue;
164 write_unlock(&qdisc_mod_lock);
167 EXPORT_SYMBOL(register_qdisc);
169 int unregister_qdisc(struct Qdisc_ops *qops)
171 struct Qdisc_ops *q, **qp;
174 write_lock(&qdisc_mod_lock);
175 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
183 write_unlock(&qdisc_mod_lock);
186 EXPORT_SYMBOL(unregister_qdisc);
188 /* We know handle. Find qdisc among all qdisc's attached to device
189 (root qdisc, all its children, children of children etc.)
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
196 if (!(root->flags & TCQ_F_BUILTIN) &&
197 root->handle == handle)
200 list_for_each_entry(q, &root->list, list) {
201 if (q->handle == handle)
207 static void qdisc_list_add(struct Qdisc *q)
209 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
213 void qdisc_list_del(struct Qdisc *q)
215 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
218 EXPORT_SYMBOL(qdisc_list_del);
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
225 for (i = 0; i < dev->num_tx_queues; i++) {
226 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
227 struct Qdisc *txq_root = txq->qdisc_sleeping;
229 q = qdisc_match_from_root(txq_root, handle);
234 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
239 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
243 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
247 cl = cops->get(p, classid);
251 leaf = cops->leaf(p, cl);
256 /* Find queueing discipline by name */
258 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
260 struct Qdisc_ops *q = NULL;
263 read_lock(&qdisc_mod_lock);
264 for (q = qdisc_base; q; q = q->next) {
265 if (nla_strcmp(kind, q->id) == 0) {
266 if (!try_module_get(q->owner))
271 read_unlock(&qdisc_mod_lock);
276 static struct qdisc_rate_table *qdisc_rtab_list;
278 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
280 struct qdisc_rate_table *rtab;
282 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
283 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
289 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
290 nla_len(tab) != TC_RTAB_SIZE)
293 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
297 memcpy(rtab->data, nla_data(tab), 1024);
298 rtab->next = qdisc_rtab_list;
299 qdisc_rtab_list = rtab;
303 EXPORT_SYMBOL(qdisc_get_rtab);
305 void qdisc_put_rtab(struct qdisc_rate_table *tab)
307 struct qdisc_rate_table *rtab, **rtabp;
309 if (!tab || --tab->refcnt)
312 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
320 EXPORT_SYMBOL(qdisc_put_rtab);
322 static LIST_HEAD(qdisc_stab_list);
323 static DEFINE_SPINLOCK(qdisc_stab_lock);
325 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
326 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
327 [TCA_STAB_DATA] = { .type = NLA_BINARY },
330 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
332 struct nlattr *tb[TCA_STAB_MAX + 1];
333 struct qdisc_size_table *stab;
334 struct tc_sizespec *s;
335 unsigned int tsize = 0;
339 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
342 if (!tb[TCA_STAB_BASE])
343 return ERR_PTR(-EINVAL);
345 s = nla_data(tb[TCA_STAB_BASE]);
348 if (!tb[TCA_STAB_DATA])
349 return ERR_PTR(-EINVAL);
350 tab = nla_data(tb[TCA_STAB_DATA]);
351 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
354 if (!s || tsize != s->tsize || (!tab && tsize > 0))
355 return ERR_PTR(-EINVAL);
357 spin_lock(&qdisc_stab_lock);
359 list_for_each_entry(stab, &qdisc_stab_list, list) {
360 if (memcmp(&stab->szopts, s, sizeof(*s)))
362 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
365 spin_unlock(&qdisc_stab_lock);
369 spin_unlock(&qdisc_stab_lock);
371 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
373 return ERR_PTR(-ENOMEM);
378 memcpy(stab->data, tab, tsize * sizeof(u16));
380 spin_lock(&qdisc_stab_lock);
381 list_add_tail(&stab->list, &qdisc_stab_list);
382 spin_unlock(&qdisc_stab_lock);
387 void qdisc_put_stab(struct qdisc_size_table *tab)
392 spin_lock(&qdisc_stab_lock);
394 if (--tab->refcnt == 0) {
395 list_del(&tab->list);
399 spin_unlock(&qdisc_stab_lock);
401 EXPORT_SYMBOL(qdisc_put_stab);
403 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
407 nest = nla_nest_start(skb, TCA_STAB);
409 goto nla_put_failure;
410 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
411 nla_nest_end(skb, nest);
419 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
423 pkt_len = skb->len + stab->szopts.overhead;
424 if (unlikely(!stab->szopts.tsize))
427 slot = pkt_len + stab->szopts.cell_align;
428 if (unlikely(slot < 0))
431 slot >>= stab->szopts.cell_log;
432 if (likely(slot < stab->szopts.tsize))
433 pkt_len = stab->data[slot];
435 pkt_len = stab->data[stab->szopts.tsize - 1] *
436 (slot / stab->szopts.tsize) +
437 stab->data[slot % stab->szopts.tsize];
439 pkt_len <<= stab->szopts.size_log;
441 if (unlikely(pkt_len < 1))
443 qdisc_skb_cb(skb)->pkt_len = pkt_len;
445 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
447 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
449 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
452 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
454 __netif_schedule(qdisc_root(wd->qdisc));
456 return HRTIMER_NORESTART;
459 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
461 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
462 wd->timer.function = qdisc_watchdog;
465 EXPORT_SYMBOL(qdisc_watchdog_init);
467 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
471 if (test_bit(__QDISC_STATE_DEACTIVATED,
472 &qdisc_root_sleeping(wd->qdisc)->state))
475 wd->qdisc->flags |= TCQ_F_THROTTLED;
476 time = ktime_set(0, 0);
477 time = ktime_add_ns(time, PSCHED_US2NS(expires));
478 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
480 EXPORT_SYMBOL(qdisc_watchdog_schedule);
482 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
484 hrtimer_cancel(&wd->timer);
485 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
487 EXPORT_SYMBOL(qdisc_watchdog_cancel);
489 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
491 unsigned int size = n * sizeof(struct hlist_head), i;
492 struct hlist_head *h;
494 if (size <= PAGE_SIZE)
495 h = kmalloc(size, GFP_KERNEL);
497 h = (struct hlist_head *)
498 __get_free_pages(GFP_KERNEL, get_order(size));
501 for (i = 0; i < n; i++)
502 INIT_HLIST_HEAD(&h[i]);
507 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
509 unsigned int size = n * sizeof(struct hlist_head);
511 if (size <= PAGE_SIZE)
514 free_pages((unsigned long)h, get_order(size));
517 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
519 struct Qdisc_class_common *cl;
520 struct hlist_node *n, *next;
521 struct hlist_head *nhash, *ohash;
522 unsigned int nsize, nmask, osize;
525 /* Rehash when load factor exceeds 0.75 */
526 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
528 nsize = clhash->hashsize * 2;
530 nhash = qdisc_class_hash_alloc(nsize);
534 ohash = clhash->hash;
535 osize = clhash->hashsize;
538 for (i = 0; i < osize; i++) {
539 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
540 h = qdisc_class_hash(cl->classid, nmask);
541 hlist_add_head(&cl->hnode, &nhash[h]);
544 clhash->hash = nhash;
545 clhash->hashsize = nsize;
546 clhash->hashmask = nmask;
547 sch_tree_unlock(sch);
549 qdisc_class_hash_free(ohash, osize);
551 EXPORT_SYMBOL(qdisc_class_hash_grow);
553 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
555 unsigned int size = 4;
557 clhash->hash = qdisc_class_hash_alloc(size);
558 if (clhash->hash == NULL)
560 clhash->hashsize = size;
561 clhash->hashmask = size - 1;
562 clhash->hashelems = 0;
565 EXPORT_SYMBOL(qdisc_class_hash_init);
567 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
569 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
571 EXPORT_SYMBOL(qdisc_class_hash_destroy);
573 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
574 struct Qdisc_class_common *cl)
578 INIT_HLIST_NODE(&cl->hnode);
579 h = qdisc_class_hash(cl->classid, clhash->hashmask);
580 hlist_add_head(&cl->hnode, &clhash->hash[h]);
583 EXPORT_SYMBOL(qdisc_class_hash_insert);
585 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
586 struct Qdisc_class_common *cl)
588 hlist_del(&cl->hnode);
591 EXPORT_SYMBOL(qdisc_class_hash_remove);
593 /* Allocate an unique handle from space managed by kernel */
595 static u32 qdisc_alloc_handle(struct net_device *dev)
598 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
601 autohandle += TC_H_MAKE(0x10000U, 0);
602 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
603 autohandle = TC_H_MAKE(0x80000000U, 0);
604 } while (qdisc_lookup(dev, autohandle) && --i > 0);
606 return i>0 ? autohandle : 0;
609 /* Attach toplevel qdisc to device queue. */
611 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
614 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
615 spinlock_t *root_lock;
617 root_lock = qdisc_lock(oqdisc);
618 spin_lock_bh(root_lock);
620 /* Prune old scheduler */
621 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
624 /* ... and graft new one */
627 dev_queue->qdisc_sleeping = qdisc;
628 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
630 spin_unlock_bh(root_lock);
635 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
637 const struct Qdisc_class_ops *cops;
643 while ((parentid = sch->parent)) {
644 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
647 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
649 WARN_ON(parentid != TC_H_ROOT);
652 cops = sch->ops->cl_ops;
653 if (cops->qlen_notify) {
654 cl = cops->get(sch, parentid);
655 cops->qlen_notify(sch, cl);
661 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
663 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
664 struct Qdisc *old, struct Qdisc *new)
667 qdisc_notify(skb, n, clid, old, new);
673 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
676 * When appropriate send a netlink notification using 'skb'
679 * On success, destroy old qdisc.
682 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
683 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
684 struct Qdisc *new, struct Qdisc *old)
686 struct Qdisc *q = old;
689 if (parent == NULL) {
690 unsigned int i, num_q, ingress;
693 num_q = dev->num_tx_queues;
694 if ((q && q->flags & TCQ_F_INGRESS) ||
695 (new && new->flags & TCQ_F_INGRESS)) {
700 if (dev->flags & IFF_UP)
703 for (i = 0; i < num_q; i++) {
704 struct netdev_queue *dev_queue = &dev->rx_queue;
707 dev_queue = netdev_get_tx_queue(dev, i);
709 old = dev_graft_qdisc(dev_queue, new);
711 atomic_inc(&new->refcnt);
713 notify_and_destroy(skb, n, classid, old, new);
716 if (dev->flags & IFF_UP)
719 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
724 unsigned long cl = cops->get(parent, classid);
726 err = cops->graft(parent, cl, new, &old);
727 cops->put(parent, cl);
731 notify_and_destroy(skb, n, classid, old, new);
736 /* lockdep annotation is needed for ingress; egress gets it only for name */
737 static struct lock_class_key qdisc_tx_lock;
738 static struct lock_class_key qdisc_rx_lock;
741 Allocate and initialize new qdisc.
743 Parameters are passed via opt.
746 static struct Qdisc *
747 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
748 u32 parent, u32 handle, struct nlattr **tca, int *errp)
751 struct nlattr *kind = tca[TCA_KIND];
753 struct Qdisc_ops *ops;
754 struct qdisc_size_table *stab;
756 ops = qdisc_lookup_ops(kind);
757 #ifdef CONFIG_MODULES
758 if (ops == NULL && kind != NULL) {
760 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
761 /* We dropped the RTNL semaphore in order to
762 * perform the module load. So, even if we
763 * succeeded in loading the module we have to
764 * tell the caller to replay the request. We
765 * indicate this using -EAGAIN.
766 * We replay the request because the device may
767 * go away in the mean time.
770 request_module("sch_%s", name);
772 ops = qdisc_lookup_ops(kind);
774 /* We will try again qdisc_lookup_ops,
775 * so don't keep a reference.
777 module_put(ops->owner);
789 sch = qdisc_alloc(dev_queue, ops);
795 sch->parent = parent;
797 if (handle == TC_H_INGRESS) {
798 sch->flags |= TCQ_F_INGRESS;
799 handle = TC_H_MAKE(TC_H_INGRESS, 0);
800 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
803 handle = qdisc_alloc_handle(dev);
808 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
811 sch->handle = handle;
813 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
815 stab = qdisc_get_stab(tca[TCA_STAB]);
823 spinlock_t *root_lock;
825 if ((sch->parent != TC_H_ROOT) &&
826 !(sch->flags & TCQ_F_INGRESS))
827 root_lock = qdisc_root_sleeping_lock(sch);
829 root_lock = qdisc_lock(sch);
831 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
832 root_lock, tca[TCA_RATE]);
835 * Any broken qdiscs that would require
836 * a ops->reset() here? The qdisc was never
837 * in action so it shouldn't be necessary.
850 qdisc_put_stab(sch->stab);
852 kfree((char *) sch - sch->padded);
854 module_put(ops->owner);
860 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
862 struct qdisc_size_table *stab = NULL;
865 if (tca[TCA_OPTIONS]) {
866 if (sch->ops->change == NULL)
868 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
874 stab = qdisc_get_stab(tca[TCA_STAB]);
876 return PTR_ERR(stab);
879 qdisc_put_stab(sch->stab);
883 /* NB: ignores errors from replace_estimator
884 because change can't be undone. */
885 gen_replace_estimator(&sch->bstats, &sch->rate_est,
886 qdisc_root_sleeping_lock(sch),
892 struct check_loop_arg
894 struct qdisc_walker w;
899 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
901 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
903 struct check_loop_arg arg;
905 if (q->ops->cl_ops == NULL)
908 arg.w.stop = arg.w.skip = arg.w.count = 0;
909 arg.w.fn = check_loop_fn;
912 q->ops->cl_ops->walk(q, &arg.w);
913 return arg.w.stop ? -ELOOP : 0;
917 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
920 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
921 struct check_loop_arg *arg = (struct check_loop_arg *)w;
923 leaf = cops->leaf(q, cl);
925 if (leaf == arg->p || arg->depth > 7)
927 return check_loop(leaf, arg->p, arg->depth + 1);
936 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
938 struct net *net = sock_net(skb->sk);
939 struct tcmsg *tcm = NLMSG_DATA(n);
940 struct nlattr *tca[TCA_MAX + 1];
941 struct net_device *dev;
942 u32 clid = tcm->tcm_parent;
943 struct Qdisc *q = NULL;
944 struct Qdisc *p = NULL;
947 if (net != &init_net)
950 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
953 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
958 if (clid != TC_H_ROOT) {
959 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
960 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
962 q = qdisc_leaf(p, clid);
963 } else { /* ingress */
964 q = dev->rx_queue.qdisc_sleeping;
967 struct netdev_queue *dev_queue;
968 dev_queue = netdev_get_tx_queue(dev, 0);
969 q = dev_queue->qdisc_sleeping;
974 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
977 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
981 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
984 if (n->nlmsg_type == RTM_DELQDISC) {
989 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
992 qdisc_notify(skb, n, clid, NULL, q);
1001 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1003 struct net *net = sock_net(skb->sk);
1005 struct nlattr *tca[TCA_MAX + 1];
1006 struct net_device *dev;
1008 struct Qdisc *q, *p;
1011 if (net != &init_net)
1015 /* Reinit, just in case something touches this. */
1016 tcm = NLMSG_DATA(n);
1017 clid = tcm->tcm_parent;
1020 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1023 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1028 if (clid != TC_H_ROOT) {
1029 if (clid != TC_H_INGRESS) {
1030 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1032 q = qdisc_leaf(p, clid);
1033 } else { /*ingress */
1034 q = dev->rx_queue.qdisc_sleeping;
1037 struct netdev_queue *dev_queue;
1038 dev_queue = netdev_get_tx_queue(dev, 0);
1039 q = dev_queue->qdisc_sleeping;
1042 /* It may be default qdisc, ignore it */
1043 if (q && q->handle == 0)
1046 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1047 if (tcm->tcm_handle) {
1048 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1050 if (TC_H_MIN(tcm->tcm_handle))
1052 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1053 goto create_n_graft;
1054 if (n->nlmsg_flags&NLM_F_EXCL)
1056 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1059 (p && check_loop(q, p, 0)))
1061 atomic_inc(&q->refcnt);
1065 goto create_n_graft;
1067 /* This magic test requires explanation.
1069 * We know, that some child q is already
1070 * attached to this parent and have choice:
1071 * either to change it or to create/graft new one.
1073 * 1. We are allowed to create/graft only
1074 * if CREATE and REPLACE flags are set.
1076 * 2. If EXCL is set, requestor wanted to say,
1077 * that qdisc tcm_handle is not expected
1078 * to exist, so that we choose create/graft too.
1080 * 3. The last case is when no flags are set.
1081 * Alas, it is sort of hole in API, we
1082 * cannot decide what to do unambiguously.
1083 * For now we select create/graft, if
1084 * user gave KIND, which does not match existing.
1086 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1087 (n->nlmsg_flags&NLM_F_REPLACE) &&
1088 ((n->nlmsg_flags&NLM_F_EXCL) ||
1090 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1091 goto create_n_graft;
1095 if (!tcm->tcm_handle)
1097 q = qdisc_lookup(dev, tcm->tcm_handle);
1100 /* Change qdisc parameters */
1103 if (n->nlmsg_flags&NLM_F_EXCL)
1105 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1107 err = qdisc_change(q, tca);
1109 qdisc_notify(skb, n, clid, NULL, q);
1113 if (!(n->nlmsg_flags&NLM_F_CREATE))
1115 if (clid == TC_H_INGRESS)
1116 q = qdisc_create(dev, &dev->rx_queue,
1117 tcm->tcm_parent, tcm->tcm_parent,
1120 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1121 tcm->tcm_parent, tcm->tcm_handle,
1130 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1141 u32 pid, u32 seq, u16 flags, int event)
1144 struct nlmsghdr *nlh;
1145 unsigned char *b = skb_tail_pointer(skb);
1148 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1149 tcm = NLMSG_DATA(nlh);
1150 tcm->tcm_family = AF_UNSPEC;
1153 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1154 tcm->tcm_parent = clid;
1155 tcm->tcm_handle = q->handle;
1156 tcm->tcm_info = atomic_read(&q->refcnt);
1157 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1158 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1159 goto nla_put_failure;
1160 q->qstats.qlen = q->q.qlen;
1162 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1163 goto nla_put_failure;
1165 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1166 qdisc_root_sleeping_lock(q), &d) < 0)
1167 goto nla_put_failure;
1169 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1170 goto nla_put_failure;
1172 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1173 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1174 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1175 goto nla_put_failure;
1177 if (gnet_stats_finish_copy(&d) < 0)
1178 goto nla_put_failure;
1180 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1189 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1190 u32 clid, struct Qdisc *old, struct Qdisc *new)
1192 struct sk_buff *skb;
1193 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1195 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1199 if (old && old->handle) {
1200 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1204 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1209 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1216 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1218 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1221 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1222 struct netlink_callback *cb,
1223 int *q_idx_p, int s_q_idx)
1225 int ret = 0, q_idx = *q_idx_p;
1232 if (q_idx < s_q_idx) {
1235 if (!tc_qdisc_dump_ignore(q) &&
1236 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1237 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1241 list_for_each_entry(q, &root->list, list) {
1242 if (q_idx < s_q_idx) {
1246 if (!tc_qdisc_dump_ignore(q) &&
1247 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1248 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1261 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1263 struct net *net = sock_net(skb->sk);
1266 struct net_device *dev;
1268 if (net != &init_net)
1271 s_idx = cb->args[0];
1272 s_q_idx = q_idx = cb->args[1];
1273 read_lock(&dev_base_lock);
1275 for_each_netdev(&init_net, dev) {
1276 struct netdev_queue *dev_queue;
1284 dev_queue = netdev_get_tx_queue(dev, 0);
1285 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1288 dev_queue = &dev->rx_queue;
1289 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1297 read_unlock(&dev_base_lock);
1300 cb->args[1] = q_idx;
1307 /************************************************
1308 * Traffic classes manipulation. *
1309 ************************************************/
1313 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1315 struct net *net = sock_net(skb->sk);
1316 struct netdev_queue *dev_queue;
1317 struct tcmsg *tcm = NLMSG_DATA(n);
1318 struct nlattr *tca[TCA_MAX + 1];
1319 struct net_device *dev;
1320 struct Qdisc *q = NULL;
1321 const struct Qdisc_class_ops *cops;
1322 unsigned long cl = 0;
1323 unsigned long new_cl;
1324 u32 pid = tcm->tcm_parent;
1325 u32 clid = tcm->tcm_handle;
1326 u32 qid = TC_H_MAJ(clid);
1329 if (net != &init_net)
1332 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1335 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1340 parent == TC_H_UNSPEC - unspecified parent.
1341 parent == TC_H_ROOT - class is root, which has no parent.
1342 parent == X:0 - parent is root class.
1343 parent == X:Y - parent is a node in hierarchy.
1344 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1346 handle == 0:0 - generate handle from kernel pool.
1347 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1348 handle == X:Y - clear.
1349 handle == X:0 - root class.
1352 /* Step 1. Determine qdisc handle X:0 */
1354 dev_queue = netdev_get_tx_queue(dev, 0);
1355 if (pid != TC_H_ROOT) {
1356 u32 qid1 = TC_H_MAJ(pid);
1359 /* If both majors are known, they must be identical. */
1364 } else if (qid == 0)
1365 qid = dev_queue->qdisc_sleeping->handle;
1367 /* Now qid is genuine qdisc handle consistent
1368 both with parent and child.
1370 TC_H_MAJ(pid) still may be unspecified, complete it now.
1373 pid = TC_H_MAKE(qid, pid);
1376 qid = dev_queue->qdisc_sleeping->handle;
1379 /* OK. Locate qdisc */
1380 if ((q = qdisc_lookup(dev, qid)) == NULL)
1383 /* An check that it supports classes */
1384 cops = q->ops->cl_ops;
1388 /* Now try to get class */
1390 if (pid == TC_H_ROOT)
1393 clid = TC_H_MAKE(qid, clid);
1396 cl = cops->get(q, clid);
1400 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1403 switch (n->nlmsg_type) {
1406 if (n->nlmsg_flags&NLM_F_EXCL)
1410 err = cops->delete(q, cl);
1412 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1415 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1424 err = cops->change(q, clid, pid, tca, &new_cl);
1426 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1436 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1438 u32 pid, u32 seq, u16 flags, int event)
1441 struct nlmsghdr *nlh;
1442 unsigned char *b = skb_tail_pointer(skb);
1444 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1446 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1447 tcm = NLMSG_DATA(nlh);
1448 tcm->tcm_family = AF_UNSPEC;
1449 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1450 tcm->tcm_parent = q->handle;
1451 tcm->tcm_handle = q->handle;
1453 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1454 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1455 goto nla_put_failure;
1457 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1458 qdisc_root_sleeping_lock(q), &d) < 0)
1459 goto nla_put_failure;
1461 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1462 goto nla_put_failure;
1464 if (gnet_stats_finish_copy(&d) < 0)
1465 goto nla_put_failure;
1467 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1476 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1477 struct Qdisc *q, unsigned long cl, int event)
1479 struct sk_buff *skb;
1480 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1482 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1486 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1491 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1494 struct qdisc_dump_args
1496 struct qdisc_walker w;
1497 struct sk_buff *skb;
1498 struct netlink_callback *cb;
1501 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1503 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1505 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1506 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1509 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1510 struct tcmsg *tcm, struct netlink_callback *cb,
1513 struct qdisc_dump_args arg;
1515 if (tc_qdisc_dump_ignore(q) ||
1516 *t_p < s_t || !q->ops->cl_ops ||
1518 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1523 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1524 arg.w.fn = qdisc_class_dump;
1528 arg.w.skip = cb->args[1];
1530 q->ops->cl_ops->walk(q, &arg.w);
1531 cb->args[1] = arg.w.count;
1538 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1539 struct tcmsg *tcm, struct netlink_callback *cb,
1547 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1550 list_for_each_entry(q, &root->list, list) {
1551 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1558 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1560 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1561 struct net *net = sock_net(skb->sk);
1562 struct netdev_queue *dev_queue;
1563 struct net_device *dev;
1566 if (net != &init_net)
1569 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1571 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1577 dev_queue = netdev_get_tx_queue(dev, 0);
1578 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1581 dev_queue = &dev->rx_queue;
1582 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1592 /* Main classifier routine: scans classifier chain attached
1593 to this qdisc, (optionally) tests for protocol and asks
1594 specific classifiers.
1596 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1597 struct tcf_result *res)
1599 __be16 protocol = skb->protocol;
1602 for (; tp; tp = tp->next) {
1603 if ((tp->protocol == protocol ||
1604 tp->protocol == htons(ETH_P_ALL)) &&
1605 (err = tp->classify(skb, tp, res)) >= 0) {
1606 #ifdef CONFIG_NET_CLS_ACT
1607 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1608 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1615 EXPORT_SYMBOL(tc_classify_compat);
1617 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1618 struct tcf_result *res)
1622 #ifdef CONFIG_NET_CLS_ACT
1623 struct tcf_proto *otp = tp;
1626 protocol = skb->protocol;
1628 err = tc_classify_compat(skb, tp, res);
1629 #ifdef CONFIG_NET_CLS_ACT
1630 if (err == TC_ACT_RECLASSIFY) {
1631 u32 verd = G_TC_VERD(skb->tc_verd);
1634 if (verd++ >= MAX_REC_LOOP) {
1635 printk("rule prio %u protocol %02x reclassify loop, "
1637 tp->prio&0xffff, ntohs(tp->protocol));
1640 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1646 EXPORT_SYMBOL(tc_classify);
1648 void tcf_destroy(struct tcf_proto *tp)
1650 tp->ops->destroy(tp);
1651 module_put(tp->ops->owner);
1655 void tcf_destroy_chain(struct tcf_proto **fl)
1657 struct tcf_proto *tp;
1659 while ((tp = *fl) != NULL) {
1664 EXPORT_SYMBOL(tcf_destroy_chain);
1666 #ifdef CONFIG_PROC_FS
1667 static int psched_show(struct seq_file *seq, void *v)
1671 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1672 seq_printf(seq, "%08x %08x %08x %08x\n",
1673 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1675 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1680 static int psched_open(struct inode *inode, struct file *file)
1682 return single_open(file, psched_show, PDE(inode)->data);
1685 static const struct file_operations psched_fops = {
1686 .owner = THIS_MODULE,
1687 .open = psched_open,
1689 .llseek = seq_lseek,
1690 .release = single_release,
1694 static int __init pktsched_init(void)
1696 register_qdisc(&pfifo_qdisc_ops);
1697 register_qdisc(&bfifo_qdisc_ops);
1698 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1700 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1701 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1702 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1703 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1704 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1705 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1710 subsys_initcall(pktsched_init);