2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
31 #include <net/net_namespace.h>
33 #include <net/netlink.h>
34 #include <net/pkt_sched.h>
36 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
37 struct Qdisc *old, struct Qdisc *new);
38 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
39 struct Qdisc *q, unsigned long cl, int event);
46 This file consists of two interrelated parts:
48 1. queueing disciplines manager frontend.
49 2. traffic classes manager frontend.
51 Generally, queueing discipline ("qdisc") is a black box,
52 which is able to enqueue packets and to dequeue them (when
53 device is ready to send something) in order and at times
54 determined by algorithm hidden in it.
56 qdisc's are divided to two categories:
57 - "queues", which have no internal structure visible from outside.
58 - "schedulers", which split all the packets to "traffic classes",
59 using "packet classifiers" (look at cls_api.c)
61 In turn, classes may have child qdiscs (as rule, queues)
62 attached to them etc. etc. etc.
64 The goal of the routines in this file is to translate
65 information supplied by user in the form of handles
66 to more intelligible for kernel form, to make some sanity
67 checks and part of work, which is common to all qdiscs
68 and to provide rtnetlink notifications.
70 All real intelligent work is done inside qdisc modules.
74 Every discipline has two major routines: enqueue and dequeue.
78 dequeue usually returns a skb to send. It is allowed to return NULL,
79 but it does not mean that queue is empty, it just means that
80 discipline does not want to send anything this time.
81 Queue is really empty if q->q.qlen == 0.
82 For complicated disciplines with multiple queues q->q is not
83 real packet queue, but however q->q.qlen must be valid.
87 enqueue returns 0, if packet was enqueued successfully.
88 If packet (this one or another one) was dropped, it returns
90 NET_XMIT_DROP - this packet dropped
91 Expected action: do not backoff, but wait until queue will clear.
92 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
93 Expected action: backoff or ignore
94 NET_XMIT_POLICED - dropped by police.
95 Expected action: backoff or error to real-time apps.
101 requeues once dequeued packet. It is used for non-standard or
102 just buggy devices, which can defer output even if netif_queue_stopped()=0.
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
111 initializes newly created qdisc.
115 destroys resources allocated by init and during lifetime of qdisc.
119 changes qdisc parameters.
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
131 /* The list of all installed queueing disciplines. */
133 static struct Qdisc_ops *qdisc_base;
135 /* Register/uregister queueing discipline */
137 int register_qdisc(struct Qdisc_ops *qops)
139 struct Qdisc_ops *q, **qp;
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->requeue == NULL)
150 qops->requeue = noop_qdisc_ops.requeue;
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
158 write_unlock(&qdisc_mod_lock);
161 EXPORT_SYMBOL(register_qdisc);
163 int unregister_qdisc(struct Qdisc_ops *qops)
165 struct Qdisc_ops *q, **qp;
168 write_lock(&qdisc_mod_lock);
169 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
177 write_unlock(&qdisc_mod_lock);
180 EXPORT_SYMBOL(unregister_qdisc);
182 /* We know handle. Find qdisc among all qdisc's attached to device
183 (root qdisc, all its children, children of children etc.)
186 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
190 for (i = 0; i < dev->num_tx_queues; i++) {
191 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
192 struct Qdisc *q, *txq_root = txq->qdisc;
194 if (!(txq_root->flags & TCQ_F_BUILTIN) &&
195 txq_root->handle == handle)
198 list_for_each_entry(q, &txq_root->list, list) {
199 if (q->handle == handle)
206 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
210 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
214 cl = cops->get(p, classid);
218 leaf = cops->leaf(p, cl);
223 /* Find queueing discipline by name */
225 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
227 struct Qdisc_ops *q = NULL;
230 read_lock(&qdisc_mod_lock);
231 for (q = qdisc_base; q; q = q->next) {
232 if (nla_strcmp(kind, q->id) == 0) {
233 if (!try_module_get(q->owner))
238 read_unlock(&qdisc_mod_lock);
243 static struct qdisc_rate_table *qdisc_rtab_list;
245 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
247 struct qdisc_rate_table *rtab;
249 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
250 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
256 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
257 nla_len(tab) != TC_RTAB_SIZE)
260 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 memcpy(rtab->data, nla_data(tab), 1024);
265 rtab->next = qdisc_rtab_list;
266 qdisc_rtab_list = rtab;
270 EXPORT_SYMBOL(qdisc_get_rtab);
272 void qdisc_put_rtab(struct qdisc_rate_table *tab)
274 struct qdisc_rate_table *rtab, **rtabp;
276 if (!tab || --tab->refcnt)
279 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 EXPORT_SYMBOL(qdisc_put_rtab);
289 static LIST_HEAD(qdisc_stab_list);
290 static DEFINE_SPINLOCK(qdisc_stab_lock);
292 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
293 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
294 [TCA_STAB_DATA] = { .type = NLA_BINARY },
297 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
299 struct nlattr *tb[TCA_STAB_MAX + 1];
300 struct qdisc_size_table *stab;
301 struct tc_sizespec *s;
302 unsigned int tsize = 0;
306 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
309 if (!tb[TCA_STAB_BASE])
310 return ERR_PTR(-EINVAL);
312 s = nla_data(tb[TCA_STAB_BASE]);
315 if (!tb[TCA_STAB_DATA])
316 return ERR_PTR(-EINVAL);
317 tab = nla_data(tb[TCA_STAB_DATA]);
318 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
321 if (!s || tsize != s->tsize || (!tab && tsize > 0))
322 return ERR_PTR(-EINVAL);
324 spin_lock(&qdisc_stab_lock);
326 list_for_each_entry(stab, &qdisc_stab_list, list) {
327 if (memcmp(&stab->szopts, s, sizeof(*s)))
329 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
332 spin_unlock(&qdisc_stab_lock);
336 spin_unlock(&qdisc_stab_lock);
338 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
340 return ERR_PTR(-ENOMEM);
345 memcpy(stab->data, tab, tsize * sizeof(u16));
347 spin_lock(&qdisc_stab_lock);
348 list_add_tail(&stab->list, &qdisc_stab_list);
349 spin_unlock(&qdisc_stab_lock);
354 void qdisc_put_stab(struct qdisc_size_table *tab)
359 spin_lock(&qdisc_stab_lock);
361 if (--tab->refcnt == 0) {
362 list_del(&tab->list);
366 spin_unlock(&qdisc_stab_lock);
368 EXPORT_SYMBOL(qdisc_put_stab);
370 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
374 nest = nla_nest_start(skb, TCA_STAB);
375 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
376 nla_nest_end(skb, nest);
384 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
388 pkt_len = skb->len + stab->szopts.overhead;
389 if (unlikely(!stab->szopts.tsize))
392 slot = pkt_len + stab->szopts.cell_align;
393 if (unlikely(slot < 0))
396 slot >>= stab->szopts.cell_log;
397 if (likely(slot < stab->szopts.tsize))
398 pkt_len = stab->data[slot];
400 pkt_len = stab->data[stab->szopts.tsize - 1] *
401 (slot / stab->szopts.tsize) +
402 stab->data[slot % stab->szopts.tsize];
404 pkt_len <<= stab->szopts.size_log;
406 if (unlikely(pkt_len < 1))
408 qdisc_skb_cb(skb)->pkt_len = pkt_len;
410 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
412 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
414 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
417 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
419 __netif_schedule(wd->qdisc);
421 return HRTIMER_NORESTART;
424 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
426 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
427 wd->timer.function = qdisc_watchdog;
430 EXPORT_SYMBOL(qdisc_watchdog_init);
432 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
436 wd->qdisc->flags |= TCQ_F_THROTTLED;
437 time = ktime_set(0, 0);
438 time = ktime_add_ns(time, PSCHED_US2NS(expires));
439 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
441 EXPORT_SYMBOL(qdisc_watchdog_schedule);
443 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
445 hrtimer_cancel(&wd->timer);
446 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
448 EXPORT_SYMBOL(qdisc_watchdog_cancel);
450 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
452 unsigned int size = n * sizeof(struct hlist_head), i;
453 struct hlist_head *h;
455 if (size <= PAGE_SIZE)
456 h = kmalloc(size, GFP_KERNEL);
458 h = (struct hlist_head *)
459 __get_free_pages(GFP_KERNEL, get_order(size));
462 for (i = 0; i < n; i++)
463 INIT_HLIST_HEAD(&h[i]);
468 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
470 unsigned int size = n * sizeof(struct hlist_head);
472 if (size <= PAGE_SIZE)
475 free_pages((unsigned long)h, get_order(size));
478 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
480 struct Qdisc_class_common *cl;
481 struct hlist_node *n, *next;
482 struct hlist_head *nhash, *ohash;
483 unsigned int nsize, nmask, osize;
486 /* Rehash when load factor exceeds 0.75 */
487 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
489 nsize = clhash->hashsize * 2;
491 nhash = qdisc_class_hash_alloc(nsize);
495 ohash = clhash->hash;
496 osize = clhash->hashsize;
499 for (i = 0; i < osize; i++) {
500 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
501 h = qdisc_class_hash(cl->classid, nmask);
502 hlist_add_head(&cl->hnode, &nhash[h]);
505 clhash->hash = nhash;
506 clhash->hashsize = nsize;
507 clhash->hashmask = nmask;
508 sch_tree_unlock(sch);
510 qdisc_class_hash_free(ohash, osize);
512 EXPORT_SYMBOL(qdisc_class_hash_grow);
514 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
516 unsigned int size = 4;
518 clhash->hash = qdisc_class_hash_alloc(size);
519 if (clhash->hash == NULL)
521 clhash->hashsize = size;
522 clhash->hashmask = size - 1;
523 clhash->hashelems = 0;
526 EXPORT_SYMBOL(qdisc_class_hash_init);
528 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
530 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
532 EXPORT_SYMBOL(qdisc_class_hash_destroy);
534 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
535 struct Qdisc_class_common *cl)
539 INIT_HLIST_NODE(&cl->hnode);
540 h = qdisc_class_hash(cl->classid, clhash->hashmask);
541 hlist_add_head(&cl->hnode, &clhash->hash[h]);
544 EXPORT_SYMBOL(qdisc_class_hash_insert);
546 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
547 struct Qdisc_class_common *cl)
549 hlist_del(&cl->hnode);
552 EXPORT_SYMBOL(qdisc_class_hash_remove);
554 /* Allocate an unique handle from space managed by kernel */
556 static u32 qdisc_alloc_handle(struct net_device *dev)
559 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
562 autohandle += TC_H_MAKE(0x10000U, 0);
563 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
564 autohandle = TC_H_MAKE(0x80000000U, 0);
565 } while (qdisc_lookup(dev, autohandle) && --i > 0);
567 return i>0 ? autohandle : 0;
570 /* Attach toplevel qdisc to device queue. */
572 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
575 spinlock_t *root_lock;
576 struct Qdisc *oqdisc;
580 if (qdisc && qdisc->flags&TCQ_F_INGRESS)
584 oqdisc = dev_queue->qdisc;
586 oqdisc = dev_queue->qdisc_sleeping;
589 root_lock = qdisc_root_lock(oqdisc);
590 spin_lock_bh(root_lock);
593 /* Prune old scheduler */
594 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
597 dev_queue->qdisc = NULL;
599 dev_queue->qdisc = qdisc;
603 /* Prune old scheduler */
604 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
607 /* ... and graft new one */
610 dev_queue->qdisc_sleeping = qdisc;
611 dev_queue->qdisc = &noop_qdisc;
614 spin_unlock_bh(root_lock);
619 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
621 const struct Qdisc_class_ops *cops;
627 while ((parentid = sch->parent)) {
628 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
631 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
633 WARN_ON(parentid != TC_H_ROOT);
636 cops = sch->ops->cl_ops;
637 if (cops->qlen_notify) {
638 cl = cops->get(sch, parentid);
639 cops->qlen_notify(sch, cl);
645 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
647 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
648 struct Qdisc *old, struct Qdisc *new)
651 qdisc_notify(skb, n, clid, old, new);
654 spin_lock_bh(&old->q.lock);
656 spin_unlock_bh(&old->q.lock);
660 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
663 * When appropriate send a netlink notification using 'skb'
666 * On success, destroy old qdisc.
669 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
670 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
671 struct Qdisc *new, struct Qdisc *old)
673 struct Qdisc *q = old;
676 if (parent == NULL) {
677 unsigned int i, num_q, ingress;
680 num_q = dev->num_tx_queues;
681 if (q && q->flags & TCQ_F_INGRESS) {
686 if (dev->flags & IFF_UP)
689 for (i = 0; i < num_q; i++) {
690 struct netdev_queue *dev_queue = &dev->rx_queue;
693 dev_queue = netdev_get_tx_queue(dev, i);
696 old = dev_graft_qdisc(dev_queue, q);
698 old = dev_graft_qdisc(dev_queue, new);
700 atomic_inc(&new->refcnt);
702 notify_and_destroy(skb, n, classid, old, new);
705 if (dev->flags & IFF_UP)
708 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
713 unsigned long cl = cops->get(parent, classid);
715 err = cops->graft(parent, cl, new, &old);
716 cops->put(parent, cl);
720 notify_and_destroy(skb, n, classid, old, new);
726 Allocate and initialize new qdisc.
728 Parameters are passed via opt.
731 static struct Qdisc *
732 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
733 u32 parent, u32 handle, struct nlattr **tca, int *errp)
736 struct nlattr *kind = tca[TCA_KIND];
738 struct Qdisc_ops *ops;
739 struct qdisc_size_table *stab;
741 ops = qdisc_lookup_ops(kind);
743 if (ops == NULL && kind != NULL) {
745 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
746 /* We dropped the RTNL semaphore in order to
747 * perform the module load. So, even if we
748 * succeeded in loading the module we have to
749 * tell the caller to replay the request. We
750 * indicate this using -EAGAIN.
751 * We replay the request because the device may
752 * go away in the mean time.
755 request_module("sch_%s", name);
757 ops = qdisc_lookup_ops(kind);
759 /* We will try again qdisc_lookup_ops,
760 * so don't keep a reference.
762 module_put(ops->owner);
774 sch = qdisc_alloc(dev_queue, ops);
780 sch->parent = parent;
782 if (handle == TC_H_INGRESS) {
783 sch->flags |= TCQ_F_INGRESS;
784 handle = TC_H_MAKE(TC_H_INGRESS, 0);
787 handle = qdisc_alloc_handle(dev);
794 sch->handle = handle;
796 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
798 stab = qdisc_get_stab(tca[TCA_STAB]);
806 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
807 qdisc_root_lock(sch),
811 * Any broken qdiscs that would require
812 * a ops->reset() here? The qdisc was never
813 * in action so it shouldn't be necessary.
821 list_add_tail(&sch->list, &dev_queue->qdisc->list);
826 qdisc_put_stab(sch->stab);
828 kfree((char *) sch - sch->padded);
830 module_put(ops->owner);
836 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
838 struct qdisc_size_table *stab = NULL;
841 if (tca[TCA_OPTIONS]) {
842 if (sch->ops->change == NULL)
844 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
850 stab = qdisc_get_stab(tca[TCA_STAB]);
852 return PTR_ERR(stab);
855 qdisc_put_stab(sch->stab);
859 gen_replace_estimator(&sch->bstats, &sch->rate_est,
860 qdisc_root_lock(sch), tca[TCA_RATE]);
864 struct check_loop_arg
866 struct qdisc_walker w;
871 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
873 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
875 struct check_loop_arg arg;
877 if (q->ops->cl_ops == NULL)
880 arg.w.stop = arg.w.skip = arg.w.count = 0;
881 arg.w.fn = check_loop_fn;
884 q->ops->cl_ops->walk(q, &arg.w);
885 return arg.w.stop ? -ELOOP : 0;
889 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
892 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
893 struct check_loop_arg *arg = (struct check_loop_arg *)w;
895 leaf = cops->leaf(q, cl);
897 if (leaf == arg->p || arg->depth > 7)
899 return check_loop(leaf, arg->p, arg->depth + 1);
908 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
910 struct net *net = sock_net(skb->sk);
911 struct tcmsg *tcm = NLMSG_DATA(n);
912 struct nlattr *tca[TCA_MAX + 1];
913 struct net_device *dev;
914 u32 clid = tcm->tcm_parent;
915 struct Qdisc *q = NULL;
916 struct Qdisc *p = NULL;
919 if (net != &init_net)
922 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
925 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
930 if (clid != TC_H_ROOT) {
931 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
932 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
934 q = qdisc_leaf(p, clid);
935 } else { /* ingress */
936 q = dev->rx_queue.qdisc;
939 struct netdev_queue *dev_queue;
940 dev_queue = netdev_get_tx_queue(dev, 0);
941 q = dev_queue->qdisc_sleeping;
946 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
949 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
953 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
956 if (n->nlmsg_type == RTM_DELQDISC) {
961 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
964 qdisc_notify(skb, n, clid, NULL, q);
973 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
975 struct net *net = sock_net(skb->sk);
977 struct nlattr *tca[TCA_MAX + 1];
978 struct net_device *dev;
983 if (net != &init_net)
987 /* Reinit, just in case something touches this. */
989 clid = tcm->tcm_parent;
992 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
995 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1000 if (clid != TC_H_ROOT) {
1001 if (clid != TC_H_INGRESS) {
1002 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1004 q = qdisc_leaf(p, clid);
1005 } else { /*ingress */
1006 q = dev->rx_queue.qdisc;
1009 struct netdev_queue *dev_queue;
1010 dev_queue = netdev_get_tx_queue(dev, 0);
1011 q = dev_queue->qdisc_sleeping;
1014 /* It may be default qdisc, ignore it */
1015 if (q && q->handle == 0)
1018 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1019 if (tcm->tcm_handle) {
1020 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1022 if (TC_H_MIN(tcm->tcm_handle))
1024 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1025 goto create_n_graft;
1026 if (n->nlmsg_flags&NLM_F_EXCL)
1028 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1031 (p && check_loop(q, p, 0)))
1033 atomic_inc(&q->refcnt);
1037 goto create_n_graft;
1039 /* This magic test requires explanation.
1041 * We know, that some child q is already
1042 * attached to this parent and have choice:
1043 * either to change it or to create/graft new one.
1045 * 1. We are allowed to create/graft only
1046 * if CREATE and REPLACE flags are set.
1048 * 2. If EXCL is set, requestor wanted to say,
1049 * that qdisc tcm_handle is not expected
1050 * to exist, so that we choose create/graft too.
1052 * 3. The last case is when no flags are set.
1053 * Alas, it is sort of hole in API, we
1054 * cannot decide what to do unambiguously.
1055 * For now we select create/graft, if
1056 * user gave KIND, which does not match existing.
1058 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1059 (n->nlmsg_flags&NLM_F_REPLACE) &&
1060 ((n->nlmsg_flags&NLM_F_EXCL) ||
1062 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1063 goto create_n_graft;
1067 if (!tcm->tcm_handle)
1069 q = qdisc_lookup(dev, tcm->tcm_handle);
1072 /* Change qdisc parameters */
1075 if (n->nlmsg_flags&NLM_F_EXCL)
1077 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1079 err = qdisc_change(q, tca);
1081 qdisc_notify(skb, n, clid, NULL, q);
1085 if (!(n->nlmsg_flags&NLM_F_CREATE))
1087 if (clid == TC_H_INGRESS)
1088 q = qdisc_create(dev, &dev->rx_queue,
1089 tcm->tcm_parent, tcm->tcm_parent,
1092 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1093 tcm->tcm_parent, tcm->tcm_handle,
1103 spinlock_t *root_lock;
1105 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1108 root_lock = qdisc_root_lock(q);
1109 spin_lock_bh(root_lock);
1111 spin_unlock_bh(root_lock);
1119 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1120 u32 pid, u32 seq, u16 flags, int event)
1123 struct nlmsghdr *nlh;
1124 unsigned char *b = skb_tail_pointer(skb);
1127 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1128 tcm = NLMSG_DATA(nlh);
1129 tcm->tcm_family = AF_UNSPEC;
1132 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1133 tcm->tcm_parent = clid;
1134 tcm->tcm_handle = q->handle;
1135 tcm->tcm_info = atomic_read(&q->refcnt);
1136 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1137 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1138 goto nla_put_failure;
1139 q->qstats.qlen = q->q.qlen;
1141 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1142 goto nla_put_failure;
1144 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1145 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
1146 goto nla_put_failure;
1148 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1149 goto nla_put_failure;
1151 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1152 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1153 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1154 goto nla_put_failure;
1156 if (gnet_stats_finish_copy(&d) < 0)
1157 goto nla_put_failure;
1159 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1168 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1169 u32 clid, struct Qdisc *old, struct Qdisc *new)
1171 struct sk_buff *skb;
1172 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1174 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1178 if (old && old->handle) {
1179 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1183 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1188 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1195 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1197 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1200 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1201 struct netlink_callback *cb,
1202 int *q_idx_p, int s_q_idx)
1204 int ret = 0, q_idx = *q_idx_p;
1211 if (q_idx < s_q_idx) {
1214 if (!tc_qdisc_dump_ignore(q) &&
1215 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1216 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1220 list_for_each_entry(q, &root->list, list) {
1221 if (q_idx < s_q_idx) {
1225 if (!tc_qdisc_dump_ignore(q) &&
1226 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1227 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1240 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1242 struct net *net = sock_net(skb->sk);
1245 struct net_device *dev;
1247 if (net != &init_net)
1250 s_idx = cb->args[0];
1251 s_q_idx = q_idx = cb->args[1];
1252 read_lock(&dev_base_lock);
1254 for_each_netdev(&init_net, dev) {
1255 struct netdev_queue *dev_queue;
1263 dev_queue = netdev_get_tx_queue(dev, 0);
1264 if (tc_dump_qdisc_root(dev_queue->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1267 dev_queue = &dev->rx_queue;
1268 if (tc_dump_qdisc_root(dev_queue->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1276 read_unlock(&dev_base_lock);
1279 cb->args[1] = q_idx;
1286 /************************************************
1287 * Traffic classes manipulation. *
1288 ************************************************/
1292 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1294 struct net *net = sock_net(skb->sk);
1295 struct netdev_queue *dev_queue;
1296 struct tcmsg *tcm = NLMSG_DATA(n);
1297 struct nlattr *tca[TCA_MAX + 1];
1298 struct net_device *dev;
1299 struct Qdisc *q = NULL;
1300 const struct Qdisc_class_ops *cops;
1301 unsigned long cl = 0;
1302 unsigned long new_cl;
1303 u32 pid = tcm->tcm_parent;
1304 u32 clid = tcm->tcm_handle;
1305 u32 qid = TC_H_MAJ(clid);
1308 if (net != &init_net)
1311 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1314 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1319 parent == TC_H_UNSPEC - unspecified parent.
1320 parent == TC_H_ROOT - class is root, which has no parent.
1321 parent == X:0 - parent is root class.
1322 parent == X:Y - parent is a node in hierarchy.
1323 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1325 handle == 0:0 - generate handle from kernel pool.
1326 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1327 handle == X:Y - clear.
1328 handle == X:0 - root class.
1331 /* Step 1. Determine qdisc handle X:0 */
1333 dev_queue = netdev_get_tx_queue(dev, 0);
1334 if (pid != TC_H_ROOT) {
1335 u32 qid1 = TC_H_MAJ(pid);
1338 /* If both majors are known, they must be identical. */
1343 } else if (qid == 0)
1344 qid = dev_queue->qdisc_sleeping->handle;
1346 /* Now qid is genuine qdisc handle consistent
1347 both with parent and child.
1349 TC_H_MAJ(pid) still may be unspecified, complete it now.
1352 pid = TC_H_MAKE(qid, pid);
1355 qid = dev_queue->qdisc_sleeping->handle;
1358 /* OK. Locate qdisc */
1359 if ((q = qdisc_lookup(dev, qid)) == NULL)
1362 /* An check that it supports classes */
1363 cops = q->ops->cl_ops;
1367 /* Now try to get class */
1369 if (pid == TC_H_ROOT)
1372 clid = TC_H_MAKE(qid, clid);
1375 cl = cops->get(q, clid);
1379 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1382 switch (n->nlmsg_type) {
1385 if (n->nlmsg_flags&NLM_F_EXCL)
1389 err = cops->delete(q, cl);
1391 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1394 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1403 err = cops->change(q, clid, pid, tca, &new_cl);
1405 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1415 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1417 u32 pid, u32 seq, u16 flags, int event)
1420 struct nlmsghdr *nlh;
1421 unsigned char *b = skb_tail_pointer(skb);
1423 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1425 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1426 tcm = NLMSG_DATA(nlh);
1427 tcm->tcm_family = AF_UNSPEC;
1428 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1429 tcm->tcm_parent = q->handle;
1430 tcm->tcm_handle = q->handle;
1432 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1433 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1434 goto nla_put_failure;
1436 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1437 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
1438 goto nla_put_failure;
1440 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1441 goto nla_put_failure;
1443 if (gnet_stats_finish_copy(&d) < 0)
1444 goto nla_put_failure;
1446 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1455 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1456 struct Qdisc *q, unsigned long cl, int event)
1458 struct sk_buff *skb;
1459 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1461 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1465 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1470 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1473 struct qdisc_dump_args
1475 struct qdisc_walker w;
1476 struct sk_buff *skb;
1477 struct netlink_callback *cb;
1480 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1482 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1484 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1485 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1488 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1489 struct tcmsg *tcm, struct netlink_callback *cb,
1492 struct qdisc_dump_args arg;
1494 if (tc_qdisc_dump_ignore(q) ||
1495 *t_p < s_t || !q->ops->cl_ops ||
1497 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1502 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1503 arg.w.fn = qdisc_class_dump;
1507 arg.w.skip = cb->args[1];
1509 q->ops->cl_ops->walk(q, &arg.w);
1510 cb->args[1] = arg.w.count;
1517 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1518 struct tcmsg *tcm, struct netlink_callback *cb,
1526 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1529 list_for_each_entry(q, &root->list, list) {
1530 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1537 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1539 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1540 struct net *net = sock_net(skb->sk);
1541 struct netdev_queue *dev_queue;
1542 struct net_device *dev;
1545 if (net != &init_net)
1548 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1550 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1556 dev_queue = netdev_get_tx_queue(dev, 0);
1557 if (tc_dump_tclass_root(dev_queue->qdisc, skb, tcm, cb, &t, s_t) < 0)
1560 dev_queue = &dev->rx_queue;
1561 if (tc_dump_tclass_root(dev_queue->qdisc, skb, tcm, cb, &t, s_t) < 0)
1571 /* Main classifier routine: scans classifier chain attached
1572 to this qdisc, (optionally) tests for protocol and asks
1573 specific classifiers.
1575 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1576 struct tcf_result *res)
1578 __be16 protocol = skb->protocol;
1581 for (; tp; tp = tp->next) {
1582 if ((tp->protocol == protocol ||
1583 tp->protocol == htons(ETH_P_ALL)) &&
1584 (err = tp->classify(skb, tp, res)) >= 0) {
1585 #ifdef CONFIG_NET_CLS_ACT
1586 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1587 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1594 EXPORT_SYMBOL(tc_classify_compat);
1596 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1597 struct tcf_result *res)
1601 #ifdef CONFIG_NET_CLS_ACT
1602 struct tcf_proto *otp = tp;
1605 protocol = skb->protocol;
1607 err = tc_classify_compat(skb, tp, res);
1608 #ifdef CONFIG_NET_CLS_ACT
1609 if (err == TC_ACT_RECLASSIFY) {
1610 u32 verd = G_TC_VERD(skb->tc_verd);
1613 if (verd++ >= MAX_REC_LOOP) {
1614 printk("rule prio %u protocol %02x reclassify loop, "
1616 tp->prio&0xffff, ntohs(tp->protocol));
1619 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1625 EXPORT_SYMBOL(tc_classify);
1627 void tcf_destroy(struct tcf_proto *tp)
1629 tp->ops->destroy(tp);
1630 module_put(tp->ops->owner);
1634 void tcf_destroy_chain(struct tcf_proto **fl)
1636 struct tcf_proto *tp;
1638 while ((tp = *fl) != NULL) {
1643 EXPORT_SYMBOL(tcf_destroy_chain);
1645 #ifdef CONFIG_PROC_FS
1646 static int psched_show(struct seq_file *seq, void *v)
1650 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1651 seq_printf(seq, "%08x %08x %08x %08x\n",
1652 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1654 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1659 static int psched_open(struct inode *inode, struct file *file)
1661 return single_open(file, psched_show, PDE(inode)->data);
1664 static const struct file_operations psched_fops = {
1665 .owner = THIS_MODULE,
1666 .open = psched_open,
1668 .llseek = seq_lseek,
1669 .release = single_release,
1673 static int __init pktsched_init(void)
1675 register_qdisc(&pfifo_qdisc_ops);
1676 register_qdisc(&bfifo_qdisc_ops);
1677 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1679 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1680 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1681 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1682 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1683 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1684 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1689 subsys_initcall(pktsched_init);