2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/bitops.h>
40 #include <net/pkt_sched.h>
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
56 This file consists of two interrelated parts:
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
80 All real intelligent work is done inside qdisc modules.
84 Every discipline has two major routines: enqueue and dequeue.
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
121 initializes newly created qdisc.
125 destroys resources allocated by init and during lifetime of qdisc.
129 changes qdisc parameters.
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock);
136 /************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
141 /* The list of all installed queueing disciplines. */
143 static struct Qdisc_ops *qdisc_base;
145 /* Register/uregister queueing discipline */
147 int register_qdisc(struct Qdisc_ops *qops)
149 struct Qdisc_ops *q, **qp;
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
168 write_unlock(&qdisc_mod_lock);
172 int unregister_qdisc(struct Qdisc_ops *qops)
174 struct Qdisc_ops *q, **qp;
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
186 write_unlock(&qdisc_mod_lock);
190 /* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
194 static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
198 list_for_each_entry(q, &dev->qdisc_list, list) {
199 if (q->handle == handle)
205 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
209 read_lock(&qdisc_tree_lock);
210 q = __qdisc_lookup(dev, handle);
211 read_unlock(&qdisc_tree_lock);
215 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
219 struct Qdisc_class_ops *cops = p->ops->cl_ops;
223 cl = cops->get(p, classid);
227 leaf = cops->leaf(p, cl);
232 /* Find queueing discipline by name */
234 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
236 struct Qdisc_ops *q = NULL;
239 read_lock(&qdisc_mod_lock);
240 for (q = qdisc_base; q; q = q->next) {
241 if (rtattr_strcmp(kind, q->id) == 0) {
242 if (!try_module_get(q->owner))
247 read_unlock(&qdisc_mod_lock);
252 static struct qdisc_rate_table *qdisc_rtab_list;
254 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
256 struct qdisc_rate_table *rtab;
258 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
259 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
265 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
268 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
272 memcpy(rtab->data, RTA_DATA(tab), 1024);
273 rtab->next = qdisc_rtab_list;
274 qdisc_rtab_list = rtab;
279 void qdisc_put_rtab(struct qdisc_rate_table *tab)
281 struct qdisc_rate_table *rtab, **rtabp;
283 if (!tab || --tab->refcnt)
286 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
296 /* Allocate an unique handle from space managed by kernel */
298 static u32 qdisc_alloc_handle(struct net_device *dev)
301 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
304 autohandle += TC_H_MAKE(0x10000U, 0);
305 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
306 autohandle = TC_H_MAKE(0x80000000U, 0);
307 } while (qdisc_lookup(dev, autohandle) && --i > 0);
309 return i>0 ? autohandle : 0;
312 /* Attach toplevel qdisc to device dev */
314 static struct Qdisc *
315 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
317 struct Qdisc *oqdisc;
319 if (dev->flags & IFF_UP)
322 qdisc_lock_tree(dev);
323 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
324 oqdisc = dev->qdisc_ingress;
325 /* Prune old scheduler */
326 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
329 dev->qdisc_ingress = NULL;
331 dev->qdisc_ingress = qdisc;
336 oqdisc = dev->qdisc_sleeping;
338 /* Prune old scheduler */
339 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
342 /* ... and graft new one */
345 dev->qdisc_sleeping = qdisc;
346 dev->qdisc = &noop_qdisc;
349 qdisc_unlock_tree(dev);
351 if (dev->flags & IFF_UP)
357 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
359 struct Qdisc_class_ops *cops;
365 while ((parentid = sch->parent)) {
366 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
367 cops = sch->ops->cl_ops;
368 if (cops->qlen_notify) {
369 cl = cops->get(sch, parentid);
370 cops->qlen_notify(sch, cl);
376 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
378 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
381 Old qdisc is not destroyed but returned in *old.
384 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
386 struct Qdisc *new, struct Qdisc **old)
389 struct Qdisc *q = *old;
392 if (parent == NULL) {
393 if (q && q->flags&TCQ_F_INGRESS) {
394 *old = dev_graft_qdisc(dev, q);
396 *old = dev_graft_qdisc(dev, new);
399 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
404 unsigned long cl = cops->get(parent, classid);
406 err = cops->graft(parent, cl, new, old);
408 new->parent = classid;
409 cops->put(parent, cl);
417 Allocate and initialize new qdisc.
419 Parameters are passed via opt.
422 static struct Qdisc *
423 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
426 struct rtattr *kind = tca[TCA_KIND-1];
428 struct Qdisc_ops *ops;
430 ops = qdisc_lookup_ops(kind);
432 if (ops == NULL && kind != NULL) {
434 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
435 /* We dropped the RTNL semaphore in order to
436 * perform the module load. So, even if we
437 * succeeded in loading the module we have to
438 * tell the caller to replay the request. We
439 * indicate this using -EAGAIN.
440 * We replay the request because the device may
441 * go away in the mean time.
444 request_module("sch_%s", name);
446 ops = qdisc_lookup_ops(kind);
448 /* We will try again qdisc_lookup_ops,
449 * so don't keep a reference.
451 module_put(ops->owner);
463 sch = qdisc_alloc(dev, ops);
469 if (handle == TC_H_INGRESS) {
470 sch->flags |= TCQ_F_INGRESS;
471 handle = TC_H_MAKE(TC_H_INGRESS, 0);
472 } else if (handle == 0) {
473 handle = qdisc_alloc_handle(dev);
479 sch->handle = handle;
481 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
482 #ifdef CONFIG_NET_ESTIMATOR
483 if (tca[TCA_RATE-1]) {
484 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
489 * Any broken qdiscs that would require
490 * a ops->reset() here? The qdisc was never
491 * in action so it shouldn't be necessary.
499 qdisc_lock_tree(dev);
500 list_add_tail(&sch->list, &dev->qdisc_list);
501 qdisc_unlock_tree(dev);
507 kfree((char *) sch - sch->padded);
509 module_put(ops->owner);
515 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
517 if (tca[TCA_OPTIONS-1]) {
520 if (sch->ops->change == NULL)
522 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
526 #ifdef CONFIG_NET_ESTIMATOR
528 gen_replace_estimator(&sch->bstats, &sch->rate_est,
529 sch->stats_lock, tca[TCA_RATE-1]);
534 struct check_loop_arg
536 struct qdisc_walker w;
541 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
543 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
545 struct check_loop_arg arg;
547 if (q->ops->cl_ops == NULL)
550 arg.w.stop = arg.w.skip = arg.w.count = 0;
551 arg.w.fn = check_loop_fn;
554 q->ops->cl_ops->walk(q, &arg.w);
555 return arg.w.stop ? -ELOOP : 0;
559 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
562 struct Qdisc_class_ops *cops = q->ops->cl_ops;
563 struct check_loop_arg *arg = (struct check_loop_arg *)w;
565 leaf = cops->leaf(q, cl);
567 if (leaf == arg->p || arg->depth > 7)
569 return check_loop(leaf, arg->p, arg->depth + 1);
578 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
580 struct tcmsg *tcm = NLMSG_DATA(n);
581 struct rtattr **tca = arg;
582 struct net_device *dev;
583 u32 clid = tcm->tcm_parent;
584 struct Qdisc *q = NULL;
585 struct Qdisc *p = NULL;
588 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
592 if (clid != TC_H_ROOT) {
593 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
594 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
596 q = qdisc_leaf(p, clid);
597 } else { /* ingress */
598 q = dev->qdisc_ingress;
601 q = dev->qdisc_sleeping;
606 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
609 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
613 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
616 if (n->nlmsg_type == RTM_DELQDISC) {
621 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
624 qdisc_notify(skb, n, clid, q, NULL);
625 spin_lock_bh(&dev->queue_lock);
627 spin_unlock_bh(&dev->queue_lock);
630 qdisc_notify(skb, n, clid, NULL, q);
639 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
643 struct net_device *dev;
649 /* Reinit, just in case something touches this. */
652 clid = tcm->tcm_parent;
655 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
659 if (clid != TC_H_ROOT) {
660 if (clid != TC_H_INGRESS) {
661 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
663 q = qdisc_leaf(p, clid);
664 } else { /*ingress */
665 q = dev->qdisc_ingress;
668 q = dev->qdisc_sleeping;
671 /* It may be default qdisc, ignore it */
672 if (q && q->handle == 0)
675 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
676 if (tcm->tcm_handle) {
677 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
679 if (TC_H_MIN(tcm->tcm_handle))
681 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
683 if (n->nlmsg_flags&NLM_F_EXCL)
685 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
688 (p && check_loop(q, p, 0)))
690 atomic_inc(&q->refcnt);
696 /* This magic test requires explanation.
698 * We know, that some child q is already
699 * attached to this parent and have choice:
700 * either to change it or to create/graft new one.
702 * 1. We are allowed to create/graft only
703 * if CREATE and REPLACE flags are set.
705 * 2. If EXCL is set, requestor wanted to say,
706 * that qdisc tcm_handle is not expected
707 * to exist, so that we choose create/graft too.
709 * 3. The last case is when no flags are set.
710 * Alas, it is sort of hole in API, we
711 * cannot decide what to do unambiguously.
712 * For now we select create/graft, if
713 * user gave KIND, which does not match existing.
715 if ((n->nlmsg_flags&NLM_F_CREATE) &&
716 (n->nlmsg_flags&NLM_F_REPLACE) &&
717 ((n->nlmsg_flags&NLM_F_EXCL) ||
719 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
724 if (!tcm->tcm_handle)
726 q = qdisc_lookup(dev, tcm->tcm_handle);
729 /* Change qdisc parameters */
732 if (n->nlmsg_flags&NLM_F_EXCL)
734 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
736 err = qdisc_change(q, tca);
738 qdisc_notify(skb, n, clid, NULL, q);
742 if (!(n->nlmsg_flags&NLM_F_CREATE))
744 if (clid == TC_H_INGRESS)
745 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
747 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
756 struct Qdisc *old_q = NULL;
757 err = qdisc_graft(dev, p, clid, q, &old_q);
760 spin_lock_bh(&dev->queue_lock);
762 spin_unlock_bh(&dev->queue_lock);
766 qdisc_notify(skb, n, clid, old_q, q);
768 spin_lock_bh(&dev->queue_lock);
769 qdisc_destroy(old_q);
770 spin_unlock_bh(&dev->queue_lock);
776 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
777 u32 pid, u32 seq, u16 flags, int event)
780 struct nlmsghdr *nlh;
781 unsigned char *b = skb->tail;
784 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
785 tcm = NLMSG_DATA(nlh);
786 tcm->tcm_family = AF_UNSPEC;
789 tcm->tcm_ifindex = q->dev->ifindex;
790 tcm->tcm_parent = clid;
791 tcm->tcm_handle = q->handle;
792 tcm->tcm_info = atomic_read(&q->refcnt);
793 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
794 if (q->ops->dump && q->ops->dump(q, skb) < 0)
796 q->qstats.qlen = q->q.qlen;
798 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
799 TCA_XSTATS, q->stats_lock, &d) < 0)
802 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
805 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
806 #ifdef CONFIG_NET_ESTIMATOR
807 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
809 gnet_stats_copy_queue(&d, &q->qstats) < 0)
812 if (gnet_stats_finish_copy(&d) < 0)
815 nlh->nlmsg_len = skb->tail - b;
820 skb_trim(skb, b - skb->data);
824 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
825 u32 clid, struct Qdisc *old, struct Qdisc *new)
828 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
830 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
834 if (old && old->handle) {
835 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
839 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
844 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
851 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
855 struct net_device *dev;
859 s_q_idx = q_idx = cb->args[1];
860 read_lock(&dev_base_lock);
861 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
866 read_lock(&qdisc_tree_lock);
868 list_for_each_entry(q, &dev->qdisc_list, list) {
869 if (q_idx < s_q_idx) {
873 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
874 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
875 read_unlock(&qdisc_tree_lock);
880 read_unlock(&qdisc_tree_lock);
884 read_unlock(&dev_base_lock);
894 /************************************************
895 * Traffic classes manipulation. *
896 ************************************************/
900 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
902 struct tcmsg *tcm = NLMSG_DATA(n);
903 struct rtattr **tca = arg;
904 struct net_device *dev;
905 struct Qdisc *q = NULL;
906 struct Qdisc_class_ops *cops;
907 unsigned long cl = 0;
908 unsigned long new_cl;
909 u32 pid = tcm->tcm_parent;
910 u32 clid = tcm->tcm_handle;
911 u32 qid = TC_H_MAJ(clid);
914 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
918 parent == TC_H_UNSPEC - unspecified parent.
919 parent == TC_H_ROOT - class is root, which has no parent.
920 parent == X:0 - parent is root class.
921 parent == X:Y - parent is a node in hierarchy.
922 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
924 handle == 0:0 - generate handle from kernel pool.
925 handle == 0:Y - class is X:Y, where X:0 is qdisc.
926 handle == X:Y - clear.
927 handle == X:0 - root class.
930 /* Step 1. Determine qdisc handle X:0 */
932 if (pid != TC_H_ROOT) {
933 u32 qid1 = TC_H_MAJ(pid);
936 /* If both majors are known, they must be identical. */
942 qid = dev->qdisc_sleeping->handle;
944 /* Now qid is genuine qdisc handle consistent
945 both with parent and child.
947 TC_H_MAJ(pid) still may be unspecified, complete it now.
950 pid = TC_H_MAKE(qid, pid);
953 qid = dev->qdisc_sleeping->handle;
956 /* OK. Locate qdisc */
957 if ((q = qdisc_lookup(dev, qid)) == NULL)
960 /* An check that it supports classes */
961 cops = q->ops->cl_ops;
965 /* Now try to get class */
967 if (pid == TC_H_ROOT)
970 clid = TC_H_MAKE(qid, clid);
973 cl = cops->get(q, clid);
977 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
980 switch (n->nlmsg_type) {
983 if (n->nlmsg_flags&NLM_F_EXCL)
987 err = cops->delete(q, cl);
989 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
992 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1001 err = cops->change(q, clid, pid, tca, &new_cl);
1003 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1013 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1015 u32 pid, u32 seq, u16 flags, int event)
1018 struct nlmsghdr *nlh;
1019 unsigned char *b = skb->tail;
1021 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1023 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1024 tcm = NLMSG_DATA(nlh);
1025 tcm->tcm_family = AF_UNSPEC;
1026 tcm->tcm_ifindex = q->dev->ifindex;
1027 tcm->tcm_parent = q->handle;
1028 tcm->tcm_handle = q->handle;
1030 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1031 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1032 goto rtattr_failure;
1034 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1035 TCA_XSTATS, q->stats_lock, &d) < 0)
1036 goto rtattr_failure;
1038 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1039 goto rtattr_failure;
1041 if (gnet_stats_finish_copy(&d) < 0)
1042 goto rtattr_failure;
1044 nlh->nlmsg_len = skb->tail - b;
1049 skb_trim(skb, b - skb->data);
1053 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1054 struct Qdisc *q, unsigned long cl, int event)
1056 struct sk_buff *skb;
1057 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1059 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1063 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1068 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1071 struct qdisc_dump_args
1073 struct qdisc_walker w;
1074 struct sk_buff *skb;
1075 struct netlink_callback *cb;
1078 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1080 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1082 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1083 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1086 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1090 struct net_device *dev;
1092 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1093 struct qdisc_dump_args arg;
1095 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1097 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1103 read_lock(&qdisc_tree_lock);
1104 list_for_each_entry(q, &dev->qdisc_list, list) {
1105 if (t < s_t || !q->ops->cl_ops ||
1107 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1112 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1113 arg.w.fn = qdisc_class_dump;
1117 arg.w.skip = cb->args[1];
1119 q->ops->cl_ops->walk(q, &arg.w);
1120 cb->args[1] = arg.w.count;
1125 read_unlock(&qdisc_tree_lock);
1133 /* Main classifier routine: scans classifier chain attached
1134 to this qdisc, (optionally) tests for protocol and asks
1135 specific classifiers.
1137 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1138 struct tcf_result *res)
1141 __be16 protocol = skb->protocol;
1142 #ifdef CONFIG_NET_CLS_ACT
1143 struct tcf_proto *otp = tp;
1146 protocol = skb->protocol;
1148 for ( ; tp; tp = tp->next) {
1149 if ((tp->protocol == protocol ||
1150 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1151 (err = tp->classify(skb, tp, res)) >= 0) {
1152 #ifdef CONFIG_NET_CLS_ACT
1153 if ( TC_ACT_RECLASSIFY == err) {
1154 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1157 if (MAX_REC_LOOP < verd++) {
1158 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1159 tp->prio&0xffff, ntohs(tp->protocol));
1162 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1166 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1179 static int psched_us_per_tick = 1;
1180 static int psched_tick_per_us = 1;
1182 #ifdef CONFIG_PROC_FS
1183 static int psched_show(struct seq_file *seq, void *v)
1185 seq_printf(seq, "%08x %08x %08x %08x\n",
1186 psched_tick_per_us, psched_us_per_tick,
1192 static int psched_open(struct inode *inode, struct file *file)
1194 return single_open(file, psched_show, PDE(inode)->data);
1197 static struct file_operations psched_fops = {
1198 .owner = THIS_MODULE,
1199 .open = psched_open,
1201 .llseek = seq_lseek,
1202 .release = single_release,
1206 #ifdef CONFIG_NET_SCH_CLK_CPU
1207 psched_tdiff_t psched_clock_per_hz;
1208 int psched_clock_scale;
1209 EXPORT_SYMBOL(psched_clock_per_hz);
1210 EXPORT_SYMBOL(psched_clock_scale);
1212 psched_time_t psched_time_base;
1213 cycles_t psched_time_mark;
1214 EXPORT_SYMBOL(psched_time_mark);
1215 EXPORT_SYMBOL(psched_time_base);
1218 * Periodically adjust psched_time_base to avoid overflow
1219 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1221 static void psched_tick(unsigned long);
1222 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1224 static void psched_tick(unsigned long dummy)
1226 if (sizeof(cycles_t) == sizeof(u32)) {
1227 psched_time_t dummy_stamp;
1228 PSCHED_GET_TIME(dummy_stamp);
1229 psched_timer.expires = jiffies + 1*HZ;
1230 add_timer(&psched_timer);
1234 int __init psched_calibrate_clock(void)
1236 psched_time_t stamp, stamp1;
1237 struct timeval tv, tv1;
1238 psched_tdiff_t delay;
1243 stop = jiffies + HZ/10;
1244 PSCHED_GET_TIME(stamp);
1245 do_gettimeofday(&tv);
1246 while (time_before(jiffies, stop)) {
1250 PSCHED_GET_TIME(stamp1);
1251 do_gettimeofday(&tv1);
1253 delay = PSCHED_TDIFF(stamp1, stamp);
1254 rdelay = tv1.tv_usec - tv.tv_usec;
1255 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1259 psched_tick_per_us = delay;
1260 while ((delay>>=1) != 0)
1261 psched_clock_scale++;
1262 psched_us_per_tick = 1<<psched_clock_scale;
1263 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1268 static int __init pktsched_init(void)
1270 struct rtnetlink_link *link_p;
1272 #ifdef CONFIG_NET_SCH_CLK_CPU
1273 if (psched_calibrate_clock() < 0)
1275 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1276 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1277 psched_us_per_tick = 1000000;
1280 link_p = rtnetlink_links[PF_UNSPEC];
1282 /* Setup rtnetlink links. It is made here to avoid
1283 exporting large number of public symbols.
1287 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1288 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1289 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1290 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1291 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1292 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1293 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1294 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1297 register_qdisc(&pfifo_qdisc_ops);
1298 register_qdisc(&bfifo_qdisc_ops);
1299 proc_net_fops_create("psched", 0, &psched_fops);
1304 subsys_initcall(pktsched_init);
1306 EXPORT_SYMBOL(qdisc_get_rtab);
1307 EXPORT_SYMBOL(qdisc_put_rtab);
1308 EXPORT_SYMBOL(register_qdisc);
1309 EXPORT_SYMBOL(unregister_qdisc);
1310 EXPORT_SYMBOL(tc_classify);