Merge branch 'devel' of master.kernel.org:/home/rmk/linux-2.6-arm
[linux-2.6] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Main qdisc structure lock. 
40
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made under rtnl semaphore or
51      from softirq context (__qdisc_destroy rcu-callback)
52      hence this lock needs local bh disabling.
53
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60         write_lock_bh(&qdisc_tree_lock);
61         spin_lock_bh(&dev->queue_lock);
62 }
63
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66         spin_unlock_bh(&dev->queue_lock);
67         write_unlock_bh(&qdisc_tree_lock);
68 }
69
70 /* 
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73
74    netif_tx_lock serializes accesses to device driver.
75
76    dev->queue_lock and netif_tx_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79
80
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91
92 static inline int qdisc_restart(struct net_device *dev)
93 {
94         struct Qdisc *q = dev->qdisc;
95         struct sk_buff *skb;
96
97         /* Dequeue packet */
98         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
99                 unsigned nolock = (dev->features & NETIF_F_LLTX);
100
101                 dev->gso_skb = NULL;
102
103                 /*
104                  * When the driver has LLTX set it does its own locking
105                  * in start_xmit. No need to add additional overhead by
106                  * locking again. These checks are worth it because
107                  * even uncongested locks can be quite expensive.
108                  * The driver can do trylock like here too, in case
109                  * of lock congestion it should return -1 and the packet
110                  * will be requeued.
111                  */
112                 if (!nolock) {
113                         if (!netif_tx_trylock(dev)) {
114                         collision:
115                                 /* So, someone grabbed the driver. */
116                                 
117                                 /* It may be transient configuration error,
118                                    when hard_start_xmit() recurses. We detect
119                                    it by checking xmit owner and drop the
120                                    packet when deadloop is detected.
121                                 */
122                                 if (dev->xmit_lock_owner == smp_processor_id()) {
123                                         kfree_skb(skb);
124                                         if (net_ratelimit())
125                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
126                                         return -1;
127                                 }
128                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
129                                 goto requeue;
130                         }
131                 }
132                 
133                 {
134                         /* And release queue */
135                         spin_unlock(&dev->queue_lock);
136
137                         if (!netif_queue_stopped(dev)) {
138                                 int ret;
139
140                                 ret = dev_hard_start_xmit(skb, dev);
141                                 if (ret == NETDEV_TX_OK) { 
142                                         if (!nolock) {
143                                                 netif_tx_unlock(dev);
144                                         }
145                                         spin_lock(&dev->queue_lock);
146                                         return -1;
147                                 }
148                                 if (ret == NETDEV_TX_LOCKED && nolock) {
149                                         spin_lock(&dev->queue_lock);
150                                         goto collision; 
151                                 }
152                         }
153
154                         /* NETDEV_TX_BUSY - we need to requeue */
155                         /* Release the driver */
156                         if (!nolock) { 
157                                 netif_tx_unlock(dev);
158                         } 
159                         spin_lock(&dev->queue_lock);
160                         q = dev->qdisc;
161                 }
162
163                 /* Device kicked us out :(
164                    This is possible in three cases:
165
166                    0. driver is locked
167                    1. fastroute is enabled
168                    2. device cannot determine busy state
169                       before start of transmission (f.e. dialout)
170                    3. device is buggy (ppp)
171                  */
172
173 requeue:
174                 if (skb->next)
175                         dev->gso_skb = skb;
176                 else
177                         q->ops->requeue(skb, q);
178                 netif_schedule(dev);
179                 return 1;
180         }
181         BUG_ON((int) q->q.qlen < 0);
182         return q->q.qlen;
183 }
184
185 void __qdisc_run(struct net_device *dev)
186 {
187         if (unlikely(dev->qdisc == &noop_qdisc))
188                 goto out;
189
190         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
191                 /* NOTHING */;
192
193 out:
194         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
195 }
196
197 static void dev_watchdog(unsigned long arg)
198 {
199         struct net_device *dev = (struct net_device *)arg;
200
201         netif_tx_lock(dev);
202         if (dev->qdisc != &noop_qdisc) {
203                 if (netif_device_present(dev) &&
204                     netif_running(dev) &&
205                     netif_carrier_ok(dev)) {
206                         if (netif_queue_stopped(dev) &&
207                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
208
209                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
210                                        dev->name);
211                                 dev->tx_timeout(dev);
212                         }
213                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
214                                 dev_hold(dev);
215                 }
216         }
217         netif_tx_unlock(dev);
218
219         dev_put(dev);
220 }
221
222 static void dev_watchdog_init(struct net_device *dev)
223 {
224         init_timer(&dev->watchdog_timer);
225         dev->watchdog_timer.data = (unsigned long)dev;
226         dev->watchdog_timer.function = dev_watchdog;
227 }
228
229 void __netdev_watchdog_up(struct net_device *dev)
230 {
231         if (dev->tx_timeout) {
232                 if (dev->watchdog_timeo <= 0)
233                         dev->watchdog_timeo = 5*HZ;
234                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
235                         dev_hold(dev);
236         }
237 }
238
239 static void dev_watchdog_up(struct net_device *dev)
240 {
241         netif_tx_lock_bh(dev);
242         __netdev_watchdog_up(dev);
243         netif_tx_unlock_bh(dev);
244 }
245
246 static void dev_watchdog_down(struct net_device *dev)
247 {
248         netif_tx_lock_bh(dev);
249         if (del_timer(&dev->watchdog_timer))
250                 dev_put(dev);
251         netif_tx_unlock_bh(dev);
252 }
253
254 void netif_carrier_on(struct net_device *dev)
255 {
256         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
257                 linkwatch_fire_event(dev);
258         if (netif_running(dev))
259                 __netdev_watchdog_up(dev);
260 }
261
262 void netif_carrier_off(struct net_device *dev)
263 {
264         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
265                 linkwatch_fire_event(dev);
266 }
267
268 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
269    under all circumstances. It is difficult to invent anything faster or
270    cheaper.
271  */
272
273 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
274 {
275         kfree_skb(skb);
276         return NET_XMIT_CN;
277 }
278
279 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
280 {
281         return NULL;
282 }
283
284 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
285 {
286         if (net_ratelimit())
287                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
288                        skb->dev->name);
289         kfree_skb(skb);
290         return NET_XMIT_CN;
291 }
292
293 struct Qdisc_ops noop_qdisc_ops = {
294         .id             =       "noop",
295         .priv_size      =       0,
296         .enqueue        =       noop_enqueue,
297         .dequeue        =       noop_dequeue,
298         .requeue        =       noop_requeue,
299         .owner          =       THIS_MODULE,
300 };
301
302 struct Qdisc noop_qdisc = {
303         .enqueue        =       noop_enqueue,
304         .dequeue        =       noop_dequeue,
305         .flags          =       TCQ_F_BUILTIN,
306         .ops            =       &noop_qdisc_ops,        
307         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
308 };
309
310 static struct Qdisc_ops noqueue_qdisc_ops = {
311         .id             =       "noqueue",
312         .priv_size      =       0,
313         .enqueue        =       noop_enqueue,
314         .dequeue        =       noop_dequeue,
315         .requeue        =       noop_requeue,
316         .owner          =       THIS_MODULE,
317 };
318
319 static struct Qdisc noqueue_qdisc = {
320         .enqueue        =       NULL,
321         .dequeue        =       noop_dequeue,
322         .flags          =       TCQ_F_BUILTIN,
323         .ops            =       &noqueue_qdisc_ops,
324         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
325 };
326
327
328 static const u8 prio2band[TC_PRIO_MAX+1] =
329         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
330
331 /* 3-band FIFO queue: old style, but should be a bit faster than
332    generic prio+fifo combination.
333  */
334
335 #define PFIFO_FAST_BANDS 3
336
337 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
338                                              struct Qdisc *qdisc)
339 {
340         struct sk_buff_head *list = qdisc_priv(qdisc);
341         return list + prio2band[skb->priority & TC_PRIO_MAX];
342 }
343
344 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
345 {
346         struct sk_buff_head *list = prio2list(skb, qdisc);
347
348         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
349                 qdisc->q.qlen++;
350                 return __qdisc_enqueue_tail(skb, qdisc, list);
351         }
352
353         return qdisc_drop(skb, qdisc);
354 }
355
356 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
357 {
358         int prio;
359         struct sk_buff_head *list = qdisc_priv(qdisc);
360
361         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
362                 if (!skb_queue_empty(list + prio)) {
363                         qdisc->q.qlen--;
364                         return __qdisc_dequeue_head(qdisc, list + prio);
365                 }
366         }
367
368         return NULL;
369 }
370
371 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
372 {
373         qdisc->q.qlen++;
374         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
375 }
376
377 static void pfifo_fast_reset(struct Qdisc* qdisc)
378 {
379         int prio;
380         struct sk_buff_head *list = qdisc_priv(qdisc);
381
382         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
383                 __qdisc_reset_queue(qdisc, list + prio);
384
385         qdisc->qstats.backlog = 0;
386         qdisc->q.qlen = 0;
387 }
388
389 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
390 {
391         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
392
393         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
394         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
395         return skb->len;
396
397 rtattr_failure:
398         return -1;
399 }
400
401 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
402 {
403         int prio;
404         struct sk_buff_head *list = qdisc_priv(qdisc);
405
406         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
407                 skb_queue_head_init(list + prio);
408
409         return 0;
410 }
411
412 static struct Qdisc_ops pfifo_fast_ops = {
413         .id             =       "pfifo_fast",
414         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
415         .enqueue        =       pfifo_fast_enqueue,
416         .dequeue        =       pfifo_fast_dequeue,
417         .requeue        =       pfifo_fast_requeue,
418         .init           =       pfifo_fast_init,
419         .reset          =       pfifo_fast_reset,
420         .dump           =       pfifo_fast_dump,
421         .owner          =       THIS_MODULE,
422 };
423
424 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
425 {
426         void *p;
427         struct Qdisc *sch;
428         unsigned int size;
429         int err = -ENOBUFS;
430
431         /* ensure that the Qdisc and the private data are 32-byte aligned */
432         size = QDISC_ALIGN(sizeof(*sch));
433         size += ops->priv_size + (QDISC_ALIGNTO - 1);
434
435         p = kmalloc(size, GFP_KERNEL);
436         if (!p)
437                 goto errout;
438         memset(p, 0, size);
439         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
440         sch->padded = (char *) sch - (char *) p;
441
442         INIT_LIST_HEAD(&sch->list);
443         skb_queue_head_init(&sch->q);
444         sch->ops = ops;
445         sch->enqueue = ops->enqueue;
446         sch->dequeue = ops->dequeue;
447         sch->dev = dev;
448         dev_hold(dev);
449         sch->stats_lock = &dev->queue_lock;
450         atomic_set(&sch->refcnt, 1);
451
452         return sch;
453 errout:
454         return ERR_PTR(-err);
455 }
456
457 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
458 {
459         struct Qdisc *sch;
460         
461         sch = qdisc_alloc(dev, ops);
462         if (IS_ERR(sch))
463                 goto errout;
464
465         if (!ops->init || ops->init(sch, NULL) == 0)
466                 return sch;
467
468         qdisc_destroy(sch);
469 errout:
470         return NULL;
471 }
472
473 /* Under dev->queue_lock and BH! */
474
475 void qdisc_reset(struct Qdisc *qdisc)
476 {
477         struct Qdisc_ops *ops = qdisc->ops;
478
479         if (ops->reset)
480                 ops->reset(qdisc);
481 }
482
483 /* this is the rcu callback function to clean up a qdisc when there 
484  * are no further references to it */
485
486 static void __qdisc_destroy(struct rcu_head *head)
487 {
488         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
489         struct Qdisc_ops  *ops = qdisc->ops;
490
491 #ifdef CONFIG_NET_ESTIMATOR
492         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
493 #endif
494         write_lock(&qdisc_tree_lock);
495         if (ops->reset)
496                 ops->reset(qdisc);
497         if (ops->destroy)
498                 ops->destroy(qdisc);
499         write_unlock(&qdisc_tree_lock);
500         module_put(ops->owner);
501
502         dev_put(qdisc->dev);
503         kfree((char *) qdisc - qdisc->padded);
504 }
505
506 /* Under dev->queue_lock and BH! */
507
508 void qdisc_destroy(struct Qdisc *qdisc)
509 {
510         struct list_head cql = LIST_HEAD_INIT(cql);
511         struct Qdisc *cq, *q, *n;
512
513         if (qdisc->flags & TCQ_F_BUILTIN ||
514                 !atomic_dec_and_test(&qdisc->refcnt))
515                 return;
516
517         if (!list_empty(&qdisc->list)) {
518                 if (qdisc->ops->cl_ops == NULL)
519                         list_del(&qdisc->list);
520                 else
521                         list_move(&qdisc->list, &cql);
522         }
523
524         /* unlink inner qdiscs from dev->qdisc_list immediately */
525         list_for_each_entry(cq, &cql, list)
526                 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
527                         if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
528                                 if (q->ops->cl_ops == NULL)
529                                         list_del_init(&q->list);
530                                 else
531                                         list_move_tail(&q->list, &cql);
532                         }
533         list_for_each_entry_safe(cq, n, &cql, list)
534                 list_del_init(&cq->list);
535
536         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
537 }
538
539 void dev_activate(struct net_device *dev)
540 {
541         /* No queueing discipline is attached to device;
542            create default one i.e. pfifo_fast for devices,
543            which need queueing and noqueue_qdisc for
544            virtual interfaces
545          */
546
547         if (dev->qdisc_sleeping == &noop_qdisc) {
548                 struct Qdisc *qdisc;
549                 if (dev->tx_queue_len) {
550                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
551                         if (qdisc == NULL) {
552                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
553                                 return;
554                         }
555                         write_lock_bh(&qdisc_tree_lock);
556                         list_add_tail(&qdisc->list, &dev->qdisc_list);
557                         write_unlock_bh(&qdisc_tree_lock);
558                 } else {
559                         qdisc =  &noqueue_qdisc;
560                 }
561                 write_lock_bh(&qdisc_tree_lock);
562                 dev->qdisc_sleeping = qdisc;
563                 write_unlock_bh(&qdisc_tree_lock);
564         }
565
566         if (!netif_carrier_ok(dev))
567                 /* Delay activation until next carrier-on event */
568                 return;
569
570         spin_lock_bh(&dev->queue_lock);
571         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
572         if (dev->qdisc != &noqueue_qdisc) {
573                 dev->trans_start = jiffies;
574                 dev_watchdog_up(dev);
575         }
576         spin_unlock_bh(&dev->queue_lock);
577 }
578
579 void dev_deactivate(struct net_device *dev)
580 {
581         struct Qdisc *qdisc;
582
583         spin_lock_bh(&dev->queue_lock);
584         qdisc = dev->qdisc;
585         dev->qdisc = &noop_qdisc;
586
587         qdisc_reset(qdisc);
588
589         spin_unlock_bh(&dev->queue_lock);
590
591         dev_watchdog_down(dev);
592
593         /* Wait for outstanding dev_queue_xmit calls. */
594         synchronize_rcu();
595
596         /* Wait for outstanding qdisc_run calls. */
597         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
598                 yield();
599
600         if (dev->gso_skb) {
601                 kfree_skb(dev->gso_skb);
602                 dev->gso_skb = NULL;
603         }
604 }
605
606 void dev_init_scheduler(struct net_device *dev)
607 {
608         qdisc_lock_tree(dev);
609         dev->qdisc = &noop_qdisc;
610         dev->qdisc_sleeping = &noop_qdisc;
611         INIT_LIST_HEAD(&dev->qdisc_list);
612         qdisc_unlock_tree(dev);
613
614         dev_watchdog_init(dev);
615 }
616
617 void dev_shutdown(struct net_device *dev)
618 {
619         struct Qdisc *qdisc;
620
621         qdisc_lock_tree(dev);
622         qdisc = dev->qdisc_sleeping;
623         dev->qdisc = &noop_qdisc;
624         dev->qdisc_sleeping = &noop_qdisc;
625         qdisc_destroy(qdisc);
626 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
627         if ((qdisc = dev->qdisc_ingress) != NULL) {
628                 dev->qdisc_ingress = NULL;
629                 qdisc_destroy(qdisc);
630         }
631 #endif
632         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
633         qdisc_unlock_tree(dev);
634 }
635
636 EXPORT_SYMBOL(__netdev_watchdog_up);
637 EXPORT_SYMBOL(netif_carrier_on);
638 EXPORT_SYMBOL(netif_carrier_off);
639 EXPORT_SYMBOL(noop_qdisc);
640 EXPORT_SYMBOL(noop_qdisc_ops);
641 EXPORT_SYMBOL(qdisc_create_dflt);
642 EXPORT_SYMBOL(qdisc_alloc);
643 EXPORT_SYMBOL(qdisc_destroy);
644 EXPORT_SYMBOL(qdisc_reset);
645 EXPORT_SYMBOL(qdisc_lock_tree);
646 EXPORT_SYMBOL(qdisc_unlock_tree);