Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/dtor/input
[linux-2.6] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Main qdisc structure lock. 
40
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made under rtnl semaphore or
51      from softirq context (__qdisc_destroy rcu-callback)
52      hence this lock needs local bh disabling.
53
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60         write_lock_bh(&qdisc_tree_lock);
61         spin_lock_bh(&dev->queue_lock);
62 }
63
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66         spin_unlock_bh(&dev->queue_lock);
67         write_unlock_bh(&qdisc_tree_lock);
68 }
69
70 /* 
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73
74    netif_tx_lock serializes accesses to device driver.
75
76    dev->queue_lock and netif_tx_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79
80
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91
92 static inline int qdisc_restart(struct net_device *dev)
93 {
94         struct Qdisc *q = dev->qdisc;
95         struct sk_buff *skb;
96
97         /* Dequeue packet */
98         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
99                 unsigned nolock = (dev->features & NETIF_F_LLTX);
100
101                 dev->gso_skb = NULL;
102
103                 /*
104                  * When the driver has LLTX set it does its own locking
105                  * in start_xmit. No need to add additional overhead by
106                  * locking again. These checks are worth it because
107                  * even uncongested locks can be quite expensive.
108                  * The driver can do trylock like here too, in case
109                  * of lock congestion it should return -1 and the packet
110                  * will be requeued.
111                  */
112                 if (!nolock) {
113                         if (!netif_tx_trylock(dev)) {
114                         collision:
115                                 /* So, someone grabbed the driver. */
116                                 
117                                 /* It may be transient configuration error,
118                                    when hard_start_xmit() recurses. We detect
119                                    it by checking xmit owner and drop the
120                                    packet when deadloop is detected.
121                                 */
122                                 if (dev->xmit_lock_owner == smp_processor_id()) {
123                                         kfree_skb(skb);
124                                         if (net_ratelimit())
125                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
126                                         return -1;
127                                 }
128                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
129                                 goto requeue;
130                         }
131                 }
132                 
133                 {
134                         /* And release queue */
135                         spin_unlock(&dev->queue_lock);
136
137                         if (!netif_queue_stopped(dev)) {
138                                 int ret;
139
140                                 ret = dev_hard_start_xmit(skb, dev);
141                                 if (ret == NETDEV_TX_OK) { 
142                                         if (!nolock) {
143                                                 netif_tx_unlock(dev);
144                                         }
145                                         spin_lock(&dev->queue_lock);
146                                         return -1;
147                                 }
148                                 if (ret == NETDEV_TX_LOCKED && nolock) {
149                                         spin_lock(&dev->queue_lock);
150                                         goto collision; 
151                                 }
152                         }
153
154                         /* NETDEV_TX_BUSY - we need to requeue */
155                         /* Release the driver */
156                         if (!nolock) { 
157                                 netif_tx_unlock(dev);
158                         } 
159                         spin_lock(&dev->queue_lock);
160                         q = dev->qdisc;
161                 }
162
163                 /* Device kicked us out :(
164                    This is possible in three cases:
165
166                    0. driver is locked
167                    1. fastroute is enabled
168                    2. device cannot determine busy state
169                       before start of transmission (f.e. dialout)
170                    3. device is buggy (ppp)
171                  */
172
173 requeue:
174                 if (skb->next)
175                         dev->gso_skb = skb;
176                 else
177                         q->ops->requeue(skb, q);
178                 netif_schedule(dev);
179                 return 1;
180         }
181         BUG_ON((int) q->q.qlen < 0);
182         return q->q.qlen;
183 }
184
185 void __qdisc_run(struct net_device *dev)
186 {
187         if (unlikely(dev->qdisc == &noop_qdisc))
188                 goto out;
189
190         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
191                 /* NOTHING */;
192
193 out:
194         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
195 }
196
197 static void dev_watchdog(unsigned long arg)
198 {
199         struct net_device *dev = (struct net_device *)arg;
200
201         netif_tx_lock(dev);
202         if (dev->qdisc != &noop_qdisc) {
203                 if (netif_device_present(dev) &&
204                     netif_running(dev) &&
205                     netif_carrier_ok(dev)) {
206                         if (netif_queue_stopped(dev) &&
207                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
208
209                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
210                                        dev->name);
211                                 dev->tx_timeout(dev);
212                         }
213                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
214                                 dev_hold(dev);
215                 }
216         }
217         netif_tx_unlock(dev);
218
219         dev_put(dev);
220 }
221
222 static void dev_watchdog_init(struct net_device *dev)
223 {
224         init_timer(&dev->watchdog_timer);
225         dev->watchdog_timer.data = (unsigned long)dev;
226         dev->watchdog_timer.function = dev_watchdog;
227 }
228
229 void __netdev_watchdog_up(struct net_device *dev)
230 {
231         if (dev->tx_timeout) {
232                 if (dev->watchdog_timeo <= 0)
233                         dev->watchdog_timeo = 5*HZ;
234                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
235                         dev_hold(dev);
236         }
237 }
238
239 static void dev_watchdog_up(struct net_device *dev)
240 {
241         netif_tx_lock_bh(dev);
242         __netdev_watchdog_up(dev);
243         netif_tx_unlock_bh(dev);
244 }
245
246 static void dev_watchdog_down(struct net_device *dev)
247 {
248         netif_tx_lock_bh(dev);
249         if (del_timer(&dev->watchdog_timer))
250                 dev_put(dev);
251         netif_tx_unlock_bh(dev);
252 }
253
254 void netif_carrier_on(struct net_device *dev)
255 {
256         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
257                 linkwatch_fire_event(dev);
258         if (netif_running(dev))
259                 __netdev_watchdog_up(dev);
260 }
261
262 void netif_carrier_off(struct net_device *dev)
263 {
264         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
265                 linkwatch_fire_event(dev);
266 }
267
268 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
269    under all circumstances. It is difficult to invent anything faster or
270    cheaper.
271  */
272
273 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
274 {
275         kfree_skb(skb);
276         return NET_XMIT_CN;
277 }
278
279 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
280 {
281         return NULL;
282 }
283
284 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
285 {
286         if (net_ratelimit())
287                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
288                        skb->dev->name);
289         kfree_skb(skb);
290         return NET_XMIT_CN;
291 }
292
293 struct Qdisc_ops noop_qdisc_ops = {
294         .id             =       "noop",
295         .priv_size      =       0,
296         .enqueue        =       noop_enqueue,
297         .dequeue        =       noop_dequeue,
298         .requeue        =       noop_requeue,
299         .owner          =       THIS_MODULE,
300 };
301
302 struct Qdisc noop_qdisc = {
303         .enqueue        =       noop_enqueue,
304         .dequeue        =       noop_dequeue,
305         .flags          =       TCQ_F_BUILTIN,
306         .ops            =       &noop_qdisc_ops,        
307         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
308 };
309
310 static struct Qdisc_ops noqueue_qdisc_ops = {
311         .id             =       "noqueue",
312         .priv_size      =       0,
313         .enqueue        =       noop_enqueue,
314         .dequeue        =       noop_dequeue,
315         .requeue        =       noop_requeue,
316         .owner          =       THIS_MODULE,
317 };
318
319 static struct Qdisc noqueue_qdisc = {
320         .enqueue        =       NULL,
321         .dequeue        =       noop_dequeue,
322         .flags          =       TCQ_F_BUILTIN,
323         .ops            =       &noqueue_qdisc_ops,
324         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
325 };
326
327
328 static const u8 prio2band[TC_PRIO_MAX+1] =
329         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
330
331 /* 3-band FIFO queue: old style, but should be a bit faster than
332    generic prio+fifo combination.
333  */
334
335 #define PFIFO_FAST_BANDS 3
336
337 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
338                                              struct Qdisc *qdisc)
339 {
340         struct sk_buff_head *list = qdisc_priv(qdisc);
341         return list + prio2band[skb->priority & TC_PRIO_MAX];
342 }
343
344 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
345 {
346         struct sk_buff_head *list = prio2list(skb, qdisc);
347
348         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
349                 qdisc->q.qlen++;
350                 return __qdisc_enqueue_tail(skb, qdisc, list);
351         }
352
353         return qdisc_drop(skb, qdisc);
354 }
355
356 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
357 {
358         int prio;
359         struct sk_buff_head *list = qdisc_priv(qdisc);
360
361         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
362                 if (!skb_queue_empty(list + prio)) {
363                         qdisc->q.qlen--;
364                         return __qdisc_dequeue_head(qdisc, list + prio);
365                 }
366         }
367
368         return NULL;
369 }
370
371 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
372 {
373         qdisc->q.qlen++;
374         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
375 }
376
377 static void pfifo_fast_reset(struct Qdisc* qdisc)
378 {
379         int prio;
380         struct sk_buff_head *list = qdisc_priv(qdisc);
381
382         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
383                 __qdisc_reset_queue(qdisc, list + prio);
384
385         qdisc->qstats.backlog = 0;
386         qdisc->q.qlen = 0;
387 }
388
389 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
390 {
391         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
392
393         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
394         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
395         return skb->len;
396
397 rtattr_failure:
398         return -1;
399 }
400
401 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
402 {
403         int prio;
404         struct sk_buff_head *list = qdisc_priv(qdisc);
405
406         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
407                 skb_queue_head_init(list + prio);
408
409         return 0;
410 }
411
412 static struct Qdisc_ops pfifo_fast_ops = {
413         .id             =       "pfifo_fast",
414         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
415         .enqueue        =       pfifo_fast_enqueue,
416         .dequeue        =       pfifo_fast_dequeue,
417         .requeue        =       pfifo_fast_requeue,
418         .init           =       pfifo_fast_init,
419         .reset          =       pfifo_fast_reset,
420         .dump           =       pfifo_fast_dump,
421         .owner          =       THIS_MODULE,
422 };
423
424 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
425 {
426         void *p;
427         struct Qdisc *sch;
428         unsigned int size;
429         int err = -ENOBUFS;
430
431         /* ensure that the Qdisc and the private data are 32-byte aligned */
432         size = QDISC_ALIGN(sizeof(*sch));
433         size += ops->priv_size + (QDISC_ALIGNTO - 1);
434
435         p = kzalloc(size, GFP_KERNEL);
436         if (!p)
437                 goto errout;
438         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
439         sch->padded = (char *) sch - (char *) p;
440
441         INIT_LIST_HEAD(&sch->list);
442         skb_queue_head_init(&sch->q);
443         sch->ops = ops;
444         sch->enqueue = ops->enqueue;
445         sch->dequeue = ops->dequeue;
446         sch->dev = dev;
447         dev_hold(dev);
448         sch->stats_lock = &dev->queue_lock;
449         atomic_set(&sch->refcnt, 1);
450
451         return sch;
452 errout:
453         return ERR_PTR(-err);
454 }
455
456 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
457 {
458         struct Qdisc *sch;
459         
460         sch = qdisc_alloc(dev, ops);
461         if (IS_ERR(sch))
462                 goto errout;
463
464         if (!ops->init || ops->init(sch, NULL) == 0)
465                 return sch;
466
467         qdisc_destroy(sch);
468 errout:
469         return NULL;
470 }
471
472 /* Under dev->queue_lock and BH! */
473
474 void qdisc_reset(struct Qdisc *qdisc)
475 {
476         struct Qdisc_ops *ops = qdisc->ops;
477
478         if (ops->reset)
479                 ops->reset(qdisc);
480 }
481
482 /* this is the rcu callback function to clean up a qdisc when there 
483  * are no further references to it */
484
485 static void __qdisc_destroy(struct rcu_head *head)
486 {
487         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
488         struct Qdisc_ops  *ops = qdisc->ops;
489
490 #ifdef CONFIG_NET_ESTIMATOR
491         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
492 #endif
493         write_lock(&qdisc_tree_lock);
494         if (ops->reset)
495                 ops->reset(qdisc);
496         if (ops->destroy)
497                 ops->destroy(qdisc);
498         write_unlock(&qdisc_tree_lock);
499         module_put(ops->owner);
500
501         dev_put(qdisc->dev);
502         kfree((char *) qdisc - qdisc->padded);
503 }
504
505 /* Under dev->queue_lock and BH! */
506
507 void qdisc_destroy(struct Qdisc *qdisc)
508 {
509         struct list_head cql = LIST_HEAD_INIT(cql);
510         struct Qdisc *cq, *q, *n;
511
512         if (qdisc->flags & TCQ_F_BUILTIN ||
513                 !atomic_dec_and_test(&qdisc->refcnt))
514                 return;
515
516         if (!list_empty(&qdisc->list)) {
517                 if (qdisc->ops->cl_ops == NULL)
518                         list_del(&qdisc->list);
519                 else
520                         list_move(&qdisc->list, &cql);
521         }
522
523         /* unlink inner qdiscs from dev->qdisc_list immediately */
524         list_for_each_entry(cq, &cql, list)
525                 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
526                         if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
527                                 if (q->ops->cl_ops == NULL)
528                                         list_del_init(&q->list);
529                                 else
530                                         list_move_tail(&q->list, &cql);
531                         }
532         list_for_each_entry_safe(cq, n, &cql, list)
533                 list_del_init(&cq->list);
534
535         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
536 }
537
538 void dev_activate(struct net_device *dev)
539 {
540         /* No queueing discipline is attached to device;
541            create default one i.e. pfifo_fast for devices,
542            which need queueing and noqueue_qdisc for
543            virtual interfaces
544          */
545
546         if (dev->qdisc_sleeping == &noop_qdisc) {
547                 struct Qdisc *qdisc;
548                 if (dev->tx_queue_len) {
549                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
550                         if (qdisc == NULL) {
551                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
552                                 return;
553                         }
554                         write_lock_bh(&qdisc_tree_lock);
555                         list_add_tail(&qdisc->list, &dev->qdisc_list);
556                         write_unlock_bh(&qdisc_tree_lock);
557                 } else {
558                         qdisc =  &noqueue_qdisc;
559                 }
560                 write_lock_bh(&qdisc_tree_lock);
561                 dev->qdisc_sleeping = qdisc;
562                 write_unlock_bh(&qdisc_tree_lock);
563         }
564
565         if (!netif_carrier_ok(dev))
566                 /* Delay activation until next carrier-on event */
567                 return;
568
569         spin_lock_bh(&dev->queue_lock);
570         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
571         if (dev->qdisc != &noqueue_qdisc) {
572                 dev->trans_start = jiffies;
573                 dev_watchdog_up(dev);
574         }
575         spin_unlock_bh(&dev->queue_lock);
576 }
577
578 void dev_deactivate(struct net_device *dev)
579 {
580         struct Qdisc *qdisc;
581
582         spin_lock_bh(&dev->queue_lock);
583         qdisc = dev->qdisc;
584         dev->qdisc = &noop_qdisc;
585
586         qdisc_reset(qdisc);
587
588         spin_unlock_bh(&dev->queue_lock);
589
590         dev_watchdog_down(dev);
591
592         /* Wait for outstanding dev_queue_xmit calls. */
593         synchronize_rcu();
594
595         /* Wait for outstanding qdisc_run calls. */
596         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
597                 yield();
598
599         if (dev->gso_skb) {
600                 kfree_skb(dev->gso_skb);
601                 dev->gso_skb = NULL;
602         }
603 }
604
605 void dev_init_scheduler(struct net_device *dev)
606 {
607         qdisc_lock_tree(dev);
608         dev->qdisc = &noop_qdisc;
609         dev->qdisc_sleeping = &noop_qdisc;
610         INIT_LIST_HEAD(&dev->qdisc_list);
611         qdisc_unlock_tree(dev);
612
613         dev_watchdog_init(dev);
614 }
615
616 void dev_shutdown(struct net_device *dev)
617 {
618         struct Qdisc *qdisc;
619
620         qdisc_lock_tree(dev);
621         qdisc = dev->qdisc_sleeping;
622         dev->qdisc = &noop_qdisc;
623         dev->qdisc_sleeping = &noop_qdisc;
624         qdisc_destroy(qdisc);
625 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
626         if ((qdisc = dev->qdisc_ingress) != NULL) {
627                 dev->qdisc_ingress = NULL;
628                 qdisc_destroy(qdisc);
629         }
630 #endif
631         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
632         qdisc_unlock_tree(dev);
633 }
634
635 EXPORT_SYMBOL(__netdev_watchdog_up);
636 EXPORT_SYMBOL(netif_carrier_on);
637 EXPORT_SYMBOL(netif_carrier_off);
638 EXPORT_SYMBOL(noop_qdisc);
639 EXPORT_SYMBOL(noop_qdisc_ops);
640 EXPORT_SYMBOL(qdisc_create_dflt);
641 EXPORT_SYMBOL(qdisc_alloc);
642 EXPORT_SYMBOL(qdisc_destroy);
643 EXPORT_SYMBOL(qdisc_reset);
644 EXPORT_SYMBOL(qdisc_lock_tree);
645 EXPORT_SYMBOL(qdisc_unlock_tree);