Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[linux-2.6] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Main qdisc structure lock. 
40
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made only under rtnl semaphore,
51      hence this lock may be made without local bh disabling.
52
53    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
54  */
55 DEFINE_RWLOCK(qdisc_tree_lock);
56
57 void qdisc_lock_tree(struct net_device *dev)
58 {
59         write_lock(&qdisc_tree_lock);
60         spin_lock_bh(&dev->queue_lock);
61 }
62
63 void qdisc_unlock_tree(struct net_device *dev)
64 {
65         spin_unlock_bh(&dev->queue_lock);
66         write_unlock(&qdisc_tree_lock);
67 }
68
69 /* 
70    dev->queue_lock serializes queue accesses for this device
71    AND dev->qdisc pointer itself.
72
73    netif_tx_lock serializes accesses to device driver.
74
75    dev->queue_lock and netif_tx_lock are mutually exclusive,
76    if one is grabbed, another must be free.
77  */
78
79
80 /* Kick device.
81    Note, that this procedure can be called by a watchdog timer, so that
82    we do not check dev->tbusy flag here.
83
84    Returns:  0  - queue is empty.
85             >0  - queue is not empty, but throttled.
86             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
87
88    NOTE: Called under dev->queue_lock with locally disabled BH.
89 */
90
91 static inline int qdisc_restart(struct net_device *dev)
92 {
93         struct Qdisc *q = dev->qdisc;
94         struct sk_buff *skb;
95
96         /* Dequeue packet */
97         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
98                 unsigned nolock = (dev->features & NETIF_F_LLTX);
99
100                 dev->gso_skb = NULL;
101
102                 /*
103                  * When the driver has LLTX set it does its own locking
104                  * in start_xmit. No need to add additional overhead by
105                  * locking again. These checks are worth it because
106                  * even uncongested locks can be quite expensive.
107                  * The driver can do trylock like here too, in case
108                  * of lock congestion it should return -1 and the packet
109                  * will be requeued.
110                  */
111                 if (!nolock) {
112                         if (!netif_tx_trylock(dev)) {
113                         collision:
114                                 /* So, someone grabbed the driver. */
115                                 
116                                 /* It may be transient configuration error,
117                                    when hard_start_xmit() recurses. We detect
118                                    it by checking xmit owner and drop the
119                                    packet when deadloop is detected.
120                                 */
121                                 if (dev->xmit_lock_owner == smp_processor_id()) {
122                                         kfree_skb(skb);
123                                         if (net_ratelimit())
124                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
125                                         return -1;
126                                 }
127                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
128                                 goto requeue;
129                         }
130                 }
131                 
132                 {
133                         /* And release queue */
134                         spin_unlock(&dev->queue_lock);
135
136                         if (!netif_queue_stopped(dev)) {
137                                 int ret;
138
139                                 ret = dev_hard_start_xmit(skb, dev);
140                                 if (ret == NETDEV_TX_OK) { 
141                                         if (!nolock) {
142                                                 netif_tx_unlock(dev);
143                                         }
144                                         spin_lock(&dev->queue_lock);
145                                         return -1;
146                                 }
147                                 if (ret == NETDEV_TX_LOCKED && nolock) {
148                                         spin_lock(&dev->queue_lock);
149                                         goto collision; 
150                                 }
151                         }
152
153                         /* NETDEV_TX_BUSY - we need to requeue */
154                         /* Release the driver */
155                         if (!nolock) { 
156                                 netif_tx_unlock(dev);
157                         } 
158                         spin_lock(&dev->queue_lock);
159                         q = dev->qdisc;
160                 }
161
162                 /* Device kicked us out :(
163                    This is possible in three cases:
164
165                    0. driver is locked
166                    1. fastroute is enabled
167                    2. device cannot determine busy state
168                       before start of transmission (f.e. dialout)
169                    3. device is buggy (ppp)
170                  */
171
172 requeue:
173                 if (skb->next)
174                         dev->gso_skb = skb;
175                 else
176                         q->ops->requeue(skb, q);
177                 netif_schedule(dev);
178                 return 1;
179         }
180         BUG_ON((int) q->q.qlen < 0);
181         return q->q.qlen;
182 }
183
184 void __qdisc_run(struct net_device *dev)
185 {
186         if (unlikely(dev->qdisc == &noop_qdisc))
187                 goto out;
188
189         while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
190                 /* NOTHING */;
191
192 out:
193         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
194 }
195
196 static void dev_watchdog(unsigned long arg)
197 {
198         struct net_device *dev = (struct net_device *)arg;
199
200         netif_tx_lock(dev);
201         if (dev->qdisc != &noop_qdisc) {
202                 if (netif_device_present(dev) &&
203                     netif_running(dev) &&
204                     netif_carrier_ok(dev)) {
205                         if (netif_queue_stopped(dev) &&
206                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
207
208                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
209                                        dev->name);
210                                 dev->tx_timeout(dev);
211                         }
212                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
213                                 dev_hold(dev);
214                 }
215         }
216         netif_tx_unlock(dev);
217
218         dev_put(dev);
219 }
220
221 static void dev_watchdog_init(struct net_device *dev)
222 {
223         init_timer(&dev->watchdog_timer);
224         dev->watchdog_timer.data = (unsigned long)dev;
225         dev->watchdog_timer.function = dev_watchdog;
226 }
227
228 void __netdev_watchdog_up(struct net_device *dev)
229 {
230         if (dev->tx_timeout) {
231                 if (dev->watchdog_timeo <= 0)
232                         dev->watchdog_timeo = 5*HZ;
233                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
234                         dev_hold(dev);
235         }
236 }
237
238 static void dev_watchdog_up(struct net_device *dev)
239 {
240         __netdev_watchdog_up(dev);
241 }
242
243 static void dev_watchdog_down(struct net_device *dev)
244 {
245         netif_tx_lock_bh(dev);
246         if (del_timer(&dev->watchdog_timer))
247                 dev_put(dev);
248         netif_tx_unlock_bh(dev);
249 }
250
251 void netif_carrier_on(struct net_device *dev)
252 {
253         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
254                 linkwatch_fire_event(dev);
255         if (netif_running(dev))
256                 __netdev_watchdog_up(dev);
257 }
258
259 void netif_carrier_off(struct net_device *dev)
260 {
261         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
262                 linkwatch_fire_event(dev);
263 }
264
265 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
266    under all circumstances. It is difficult to invent anything faster or
267    cheaper.
268  */
269
270 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
271 {
272         kfree_skb(skb);
273         return NET_XMIT_CN;
274 }
275
276 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
277 {
278         return NULL;
279 }
280
281 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
282 {
283         if (net_ratelimit())
284                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
285                        skb->dev->name);
286         kfree_skb(skb);
287         return NET_XMIT_CN;
288 }
289
290 struct Qdisc_ops noop_qdisc_ops = {
291         .id             =       "noop",
292         .priv_size      =       0,
293         .enqueue        =       noop_enqueue,
294         .dequeue        =       noop_dequeue,
295         .requeue        =       noop_requeue,
296         .owner          =       THIS_MODULE,
297 };
298
299 struct Qdisc noop_qdisc = {
300         .enqueue        =       noop_enqueue,
301         .dequeue        =       noop_dequeue,
302         .flags          =       TCQ_F_BUILTIN,
303         .ops            =       &noop_qdisc_ops,        
304         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
305 };
306
307 static struct Qdisc_ops noqueue_qdisc_ops = {
308         .id             =       "noqueue",
309         .priv_size      =       0,
310         .enqueue        =       noop_enqueue,
311         .dequeue        =       noop_dequeue,
312         .requeue        =       noop_requeue,
313         .owner          =       THIS_MODULE,
314 };
315
316 static struct Qdisc noqueue_qdisc = {
317         .enqueue        =       NULL,
318         .dequeue        =       noop_dequeue,
319         .flags          =       TCQ_F_BUILTIN,
320         .ops            =       &noqueue_qdisc_ops,
321         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
322 };
323
324
325 static const u8 prio2band[TC_PRIO_MAX+1] =
326         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
327
328 /* 3-band FIFO queue: old style, but should be a bit faster than
329    generic prio+fifo combination.
330  */
331
332 #define PFIFO_FAST_BANDS 3
333
334 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
335                                              struct Qdisc *qdisc)
336 {
337         struct sk_buff_head *list = qdisc_priv(qdisc);
338         return list + prio2band[skb->priority & TC_PRIO_MAX];
339 }
340
341 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
342 {
343         struct sk_buff_head *list = prio2list(skb, qdisc);
344
345         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
346                 qdisc->q.qlen++;
347                 return __qdisc_enqueue_tail(skb, qdisc, list);
348         }
349
350         return qdisc_drop(skb, qdisc);
351 }
352
353 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
354 {
355         int prio;
356         struct sk_buff_head *list = qdisc_priv(qdisc);
357
358         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
359                 if (!skb_queue_empty(list + prio)) {
360                         qdisc->q.qlen--;
361                         return __qdisc_dequeue_head(qdisc, list + prio);
362                 }
363         }
364
365         return NULL;
366 }
367
368 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
369 {
370         qdisc->q.qlen++;
371         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
372 }
373
374 static void pfifo_fast_reset(struct Qdisc* qdisc)
375 {
376         int prio;
377         struct sk_buff_head *list = qdisc_priv(qdisc);
378
379         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
380                 __qdisc_reset_queue(qdisc, list + prio);
381
382         qdisc->qstats.backlog = 0;
383         qdisc->q.qlen = 0;
384 }
385
386 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
387 {
388         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
389
390         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
391         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
392         return skb->len;
393
394 rtattr_failure:
395         return -1;
396 }
397
398 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
399 {
400         int prio;
401         struct sk_buff_head *list = qdisc_priv(qdisc);
402
403         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
404                 skb_queue_head_init(list + prio);
405
406         return 0;
407 }
408
409 static struct Qdisc_ops pfifo_fast_ops = {
410         .id             =       "pfifo_fast",
411         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
412         .enqueue        =       pfifo_fast_enqueue,
413         .dequeue        =       pfifo_fast_dequeue,
414         .requeue        =       pfifo_fast_requeue,
415         .init           =       pfifo_fast_init,
416         .reset          =       pfifo_fast_reset,
417         .dump           =       pfifo_fast_dump,
418         .owner          =       THIS_MODULE,
419 };
420
421 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
422 {
423         void *p;
424         struct Qdisc *sch;
425         unsigned int size;
426         int err = -ENOBUFS;
427
428         /* ensure that the Qdisc and the private data are 32-byte aligned */
429         size = QDISC_ALIGN(sizeof(*sch));
430         size += ops->priv_size + (QDISC_ALIGNTO - 1);
431
432         p = kzalloc(size, GFP_KERNEL);
433         if (!p)
434                 goto errout;
435         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
436         sch->padded = (char *) sch - (char *) p;
437
438         INIT_LIST_HEAD(&sch->list);
439         skb_queue_head_init(&sch->q);
440         sch->ops = ops;
441         sch->enqueue = ops->enqueue;
442         sch->dequeue = ops->dequeue;
443         sch->dev = dev;
444         dev_hold(dev);
445         sch->stats_lock = &dev->queue_lock;
446         atomic_set(&sch->refcnt, 1);
447
448         return sch;
449 errout:
450         return ERR_PTR(-err);
451 }
452
453 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
454                                  unsigned int parentid)
455 {
456         struct Qdisc *sch;
457         
458         sch = qdisc_alloc(dev, ops);
459         if (IS_ERR(sch))
460                 goto errout;
461         sch->parent = parentid;
462
463         if (!ops->init || ops->init(sch, NULL) == 0)
464                 return sch;
465
466         qdisc_destroy(sch);
467 errout:
468         return NULL;
469 }
470
471 /* Under dev->queue_lock and BH! */
472
473 void qdisc_reset(struct Qdisc *qdisc)
474 {
475         struct Qdisc_ops *ops = qdisc->ops;
476
477         if (ops->reset)
478                 ops->reset(qdisc);
479 }
480
481 /* this is the rcu callback function to clean up a qdisc when there 
482  * are no further references to it */
483
484 static void __qdisc_destroy(struct rcu_head *head)
485 {
486         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
487         kfree((char *) qdisc - qdisc->padded);
488 }
489
490 /* Under dev->queue_lock and BH! */
491
492 void qdisc_destroy(struct Qdisc *qdisc)
493 {
494         struct Qdisc_ops  *ops = qdisc->ops;
495
496         if (qdisc->flags & TCQ_F_BUILTIN ||
497             !atomic_dec_and_test(&qdisc->refcnt))
498                 return;
499
500         list_del(&qdisc->list);
501 #ifdef CONFIG_NET_ESTIMATOR
502         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
503 #endif
504         if (ops->reset)
505                 ops->reset(qdisc);
506         if (ops->destroy)
507                 ops->destroy(qdisc);
508
509         module_put(ops->owner);
510         dev_put(qdisc->dev);
511         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
512 }
513
514 void dev_activate(struct net_device *dev)
515 {
516         /* No queueing discipline is attached to device;
517            create default one i.e. pfifo_fast for devices,
518            which need queueing and noqueue_qdisc for
519            virtual interfaces
520          */
521
522         if (dev->qdisc_sleeping == &noop_qdisc) {
523                 struct Qdisc *qdisc;
524                 if (dev->tx_queue_len) {
525                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
526                                                   TC_H_ROOT);
527                         if (qdisc == NULL) {
528                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
529                                 return;
530                         }
531                         write_lock(&qdisc_tree_lock);
532                         list_add_tail(&qdisc->list, &dev->qdisc_list);
533                         write_unlock(&qdisc_tree_lock);
534                 } else {
535                         qdisc =  &noqueue_qdisc;
536                 }
537                 write_lock(&qdisc_tree_lock);
538                 dev->qdisc_sleeping = qdisc;
539                 write_unlock(&qdisc_tree_lock);
540         }
541
542         if (!netif_carrier_ok(dev))
543                 /* Delay activation until next carrier-on event */
544                 return;
545
546         spin_lock_bh(&dev->queue_lock);
547         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
548         if (dev->qdisc != &noqueue_qdisc) {
549                 dev->trans_start = jiffies;
550                 dev_watchdog_up(dev);
551         }
552         spin_unlock_bh(&dev->queue_lock);
553 }
554
555 void dev_deactivate(struct net_device *dev)
556 {
557         struct Qdisc *qdisc;
558
559         spin_lock_bh(&dev->queue_lock);
560         qdisc = dev->qdisc;
561         dev->qdisc = &noop_qdisc;
562
563         qdisc_reset(qdisc);
564
565         spin_unlock_bh(&dev->queue_lock);
566
567         dev_watchdog_down(dev);
568
569         /* Wait for outstanding dev_queue_xmit calls. */
570         synchronize_rcu();
571
572         /* Wait for outstanding qdisc_run calls. */
573         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
574                 yield();
575
576         if (dev->gso_skb) {
577                 kfree_skb(dev->gso_skb);
578                 dev->gso_skb = NULL;
579         }
580 }
581
582 void dev_init_scheduler(struct net_device *dev)
583 {
584         qdisc_lock_tree(dev);
585         dev->qdisc = &noop_qdisc;
586         dev->qdisc_sleeping = &noop_qdisc;
587         INIT_LIST_HEAD(&dev->qdisc_list);
588         qdisc_unlock_tree(dev);
589
590         dev_watchdog_init(dev);
591 }
592
593 void dev_shutdown(struct net_device *dev)
594 {
595         struct Qdisc *qdisc;
596
597         qdisc_lock_tree(dev);
598         qdisc = dev->qdisc_sleeping;
599         dev->qdisc = &noop_qdisc;
600         dev->qdisc_sleeping = &noop_qdisc;
601         qdisc_destroy(qdisc);
602 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
603         if ((qdisc = dev->qdisc_ingress) != NULL) {
604                 dev->qdisc_ingress = NULL;
605                 qdisc_destroy(qdisc);
606         }
607 #endif
608         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
609         qdisc_unlock_tree(dev);
610 }
611
612 EXPORT_SYMBOL(netif_carrier_on);
613 EXPORT_SYMBOL(netif_carrier_off);
614 EXPORT_SYMBOL(noop_qdisc);
615 EXPORT_SYMBOL(qdisc_create_dflt);
616 EXPORT_SYMBOL(qdisc_destroy);
617 EXPORT_SYMBOL(qdisc_reset);
618 EXPORT_SYMBOL(qdisc_lock_tree);
619 EXPORT_SYMBOL(qdisc_unlock_tree);