[NET]: Add netif_tx_lock
[linux-2.6] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <linux/list.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
37
38 /* Main transmission queue. */
39
40 /* Main qdisc structure lock. 
41
42    However, modifications
43    to data, participating in scheduling must be additionally
44    protected with dev->queue_lock spinlock.
45
46    The idea is the following:
47    - enqueue, dequeue are serialized via top level device
48      spinlock dev->queue_lock.
49    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
50      and this lock is used only in process context.
51    - updates to tree are made under rtnl semaphore or
52      from softirq context (__qdisc_destroy rcu-callback)
53      hence this lock needs local bh disabling.
54
55    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
56  */
57 DEFINE_RWLOCK(qdisc_tree_lock);
58
59 void qdisc_lock_tree(struct net_device *dev)
60 {
61         write_lock_bh(&qdisc_tree_lock);
62         spin_lock_bh(&dev->queue_lock);
63 }
64
65 void qdisc_unlock_tree(struct net_device *dev)
66 {
67         spin_unlock_bh(&dev->queue_lock);
68         write_unlock_bh(&qdisc_tree_lock);
69 }
70
71 /* 
72    dev->queue_lock serializes queue accesses for this device
73    AND dev->qdisc pointer itself.
74
75    netif_tx_lock serializes accesses to device driver.
76
77    dev->queue_lock and netif_tx_lock are mutually exclusive,
78    if one is grabbed, another must be free.
79  */
80
81
82 /* Kick device.
83    Note, that this procedure can be called by a watchdog timer, so that
84    we do not check dev->tbusy flag here.
85
86    Returns:  0  - queue is empty.
87             >0  - queue is not empty, but throttled.
88             <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
89
90    NOTE: Called under dev->queue_lock with locally disabled BH.
91 */
92
93 int qdisc_restart(struct net_device *dev)
94 {
95         struct Qdisc *q = dev->qdisc;
96         struct sk_buff *skb;
97
98         /* Dequeue packet */
99         if ((skb = q->dequeue(q)) != NULL) {
100                 unsigned nolock = (dev->features & NETIF_F_LLTX);
101                 /*
102                  * When the driver has LLTX set it does its own locking
103                  * in start_xmit. No need to add additional overhead by
104                  * locking again. These checks are worth it because
105                  * even uncongested locks can be quite expensive.
106                  * The driver can do trylock like here too, in case
107                  * of lock congestion it should return -1 and the packet
108                  * will be requeued.
109                  */
110                 if (!nolock) {
111                         if (!netif_tx_trylock(dev)) {
112                         collision:
113                                 /* So, someone grabbed the driver. */
114                                 
115                                 /* It may be transient configuration error,
116                                    when hard_start_xmit() recurses. We detect
117                                    it by checking xmit owner and drop the
118                                    packet when deadloop is detected.
119                                 */
120                                 if (dev->xmit_lock_owner == smp_processor_id()) {
121                                         kfree_skb(skb);
122                                         if (net_ratelimit())
123                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
124                                         return -1;
125                                 }
126                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
127                                 goto requeue;
128                         }
129                 }
130                 
131                 {
132                         /* And release queue */
133                         spin_unlock(&dev->queue_lock);
134
135                         if (!netif_queue_stopped(dev)) {
136                                 int ret;
137                                 if (netdev_nit)
138                                         dev_queue_xmit_nit(skb, dev);
139
140                                 ret = dev->hard_start_xmit(skb, dev);
141                                 if (ret == NETDEV_TX_OK) { 
142                                         if (!nolock) {
143                                                 netif_tx_unlock(dev);
144                                         }
145                                         spin_lock(&dev->queue_lock);
146                                         return -1;
147                                 }
148                                 if (ret == NETDEV_TX_LOCKED && nolock) {
149                                         spin_lock(&dev->queue_lock);
150                                         goto collision; 
151                                 }
152                         }
153
154                         /* NETDEV_TX_BUSY - we need to requeue */
155                         /* Release the driver */
156                         if (!nolock) { 
157                                 netif_tx_unlock(dev);
158                         } 
159                         spin_lock(&dev->queue_lock);
160                         q = dev->qdisc;
161                 }
162
163                 /* Device kicked us out :(
164                    This is possible in three cases:
165
166                    0. driver is locked
167                    1. fastroute is enabled
168                    2. device cannot determine busy state
169                       before start of transmission (f.e. dialout)
170                    3. device is buggy (ppp)
171                  */
172
173 requeue:
174                 q->ops->requeue(skb, q);
175                 netif_schedule(dev);
176                 return 1;
177         }
178         BUG_ON((int) q->q.qlen < 0);
179         return q->q.qlen;
180 }
181
182 static void dev_watchdog(unsigned long arg)
183 {
184         struct net_device *dev = (struct net_device *)arg;
185
186         netif_tx_lock(dev);
187         if (dev->qdisc != &noop_qdisc) {
188                 if (netif_device_present(dev) &&
189                     netif_running(dev) &&
190                     netif_carrier_ok(dev)) {
191                         if (netif_queue_stopped(dev) &&
192                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
193
194                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
195                                        dev->name);
196                                 dev->tx_timeout(dev);
197                         }
198                         if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
199                                 dev_hold(dev);
200                 }
201         }
202         netif_tx_unlock(dev);
203
204         dev_put(dev);
205 }
206
207 static void dev_watchdog_init(struct net_device *dev)
208 {
209         init_timer(&dev->watchdog_timer);
210         dev->watchdog_timer.data = (unsigned long)dev;
211         dev->watchdog_timer.function = dev_watchdog;
212 }
213
214 void __netdev_watchdog_up(struct net_device *dev)
215 {
216         if (dev->tx_timeout) {
217                 if (dev->watchdog_timeo <= 0)
218                         dev->watchdog_timeo = 5*HZ;
219                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
220                         dev_hold(dev);
221         }
222 }
223
224 static void dev_watchdog_up(struct net_device *dev)
225 {
226         netif_tx_lock_bh(dev);
227         __netdev_watchdog_up(dev);
228         netif_tx_unlock_bh(dev);
229 }
230
231 static void dev_watchdog_down(struct net_device *dev)
232 {
233         netif_tx_lock_bh(dev);
234         if (del_timer(&dev->watchdog_timer))
235                 dev_put(dev);
236         netif_tx_unlock_bh(dev);
237 }
238
239 void netif_carrier_on(struct net_device *dev)
240 {
241         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
242                 linkwatch_fire_event(dev);
243         if (netif_running(dev))
244                 __netdev_watchdog_up(dev);
245 }
246
247 void netif_carrier_off(struct net_device *dev)
248 {
249         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
250                 linkwatch_fire_event(dev);
251 }
252
253 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
254    under all circumstances. It is difficult to invent anything faster or
255    cheaper.
256  */
257
258 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
259 {
260         kfree_skb(skb);
261         return NET_XMIT_CN;
262 }
263
264 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
265 {
266         return NULL;
267 }
268
269 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
270 {
271         if (net_ratelimit())
272                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
273                        skb->dev->name);
274         kfree_skb(skb);
275         return NET_XMIT_CN;
276 }
277
278 struct Qdisc_ops noop_qdisc_ops = {
279         .id             =       "noop",
280         .priv_size      =       0,
281         .enqueue        =       noop_enqueue,
282         .dequeue        =       noop_dequeue,
283         .requeue        =       noop_requeue,
284         .owner          =       THIS_MODULE,
285 };
286
287 struct Qdisc noop_qdisc = {
288         .enqueue        =       noop_enqueue,
289         .dequeue        =       noop_dequeue,
290         .flags          =       TCQ_F_BUILTIN,
291         .ops            =       &noop_qdisc_ops,        
292         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
293 };
294
295 static struct Qdisc_ops noqueue_qdisc_ops = {
296         .id             =       "noqueue",
297         .priv_size      =       0,
298         .enqueue        =       noop_enqueue,
299         .dequeue        =       noop_dequeue,
300         .requeue        =       noop_requeue,
301         .owner          =       THIS_MODULE,
302 };
303
304 static struct Qdisc noqueue_qdisc = {
305         .enqueue        =       NULL,
306         .dequeue        =       noop_dequeue,
307         .flags          =       TCQ_F_BUILTIN,
308         .ops            =       &noqueue_qdisc_ops,
309         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
310 };
311
312
313 static const u8 prio2band[TC_PRIO_MAX+1] =
314         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
315
316 /* 3-band FIFO queue: old style, but should be a bit faster than
317    generic prio+fifo combination.
318  */
319
320 #define PFIFO_FAST_BANDS 3
321
322 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
323                                              struct Qdisc *qdisc)
324 {
325         struct sk_buff_head *list = qdisc_priv(qdisc);
326         return list + prio2band[skb->priority & TC_PRIO_MAX];
327 }
328
329 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
330 {
331         struct sk_buff_head *list = prio2list(skb, qdisc);
332
333         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
334                 qdisc->q.qlen++;
335                 return __qdisc_enqueue_tail(skb, qdisc, list);
336         }
337
338         return qdisc_drop(skb, qdisc);
339 }
340
341 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
342 {
343         int prio;
344         struct sk_buff_head *list = qdisc_priv(qdisc);
345
346         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
347                 if (!skb_queue_empty(list + prio)) {
348                         qdisc->q.qlen--;
349                         return __qdisc_dequeue_head(qdisc, list + prio);
350                 }
351         }
352
353         return NULL;
354 }
355
356 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
357 {
358         qdisc->q.qlen++;
359         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
360 }
361
362 static void pfifo_fast_reset(struct Qdisc* qdisc)
363 {
364         int prio;
365         struct sk_buff_head *list = qdisc_priv(qdisc);
366
367         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
368                 __qdisc_reset_queue(qdisc, list + prio);
369
370         qdisc->qstats.backlog = 0;
371         qdisc->q.qlen = 0;
372 }
373
374 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
375 {
376         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
377
378         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
379         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
380         return skb->len;
381
382 rtattr_failure:
383         return -1;
384 }
385
386 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
387 {
388         int prio;
389         struct sk_buff_head *list = qdisc_priv(qdisc);
390
391         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
392                 skb_queue_head_init(list + prio);
393
394         return 0;
395 }
396
397 static struct Qdisc_ops pfifo_fast_ops = {
398         .id             =       "pfifo_fast",
399         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
400         .enqueue        =       pfifo_fast_enqueue,
401         .dequeue        =       pfifo_fast_dequeue,
402         .requeue        =       pfifo_fast_requeue,
403         .init           =       pfifo_fast_init,
404         .reset          =       pfifo_fast_reset,
405         .dump           =       pfifo_fast_dump,
406         .owner          =       THIS_MODULE,
407 };
408
409 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
410 {
411         void *p;
412         struct Qdisc *sch;
413         unsigned int size;
414         int err = -ENOBUFS;
415
416         /* ensure that the Qdisc and the private data are 32-byte aligned */
417         size = QDISC_ALIGN(sizeof(*sch));
418         size += ops->priv_size + (QDISC_ALIGNTO - 1);
419
420         p = kmalloc(size, GFP_KERNEL);
421         if (!p)
422                 goto errout;
423         memset(p, 0, size);
424         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
425         sch->padded = (char *) sch - (char *) p;
426
427         INIT_LIST_HEAD(&sch->list);
428         skb_queue_head_init(&sch->q);
429         sch->ops = ops;
430         sch->enqueue = ops->enqueue;
431         sch->dequeue = ops->dequeue;
432         sch->dev = dev;
433         dev_hold(dev);
434         sch->stats_lock = &dev->queue_lock;
435         atomic_set(&sch->refcnt, 1);
436
437         return sch;
438 errout:
439         return ERR_PTR(-err);
440 }
441
442 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
443 {
444         struct Qdisc *sch;
445         
446         sch = qdisc_alloc(dev, ops);
447         if (IS_ERR(sch))
448                 goto errout;
449
450         if (!ops->init || ops->init(sch, NULL) == 0)
451                 return sch;
452
453         qdisc_destroy(sch);
454 errout:
455         return NULL;
456 }
457
458 /* Under dev->queue_lock and BH! */
459
460 void qdisc_reset(struct Qdisc *qdisc)
461 {
462         struct Qdisc_ops *ops = qdisc->ops;
463
464         if (ops->reset)
465                 ops->reset(qdisc);
466 }
467
468 /* this is the rcu callback function to clean up a qdisc when there 
469  * are no further references to it */
470
471 static void __qdisc_destroy(struct rcu_head *head)
472 {
473         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
474         struct Qdisc_ops  *ops = qdisc->ops;
475
476 #ifdef CONFIG_NET_ESTIMATOR
477         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
478 #endif
479         write_lock(&qdisc_tree_lock);
480         if (ops->reset)
481                 ops->reset(qdisc);
482         if (ops->destroy)
483                 ops->destroy(qdisc);
484         write_unlock(&qdisc_tree_lock);
485         module_put(ops->owner);
486
487         dev_put(qdisc->dev);
488         kfree((char *) qdisc - qdisc->padded);
489 }
490
491 /* Under dev->queue_lock and BH! */
492
493 void qdisc_destroy(struct Qdisc *qdisc)
494 {
495         struct list_head cql = LIST_HEAD_INIT(cql);
496         struct Qdisc *cq, *q, *n;
497
498         if (qdisc->flags & TCQ_F_BUILTIN ||
499                 !atomic_dec_and_test(&qdisc->refcnt))
500                 return;
501
502         if (!list_empty(&qdisc->list)) {
503                 if (qdisc->ops->cl_ops == NULL)
504                         list_del(&qdisc->list);
505                 else
506                         list_move(&qdisc->list, &cql);
507         }
508
509         /* unlink inner qdiscs from dev->qdisc_list immediately */
510         list_for_each_entry(cq, &cql, list)
511                 list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
512                         if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
513                                 if (q->ops->cl_ops == NULL)
514                                         list_del_init(&q->list);
515                                 else
516                                         list_move_tail(&q->list, &cql);
517                         }
518         list_for_each_entry_safe(cq, n, &cql, list)
519                 list_del_init(&cq->list);
520
521         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
522 }
523
524 void dev_activate(struct net_device *dev)
525 {
526         /* No queueing discipline is attached to device;
527            create default one i.e. pfifo_fast for devices,
528            which need queueing and noqueue_qdisc for
529            virtual interfaces
530          */
531
532         if (dev->qdisc_sleeping == &noop_qdisc) {
533                 struct Qdisc *qdisc;
534                 if (dev->tx_queue_len) {
535                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
536                         if (qdisc == NULL) {
537                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
538                                 return;
539                         }
540                         write_lock_bh(&qdisc_tree_lock);
541                         list_add_tail(&qdisc->list, &dev->qdisc_list);
542                         write_unlock_bh(&qdisc_tree_lock);
543                 } else {
544                         qdisc =  &noqueue_qdisc;
545                 }
546                 write_lock_bh(&qdisc_tree_lock);
547                 dev->qdisc_sleeping = qdisc;
548                 write_unlock_bh(&qdisc_tree_lock);
549         }
550
551         if (!netif_carrier_ok(dev))
552                 /* Delay activation until next carrier-on event */
553                 return;
554
555         spin_lock_bh(&dev->queue_lock);
556         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
557         if (dev->qdisc != &noqueue_qdisc) {
558                 dev->trans_start = jiffies;
559                 dev_watchdog_up(dev);
560         }
561         spin_unlock_bh(&dev->queue_lock);
562 }
563
564 void dev_deactivate(struct net_device *dev)
565 {
566         struct Qdisc *qdisc;
567
568         spin_lock_bh(&dev->queue_lock);
569         qdisc = dev->qdisc;
570         dev->qdisc = &noop_qdisc;
571
572         qdisc_reset(qdisc);
573
574         spin_unlock_bh(&dev->queue_lock);
575
576         dev_watchdog_down(dev);
577
578         while (test_bit(__LINK_STATE_SCHED, &dev->state))
579                 yield();
580
581         spin_unlock_wait(&dev->_xmit_lock);
582 }
583
584 void dev_init_scheduler(struct net_device *dev)
585 {
586         qdisc_lock_tree(dev);
587         dev->qdisc = &noop_qdisc;
588         dev->qdisc_sleeping = &noop_qdisc;
589         INIT_LIST_HEAD(&dev->qdisc_list);
590         qdisc_unlock_tree(dev);
591
592         dev_watchdog_init(dev);
593 }
594
595 void dev_shutdown(struct net_device *dev)
596 {
597         struct Qdisc *qdisc;
598
599         qdisc_lock_tree(dev);
600         qdisc = dev->qdisc_sleeping;
601         dev->qdisc = &noop_qdisc;
602         dev->qdisc_sleeping = &noop_qdisc;
603         qdisc_destroy(qdisc);
604 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
605         if ((qdisc = dev->qdisc_ingress) != NULL) {
606                 dev->qdisc_ingress = NULL;
607                 qdisc_destroy(qdisc);
608         }
609 #endif
610         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
611         qdisc_unlock_tree(dev);
612 }
613
614 EXPORT_SYMBOL(__netdev_watchdog_up);
615 EXPORT_SYMBOL(netif_carrier_on);
616 EXPORT_SYMBOL(netif_carrier_off);
617 EXPORT_SYMBOL(noop_qdisc);
618 EXPORT_SYMBOL(noop_qdisc_ops);
619 EXPORT_SYMBOL(qdisc_create_dflt);
620 EXPORT_SYMBOL(qdisc_alloc);
621 EXPORT_SYMBOL(qdisc_destroy);
622 EXPORT_SYMBOL(qdisc_reset);
623 EXPORT_SYMBOL(qdisc_restart);
624 EXPORT_SYMBOL(qdisc_lock_tree);
625 EXPORT_SYMBOL(qdisc_unlock_tree);