git.oblomov.eu Git - linux-2.6/blob - net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <asm/uaccess.h>
  14 #include <asm/system.h>
  15 #include <linux/bitops.h>
  16 #include <linux/types.h>
  17 #include <linux/kernel.h>
  18 #include <linux/jiffies.h>
  19 #include <linux/string.h>
  20 #include <linux/mm.h>
  21 #include <linux/socket.h>
  22 #include <linux/sockios.h>
  23 #include <linux/in.h>
  24 #include <linux/errno.h>
  25 #include <linux/interrupt.h>
  26 #include <linux/if_ether.h>
  27 #include <linux/inet.h>
  28 #include <linux/netdevice.h>
  29 #include <linux/etherdevice.h>
  30 #include <linux/notifier.h>
  31 #include <linux/init.h>
  32 #include <net/ip.h>
  33 #include <net/netlink.h>
  34 #include <linux/ipv6.h>
  35 #include <net/route.h>
  36 #include <linux/skbuff.h>
  37 #include <net/sock.h>
  38 #include <net/pkt_sched.h>
  39
  40
  41 /*      Stochastic Fairness Queuing algorithm.
  42         =======================================
  43
  44         Source:
  45         Paul E. McKenney "Stochastic Fairness Queuing",
  46         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  47
  48         Paul E. McKenney "Stochastic Fairness Queuing",
  49         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  50
  51
  52         See also:
  53         M. Shreedhar and George Varghese "Efficient Fair
  54         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  55
  56
  57         This is not the thing that is usually called (W)FQ nowadays.
  58         It does not use any timestamp mechanism, but instead
  59         processes queues in round-robin order.
  60
  61         ADVANTAGE:
  62
  63         - It is very cheap. Both CPU and memory requirements are minimal.
  64
  65         DRAWBACKS:
  66
  67         - "Stochastic" -> It is not 100% fair.
  68         When hash collisions occur, several flows are considered as one.
  69
  70         - "Round-robin" -> It introduces larger delays than virtual clock
  71         based schemes, and should not be used for isolating interactive
  72         traffic from non-interactive. It means, that this scheduler
  73         should be used as leaf of CBQ or P3, which put interactive traffic
  74         to higher priority band.
  75
  76         We still need true WFQ for top level CSZ, but using WFQ
  77         for the best effort traffic is absolutely pointless:
  78         SFQ is superior for this purpose.
  79
  80         IMPLEMENTATION:
  81         This implementation limits maximal queue length to 128;
  82         maximal mtu to 2^15-1; number of hash buckets to 1024.
  83         The only goal of this restrictions was that all data
  84         fit into one 4K page :-). Struct sfq_sched_data is
  85         organized in anti-cache manner: all the data for a bucket
  86         are scattered over different locations. This is not good,
  87         but it allowed me to put it into 4K.
  88
  89         It is easy to increase these values, but not in flight.  */
  90
  91 #define SFQ_DEPTH               128
  92 #define SFQ_HASH_DIVISOR        1024
  93
  94 /* This type should contain at least SFQ_DEPTH*2 values */
  95 typedef unsigned char sfq_index;
  96
  97 struct sfq_head
  98 {
  99         sfq_index       next;
 100         sfq_index       prev;
 101 };
 102
 103 struct sfq_sched_data
 104 {
 105 /* Parameters */
 106         int             perturb_period;
 107         unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
 108         int             limit;
 109
 110 /* Variables */
 111         struct timer_list perturb_timer;
 112         int             perturbation;
 113         sfq_index       tail;           /* Index of current slot in round */
 114         sfq_index       max_depth;      /* Maximal depth */
 115
 116         sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 117         sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 118         short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 119         unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 120         struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 121         struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 122 };
 123
 124 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 125 {
 126         int pert = q->perturbation;
 127
 128         /* Have we any rotation primitives? If not, WHY? */
 129         h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
 130         h ^= h>>10;
 131         return h & 0x3FF;
 132 }
 133
 134 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 135 {
 136         u32 h, h2;
 137
 138         switch (skb->protocol) {
 139         case __constant_htons(ETH_P_IP):
 140         {
 141                 const struct iphdr *iph = ip_hdr(skb);
 142                 h = iph->daddr;
 143                 h2 = iph->saddr^iph->protocol;
 144                 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 145                     (iph->protocol == IPPROTO_TCP ||
 146                      iph->protocol == IPPROTO_UDP ||
 147                      iph->protocol == IPPROTO_UDPLITE ||
 148                      iph->protocol == IPPROTO_SCTP ||
 149                      iph->protocol == IPPROTO_DCCP ||
 150                      iph->protocol == IPPROTO_ESP))
 151                         h2 ^= *(((u32*)iph) + iph->ihl);
 152                 break;
 153         }
 154         case __constant_htons(ETH_P_IPV6):
 155         {
 156                 struct ipv6hdr *iph = ipv6_hdr(skb);
 157                 h = iph->daddr.s6_addr32[3];
 158                 h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
 159                 if (iph->nexthdr == IPPROTO_TCP ||
 160                     iph->nexthdr == IPPROTO_UDP ||
 161                     iph->nexthdr == IPPROTO_UDPLITE ||
 162                     iph->nexthdr == IPPROTO_SCTP ||
 163                     iph->nexthdr == IPPROTO_DCCP ||
 164                     iph->nexthdr == IPPROTO_ESP)
 165                         h2 ^= *(u32*)&iph[1];
 166                 break;
 167         }
 168         default:
 169                 h = (u32)(unsigned long)skb->dst^skb->protocol;
 170                 h2 = (u32)(unsigned long)skb->sk;
 171         }
 172         return sfq_fold_hash(q, h, h2);
 173 }
 174
 175 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 176 {
 177         sfq_index p, n;
 178         int d = q->qs[x].qlen + SFQ_DEPTH;
 179
 180         p = d;
 181         n = q->dep[d].next;
 182         q->dep[x].next = n;
 183         q->dep[x].prev = p;
 184         q->dep[p].next = q->dep[n].prev = x;
 185 }
 186
 187 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 188 {
 189         sfq_index p, n;
 190
 191         n = q->dep[x].next;
 192         p = q->dep[x].prev;
 193         q->dep[p].next = n;
 194         q->dep[n].prev = p;
 195
 196         if (n == p && q->max_depth == q->qs[x].qlen + 1)
 197                 q->max_depth--;
 198
 199         sfq_link(q, x);
 200 }
 201
 202 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 203 {
 204         sfq_index p, n;
 205         int d;
 206
 207         n = q->dep[x].next;
 208         p = q->dep[x].prev;
 209         q->dep[p].next = n;
 210         q->dep[n].prev = p;
 211         d = q->qs[x].qlen;
 212         if (q->max_depth < d)
 213                 q->max_depth = d;
 214
 215         sfq_link(q, x);
 216 }
 217
 218 static unsigned int sfq_drop(struct Qdisc *sch)
 219 {
 220         struct sfq_sched_data *q = qdisc_priv(sch);
 221         sfq_index d = q->max_depth;
 222         struct sk_buff *skb;
 223         unsigned int len;
 224
 225         /* Queue is full! Find the longest slot and
 226            drop a packet from it */
 227
 228         if (d > 1) {
 229                 sfq_index x = q->dep[d+SFQ_DEPTH].next;
 230                 skb = q->qs[x].prev;
 231                 len = skb->len;
 232                 __skb_unlink(skb, &q->qs[x]);
 233                 kfree_skb(skb);
 234                 sfq_dec(q, x);
 235                 sch->q.qlen--;
 236                 sch->qstats.drops++;
 237                 sch->qstats.backlog -= len;
 238                 return len;
 239         }
 240
 241         if (d == 1) {
 242                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 243                 d = q->next[q->tail];
 244                 q->next[q->tail] = q->next[d];
 245                 q->allot[q->next[d]] += q->quantum;
 246                 skb = q->qs[d].prev;
 247                 len = skb->len;
 248                 __skb_unlink(skb, &q->qs[d]);
 249                 kfree_skb(skb);
 250                 sfq_dec(q, d);
 251                 sch->q.qlen--;
 252                 q->ht[q->hash[d]] = SFQ_DEPTH;
 253                 sch->qstats.drops++;
 254                 sch->qstats.backlog -= len;
 255                 return len;
 256         }
 257
 258         return 0;
 259 }
 260
 261 static int
 262 sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 263 {
 264         struct sfq_sched_data *q = qdisc_priv(sch);
 265         unsigned hash = sfq_hash(q, skb);
 266         sfq_index x;
 267
 268         x = q->ht[hash];
 269         if (x == SFQ_DEPTH) {
 270                 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 271                 q->hash[x] = hash;
 272         }
 273         sch->qstats.backlog += skb->len;
 274         __skb_queue_tail(&q->qs[x], skb);
 275         sfq_inc(q, x);
 276         if (q->qs[x].qlen == 1) {               /* The flow is new */
 277                 if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 278                         q->tail = x;
 279                         q->next[x] = x;
 280                         q->allot[x] = q->quantum;
 281                 } else {
 282                         q->next[x] = q->next[q->tail];
 283                         q->next[q->tail] = x;
 284                         q->tail = x;
 285                 }
 286         }
 287         if (++sch->q.qlen < q->limit-1) {
 288                 sch->bstats.bytes += skb->len;
 289                 sch->bstats.packets++;
 290                 return 0;
 291         }
 292
 293         sfq_drop(sch);
 294         return NET_XMIT_CN;
 295 }
 296
 297 static int
 298 sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
 299 {
 300         struct sfq_sched_data *q = qdisc_priv(sch);
 301         unsigned hash = sfq_hash(q, skb);
 302         sfq_index x;
 303
 304         x = q->ht[hash];
 305         if (x == SFQ_DEPTH) {
 306                 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 307                 q->hash[x] = hash;
 308         }
 309         sch->qstats.backlog += skb->len;
 310         __skb_queue_head(&q->qs[x], skb);
 311         sfq_inc(q, x);
 312         if (q->qs[x].qlen == 1) {               /* The flow is new */
 313                 if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 314                         q->tail = x;
 315                         q->next[x] = x;
 316                         q->allot[x] = q->quantum;
 317                 } else {
 318                         q->next[x] = q->next[q->tail];
 319                         q->next[q->tail] = x;
 320                         q->tail = x;
 321                 }
 322         }
 323         if (++sch->q.qlen < q->limit - 1) {
 324                 sch->qstats.requeues++;
 325                 return 0;
 326         }
 327
 328         sch->qstats.drops++;
 329         sfq_drop(sch);
 330         return NET_XMIT_CN;
 331 }
 332
 333
 334
 335
 336 static struct sk_buff *
 337 sfq_dequeue(struct Qdisc* sch)
 338 {
 339         struct sfq_sched_data *q = qdisc_priv(sch);
 340         struct sk_buff *skb;
 341         sfq_index a, old_a;
 342
 343         /* No active slots */
 344         if (q->tail == SFQ_DEPTH)
 345                 return NULL;
 346
 347         a = old_a = q->next[q->tail];
 348
 349         /* Grab packet */
 350         skb = __skb_dequeue(&q->qs[a]);
 351         sfq_dec(q, a);
 352         sch->q.qlen--;
 353         sch->qstats.backlog -= skb->len;
 354
 355         /* Is the slot empty? */
 356         if (q->qs[a].qlen == 0) {
 357                 q->ht[q->hash[a]] = SFQ_DEPTH;
 358                 a = q->next[a];
 359                 if (a == old_a) {
 360                         q->tail = SFQ_DEPTH;
 361                         return skb;
 362                 }
 363                 q->next[q->tail] = a;
 364                 q->allot[a] += q->quantum;
 365         } else if ((q->allot[a] -= skb->len) <= 0) {
 366                 q->tail = a;
 367                 a = q->next[a];
 368                 q->allot[a] += q->quantum;
 369         }
 370         return skb;
 371 }
 372
 373 static void
 374 sfq_reset(struct Qdisc* sch)
 375 {
 376         struct sk_buff *skb;
 377
 378         while ((skb = sfq_dequeue(sch)) != NULL)
 379                 kfree_skb(skb);
 380 }
 381
 382 static void sfq_perturbation(unsigned long arg)
 383 {
 384         struct Qdisc *sch = (struct Qdisc*)arg;
 385         struct sfq_sched_data *q = qdisc_priv(sch);
 386
 387         q->perturbation = net_random()&0x1F;
 388
 389         if (q->perturb_period) {
 390                 q->perturb_timer.expires = jiffies + q->perturb_period;
 391                 add_timer(&q->perturb_timer);
 392         }
 393 }
 394
 395 static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
 396 {
 397         struct sfq_sched_data *q = qdisc_priv(sch);
 398         struct tc_sfq_qopt *ctl = RTA_DATA(opt);
 399         unsigned int qlen;
 400
 401         if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
 402                 return -EINVAL;
 403
 404         sch_tree_lock(sch);
 405         q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
 406         q->perturb_period = ctl->perturb_period*HZ;
 407         if (ctl->limit)
 408                 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
 409
 410         qlen = sch->q.qlen;
 411         while (sch->q.qlen >= q->limit-1)
 412                 sfq_drop(sch);
 413         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 414
 415         del_timer(&q->perturb_timer);
 416         if (q->perturb_period) {
 417                 q->perturb_timer.expires = jiffies + q->perturb_period;
 418                 add_timer(&q->perturb_timer);
 419         }
 420         sch_tree_unlock(sch);
 421         return 0;
 422 }
 423
 424 static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
 425 {
 426         struct sfq_sched_data *q = qdisc_priv(sch);
 427         int i;
 428
 429         init_timer(&q->perturb_timer);
 430         q->perturb_timer.data = (unsigned long)sch;
 431         q->perturb_timer.function = sfq_perturbation;
 432
 433         for (i=0; i<SFQ_HASH_DIVISOR; i++)
 434                 q->ht[i] = SFQ_DEPTH;
 435         for (i=0; i<SFQ_DEPTH; i++) {
 436                 skb_queue_head_init(&q->qs[i]);
 437                 q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
 438                 q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
 439         }
 440         q->limit = SFQ_DEPTH;
 441         q->max_depth = 0;
 442         q->tail = SFQ_DEPTH;
 443         if (opt == NULL) {
 444                 q->quantum = psched_mtu(sch->dev);
 445                 q->perturb_period = 0;
 446         } else {
 447                 int err = sfq_change(sch, opt);
 448                 if (err)
 449                         return err;
 450         }
 451         for (i=0; i<SFQ_DEPTH; i++)
 452                 sfq_link(q, i);
 453         return 0;
 454 }
 455
 456 static void sfq_destroy(struct Qdisc *sch)
 457 {
 458         struct sfq_sched_data *q = qdisc_priv(sch);
 459         del_timer(&q->perturb_timer);
 460 }
 461
 462 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 463 {
 464         struct sfq_sched_data *q = qdisc_priv(sch);
 465         unsigned char *b = skb_tail_pointer(skb);
 466         struct tc_sfq_qopt opt;
 467
 468         opt.quantum = q->quantum;
 469         opt.perturb_period = q->perturb_period/HZ;
 470
 471         opt.limit = q->limit;
 472         opt.divisor = SFQ_HASH_DIVISOR;
 473         opt.flows = q->limit;
 474
 475         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 476
 477         return skb->len;
 478
 479 rtattr_failure:
 480         nlmsg_trim(skb, b);
 481         return -1;
 482 }
 483
 484 static struct Qdisc_ops sfq_qdisc_ops = {
 485         .next           =       NULL,
 486         .cl_ops         =       NULL,
 487         .id             =       "sfq",
 488         .priv_size      =       sizeof(struct sfq_sched_data),
 489         .enqueue        =       sfq_enqueue,
 490         .dequeue        =       sfq_dequeue,
 491         .requeue        =       sfq_requeue,
 492         .drop           =       sfq_drop,
 493         .init           =       sfq_init,
 494         .reset          =       sfq_reset,
 495         .destroy        =       sfq_destroy,
 496         .change         =       NULL,
 497         .dump           =       sfq_dump,
 498         .owner          =       THIS_MODULE,
 499 };
 500
 501 static int __init sfq_module_init(void)
 502 {
 503         return register_qdisc(&sfq_qdisc_ops);
 504 }
 505 static void __exit sfq_module_exit(void)
 506 {
 507         unregister_qdisc(&sfq_qdisc_ops);
 508 }
 509 module_init(sfq_module_init)
 510 module_exit(sfq_module_exit)
 511 MODULE_LICENSE("GPL");