Documentation: move nfsroot.txt to filesystems/
[linux-2.6] / net / ipv4 / inet_lro.c
1 /*
2  *  linux/net/ipv4/inet_lro.c
3  *
4  *  Large Receive Offload (ipv4 / tcp)
5  *
6  *  (C) Copyright IBM Corp. 2007
7  *
8  *  Authors:
9  *       Jan-Bernd Themann <themann@de.ibm.com>
10  *       Christoph Raisch <raisch@de.ibm.com>
11  *
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2, or (at your option)
16  * any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program; if not, write to the Free Software
25  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26  */
27
28
29 #include <linux/module.h>
30 #include <linux/if_vlan.h>
31 #include <linux/inet_lro.h>
32
33 MODULE_LICENSE("GPL");
34 MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
35 MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
36
37 #define TCP_HDR_LEN(tcph) (tcph->doff << 2)
38 #define IP_HDR_LEN(iph) (iph->ihl << 2)
39 #define TCP_PAYLOAD_LENGTH(iph, tcph) \
40         (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
41
42 #define IPH_LEN_WO_OPTIONS 5
43 #define TCPH_LEN_WO_OPTIONS 5
44 #define TCPH_LEN_W_TIMESTAMP 8
45
46 #define LRO_MAX_PG_HLEN 64
47
48 #define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
49
50 /*
51  * Basic tcp checks whether packet is suitable for LRO
52  */
53
54 static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
55                             int len, struct net_lro_desc *lro_desc)
56 {
57         /* check ip header: don't aggregate padded frames */
58         if (ntohs(iph->tot_len) != len)
59                 return -1;
60
61         if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
62                 return -1;
63
64         if (iph->ihl != IPH_LEN_WO_OPTIONS)
65                 return -1;
66
67         if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack
68             || tcph->rst || tcph->syn || tcph->fin)
69                 return -1;
70
71         if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
72                 return -1;
73
74         if (tcph->doff != TCPH_LEN_WO_OPTIONS
75             && tcph->doff != TCPH_LEN_W_TIMESTAMP)
76                 return -1;
77
78         /* check tcp options (only timestamp allowed) */
79         if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
80                 __be32 *topt = (__be32 *)(tcph + 1);
81
82                 if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
83                                    | (TCPOPT_TIMESTAMP << 8)
84                                    | TCPOLEN_TIMESTAMP))
85                         return -1;
86
87                 /* timestamp should be in right order */
88                 topt++;
89                 if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
90                                       ntohl(*topt)))
91                         return -1;
92
93                 /* timestamp reply should not be zero */
94                 topt++;
95                 if (*topt == 0)
96                         return -1;
97         }
98
99         return 0;
100 }
101
102 static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
103 {
104         struct iphdr *iph = lro_desc->iph;
105         struct tcphdr *tcph = lro_desc->tcph;
106         __be32 *p;
107         __wsum tcp_hdr_csum;
108
109         tcph->ack_seq = lro_desc->tcp_ack;
110         tcph->window = lro_desc->tcp_window;
111
112         if (lro_desc->tcp_saw_tstamp) {
113                 p = (__be32 *)(tcph + 1);
114                 *(p+2) = lro_desc->tcp_rcv_tsecr;
115         }
116
117         iph->tot_len = htons(lro_desc->ip_tot_len);
118
119         iph->check = 0;
120         iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
121
122         tcph->check = 0;
123         tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0);
124         lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
125         tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
126                                         lro_desc->ip_tot_len -
127                                         IP_HDR_LEN(iph), IPPROTO_TCP,
128                                         lro_desc->data_csum);
129 }
130
131 static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
132 {
133         __wsum tcp_csum;
134         __wsum tcp_hdr_csum;
135         __wsum tcp_ps_hdr_csum;
136
137         tcp_csum = ~csum_unfold(tcph->check);
138         tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum);
139
140         tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
141                                              len + TCP_HDR_LEN(tcph),
142                                              IPPROTO_TCP, 0);
143
144         return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
145                         tcp_ps_hdr_csum);
146 }
147
148 static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
149                           struct iphdr *iph, struct tcphdr *tcph,
150                           u16 vlan_tag, struct vlan_group *vgrp)
151 {
152         int nr_frags;
153         __be32 *ptr;
154         u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
155
156         nr_frags = skb_shinfo(skb)->nr_frags;
157         lro_desc->parent = skb;
158         lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
159         lro_desc->iph = iph;
160         lro_desc->tcph = tcph;
161         lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
162         lro_desc->tcp_ack = tcph->ack_seq;
163         lro_desc->tcp_window = tcph->window;
164
165         lro_desc->pkt_aggr_cnt = 1;
166         lro_desc->ip_tot_len = ntohs(iph->tot_len);
167
168         if (tcph->doff == 8) {
169                 ptr = (__be32 *)(tcph+1);
170                 lro_desc->tcp_saw_tstamp = 1;
171                 lro_desc->tcp_rcv_tsval = *(ptr+1);
172                 lro_desc->tcp_rcv_tsecr = *(ptr+2);
173         }
174
175         lro_desc->mss = tcp_data_len;
176         lro_desc->vgrp = vgrp;
177         lro_desc->vlan_tag = vlan_tag;
178         lro_desc->active = 1;
179
180         lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
181                                                 tcp_data_len);
182 }
183
184 static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
185 {
186         memset(lro_desc, 0, sizeof(struct net_lro_desc));
187 }
188
189 static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
190                            struct tcphdr *tcph, int tcp_data_len)
191 {
192         struct sk_buff *parent = lro_desc->parent;
193         __be32 *topt;
194
195         lro_desc->pkt_aggr_cnt++;
196         lro_desc->ip_tot_len += tcp_data_len;
197         lro_desc->tcp_next_seq += tcp_data_len;
198         lro_desc->tcp_window = tcph->window;
199         lro_desc->tcp_ack = tcph->ack_seq;
200
201         /* don't update tcp_rcv_tsval, would not work with PAWS */
202         if (lro_desc->tcp_saw_tstamp) {
203                 topt = (__be32 *) (tcph + 1);
204                 lro_desc->tcp_rcv_tsecr = *(topt + 2);
205         }
206
207         lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
208                                              lro_tcp_data_csum(iph, tcph,
209                                                                tcp_data_len),
210                                              parent->len);
211
212         parent->len += tcp_data_len;
213         parent->data_len += tcp_data_len;
214         if (tcp_data_len > lro_desc->mss)
215                 lro_desc->mss = tcp_data_len;
216 }
217
218 static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
219                            struct iphdr *iph, struct tcphdr *tcph)
220 {
221         struct sk_buff *parent = lro_desc->parent;
222         int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
223
224         lro_add_common(lro_desc, iph, tcph, tcp_data_len);
225
226         skb_pull(skb, (skb->len - tcp_data_len));
227         parent->truesize += skb->truesize;
228
229         if (lro_desc->last_skb)
230                 lro_desc->last_skb->next = skb;
231         else
232                 skb_shinfo(parent)->frag_list = skb;
233
234         lro_desc->last_skb = skb;
235 }
236
237 static void lro_add_frags(struct net_lro_desc *lro_desc,
238                           int len, int hlen, int truesize,
239                           struct skb_frag_struct *skb_frags,
240                           struct iphdr *iph, struct tcphdr *tcph)
241 {
242         struct sk_buff *skb = lro_desc->parent;
243         int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
244
245         lro_add_common(lro_desc, iph, tcph, tcp_data_len);
246
247         skb->truesize += truesize;
248
249         skb_frags[0].page_offset += hlen;
250         skb_frags[0].size -= hlen;
251
252         while (tcp_data_len > 0) {
253                 *(lro_desc->next_frag) = *skb_frags;
254                 tcp_data_len -= skb_frags->size;
255                 lro_desc->next_frag++;
256                 skb_frags++;
257                 skb_shinfo(skb)->nr_frags++;
258         }
259 }
260
261 static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
262                               struct iphdr *iph,
263                               struct tcphdr *tcph)
264 {
265         if ((lro_desc->iph->saddr != iph->saddr)
266             || (lro_desc->iph->daddr != iph->daddr)
267             || (lro_desc->tcph->source != tcph->source)
268             || (lro_desc->tcph->dest != tcph->dest))
269                 return -1;
270         return 0;
271 }
272
273 static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
274                                          struct net_lro_desc *lro_arr,
275                                          struct iphdr *iph,
276                                          struct tcphdr *tcph)
277 {
278         struct net_lro_desc *lro_desc = NULL;
279         struct net_lro_desc *tmp;
280         int max_desc = lro_mgr->max_desc;
281         int i;
282
283         for (i = 0; i < max_desc; i++) {
284                 tmp = &lro_arr[i];
285                 if (tmp->active)
286                         if (!lro_check_tcp_conn(tmp, iph, tcph)) {
287                                 lro_desc = tmp;
288                                 goto out;
289                         }
290         }
291
292         for (i = 0; i < max_desc; i++) {
293                 if (!lro_arr[i].active) {
294                         lro_desc = &lro_arr[i];
295                         goto out;
296                 }
297         }
298
299         LRO_INC_STATS(lro_mgr, no_desc);
300 out:
301         return lro_desc;
302 }
303
304 static void lro_flush(struct net_lro_mgr *lro_mgr,
305                       struct net_lro_desc *lro_desc)
306 {
307         if (lro_desc->pkt_aggr_cnt > 1)
308                 lro_update_tcp_ip_header(lro_desc);
309
310         skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
311
312         if (lro_desc->vgrp) {
313                 if (lro_mgr->features & LRO_F_NAPI)
314                         vlan_hwaccel_receive_skb(lro_desc->parent,
315                                                  lro_desc->vgrp,
316                                                  lro_desc->vlan_tag);
317                 else
318                         vlan_hwaccel_rx(lro_desc->parent,
319                                         lro_desc->vgrp,
320                                         lro_desc->vlan_tag);
321
322         } else {
323                 if (lro_mgr->features & LRO_F_NAPI)
324                         netif_receive_skb(lro_desc->parent);
325                 else
326                         netif_rx(lro_desc->parent);
327         }
328
329         LRO_INC_STATS(lro_mgr, flushed);
330         lro_clear_desc(lro_desc);
331 }
332
333 static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
334                           struct vlan_group *vgrp, u16 vlan_tag, void *priv)
335 {
336         struct net_lro_desc *lro_desc;
337         struct iphdr *iph;
338         struct tcphdr *tcph;
339         u64 flags;
340         int vlan_hdr_len = 0;
341
342         if (!lro_mgr->get_skb_header
343             || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
344                                        &flags, priv))
345                 goto out;
346
347         if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
348                 goto out;
349
350         lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
351         if (!lro_desc)
352                 goto out;
353
354         if ((skb->protocol == htons(ETH_P_8021Q))
355             && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
356                 vlan_hdr_len = VLAN_HLEN;
357
358         if (!lro_desc->active) { /* start new lro session */
359                 if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
360                         goto out;
361
362                 skb->ip_summed = lro_mgr->ip_summed_aggr;
363                 lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
364                 LRO_INC_STATS(lro_mgr, aggregated);
365                 return 0;
366         }
367
368         if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
369                 goto out2;
370
371         if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
372                 goto out2;
373
374         lro_add_packet(lro_desc, skb, iph, tcph);
375         LRO_INC_STATS(lro_mgr, aggregated);
376
377         if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
378             lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
379                 lro_flush(lro_mgr, lro_desc);
380
381         return 0;
382
383 out2: /* send aggregated SKBs to stack */
384         lro_flush(lro_mgr, lro_desc);
385
386 out:  /* Original SKB has to be posted to stack */
387         skb->ip_summed = lro_mgr->ip_summed;
388         return 1;
389 }
390
391
392 static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
393                                    struct skb_frag_struct *frags,
394                                    int len, int true_size,
395                                    void *mac_hdr,
396                                    int hlen, __wsum sum,
397                                    u32 ip_summed)
398 {
399         struct sk_buff *skb;
400         struct skb_frag_struct *skb_frags;
401         int data_len = len;
402         int hdr_len = min(len, hlen);
403
404         skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad);
405         if (!skb)
406                 return NULL;
407
408         skb_reserve(skb, lro_mgr->frag_align_pad);
409         skb->len = len;
410         skb->data_len = len - hdr_len;
411         skb->truesize += true_size;
412         skb->tail += hdr_len;
413
414         memcpy(skb->data, mac_hdr, hdr_len);
415
416         skb_frags = skb_shinfo(skb)->frags;
417         while (data_len > 0) {
418                 *skb_frags = *frags;
419                 data_len -= frags->size;
420                 skb_frags++;
421                 frags++;
422                 skb_shinfo(skb)->nr_frags++;
423         }
424
425         skb_shinfo(skb)->frags[0].page_offset += hdr_len;
426         skb_shinfo(skb)->frags[0].size -= hdr_len;
427
428         skb->ip_summed = ip_summed;
429         skb->csum = sum;
430         skb->protocol = eth_type_trans(skb, lro_mgr->dev);
431         return skb;
432 }
433
434 static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
435                                           struct skb_frag_struct *frags,
436                                           int len, int true_size,
437                                           struct vlan_group *vgrp,
438                                           u16 vlan_tag, void *priv, __wsum sum)
439 {
440         struct net_lro_desc *lro_desc;
441         struct iphdr *iph;
442         struct tcphdr *tcph;
443         struct sk_buff *skb;
444         u64 flags;
445         void *mac_hdr;
446         int mac_hdr_len;
447         int hdr_len = LRO_MAX_PG_HLEN;
448         int vlan_hdr_len = 0;
449
450         if (!lro_mgr->get_frag_header
451             || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
452                                         (void *)&tcph, &flags, priv)) {
453                 mac_hdr = page_address(frags->page) + frags->page_offset;
454                 goto out1;
455         }
456
457         if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
458                 goto out1;
459
460         hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
461         mac_hdr_len = (int)((void *)(iph) - mac_hdr);
462
463         lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
464         if (!lro_desc)
465                 goto out1;
466
467         if (!lro_desc->active) { /* start new lro session */
468                 if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
469                         goto out1;
470
471                 skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
472                                   hdr_len, 0, lro_mgr->ip_summed_aggr);
473                 if (!skb)
474                         goto out;
475
476                 if ((skb->protocol == htons(ETH_P_8021Q))
477                     && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
478                         vlan_hdr_len = VLAN_HLEN;
479
480                 iph = (void *)(skb->data + vlan_hdr_len);
481                 tcph = (void *)((u8 *)skb->data + vlan_hdr_len
482                                 + IP_HDR_LEN(iph));
483
484                 lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
485                 LRO_INC_STATS(lro_mgr, aggregated);
486                 return NULL;
487         }
488
489         if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
490                 goto out2;
491
492         if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
493                 goto out2;
494
495         lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
496         LRO_INC_STATS(lro_mgr, aggregated);
497
498         if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
499             lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
500                 lro_flush(lro_mgr, lro_desc);
501
502         return NULL;
503
504 out2: /* send aggregated packets to the stack */
505         lro_flush(lro_mgr, lro_desc);
506
507 out1:  /* Original packet has to be posted to the stack */
508         skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
509                           hdr_len, sum, lro_mgr->ip_summed);
510 out:
511         return skb;
512 }
513
514 void lro_receive_skb(struct net_lro_mgr *lro_mgr,
515                      struct sk_buff *skb,
516                      void *priv)
517 {
518         if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
519                 if (lro_mgr->features & LRO_F_NAPI)
520                         netif_receive_skb(skb);
521                 else
522                         netif_rx(skb);
523         }
524 }
525 EXPORT_SYMBOL(lro_receive_skb);
526
527 void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
528                                   struct sk_buff *skb,
529                                   struct vlan_group *vgrp,
530                                   u16 vlan_tag,
531                                   void *priv)
532 {
533         if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
534                 if (lro_mgr->features & LRO_F_NAPI)
535                         vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
536                 else
537                         vlan_hwaccel_rx(skb, vgrp, vlan_tag);
538         }
539 }
540 EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
541
542 void lro_receive_frags(struct net_lro_mgr *lro_mgr,
543                        struct skb_frag_struct *frags,
544                        int len, int true_size, void *priv, __wsum sum)
545 {
546         struct sk_buff *skb;
547
548         skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
549                                  priv, sum);
550         if (!skb)
551                 return;
552
553         if (lro_mgr->features & LRO_F_NAPI)
554                 netif_receive_skb(skb);
555         else
556                 netif_rx(skb);
557 }
558 EXPORT_SYMBOL(lro_receive_frags);
559
560 void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
561                                     struct skb_frag_struct *frags,
562                                     int len, int true_size,
563                                     struct vlan_group *vgrp,
564                                     u16 vlan_tag, void *priv, __wsum sum)
565 {
566         struct sk_buff *skb;
567
568         skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
569                                  vlan_tag, priv, sum);
570         if (!skb)
571                 return;
572
573         if (lro_mgr->features & LRO_F_NAPI)
574                 vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
575         else
576                 vlan_hwaccel_rx(skb, vgrp, vlan_tag);
577 }
578 EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
579
580 void lro_flush_all(struct net_lro_mgr *lro_mgr)
581 {
582         int i;
583         struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
584
585         for (i = 0; i < lro_mgr->max_desc; i++) {
586                 if (lro_desc[i].active)
587                         lro_flush(lro_mgr, &lro_desc[i]);
588         }
589 }
590 EXPORT_SYMBOL(lro_flush_all);
591
592 void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
593                   struct iphdr *iph, struct tcphdr *tcph)
594 {
595         struct net_lro_desc *lro_desc;
596
597         lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
598         if (lro_desc->active)
599                 lro_flush(lro_mgr, lro_desc);
600 }
601 EXPORT_SYMBOL(lro_flush_pkt);