IB/ipath: Fix RDMA reads
[linux-2.6] / drivers / infiniband / hw / ipath / ipath_rc.c
1 /*
2  * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include "ipath_verbs.h"
35 #include "ipath_kernel.h"
36
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
39
40 /**
41  * ipath_init_restart- initialize the qp->s_sge after a restart
42  * @qp: the QP who's SGE we're restarting
43  * @wqe: the work queue to initialize the QP's SGE from
44  *
45  * The QP s_lock should be held and interrupts disabled.
46  */
47 static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
48 {
49         struct ipath_ibdev *dev;
50         u32 len;
51
52         len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) *
53                 ib_mtu_enum_to_int(qp->path_mtu);
54         qp->s_sge.sge = wqe->sg_list[0];
55         qp->s_sge.sg_list = wqe->sg_list + 1;
56         qp->s_sge.num_sge = wqe->wr.num_sge;
57         ipath_skip_sge(&qp->s_sge, len);
58         qp->s_len = wqe->length - len;
59         dev = to_idev(qp->ibqp.device);
60         spin_lock(&dev->pending_lock);
61         if (list_empty(&qp->timerwait))
62                 list_add_tail(&qp->timerwait,
63                               &dev->pending[dev->pending_index]);
64         spin_unlock(&dev->pending_lock);
65 }
66
67 /**
68  * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
69  * @qp: a pointer to the QP
70  * @ohdr: a pointer to the IB header being constructed
71  * @pmtu: the path MTU
72  *
73  * Return bth0 if constructed; otherwise, return 0.
74  * Note the QP s_lock must be held.
75  */
76 u32 ipath_make_rc_ack(struct ipath_qp *qp,
77                       struct ipath_other_headers *ohdr,
78                       u32 pmtu)
79 {
80         u32 hwords;
81         u32 len;
82         u32 bth0;
83
84         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
85         hwords = 5;
86
87         /*
88          * Send a response.  Note that we are in the responder's
89          * side of the QP context.
90          */
91         switch (qp->s_ack_state) {
92         case OP(RDMA_READ_REQUEST):
93                 qp->s_cur_sge = &qp->s_rdma_sge;
94                 len = qp->s_rdma_len;
95                 if (len > pmtu) {
96                         len = pmtu;
97                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
98                 } else
99                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
100                 qp->s_rdma_len -= len;
101                 bth0 = qp->s_ack_state << 24;
102                 ohdr->u.aeth = ipath_compute_aeth(qp);
103                 hwords++;
104                 break;
105
106         case OP(RDMA_READ_RESPONSE_FIRST):
107                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
108                 /* FALLTHROUGH */
109         case OP(RDMA_READ_RESPONSE_MIDDLE):
110                 qp->s_cur_sge = &qp->s_rdma_sge;
111                 len = qp->s_rdma_len;
112                 if (len > pmtu)
113                         len = pmtu;
114                 else {
115                         ohdr->u.aeth = ipath_compute_aeth(qp);
116                         hwords++;
117                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
118                 }
119                 qp->s_rdma_len -= len;
120                 bth0 = qp->s_ack_state << 24;
121                 break;
122
123         case OP(RDMA_READ_RESPONSE_LAST):
124         case OP(RDMA_READ_RESPONSE_ONLY):
125                 /*
126                  * We have to prevent new requests from changing
127                  * the r_sge state while a ipath_verbs_send()
128                  * is in progress.
129                  */
130                 qp->s_ack_state = OP(ACKNOWLEDGE);
131                 bth0 = 0;
132                 goto bail;
133
134         case OP(COMPARE_SWAP):
135         case OP(FETCH_ADD):
136                 qp->s_cur_sge = NULL;
137                 len = 0;
138                 /*
139                  * Set the s_ack_state so the receive interrupt handler
140                  * won't try to send an ACK (out of order) until this one
141                  * is actually sent.
142                  */
143                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
144                 bth0 = OP(ATOMIC_ACKNOWLEDGE) << 24;
145                 ohdr->u.at.aeth = ipath_compute_aeth(qp);
146                 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data);
147                 hwords += sizeof(ohdr->u.at) / 4;
148                 break;
149
150         default:
151                 /* Send a regular ACK. */
152                 qp->s_cur_sge = NULL;
153                 len = 0;
154                 /*
155                  * Set the s_ack_state so the receive interrupt handler
156                  * won't try to send an ACK (out of order) until this one
157                  * is actually sent.
158                  */
159                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
160                 bth0 = OP(ACKNOWLEDGE) << 24;
161                 if (qp->s_nak_state)
162                         ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
163                                                     (qp->s_nak_state <<
164                                                      IPATH_AETH_CREDIT_SHIFT));
165                 else
166                         ohdr->u.aeth = ipath_compute_aeth(qp);
167                 hwords++;
168         }
169         qp->s_hdrwords = hwords;
170         qp->s_cur_size = len;
171
172 bail:
173         return bth0;
174 }
175
176 /**
177  * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
178  * @qp: a pointer to the QP
179  * @ohdr: a pointer to the IB header being constructed
180  * @pmtu: the path MTU
181  * @bth0p: pointer to the BTH opcode word
182  * @bth2p: pointer to the BTH PSN word
183  *
184  * Return 1 if constructed; otherwise, return 0.
185  * Note the QP s_lock must be held and interrupts disabled.
186  */
187 int ipath_make_rc_req(struct ipath_qp *qp,
188                       struct ipath_other_headers *ohdr,
189                       u32 pmtu, u32 *bth0p, u32 *bth2p)
190 {
191         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
192         struct ipath_sge_state *ss;
193         struct ipath_swqe *wqe;
194         u32 hwords;
195         u32 len;
196         u32 bth0;
197         u32 bth2;
198         char newreq;
199
200         if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
201             qp->s_rnr_timeout)
202                 goto done;
203
204         /* Limit the number of packets sent without an ACK. */
205         if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) {
206                 qp->s_wait_credit = 1;
207                 dev->n_rc_stalls++;
208                 spin_lock(&dev->pending_lock);
209                 if (list_empty(&qp->timerwait))
210                         list_add_tail(&qp->timerwait,
211                                       &dev->pending[dev->pending_index]);
212                 spin_unlock(&dev->pending_lock);
213                 goto done;
214         }
215
216         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
217         hwords = 5;
218         bth0 = 0;
219
220         /* Send a request. */
221         wqe = get_swqe_ptr(qp, qp->s_cur);
222         switch (qp->s_state) {
223         default:
224                 /*
225                  * Resend an old request or start a new one.
226                  *
227                  * We keep track of the current SWQE so that
228                  * we don't reset the "furthest progress" state
229                  * if we need to back up.
230                  */
231                 newreq = 0;
232                 if (qp->s_cur == qp->s_tail) {
233                         /* Check if send work queue is empty. */
234                         if (qp->s_tail == qp->s_head)
235                                 goto done;
236                         wqe->psn = qp->s_next_psn;
237                         newreq = 1;
238                 }
239                 /*
240                  * Note that we have to be careful not to modify the
241                  * original work request since we may need to resend
242                  * it.
243                  */
244                 len = wqe->length;
245                 ss = &qp->s_sge;
246                 bth2 = 0;
247                 switch (wqe->wr.opcode) {
248                 case IB_WR_SEND:
249                 case IB_WR_SEND_WITH_IMM:
250                         /* If no credit, return. */
251                         if (qp->s_lsn != (u32) -1 &&
252                             ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
253                                 goto done;
254                         wqe->lpsn = wqe->psn;
255                         if (len > pmtu) {
256                                 wqe->lpsn += (len - 1) / pmtu;
257                                 qp->s_state = OP(SEND_FIRST);
258                                 len = pmtu;
259                                 break;
260                         }
261                         if (wqe->wr.opcode == IB_WR_SEND)
262                                 qp->s_state = OP(SEND_ONLY);
263                         else {
264                                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
265                                 /* Immediate data comes after the BTH */
266                                 ohdr->u.imm_data = wqe->wr.imm_data;
267                                 hwords += 1;
268                         }
269                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
270                                 bth0 |= 1 << 23;
271                         bth2 = 1 << 31; /* Request ACK. */
272                         if (++qp->s_cur == qp->s_size)
273                                 qp->s_cur = 0;
274                         break;
275
276                 case IB_WR_RDMA_WRITE:
277                         if (newreq && qp->s_lsn != (u32) -1)
278                                 qp->s_lsn++;
279                         /* FALLTHROUGH */
280                 case IB_WR_RDMA_WRITE_WITH_IMM:
281                         /* If no credit, return. */
282                         if (qp->s_lsn != (u32) -1 &&
283                             ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
284                                 goto done;
285                         ohdr->u.rc.reth.vaddr =
286                                 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
287                         ohdr->u.rc.reth.rkey =
288                                 cpu_to_be32(wqe->wr.wr.rdma.rkey);
289                         ohdr->u.rc.reth.length = cpu_to_be32(len);
290                         hwords += sizeof(struct ib_reth) / 4;
291                         wqe->lpsn = wqe->psn;
292                         if (len > pmtu) {
293                                 wqe->lpsn += (len - 1) / pmtu;
294                                 qp->s_state = OP(RDMA_WRITE_FIRST);
295                                 len = pmtu;
296                                 break;
297                         }
298                         if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
299                                 qp->s_state = OP(RDMA_WRITE_ONLY);
300                         else {
301                                 qp->s_state =
302                                         OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
303                                 /* Immediate data comes after RETH */
304                                 ohdr->u.rc.imm_data = wqe->wr.imm_data;
305                                 hwords += 1;
306                                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
307                                         bth0 |= 1 << 23;
308                         }
309                         bth2 = 1 << 31; /* Request ACK. */
310                         if (++qp->s_cur == qp->s_size)
311                                 qp->s_cur = 0;
312                         break;
313
314                 case IB_WR_RDMA_READ:
315                         ohdr->u.rc.reth.vaddr =
316                                 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
317                         ohdr->u.rc.reth.rkey =
318                                 cpu_to_be32(wqe->wr.wr.rdma.rkey);
319                         ohdr->u.rc.reth.length = cpu_to_be32(len);
320                         qp->s_state = OP(RDMA_READ_REQUEST);
321                         hwords += sizeof(ohdr->u.rc.reth) / 4;
322                         if (newreq) {
323                                 if (qp->s_lsn != (u32) -1)
324                                         qp->s_lsn++;
325                                 /*
326                                  * Adjust s_next_psn to count the
327                                  * expected number of responses.
328                                  */
329                                 if (len > pmtu)
330                                         qp->s_next_psn += (len - 1) / pmtu;
331                                 wqe->lpsn = qp->s_next_psn++;
332                         }
333                         ss = NULL;
334                         len = 0;
335                         if (++qp->s_cur == qp->s_size)
336                                 qp->s_cur = 0;
337                         break;
338
339                 case IB_WR_ATOMIC_CMP_AND_SWP:
340                 case IB_WR_ATOMIC_FETCH_AND_ADD:
341                         if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP)
342                                 qp->s_state = OP(COMPARE_SWAP);
343                         else
344                                 qp->s_state = OP(FETCH_ADD);
345                         ohdr->u.atomic_eth.vaddr = cpu_to_be64(
346                                 wqe->wr.wr.atomic.remote_addr);
347                         ohdr->u.atomic_eth.rkey = cpu_to_be32(
348                                 wqe->wr.wr.atomic.rkey);
349                         ohdr->u.atomic_eth.swap_data = cpu_to_be64(
350                                 wqe->wr.wr.atomic.swap);
351                         ohdr->u.atomic_eth.compare_data = cpu_to_be64(
352                                 wqe->wr.wr.atomic.compare_add);
353                         hwords += sizeof(struct ib_atomic_eth) / 4;
354                         if (newreq) {
355                                 if (qp->s_lsn != (u32) -1)
356                                         qp->s_lsn++;
357                                 wqe->lpsn = wqe->psn;
358                         }
359                         if (++qp->s_cur == qp->s_size)
360                                 qp->s_cur = 0;
361                         ss = NULL;
362                         len = 0;
363                         break;
364
365                 default:
366                         goto done;
367                 }
368                 qp->s_sge.sge = wqe->sg_list[0];
369                 qp->s_sge.sg_list = wqe->sg_list + 1;
370                 qp->s_sge.num_sge = wqe->wr.num_sge;
371                 qp->s_len = wqe->length;
372                 if (newreq) {
373                         qp->s_tail++;
374                         if (qp->s_tail >= qp->s_size)
375                                 qp->s_tail = 0;
376                 }
377                 bth2 |= qp->s_psn & IPATH_PSN_MASK;
378                 if (wqe->wr.opcode == IB_WR_RDMA_READ)
379                         qp->s_psn = wqe->lpsn + 1;
380                 else {
381                         qp->s_psn++;
382                         if ((int)(qp->s_psn - qp->s_next_psn) > 0)
383                                 qp->s_next_psn = qp->s_psn;
384                 }
385                 /*
386                  * Put the QP on the pending list so lost ACKs will cause
387                  * a retry.  More than one request can be pending so the
388                  * QP may already be on the dev->pending list.
389                  */
390                 spin_lock(&dev->pending_lock);
391                 if (list_empty(&qp->timerwait))
392                         list_add_tail(&qp->timerwait,
393                                       &dev->pending[dev->pending_index]);
394                 spin_unlock(&dev->pending_lock);
395                 break;
396
397         case OP(RDMA_READ_RESPONSE_FIRST):
398                 /*
399                  * This case can only happen if a send is restarted.
400                  * See ipath_restart_rc().
401                  */
402                 ipath_init_restart(qp, wqe);
403                 /* FALLTHROUGH */
404         case OP(SEND_FIRST):
405                 qp->s_state = OP(SEND_MIDDLE);
406                 /* FALLTHROUGH */
407         case OP(SEND_MIDDLE):
408                 bth2 = qp->s_psn++ & IPATH_PSN_MASK;
409                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
410                         qp->s_next_psn = qp->s_psn;
411                 ss = &qp->s_sge;
412                 len = qp->s_len;
413                 if (len > pmtu) {
414                         len = pmtu;
415                         break;
416                 }
417                 if (wqe->wr.opcode == IB_WR_SEND)
418                         qp->s_state = OP(SEND_LAST);
419                 else {
420                         qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
421                         /* Immediate data comes after the BTH */
422                         ohdr->u.imm_data = wqe->wr.imm_data;
423                         hwords += 1;
424                 }
425                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
426                         bth0 |= 1 << 23;
427                 bth2 |= 1 << 31;        /* Request ACK. */
428                 qp->s_cur++;
429                 if (qp->s_cur >= qp->s_size)
430                         qp->s_cur = 0;
431                 break;
432
433         case OP(RDMA_READ_RESPONSE_LAST):
434                 /*
435                  * This case can only happen if a RDMA write is restarted.
436                  * See ipath_restart_rc().
437                  */
438                 ipath_init_restart(qp, wqe);
439                 /* FALLTHROUGH */
440         case OP(RDMA_WRITE_FIRST):
441                 qp->s_state = OP(RDMA_WRITE_MIDDLE);
442                 /* FALLTHROUGH */
443         case OP(RDMA_WRITE_MIDDLE):
444                 bth2 = qp->s_psn++ & IPATH_PSN_MASK;
445                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
446                         qp->s_next_psn = qp->s_psn;
447                 ss = &qp->s_sge;
448                 len = qp->s_len;
449                 if (len > pmtu) {
450                         len = pmtu;
451                         break;
452                 }
453                 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
454                         qp->s_state = OP(RDMA_WRITE_LAST);
455                 else {
456                         qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
457                         /* Immediate data comes after the BTH */
458                         ohdr->u.imm_data = wqe->wr.imm_data;
459                         hwords += 1;
460                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
461                                 bth0 |= 1 << 23;
462                 }
463                 bth2 |= 1 << 31;        /* Request ACK. */
464                 qp->s_cur++;
465                 if (qp->s_cur >= qp->s_size)
466                         qp->s_cur = 0;
467                 break;
468
469         case OP(RDMA_READ_RESPONSE_MIDDLE):
470                 /*
471                  * This case can only happen if a RDMA read is restarted.
472                  * See ipath_restart_rc().
473                  */
474                 ipath_init_restart(qp, wqe);
475                 len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
476                 ohdr->u.rc.reth.vaddr =
477                         cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
478                 ohdr->u.rc.reth.rkey =
479                         cpu_to_be32(wqe->wr.wr.rdma.rkey);
480                 ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
481                 qp->s_state = OP(RDMA_READ_REQUEST);
482                 hwords += sizeof(ohdr->u.rc.reth) / 4;
483                 bth2 = qp->s_psn++ & IPATH_PSN_MASK;
484                 if ((int)(qp->s_psn - qp->s_next_psn) > 0)
485                         qp->s_next_psn = qp->s_psn;
486                 ss = NULL;
487                 len = 0;
488                 qp->s_cur++;
489                 if (qp->s_cur == qp->s_size)
490                         qp->s_cur = 0;
491                 break;
492
493         case OP(RDMA_READ_REQUEST):
494         case OP(COMPARE_SWAP):
495         case OP(FETCH_ADD):
496                 /*
497                  * We shouldn't start anything new until this request is
498                  * finished.  The ACK will handle rescheduling us.  XXX The
499                  * number of outstanding ones is negotiated at connection
500                  * setup time (see pg. 258,289)?  XXX Also, if we support
501                  * multiple outstanding requests, we need to check the WQE
502                  * IB_SEND_FENCE flag and not send a new request if a RDMA
503                  * read or atomic is pending.
504                  */
505                 goto done;
506         }
507         if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
508                 bth2 |= 1 << 31;        /* Request ACK. */
509         qp->s_len -= len;
510         qp->s_hdrwords = hwords;
511         qp->s_cur_sge = ss;
512         qp->s_cur_size = len;
513         *bth0p = bth0 | (qp->s_state << 24);
514         *bth2p = bth2;
515         return 1;
516
517 done:
518         return 0;
519 }
520
521 /**
522  * send_rc_ack - Construct an ACK packet and send it
523  * @qp: a pointer to the QP
524  *
525  * This is called from ipath_rc_rcv() and only uses the receive
526  * side QP state.
527  * Note that RDMA reads are handled in the send side QP state and tasklet.
528  */
529 static void send_rc_ack(struct ipath_qp *qp)
530 {
531         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
532         u16 lrh0;
533         u32 bth0;
534         u32 hwords;
535         struct ipath_ib_header hdr;
536         struct ipath_other_headers *ohdr;
537
538         /* Construct the header. */
539         ohdr = &hdr.u.oth;
540         lrh0 = IPATH_LRH_BTH;
541         /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
542         hwords = 6;
543         if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
544                 hwords += ipath_make_grh(dev, &hdr.u.l.grh,
545                                          &qp->remote_ah_attr.grh,
546                                          hwords, 0);
547                 ohdr = &hdr.u.l.oth;
548                 lrh0 = IPATH_LRH_GRH;
549         }
550         /* read pkey_index w/o lock (its atomic) */
551         bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index);
552         if (qp->r_nak_state)
553                 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
554                                             (qp->r_nak_state <<
555                                              IPATH_AETH_CREDIT_SHIFT));
556         else
557                 ohdr->u.aeth = ipath_compute_aeth(qp);
558         if (qp->r_ack_state >= OP(COMPARE_SWAP)) {
559                 bth0 |= OP(ATOMIC_ACKNOWLEDGE) << 24;
560                 ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data);
561                 hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4;
562         } else
563                 bth0 |= OP(ACKNOWLEDGE) << 24;
564         lrh0 |= qp->remote_ah_attr.sl << 4;
565         hdr.lrh[0] = cpu_to_be16(lrh0);
566         hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
567         hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
568         hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid);
569         ohdr->bth[0] = cpu_to_be32(bth0);
570         ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
571         ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
572
573         /*
574          * If we can send the ACK, clear the ACK state.
575          */
576         if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) {
577                 qp->r_ack_state = OP(ACKNOWLEDGE);
578                 dev->n_unicast_xmit++;
579         } else {
580                 /*
581                  * We are out of PIO buffers at the moment.
582                  * Pass responsibility for sending the ACK to the
583                  * send tasklet so that when a PIO buffer becomes
584                  * available, the ACK is sent ahead of other outgoing
585                  * packets.
586                  */
587                 dev->n_rc_qacks++;
588                 spin_lock_irq(&qp->s_lock);
589                 /* Don't coalesce if a RDMA read or atomic is pending. */
590                 if (qp->s_ack_state == OP(ACKNOWLEDGE) ||
591                     qp->s_ack_state < OP(RDMA_READ_REQUEST)) {
592                         qp->s_ack_state = qp->r_ack_state;
593                         qp->s_nak_state = qp->r_nak_state;
594                         qp->s_ack_psn = qp->r_ack_psn;
595                         qp->r_ack_state = OP(ACKNOWLEDGE);
596                 }
597                 spin_unlock_irq(&qp->s_lock);
598
599                 /* Call ipath_do_rc_send() in another thread. */
600                 tasklet_hi_schedule(&qp->s_task);
601         }
602 }
603
604 /**
605  * reset_psn - reset the QP state to send starting from PSN
606  * @qp: the QP
607  * @psn: the packet sequence number to restart at
608  *
609  * This is called from ipath_rc_rcv() to process an incoming RC ACK
610  * for the given QP.
611  * Called at interrupt level with the QP s_lock held.
612  */
613 static void reset_psn(struct ipath_qp *qp, u32 psn)
614 {
615         u32 n = qp->s_last;
616         struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
617         u32 opcode;
618
619         qp->s_cur = n;
620
621         /*
622          * If we are starting the request from the beginning,
623          * let the normal send code handle initialization.
624          */
625         if (ipath_cmp24(psn, wqe->psn) <= 0) {
626                 qp->s_state = OP(SEND_LAST);
627                 goto done;
628         }
629
630         /* Find the work request opcode corresponding to the given PSN. */
631         opcode = wqe->wr.opcode;
632         for (;;) {
633                 int diff;
634
635                 if (++n == qp->s_size)
636                         n = 0;
637                 if (n == qp->s_tail)
638                         break;
639                 wqe = get_swqe_ptr(qp, n);
640                 diff = ipath_cmp24(psn, wqe->psn);
641                 if (diff < 0)
642                         break;
643                 qp->s_cur = n;
644                 /*
645                  * If we are starting the request from the beginning,
646                  * let the normal send code handle initialization.
647                  */
648                 if (diff == 0) {
649                         qp->s_state = OP(SEND_LAST);
650                         goto done;
651                 }
652                 opcode = wqe->wr.opcode;
653         }
654
655         /*
656          * Set the state to restart in the middle of a request.
657          * Don't change the s_sge, s_cur_sge, or s_cur_size.
658          * See ipath_do_rc_send().
659          */
660         switch (opcode) {
661         case IB_WR_SEND:
662         case IB_WR_SEND_WITH_IMM:
663                 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
664                 break;
665
666         case IB_WR_RDMA_WRITE:
667         case IB_WR_RDMA_WRITE_WITH_IMM:
668                 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
669                 break;
670
671         case IB_WR_RDMA_READ:
672                 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
673                 break;
674
675         default:
676                 /*
677                  * This case shouldn't happen since its only
678                  * one PSN per req.
679                  */
680                 qp->s_state = OP(SEND_LAST);
681         }
682 done:
683         qp->s_psn = psn;
684 }
685
686 /**
687  * ipath_restart_rc - back up requester to resend the last un-ACKed request
688  * @qp: the QP to restart
689  * @psn: packet sequence number for the request
690  * @wc: the work completion request
691  *
692  * The QP s_lock should be held and interrupts disabled.
693  */
694 void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
695 {
696         struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
697         struct ipath_ibdev *dev;
698
699         if (qp->s_retry == 0) {
700                 wc->wr_id = wqe->wr.wr_id;
701                 wc->status = IB_WC_RETRY_EXC_ERR;
702                 wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
703                 wc->vendor_err = 0;
704                 wc->byte_len = 0;
705                 wc->qp_num = qp->ibqp.qp_num;
706                 wc->src_qp = qp->remote_qpn;
707                 wc->pkey_index = 0;
708                 wc->slid = qp->remote_ah_attr.dlid;
709                 wc->sl = qp->remote_ah_attr.sl;
710                 wc->dlid_path_bits = 0;
711                 wc->port_num = 0;
712                 ipath_sqerror_qp(qp, wc);
713                 goto bail;
714         }
715         qp->s_retry--;
716
717         /*
718          * Remove the QP from the timeout queue.
719          * Note: it may already have been removed by ipath_ib_timer().
720          */
721         dev = to_idev(qp->ibqp.device);
722         spin_lock(&dev->pending_lock);
723         if (!list_empty(&qp->timerwait))
724                 list_del_init(&qp->timerwait);
725         spin_unlock(&dev->pending_lock);
726
727         if (wqe->wr.opcode == IB_WR_RDMA_READ)
728                 dev->n_rc_resends++;
729         else
730                 dev->n_rc_resends += (int)qp->s_psn - (int)psn;
731
732         reset_psn(qp, psn);
733         tasklet_hi_schedule(&qp->s_task);
734
735 bail:
736         return;
737 }
738
739 static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
740 {
741         if (qp->s_wait_credit) {
742                 qp->s_wait_credit = 0;
743                 tasklet_hi_schedule(&qp->s_task);
744         }
745         qp->s_last_psn = psn;
746 }
747
748 /**
749  * do_rc_ack - process an incoming RC ACK
750  * @qp: the QP the ACK came in on
751  * @psn: the packet sequence number of the ACK
752  * @opcode: the opcode of the request that resulted in the ACK
753  *
754  * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
755  * for the given QP.
756  * Called at interrupt level with the QP s_lock held and interrupts disabled.
757  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
758  */
759 static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
760 {
761         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
762         struct ib_wc wc;
763         struct ipath_swqe *wqe;
764         int ret = 0;
765         u32 ack_psn;
766
767         /*
768          * Remove the QP from the timeout queue (or RNR timeout queue).
769          * If ipath_ib_timer() has already removed it,
770          * it's OK since we hold the QP s_lock and ipath_restart_rc()
771          * just won't find anything to restart if we ACK everything.
772          */
773         spin_lock(&dev->pending_lock);
774         if (!list_empty(&qp->timerwait))
775                 list_del_init(&qp->timerwait);
776         spin_unlock(&dev->pending_lock);
777
778         /* Nothing is pending to ACK/NAK. */
779         if (unlikely(qp->s_last == qp->s_tail))
780                 goto bail;
781
782         /*
783          * Note that NAKs implicitly ACK outstanding SEND and RDMA write
784          * requests and implicitly NAK RDMA read and atomic requests issued
785          * before the NAK'ed request.  The MSN won't include the NAK'ed
786          * request but will include an ACK'ed request(s).
787          */
788         ack_psn = psn;
789         if (aeth >> 29)
790                 ack_psn--;
791         wqe = get_swqe_ptr(qp, qp->s_last);
792
793         /*
794          * The MSN might be for a later WQE than the PSN indicates so
795          * only complete WQEs that the PSN finishes.
796          */
797         while (ipath_cmp24(ack_psn, wqe->lpsn) >= 0) {
798                 /*
799                  * If this request is a RDMA read or atomic, and the ACK is
800                  * for a later operation, this ACK NAKs the RDMA read or
801                  * atomic.  In other words, only a RDMA_READ_LAST or ONLY
802                  * can ACK a RDMA read and likewise for atomic ops.  Note
803                  * that the NAK case can only happen if relaxed ordering is
804                  * used and requests are sent after an RDMA read or atomic
805                  * is sent but before the response is received.
806                  */
807                 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
808                      (opcode != OP(RDMA_READ_RESPONSE_LAST) ||
809                        ipath_cmp24(ack_psn, wqe->lpsn) != 0)) ||
810                     ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
811                       wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
812                      (opcode != OP(ATOMIC_ACKNOWLEDGE) ||
813                       ipath_cmp24(wqe->psn, psn) != 0))) {
814                         /*
815                          * The last valid PSN seen is the previous
816                          * request's.
817                          */
818                         update_last_psn(qp, wqe->psn - 1);
819                         /* Retry this request. */
820                         ipath_restart_rc(qp, wqe->psn, &wc);
821                         /*
822                          * No need to process the ACK/NAK since we are
823                          * restarting an earlier request.
824                          */
825                         goto bail;
826                 }
827                 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
828                     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
829                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
830                         tasklet_hi_schedule(&qp->s_task);
831                 /* Post a send completion queue entry if requested. */
832                 if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
833                     (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
834                         wc.wr_id = wqe->wr.wr_id;
835                         wc.status = IB_WC_SUCCESS;
836                         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
837                         wc.vendor_err = 0;
838                         wc.byte_len = wqe->length;
839                         wc.qp_num = qp->ibqp.qp_num;
840                         wc.src_qp = qp->remote_qpn;
841                         wc.pkey_index = 0;
842                         wc.slid = qp->remote_ah_attr.dlid;
843                         wc.sl = qp->remote_ah_attr.sl;
844                         wc.dlid_path_bits = 0;
845                         wc.port_num = 0;
846                         ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
847                 }
848                 qp->s_retry = qp->s_retry_cnt;
849                 /*
850                  * If we are completing a request which is in the process of
851                  * being resent, we can stop resending it since we know the
852                  * responder has already seen it.
853                  */
854                 if (qp->s_last == qp->s_cur) {
855                         if (++qp->s_cur >= qp->s_size)
856                                 qp->s_cur = 0;
857                         wqe = get_swqe_ptr(qp, qp->s_cur);
858                         qp->s_state = OP(SEND_LAST);
859                         qp->s_psn = wqe->psn;
860                 }
861                 if (++qp->s_last >= qp->s_size)
862                         qp->s_last = 0;
863                 wqe = get_swqe_ptr(qp, qp->s_last);
864                 if (qp->s_last == qp->s_tail)
865                         break;
866         }
867
868         switch (aeth >> 29) {
869         case 0:         /* ACK */
870                 dev->n_rc_acks++;
871                 /* If this is a partial ACK, reset the retransmit timer. */
872                 if (qp->s_last != qp->s_tail) {
873                         spin_lock(&dev->pending_lock);
874                         list_add_tail(&qp->timerwait,
875                                       &dev->pending[dev->pending_index]);
876                         spin_unlock(&dev->pending_lock);
877                 }
878                 ipath_get_credit(qp, aeth);
879                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
880                 qp->s_retry = qp->s_retry_cnt;
881                 update_last_psn(qp, psn);
882                 ret = 1;
883                 goto bail;
884
885         case 1:         /* RNR NAK */
886                 dev->n_rnr_naks++;
887                 if (qp->s_rnr_retry == 0) {
888                         if (qp->s_last == qp->s_tail)
889                                 goto bail;
890
891                         wc.status = IB_WC_RNR_RETRY_EXC_ERR;
892                         goto class_b;
893                 }
894                 if (qp->s_rnr_retry_cnt < 7)
895                         qp->s_rnr_retry--;
896                 if (qp->s_last == qp->s_tail)
897                         goto bail;
898
899                 /* The last valid PSN is the previous PSN. */
900                 update_last_psn(qp, psn - 1);
901
902                 dev->n_rc_resends += (int)qp->s_psn - (int)psn;
903
904                 reset_psn(qp, psn);
905
906                 qp->s_rnr_timeout =
907                         ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
908                                            IPATH_AETH_CREDIT_MASK];
909                 ipath_insert_rnr_queue(qp);
910                 goto bail;
911
912         case 3:         /* NAK */
913                 /* The last valid PSN seen is the previous request's. */
914                 if (qp->s_last != qp->s_tail)
915                         update_last_psn(qp, wqe->psn - 1);
916                 switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
917                         IPATH_AETH_CREDIT_MASK) {
918                 case 0: /* PSN sequence error */
919                         dev->n_seq_naks++;
920                         /*
921                          * Back up to the responder's expected PSN.  XXX
922                          * Note that we might get a NAK in the middle of an
923                          * RDMA READ response which terminates the RDMA
924                          * READ.
925                          */
926                         if (qp->s_last == qp->s_tail)
927                                 break;
928
929                         if (ipath_cmp24(psn, wqe->psn) < 0)
930                                 break;
931
932                         /* Retry the request. */
933                         ipath_restart_rc(qp, psn, &wc);
934                         break;
935
936                 case 1: /* Invalid Request */
937                         wc.status = IB_WC_REM_INV_REQ_ERR;
938                         dev->n_other_naks++;
939                         goto class_b;
940
941                 case 2: /* Remote Access Error */
942                         wc.status = IB_WC_REM_ACCESS_ERR;
943                         dev->n_other_naks++;
944                         goto class_b;
945
946                 case 3: /* Remote Operation Error */
947                         wc.status = IB_WC_REM_OP_ERR;
948                         dev->n_other_naks++;
949                 class_b:
950                         wc.wr_id = wqe->wr.wr_id;
951                         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
952                         wc.vendor_err = 0;
953                         wc.byte_len = 0;
954                         wc.qp_num = qp->ibqp.qp_num;
955                         wc.src_qp = qp->remote_qpn;
956                         wc.pkey_index = 0;
957                         wc.slid = qp->remote_ah_attr.dlid;
958                         wc.sl = qp->remote_ah_attr.sl;
959                         wc.dlid_path_bits = 0;
960                         wc.port_num = 0;
961                         ipath_sqerror_qp(qp, &wc);
962                         break;
963
964                 default:
965                         /* Ignore other reserved NAK error codes */
966                         goto reserved;
967                 }
968                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
969                 goto bail;
970
971         default:                /* 2: reserved */
972         reserved:
973                 /* Ignore reserved NAK codes. */
974                 goto bail;
975         }
976
977 bail:
978         return ret;
979 }
980
981 /**
982  * ipath_rc_rcv_resp - process an incoming RC response packet
983  * @dev: the device this packet came in on
984  * @ohdr: the other headers for this packet
985  * @data: the packet data
986  * @tlen: the packet length
987  * @qp: the QP for this packet
988  * @opcode: the opcode for this packet
989  * @psn: the packet sequence number for this packet
990  * @hdrsize: the header length
991  * @pmtu: the path MTU
992  * @header_in_data: true if part of the header data is in the data buffer
993  *
994  * This is called from ipath_rc_rcv() to process an incoming RC response
995  * packet for the given QP.
996  * Called at interrupt level.
997  */
998 static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
999                                      struct ipath_other_headers *ohdr,
1000                                      void *data, u32 tlen,
1001                                      struct ipath_qp *qp,
1002                                      u32 opcode,
1003                                      u32 psn, u32 hdrsize, u32 pmtu,
1004                                      int header_in_data)
1005 {
1006         unsigned long flags;
1007         struct ib_wc wc;
1008         int diff;
1009         u32 pad;
1010         u32 aeth;
1011
1012         spin_lock_irqsave(&qp->s_lock, flags);
1013
1014         /* Ignore invalid responses. */
1015         if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
1016                 goto ack_done;
1017
1018         /* Ignore duplicate responses. */
1019         diff = ipath_cmp24(psn, qp->s_last_psn);
1020         if (unlikely(diff <= 0)) {
1021                 /* Update credits for "ghost" ACKs */
1022                 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1023                         if (!header_in_data)
1024                                 aeth = be32_to_cpu(ohdr->u.aeth);
1025                         else {
1026                                 aeth = be32_to_cpu(((__be32 *) data)[0]);
1027                                 data += sizeof(__be32);
1028                         }
1029                         if ((aeth >> 29) == 0)
1030                                 ipath_get_credit(qp, aeth);
1031                 }
1032                 goto ack_done;
1033         }
1034
1035         switch (opcode) {
1036         case OP(ACKNOWLEDGE):
1037         case OP(ATOMIC_ACKNOWLEDGE):
1038         case OP(RDMA_READ_RESPONSE_FIRST):
1039                 if (!header_in_data)
1040                         aeth = be32_to_cpu(ohdr->u.aeth);
1041                 else {
1042                         aeth = be32_to_cpu(((__be32 *) data)[0]);
1043                         data += sizeof(__be32);
1044                 }
1045                 if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1046                         *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data;
1047                 if (!do_rc_ack(qp, aeth, psn, opcode) ||
1048                     opcode != OP(RDMA_READ_RESPONSE_FIRST))
1049                         goto ack_done;
1050                 hdrsize += 4;
1051                 /*
1052                  * do_rc_ack() has already checked the PSN so skip
1053                  * the sequence check.
1054                  */
1055                 goto rdma_read;
1056
1057         case OP(RDMA_READ_RESPONSE_MIDDLE):
1058                 /* no AETH, no ACK */
1059                 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
1060                         dev->n_rdma_seq++;
1061                         if (qp->s_last != qp->s_tail)
1062                                 ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
1063                         goto ack_done;
1064                 }
1065         rdma_read:
1066                 if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST)))
1067                         goto ack_done;
1068                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1069                         goto ack_done;
1070                 if (unlikely(pmtu >= qp->s_len))
1071                         goto ack_done;
1072                 /* We got a response so update the timeout. */
1073                 if (unlikely(qp->s_last == qp->s_tail ||
1074                              get_swqe_ptr(qp, qp->s_last)->wr.opcode !=
1075                              IB_WR_RDMA_READ))
1076                         goto ack_done;
1077                 spin_lock(&dev->pending_lock);
1078                 if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
1079                         list_move_tail(&qp->timerwait,
1080                                        &dev->pending[dev->pending_index]);
1081                 spin_unlock(&dev->pending_lock);
1082                 /*
1083                  * Update the RDMA receive state but do the copy w/o
1084                  * holding the locks and blocking interrupts.
1085                  * XXX Yet another place that affects relaxed RDMA order
1086                  * since we don't want s_sge modified.
1087                  */
1088                 qp->s_len -= pmtu;
1089                 update_last_psn(qp, psn);
1090                 spin_unlock_irqrestore(&qp->s_lock, flags);
1091                 ipath_copy_sge(&qp->s_sge, data, pmtu);
1092                 goto bail;
1093
1094         case OP(RDMA_READ_RESPONSE_LAST):
1095                 /* ACKs READ req. */
1096                 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
1097                         dev->n_rdma_seq++;
1098                         if (qp->s_last != qp->s_tail)
1099                                 ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
1100                         goto ack_done;
1101                 }
1102                 /* FALLTHROUGH */
1103         case OP(RDMA_READ_RESPONSE_ONLY):
1104                 if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST)))
1105                         goto ack_done;
1106                 /*
1107                  * Get the number of bytes the message was padded by.
1108                  */
1109                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1110                 /*
1111                  * Check that the data size is >= 1 && <= pmtu.
1112                  * Remember to account for the AETH header (4) and
1113                  * ICRC (4).
1114                  */
1115                 if (unlikely(tlen <= (hdrsize + pad + 8))) {
1116                         /* XXX Need to generate an error CQ entry. */
1117                         goto ack_done;
1118                 }
1119                 tlen -= hdrsize + pad + 8;
1120                 if (unlikely(tlen != qp->s_len)) {
1121                         /* XXX Need to generate an error CQ entry. */
1122                         goto ack_done;
1123                 }
1124                 if (!header_in_data)
1125                         aeth = be32_to_cpu(ohdr->u.aeth);
1126                 else {
1127                         aeth = be32_to_cpu(((__be32 *) data)[0]);
1128                         data += sizeof(__be32);
1129                 }
1130                 ipath_copy_sge(&qp->s_sge, data, tlen);
1131                 if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) {
1132                         /*
1133                          * Change the state so we contimue
1134                          * processing new requests and wake up the
1135                          * tasklet if there are posted sends.
1136                          */
1137                         qp->s_state = OP(SEND_LAST);
1138                         if (qp->s_tail != qp->s_head)
1139                                 tasklet_hi_schedule(&qp->s_task);
1140                 }
1141                 goto ack_done;
1142         }
1143
1144 ack_done:
1145         spin_unlock_irqrestore(&qp->s_lock, flags);
1146 bail:
1147         return;
1148 }
1149
1150 /**
1151  * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1152  * @dev: the device this packet came in on
1153  * @ohdr: the other headers for this packet
1154  * @data: the packet data
1155  * @qp: the QP for this packet
1156  * @opcode: the opcode for this packet
1157  * @psn: the packet sequence number for this packet
1158  * @diff: the difference between the PSN and the expected PSN
1159  * @header_in_data: true if part of the header data is in the data buffer
1160  *
1161  * This is called from ipath_rc_rcv() to process an unexpected
1162  * incoming RC packet for the given QP.
1163  * Called at interrupt level.
1164  * Return 1 if no more processing is needed; otherwise return 0 to
1165  * schedule a response to be sent and the s_lock unlocked.
1166  */
1167 static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
1168                                      struct ipath_other_headers *ohdr,
1169                                      void *data,
1170                                      struct ipath_qp *qp,
1171                                      u32 opcode,
1172                                      u32 psn,
1173                                      int diff,
1174                                      int header_in_data)
1175 {
1176         struct ib_reth *reth;
1177
1178         if (diff > 0) {
1179                 /*
1180                  * Packet sequence error.
1181                  * A NAK will ACK earlier sends and RDMA writes.
1182                  * Don't queue the NAK if a RDMA read, atomic, or
1183                  * NAK is pending though.
1184                  */
1185                 if (qp->s_ack_state != OP(ACKNOWLEDGE) ||
1186                     qp->r_nak_state != 0)
1187                         goto done;
1188                 if (qp->r_ack_state < OP(COMPARE_SWAP)) {
1189                         qp->r_ack_state = OP(SEND_ONLY);
1190                         qp->r_nak_state = IB_NAK_PSN_ERROR;
1191                         /* Use the expected PSN. */
1192                         qp->r_ack_psn = qp->r_psn;
1193                 }
1194                 goto send_ack;
1195         }
1196
1197         /*
1198          * Handle a duplicate request.  Don't re-execute SEND, RDMA
1199          * write or atomic op.  Don't NAK errors, just silently drop
1200          * the duplicate request.  Note that r_sge, r_len, and
1201          * r_rcv_len may be in use so don't modify them.
1202          *
1203          * We are supposed to ACK the earliest duplicate PSN but we
1204          * can coalesce an outstanding duplicate ACK.  We have to
1205          * send the earliest so that RDMA reads can be restarted at
1206          * the requester's expected PSN.
1207          */
1208         if (opcode == OP(RDMA_READ_REQUEST)) {
1209                 /* RETH comes after BTH */
1210                 if (!header_in_data)
1211                         reth = &ohdr->u.rc.reth;
1212                 else {
1213                         reth = (struct ib_reth *)data;
1214                         data += sizeof(*reth);
1215                 }
1216                 /*
1217                  * If we receive a duplicate RDMA request, it means the
1218                  * requester saw a sequence error and needs to restart
1219                  * from an earlier point.  We can abort the current
1220                  * RDMA read send in that case.
1221                  */
1222                 spin_lock_irq(&qp->s_lock);
1223                 if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
1224                     (qp->s_hdrwords || ipath_cmp24(psn, qp->s_ack_psn) >= 0)) {
1225                         /*
1226                          * We are already sending earlier requested data.
1227                          * Don't abort it to send later out of sequence data.
1228                          */
1229                         spin_unlock_irq(&qp->s_lock);
1230                         goto done;
1231                 }
1232                 qp->s_rdma_len = be32_to_cpu(reth->length);
1233                 if (qp->s_rdma_len != 0) {
1234                         u32 rkey = be32_to_cpu(reth->rkey);
1235                         u64 vaddr = be64_to_cpu(reth->vaddr);
1236                         int ok;
1237
1238                         /*
1239                          * Address range must be a subset of the original
1240                          * request and start on pmtu boundaries.
1241                          */
1242                         ok = ipath_rkey_ok(qp, &qp->s_rdma_sge,
1243                                            qp->s_rdma_len, vaddr, rkey,
1244                                            IB_ACCESS_REMOTE_READ);
1245                         if (unlikely(!ok)) {
1246                                 spin_unlock_irq(&qp->s_lock);
1247                                 goto done;
1248                         }
1249                 } else {
1250                         qp->s_rdma_sge.sg_list = NULL;
1251                         qp->s_rdma_sge.num_sge = 0;
1252                         qp->s_rdma_sge.sge.mr = NULL;
1253                         qp->s_rdma_sge.sge.vaddr = NULL;
1254                         qp->s_rdma_sge.sge.length = 0;
1255                         qp->s_rdma_sge.sge.sge_length = 0;
1256                 }
1257                 qp->s_ack_state = opcode;
1258                 qp->s_ack_psn = psn;
1259                 spin_unlock_irq(&qp->s_lock);
1260                 tasklet_hi_schedule(&qp->s_task);
1261                 goto send_ack;
1262         }
1263
1264         /*
1265          * A pending RDMA read will ACK anything before it so
1266          * ignore earlier duplicate requests.
1267          */
1268         if (qp->s_ack_state != OP(ACKNOWLEDGE))
1269                 goto done;
1270
1271         /*
1272          * If an ACK is pending, don't replace the pending ACK
1273          * with an earlier one since the later one will ACK the earlier.
1274          * Also, if we already have a pending atomic, send it.
1275          */
1276         if (qp->r_ack_state != OP(ACKNOWLEDGE) &&
1277             (ipath_cmp24(psn, qp->r_ack_psn) <= 0 ||
1278              qp->r_ack_state >= OP(COMPARE_SWAP)))
1279                 goto send_ack;
1280         switch (opcode) {
1281         case OP(COMPARE_SWAP):
1282         case OP(FETCH_ADD):
1283                 /*
1284                  * Check for the PSN of the last atomic operation
1285                  * performed and resend the result if found.
1286                  */
1287                 if ((psn & IPATH_PSN_MASK) != qp->r_atomic_psn)
1288                         goto done;
1289                 break;
1290         }
1291         qp->r_ack_state = opcode;
1292         qp->r_nak_state = 0;
1293         qp->r_ack_psn = psn;
1294 send_ack:
1295         return 0;
1296
1297 done:
1298         return 1;
1299 }
1300
1301 static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
1302 {
1303         spin_lock_irq(&qp->s_lock);
1304         qp->state = IB_QPS_ERR;
1305         ipath_error_qp(qp, err);
1306         spin_unlock_irq(&qp->s_lock);
1307 }
1308
1309 /**
1310  * ipath_rc_rcv - process an incoming RC packet
1311  * @dev: the device this packet came in on
1312  * @hdr: the header of this packet
1313  * @has_grh: true if the header has a GRH
1314  * @data: the packet data
1315  * @tlen: the packet length
1316  * @qp: the QP for this packet
1317  *
1318  * This is called from ipath_qp_rcv() to process an incoming RC packet
1319  * for the given QP.
1320  * Called at interrupt level.
1321  */
1322 void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
1323                   int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
1324 {
1325         struct ipath_other_headers *ohdr;
1326         u32 opcode;
1327         u32 hdrsize;
1328         u32 psn;
1329         u32 pad;
1330         struct ib_wc wc;
1331         u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
1332         int diff;
1333         struct ib_reth *reth;
1334         int header_in_data;
1335
1336         /* Validate the SLID. See Ch. 9.6.1.5 */
1337         if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
1338                 goto done;
1339
1340         /* Check for GRH */
1341         if (!has_grh) {
1342                 ohdr = &hdr->u.oth;
1343                 hdrsize = 8 + 12;       /* LRH + BTH */
1344                 psn = be32_to_cpu(ohdr->bth[2]);
1345                 header_in_data = 0;
1346         } else {
1347                 ohdr = &hdr->u.l.oth;
1348                 hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1349                 /*
1350                  * The header with GRH is 60 bytes and the core driver sets
1351                  * the eager header buffer size to 56 bytes so the last 4
1352                  * bytes of the BTH header (PSN) is in the data buffer.
1353                  */
1354                 header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
1355                 if (header_in_data) {
1356                         psn = be32_to_cpu(((__be32 *) data)[0]);
1357                         data += sizeof(__be32);
1358                 } else
1359                         psn = be32_to_cpu(ohdr->bth[2]);
1360         }
1361
1362         /*
1363          * Process responses (ACKs) before anything else.  Note that the
1364          * packet sequence number will be for something in the send work
1365          * queue rather than the expected receive packet sequence number.
1366          * In other words, this QP is the requester.
1367          */
1368         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
1369         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1370             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1371                 ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
1372                                   hdrsize, pmtu, header_in_data);
1373                 goto done;
1374         }
1375
1376         /* Compute 24 bits worth of difference. */
1377         diff = ipath_cmp24(psn, qp->r_psn);
1378         if (unlikely(diff)) {
1379                 if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
1380                                        psn, diff, header_in_data))
1381                         goto done;
1382                 goto send_ack;
1383         }
1384
1385         /* Check for opcode sequence errors. */
1386         switch (qp->r_state) {
1387         case OP(SEND_FIRST):
1388         case OP(SEND_MIDDLE):
1389                 if (opcode == OP(SEND_MIDDLE) ||
1390                     opcode == OP(SEND_LAST) ||
1391                     opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1392                         break;
1393         nack_inv:
1394                 /*
1395                  * A NAK will ACK earlier sends and RDMA writes.
1396                  * Don't queue the NAK if a RDMA read, atomic, or NAK
1397                  * is pending though.
1398                  */
1399                 if (qp->r_ack_state >= OP(COMPARE_SWAP))
1400                         goto send_ack;
1401                 ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR);
1402                 qp->r_ack_state = OP(SEND_ONLY);
1403                 qp->r_nak_state = IB_NAK_INVALID_REQUEST;
1404                 qp->r_ack_psn = qp->r_psn;
1405                 goto send_ack;
1406
1407         case OP(RDMA_WRITE_FIRST):
1408         case OP(RDMA_WRITE_MIDDLE):
1409                 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1410                     opcode == OP(RDMA_WRITE_LAST) ||
1411                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1412                         break;
1413                 goto nack_inv;
1414
1415         default:
1416                 if (opcode == OP(SEND_MIDDLE) ||
1417                     opcode == OP(SEND_LAST) ||
1418                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1419                     opcode == OP(RDMA_WRITE_MIDDLE) ||
1420                     opcode == OP(RDMA_WRITE_LAST) ||
1421                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1422                         goto nack_inv;
1423                 /*
1424                  * Note that it is up to the requester to not send a new
1425                  * RDMA read or atomic operation before receiving an ACK
1426                  * for the previous operation.
1427                  */
1428                 break;
1429         }
1430
1431         wc.imm_data = 0;
1432         wc.wc_flags = 0;
1433
1434         /* OK, process the packet. */
1435         switch (opcode) {
1436         case OP(SEND_FIRST):
1437                 if (!ipath_get_rwqe(qp, 0)) {
1438                 rnr_nak:
1439                         /*
1440                          * A RNR NAK will ACK earlier sends and RDMA writes.
1441                          * Don't queue the NAK if a RDMA read or atomic
1442                          * is pending though.
1443                          */
1444                         if (qp->r_ack_state >= OP(COMPARE_SWAP))
1445                                 goto send_ack;
1446                         qp->r_ack_state = OP(SEND_ONLY);
1447                         qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
1448                         qp->r_ack_psn = qp->r_psn;
1449                         goto send_ack;
1450                 }
1451                 qp->r_rcv_len = 0;
1452                 /* FALLTHROUGH */
1453         case OP(SEND_MIDDLE):
1454         case OP(RDMA_WRITE_MIDDLE):
1455         send_middle:
1456                 /* Check for invalid length PMTU or posted rwqe len. */
1457                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1458                         goto nack_inv;
1459                 qp->r_rcv_len += pmtu;
1460                 if (unlikely(qp->r_rcv_len > qp->r_len))
1461                         goto nack_inv;
1462                 ipath_copy_sge(&qp->r_sge, data, pmtu);
1463                 break;
1464
1465         case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1466                 /* consume RWQE */
1467                 if (!ipath_get_rwqe(qp, 1))
1468                         goto rnr_nak;
1469                 goto send_last_imm;
1470
1471         case OP(SEND_ONLY):
1472         case OP(SEND_ONLY_WITH_IMMEDIATE):
1473                 if (!ipath_get_rwqe(qp, 0))
1474                         goto rnr_nak;
1475                 qp->r_rcv_len = 0;
1476                 if (opcode == OP(SEND_ONLY))
1477                         goto send_last;
1478                 /* FALLTHROUGH */
1479         case OP(SEND_LAST_WITH_IMMEDIATE):
1480         send_last_imm:
1481                 if (header_in_data) {
1482                         wc.imm_data = *(__be32 *) data;
1483                         data += sizeof(__be32);
1484                 } else {
1485                         /* Immediate data comes after BTH */
1486                         wc.imm_data = ohdr->u.imm_data;
1487                 }
1488                 hdrsize += 4;
1489                 wc.wc_flags = IB_WC_WITH_IMM;
1490                 /* FALLTHROUGH */
1491         case OP(SEND_LAST):
1492         case OP(RDMA_WRITE_LAST):
1493         send_last:
1494                 /* Get the number of bytes the message was padded by. */
1495                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1496                 /* Check for invalid length. */
1497                 /* XXX LAST len should be >= 1 */
1498                 if (unlikely(tlen < (hdrsize + pad + 4)))
1499                         goto nack_inv;
1500                 /* Don't count the CRC. */
1501                 tlen -= (hdrsize + pad + 4);
1502                 wc.byte_len = tlen + qp->r_rcv_len;
1503                 if (unlikely(wc.byte_len > qp->r_len))
1504                         goto nack_inv;
1505                 ipath_copy_sge(&qp->r_sge, data, tlen);
1506                 qp->r_msn++;
1507                 if (!qp->r_wrid_valid)
1508                         break;
1509                 qp->r_wrid_valid = 0;
1510                 wc.wr_id = qp->r_wr_id;
1511                 wc.status = IB_WC_SUCCESS;
1512                 wc.opcode = IB_WC_RECV;
1513                 wc.vendor_err = 0;
1514                 wc.qp_num = qp->ibqp.qp_num;
1515                 wc.src_qp = qp->remote_qpn;
1516                 wc.pkey_index = 0;
1517                 wc.slid = qp->remote_ah_attr.dlid;
1518                 wc.sl = qp->remote_ah_attr.sl;
1519                 wc.dlid_path_bits = 0;
1520                 wc.port_num = 0;
1521                 /* Signal completion event if the solicited bit is set. */
1522                 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
1523                                (ohdr->bth[0] &
1524                                 __constant_cpu_to_be32(1 << 23)) != 0);
1525                 break;
1526
1527         case OP(RDMA_WRITE_FIRST):
1528         case OP(RDMA_WRITE_ONLY):
1529         case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
1530                 /* consume RWQE */
1531                 /* RETH comes after BTH */
1532                 if (!header_in_data)
1533                         reth = &ohdr->u.rc.reth;
1534                 else {
1535                         reth = (struct ib_reth *)data;
1536                         data += sizeof(*reth);
1537                 }
1538                 hdrsize += sizeof(*reth);
1539                 qp->r_len = be32_to_cpu(reth->length);
1540                 qp->r_rcv_len = 0;
1541                 if (qp->r_len != 0) {
1542                         u32 rkey = be32_to_cpu(reth->rkey);
1543                         u64 vaddr = be64_to_cpu(reth->vaddr);
1544                         int ok;
1545
1546                         /* Check rkey & NAK */
1547                         ok = ipath_rkey_ok(qp, &qp->r_sge,
1548                                            qp->r_len, vaddr, rkey,
1549                                            IB_ACCESS_REMOTE_WRITE);
1550                         if (unlikely(!ok))
1551                                 goto nack_acc;
1552                 } else {
1553                         qp->r_sge.sg_list = NULL;
1554                         qp->r_sge.sge.mr = NULL;
1555                         qp->r_sge.sge.vaddr = NULL;
1556                         qp->r_sge.sge.length = 0;
1557                         qp->r_sge.sge.sge_length = 0;
1558                 }
1559                 if (unlikely(!(qp->qp_access_flags &
1560                                IB_ACCESS_REMOTE_WRITE)))
1561                         goto nack_acc;
1562                 if (opcode == OP(RDMA_WRITE_FIRST))
1563                         goto send_middle;
1564                 else if (opcode == OP(RDMA_WRITE_ONLY))
1565                         goto send_last;
1566                 if (!ipath_get_rwqe(qp, 1))
1567                         goto rnr_nak;
1568                 goto send_last_imm;
1569
1570         case OP(RDMA_READ_REQUEST):
1571                 /* RETH comes after BTH */
1572                 if (!header_in_data)
1573                         reth = &ohdr->u.rc.reth;
1574                 else {
1575                         reth = (struct ib_reth *)data;
1576                         data += sizeof(*reth);
1577                 }
1578                 if (unlikely(!(qp->qp_access_flags &
1579                                IB_ACCESS_REMOTE_READ)))
1580                         goto nack_acc;
1581                 spin_lock_irq(&qp->s_lock);
1582                 qp->s_rdma_len = be32_to_cpu(reth->length);
1583                 if (qp->s_rdma_len != 0) {
1584                         u32 rkey = be32_to_cpu(reth->rkey);
1585                         u64 vaddr = be64_to_cpu(reth->vaddr);
1586                         int ok;
1587
1588                         /* Check rkey & NAK */
1589                         ok = ipath_rkey_ok(qp, &qp->s_rdma_sge,
1590                                            qp->s_rdma_len, vaddr, rkey,
1591                                            IB_ACCESS_REMOTE_READ);
1592                         if (unlikely(!ok)) {
1593                                 spin_unlock_irq(&qp->s_lock);
1594                                 goto nack_acc;
1595                         }
1596                         /*
1597                          * Update the next expected PSN.  We add 1 later
1598                          * below, so only add the remainder here.
1599                          */
1600                         if (qp->s_rdma_len > pmtu)
1601                                 qp->r_psn += (qp->s_rdma_len - 1) / pmtu;
1602                 } else {
1603                         qp->s_rdma_sge.sg_list = NULL;
1604                         qp->s_rdma_sge.num_sge = 0;
1605                         qp->s_rdma_sge.sge.mr = NULL;
1606                         qp->s_rdma_sge.sge.vaddr = NULL;
1607                         qp->s_rdma_sge.sge.length = 0;
1608                         qp->s_rdma_sge.sge.sge_length = 0;
1609                 }
1610                 /*
1611                  * We need to increment the MSN here instead of when we
1612                  * finish sending the result since a duplicate request would
1613                  * increment it more than once.
1614                  */
1615                 qp->r_msn++;
1616
1617                 qp->s_ack_state = opcode;
1618                 qp->s_ack_psn = psn;
1619                 spin_unlock_irq(&qp->s_lock);
1620
1621                 qp->r_psn++;
1622                 qp->r_state = opcode;
1623                 qp->r_nak_state = 0;
1624
1625                 /* Call ipath_do_rc_send() in another thread. */
1626                 tasklet_hi_schedule(&qp->s_task);
1627
1628                 goto done;
1629
1630         case OP(COMPARE_SWAP):
1631         case OP(FETCH_ADD): {
1632                 struct ib_atomic_eth *ateth;
1633                 u64 vaddr;
1634                 u64 sdata;
1635                 u32 rkey;
1636
1637                 if (!header_in_data)
1638                         ateth = &ohdr->u.atomic_eth;
1639                 else {
1640                         ateth = (struct ib_atomic_eth *)data;
1641                         data += sizeof(*ateth);
1642                 }
1643                 vaddr = be64_to_cpu(ateth->vaddr);
1644                 if (unlikely(vaddr & (sizeof(u64) - 1)))
1645                         goto nack_inv;
1646                 rkey = be32_to_cpu(ateth->rkey);
1647                 /* Check rkey & NAK */
1648                 if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
1649                                             sizeof(u64), vaddr, rkey,
1650                                             IB_ACCESS_REMOTE_ATOMIC)))
1651                         goto nack_acc;
1652                 if (unlikely(!(qp->qp_access_flags &
1653                                IB_ACCESS_REMOTE_ATOMIC)))
1654                         goto nack_acc;
1655                 /* Perform atomic OP and save result. */
1656                 sdata = be64_to_cpu(ateth->swap_data);
1657                 spin_lock_irq(&dev->pending_lock);
1658                 qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
1659                 if (opcode == OP(FETCH_ADD))
1660                         *(u64 *) qp->r_sge.sge.vaddr =
1661                                 qp->r_atomic_data + sdata;
1662                 else if (qp->r_atomic_data ==
1663                          be64_to_cpu(ateth->compare_data))
1664                         *(u64 *) qp->r_sge.sge.vaddr = sdata;
1665                 spin_unlock_irq(&dev->pending_lock);
1666                 qp->r_msn++;
1667                 qp->r_atomic_psn = psn & IPATH_PSN_MASK;
1668                 psn |= 1 << 31;
1669                 break;
1670         }
1671
1672         default:
1673                 /* Drop packet for unknown opcodes. */
1674                 goto done;
1675         }
1676         qp->r_psn++;
1677         qp->r_state = opcode;
1678         qp->r_nak_state = 0;
1679         /* Send an ACK if requested or required. */
1680         if (psn & (1 << 31)) {
1681                 /*
1682                  * Coalesce ACKs unless there is a RDMA READ or
1683                  * ATOMIC pending.
1684                  */
1685                 if (qp->r_ack_state < OP(COMPARE_SWAP)) {
1686                         qp->r_ack_state = opcode;
1687                         qp->r_ack_psn = psn;
1688                 }
1689                 goto send_ack;
1690         }
1691         goto done;
1692
1693 nack_acc:
1694         /*
1695          * A NAK will ACK earlier sends and RDMA writes.
1696          * Don't queue the NAK if a RDMA read, atomic, or NAK
1697          * is pending though.
1698          */
1699         if (qp->r_ack_state < OP(COMPARE_SWAP)) {
1700                 ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR);
1701                 qp->r_ack_state = OP(RDMA_WRITE_ONLY);
1702                 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
1703                 qp->r_ack_psn = qp->r_psn;
1704         }
1705 send_ack:
1706         /* Send ACK right away unless the send tasklet has a pending ACK. */
1707         if (qp->s_ack_state == OP(ACKNOWLEDGE))
1708                 send_rc_ack(qp);
1709
1710 done:
1711         return;
1712 }