2  * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
 
   3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
 
   5  * This software is available to you under a choice of one of two
 
   6  * licenses.  You may choose to be licensed under the terms of the GNU
 
   7  * General Public License (GPL) Version 2, available from the file
 
   8  * COPYING in the main directory of this source tree, or the
 
   9  * OpenIB.org BSD license below:
 
  11  *     Redistribution and use in source and binary forms, with or
 
  12  *     without modification, are permitted provided that the following
 
  15  *      - Redistributions of source code must retain the above
 
  16  *        copyright notice, this list of conditions and the following
 
  19  *      - Redistributions in binary form must reproduce the above
 
  20  *        copyright notice, this list of conditions and the following
 
  21  *        disclaimer in the documentation and/or other materials
 
  22  *        provided with the distribution.
 
  24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 
  25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 
  26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 
  27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 
  28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 
  29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 
  30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 
  34 #include "ipath_verbs.h"
 
  35 #include "ipath_kernel.h"
 
  37 /* cut down ridiculously long IB macro names */
 
  38 #define OP(x) IB_OPCODE_RC_##x
 
  40 static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
 
  45         len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
 
  46         ss->sge = wqe->sg_list[0];
 
  47         ss->sg_list = wqe->sg_list + 1;
 
  48         ss->num_sge = wqe->wr.num_sge;
 
  49         ipath_skip_sge(ss, len);
 
  50         return wqe->length - len;
 
  54  * ipath_init_restart- initialize the qp->s_sge after a restart
 
  55  * @qp: the QP who's SGE we're restarting
 
  56  * @wqe: the work queue to initialize the QP's SGE from
 
  58  * The QP s_lock should be held and interrupts disabled.
 
  60 static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
 
  62         struct ipath_ibdev *dev;
 
  64         qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
 
  65                                 ib_mtu_enum_to_int(qp->path_mtu));
 
  66         dev = to_idev(qp->ibqp.device);
 
  67         spin_lock(&dev->pending_lock);
 
  68         if (list_empty(&qp->timerwait))
 
  69                 list_add_tail(&qp->timerwait,
 
  70                               &dev->pending[dev->pending_index]);
 
  71         spin_unlock(&dev->pending_lock);
 
  75  * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
 
  76  * @qp: a pointer to the QP
 
  77  * @ohdr: a pointer to the IB header being constructed
 
  80  * Return 1 if constructed; otherwise, return 0.
 
  81  * Note that we are in the responder's side of the QP context.
 
  82  * Note the QP s_lock must be held.
 
  84 static int ipath_make_rc_ack(struct ipath_qp *qp,
 
  85                              struct ipath_other_headers *ohdr,
 
  86                              u32 pmtu, u32 *bth0p, u32 *bth2p)
 
  88         struct ipath_ack_entry *e;
 
  94         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
 
  97         switch (qp->s_ack_state) {
 
  98         case OP(RDMA_READ_RESPONSE_LAST):
 
  99         case OP(RDMA_READ_RESPONSE_ONLY):
 
 100         case OP(ATOMIC_ACKNOWLEDGE):
 
 102                  * We can increment the tail pointer now that the last
 
 103                  * response has been sent instead of only being
 
 106                 if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
 
 107                         qp->s_tail_ack_queue = 0;
 
 110         case OP(ACKNOWLEDGE):
 
 111                 /* Check for no next entry in the queue. */
 
 112                 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
 
 113                         if (qp->s_flags & IPATH_S_ACK_PENDING)
 
 115                         qp->s_ack_state = OP(ACKNOWLEDGE);
 
 119                 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 
 120                 if (e->opcode == OP(RDMA_READ_REQUEST)) {
 
 121                         /* Copy SGE state in case we need to resend */
 
 122                         qp->s_ack_rdma_sge = e->rdma_sge;
 
 123                         qp->s_cur_sge = &qp->s_ack_rdma_sge;
 
 124                         len = e->rdma_sge.sge.sge_length;
 
 127                                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
 
 129                                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
 
 132                         ohdr->u.aeth = ipath_compute_aeth(qp);
 
 134                         qp->s_ack_rdma_psn = e->psn;
 
 135                         bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
 
 137                         /* COMPARE_SWAP or FETCH_ADD */
 
 138                         qp->s_cur_sge = NULL;
 
 140                         qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
 
 141                         ohdr->u.at.aeth = ipath_compute_aeth(qp);
 
 142                         ohdr->u.at.atomic_ack_eth[0] =
 
 143                                 cpu_to_be32(e->atomic_data >> 32);
 
 144                         ohdr->u.at.atomic_ack_eth[1] =
 
 145                                 cpu_to_be32(e->atomic_data);
 
 146                         hwords += sizeof(ohdr->u.at) / sizeof(u32);
 
 150                 bth0 = qp->s_ack_state << 24;
 
 153         case OP(RDMA_READ_RESPONSE_FIRST):
 
 154                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 
 156         case OP(RDMA_READ_RESPONSE_MIDDLE):
 
 157                 len = qp->s_ack_rdma_sge.sge.sge_length;
 
 161                         ohdr->u.aeth = ipath_compute_aeth(qp);
 
 163                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
 
 164                         qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
 
 166                 bth0 = qp->s_ack_state << 24;
 
 167                 bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
 
 173                  * Send a regular ACK.
 
 174                  * Set the s_ack_state so we wait until after sending
 
 175                  * the ACK before setting s_ack_state to ACKNOWLEDGE
 
 178                 qp->s_ack_state = OP(SEND_ONLY);
 
 179                 qp->s_flags &= ~IPATH_S_ACK_PENDING;
 
 180                 qp->s_cur_sge = NULL;
 
 183                                 cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
 
 185                                              IPATH_AETH_CREDIT_SHIFT));
 
 187                         ohdr->u.aeth = ipath_compute_aeth(qp);
 
 190                 bth0 = OP(ACKNOWLEDGE) << 24;
 
 191                 bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
 
 193         qp->s_hdrwords = hwords;
 
 194         qp->s_cur_size = len;
 
 195         *bth0p = bth0 | (1 << 22); /* Set M bit */
 
 204  * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
 
 205  * @qp: a pointer to the QP
 
 206  * @ohdr: a pointer to the IB header being constructed
 
 207  * @pmtu: the path MTU
 
 208  * @bth0p: pointer to the BTH opcode word
 
 209  * @bth2p: pointer to the BTH PSN word
 
 211  * Return 1 if constructed; otherwise, return 0.
 
 212  * Note the QP s_lock must be held and interrupts disabled.
 
 214 int ipath_make_rc_req(struct ipath_qp *qp,
 
 215                       struct ipath_other_headers *ohdr,
 
 216                       u32 pmtu, u32 *bth0p, u32 *bth2p)
 
 218         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 
 219         struct ipath_sge_state *ss;
 
 220         struct ipath_swqe *wqe;
 
 227         /* Sending responses has higher priority over sending requests. */
 
 228         if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
 
 229              (qp->s_flags & IPATH_S_ACK_PENDING) ||
 
 230              qp->s_ack_state != OP(ACKNOWLEDGE)) &&
 
 231             ipath_make_rc_ack(qp, ohdr, pmtu, bth0p, bth2p))
 
 234         if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
 
 235             qp->s_rnr_timeout || qp->s_wait_credit)
 
 238         /* Limit the number of packets sent without an ACK. */
 
 239         if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) {
 
 240                 qp->s_wait_credit = 1;
 
 245         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
 
 247         bth0 = 1 << 22; /* Set M bit */
 
 249         /* Send a request. */
 
 250         wqe = get_swqe_ptr(qp, qp->s_cur);
 
 251         switch (qp->s_state) {
 
 254                  * Resend an old request or start a new one.
 
 256                  * We keep track of the current SWQE so that
 
 257                  * we don't reset the "furthest progress" state
 
 258                  * if we need to back up.
 
 261                 if (qp->s_cur == qp->s_tail) {
 
 262                         /* Check if send work queue is empty. */
 
 263                         if (qp->s_tail == qp->s_head)
 
 266                          * If a fence is requested, wait for previous
 
 267                          * RDMA read and atomic operations to finish.
 
 269                         if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
 
 270                             qp->s_num_rd_atomic) {
 
 271                                 qp->s_flags |= IPATH_S_FENCE_PENDING;
 
 274                         wqe->psn = qp->s_next_psn;
 
 278                  * Note that we have to be careful not to modify the
 
 279                  * original work request since we may need to resend
 
 285                 switch (wqe->wr.opcode) {
 
 287                 case IB_WR_SEND_WITH_IMM:
 
 288                         /* If no credit, return. */
 
 289                         if (qp->s_lsn != (u32) -1 &&
 
 290                             ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
 
 292                         wqe->lpsn = wqe->psn;
 
 294                                 wqe->lpsn += (len - 1) / pmtu;
 
 295                                 qp->s_state = OP(SEND_FIRST);
 
 299                         if (wqe->wr.opcode == IB_WR_SEND)
 
 300                                 qp->s_state = OP(SEND_ONLY);
 
 302                                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
 
 303                                 /* Immediate data comes after the BTH */
 
 304                                 ohdr->u.imm_data = wqe->wr.imm_data;
 
 307                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 
 309                         bth2 = 1 << 31; /* Request ACK. */
 
 310                         if (++qp->s_cur == qp->s_size)
 
 314                 case IB_WR_RDMA_WRITE:
 
 315                         if (newreq && qp->s_lsn != (u32) -1)
 
 318                 case IB_WR_RDMA_WRITE_WITH_IMM:
 
 319                         /* If no credit, return. */
 
 320                         if (qp->s_lsn != (u32) -1 &&
 
 321                             ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
 
 323                         ohdr->u.rc.reth.vaddr =
 
 324                                 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
 
 325                         ohdr->u.rc.reth.rkey =
 
 326                                 cpu_to_be32(wqe->wr.wr.rdma.rkey);
 
 327                         ohdr->u.rc.reth.length = cpu_to_be32(len);
 
 328                         hwords += sizeof(struct ib_reth) / sizeof(u32);
 
 329                         wqe->lpsn = wqe->psn;
 
 331                                 wqe->lpsn += (len - 1) / pmtu;
 
 332                                 qp->s_state = OP(RDMA_WRITE_FIRST);
 
 336                         if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
 
 337                                 qp->s_state = OP(RDMA_WRITE_ONLY);
 
 340                                         OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
 
 341                                 /* Immediate data comes after RETH */
 
 342                                 ohdr->u.rc.imm_data = wqe->wr.imm_data;
 
 344                                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 
 347                         bth2 = 1 << 31; /* Request ACK. */
 
 348                         if (++qp->s_cur == qp->s_size)
 
 352                 case IB_WR_RDMA_READ:
 
 354                          * Don't allow more operations to be started
 
 355                          * than the QP limits allow.
 
 358                                 if (qp->s_num_rd_atomic >=
 
 359                                     qp->s_max_rd_atomic) {
 
 360                                         qp->s_flags |= IPATH_S_RDMAR_PENDING;
 
 363                                 qp->s_num_rd_atomic++;
 
 364                                 if (qp->s_lsn != (u32) -1)
 
 367                                  * Adjust s_next_psn to count the
 
 368                                  * expected number of responses.
 
 371                                         qp->s_next_psn += (len - 1) / pmtu;
 
 372                                 wqe->lpsn = qp->s_next_psn++;
 
 374                         ohdr->u.rc.reth.vaddr =
 
 375                                 cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
 
 376                         ohdr->u.rc.reth.rkey =
 
 377                                 cpu_to_be32(wqe->wr.wr.rdma.rkey);
 
 378                         ohdr->u.rc.reth.length = cpu_to_be32(len);
 
 379                         qp->s_state = OP(RDMA_READ_REQUEST);
 
 380                         hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
 
 383                         if (++qp->s_cur == qp->s_size)
 
 387                 case IB_WR_ATOMIC_CMP_AND_SWP:
 
 388                 case IB_WR_ATOMIC_FETCH_AND_ADD:
 
 390                          * Don't allow more operations to be started
 
 391                          * than the QP limits allow.
 
 394                                 if (qp->s_num_rd_atomic >=
 
 395                                     qp->s_max_rd_atomic) {
 
 396                                         qp->s_flags |= IPATH_S_RDMAR_PENDING;
 
 399                                 qp->s_num_rd_atomic++;
 
 400                                 if (qp->s_lsn != (u32) -1)
 
 402                                 wqe->lpsn = wqe->psn;
 
 404                         if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
 
 405                                 qp->s_state = OP(COMPARE_SWAP);
 
 406                                 ohdr->u.atomic_eth.swap_data = cpu_to_be64(
 
 407                                         wqe->wr.wr.atomic.swap);
 
 408                                 ohdr->u.atomic_eth.compare_data = cpu_to_be64(
 
 409                                         wqe->wr.wr.atomic.compare_add);
 
 411                                 qp->s_state = OP(FETCH_ADD);
 
 412                                 ohdr->u.atomic_eth.swap_data = cpu_to_be64(
 
 413                                         wqe->wr.wr.atomic.compare_add);
 
 414                                 ohdr->u.atomic_eth.compare_data = 0;
 
 416                         ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
 
 417                                 wqe->wr.wr.atomic.remote_addr >> 32);
 
 418                         ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
 
 419                                 wqe->wr.wr.atomic.remote_addr);
 
 420                         ohdr->u.atomic_eth.rkey = cpu_to_be32(
 
 421                                 wqe->wr.wr.atomic.rkey);
 
 422                         hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
 
 425                         if (++qp->s_cur == qp->s_size)
 
 432                 qp->s_sge.sge = wqe->sg_list[0];
 
 433                 qp->s_sge.sg_list = wqe->sg_list + 1;
 
 434                 qp->s_sge.num_sge = wqe->wr.num_sge;
 
 435                 qp->s_len = wqe->length;
 
 438                         if (qp->s_tail >= qp->s_size)
 
 441                 bth2 |= qp->s_psn & IPATH_PSN_MASK;
 
 442                 if (wqe->wr.opcode == IB_WR_RDMA_READ)
 
 443                         qp->s_psn = wqe->lpsn + 1;
 
 446                         if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
 
 447                                 qp->s_next_psn = qp->s_psn;
 
 450                  * Put the QP on the pending list so lost ACKs will cause
 
 451                  * a retry.  More than one request can be pending so the
 
 452                  * QP may already be on the dev->pending list.
 
 454                 spin_lock(&dev->pending_lock);
 
 455                 if (list_empty(&qp->timerwait))
 
 456                         list_add_tail(&qp->timerwait,
 
 457                                       &dev->pending[dev->pending_index]);
 
 458                 spin_unlock(&dev->pending_lock);
 
 461         case OP(RDMA_READ_RESPONSE_FIRST):
 
 463                  * This case can only happen if a send is restarted.
 
 464                  * See ipath_restart_rc().
 
 466                 ipath_init_restart(qp, wqe);
 
 469                 qp->s_state = OP(SEND_MIDDLE);
 
 471         case OP(SEND_MIDDLE):
 
 472                 bth2 = qp->s_psn++ & IPATH_PSN_MASK;
 
 473                 if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
 
 474                         qp->s_next_psn = qp->s_psn;
 
 481                 if (wqe->wr.opcode == IB_WR_SEND)
 
 482                         qp->s_state = OP(SEND_LAST);
 
 484                         qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
 
 485                         /* Immediate data comes after the BTH */
 
 486                         ohdr->u.imm_data = wqe->wr.imm_data;
 
 489                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 
 491                 bth2 |= 1 << 31;        /* Request ACK. */
 
 493                 if (qp->s_cur >= qp->s_size)
 
 497         case OP(RDMA_READ_RESPONSE_LAST):
 
 499                  * This case can only happen if a RDMA write is restarted.
 
 500                  * See ipath_restart_rc().
 
 502                 ipath_init_restart(qp, wqe);
 
 504         case OP(RDMA_WRITE_FIRST):
 
 505                 qp->s_state = OP(RDMA_WRITE_MIDDLE);
 
 507         case OP(RDMA_WRITE_MIDDLE):
 
 508                 bth2 = qp->s_psn++ & IPATH_PSN_MASK;
 
 509                 if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
 
 510                         qp->s_next_psn = qp->s_psn;
 
 517                 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
 
 518                         qp->s_state = OP(RDMA_WRITE_LAST);
 
 520                         qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
 
 521                         /* Immediate data comes after the BTH */
 
 522                         ohdr->u.imm_data = wqe->wr.imm_data;
 
 524                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 
 527                 bth2 |= 1 << 31;        /* Request ACK. */
 
 529                 if (qp->s_cur >= qp->s_size)
 
 533         case OP(RDMA_READ_RESPONSE_MIDDLE):
 
 535                  * This case can only happen if a RDMA read is restarted.
 
 536                  * See ipath_restart_rc().
 
 538                 ipath_init_restart(qp, wqe);
 
 539                 len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
 
 540                 ohdr->u.rc.reth.vaddr =
 
 541                         cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
 
 542                 ohdr->u.rc.reth.rkey =
 
 543                         cpu_to_be32(wqe->wr.wr.rdma.rkey);
 
 544                 ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
 
 545                 qp->s_state = OP(RDMA_READ_REQUEST);
 
 546                 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
 
 547                 bth2 = qp->s_psn++ & IPATH_PSN_MASK;
 
 548                 if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
 
 549                         qp->s_next_psn = qp->s_psn;
 
 553                 if (qp->s_cur == qp->s_size)
 
 557         if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
 
 558                 bth2 |= 1 << 31;        /* Request ACK. */
 
 560         qp->s_hdrwords = hwords;
 
 562         qp->s_cur_size = len;
 
 563         *bth0p = bth0 | (qp->s_state << 24);
 
 573  * send_rc_ack - Construct an ACK packet and send it
 
 574  * @qp: a pointer to the QP
 
 576  * This is called from ipath_rc_rcv() and only uses the receive
 
 578  * Note that RDMA reads and atomics are handled in the
 
 579  * send side QP state and tasklet.
 
 581 static void send_rc_ack(struct ipath_qp *qp)
 
 583         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 
 587         struct ipath_ib_header hdr;
 
 588         struct ipath_other_headers *ohdr;
 
 591         /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
 
 592         if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
 
 593             (qp->s_flags & IPATH_S_ACK_PENDING) ||
 
 594             qp->s_ack_state != OP(ACKNOWLEDGE))
 
 597         /* Construct the header. */
 
 599         lrh0 = IPATH_LRH_BTH;
 
 600         /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
 
 602         if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
 
 603                 hwords += ipath_make_grh(dev, &hdr.u.l.grh,
 
 604                                          &qp->remote_ah_attr.grh,
 
 607                 lrh0 = IPATH_LRH_GRH;
 
 609         /* read pkey_index w/o lock (its atomic) */
 
 610         bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index) |
 
 611                 (OP(ACKNOWLEDGE) << 24) | (1 << 22);
 
 613                 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
 
 615                                              IPATH_AETH_CREDIT_SHIFT));
 
 617                 ohdr->u.aeth = ipath_compute_aeth(qp);
 
 618         lrh0 |= qp->remote_ah_attr.sl << 4;
 
 619         hdr.lrh[0] = cpu_to_be16(lrh0);
 
 620         hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
 
 621         hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
 
 622         hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid);
 
 623         ohdr->bth[0] = cpu_to_be32(bth0);
 
 624         ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
 
 625         ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
 
 628          * If we can send the ACK, clear the ACK state.
 
 630         if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) {
 
 631                 dev->n_unicast_xmit++;
 
 636          * We are out of PIO buffers at the moment.
 
 637          * Pass responsibility for sending the ACK to the
 
 638          * send tasklet so that when a PIO buffer becomes
 
 639          * available, the ACK is sent ahead of other outgoing
 
 645         spin_lock_irqsave(&qp->s_lock, flags);
 
 646         qp->s_flags |= IPATH_S_ACK_PENDING;
 
 647         qp->s_nak_state = qp->r_nak_state;
 
 648         qp->s_ack_psn = qp->r_ack_psn;
 
 649         spin_unlock_irqrestore(&qp->s_lock, flags);
 
 651         /* Call ipath_do_rc_send() in another thread. */
 
 652         tasklet_hi_schedule(&qp->s_task);
 
 659  * reset_psn - reset the QP state to send starting from PSN
 
 661  * @psn: the packet sequence number to restart at
 
 663  * This is called from ipath_rc_rcv() to process an incoming RC ACK
 
 665  * Called at interrupt level with the QP s_lock held.
 
 667 static void reset_psn(struct ipath_qp *qp, u32 psn)
 
 670         struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
 
 676          * If we are starting the request from the beginning,
 
 677          * let the normal send code handle initialization.
 
 679         if (ipath_cmp24(psn, wqe->psn) <= 0) {
 
 680                 qp->s_state = OP(SEND_LAST);
 
 684         /* Find the work request opcode corresponding to the given PSN. */
 
 685         opcode = wqe->wr.opcode;
 
 689                 if (++n == qp->s_size)
 
 693                 wqe = get_swqe_ptr(qp, n);
 
 694                 diff = ipath_cmp24(psn, wqe->psn);
 
 699                  * If we are starting the request from the beginning,
 
 700                  * let the normal send code handle initialization.
 
 703                         qp->s_state = OP(SEND_LAST);
 
 706                 opcode = wqe->wr.opcode;
 
 710          * Set the state to restart in the middle of a request.
 
 711          * Don't change the s_sge, s_cur_sge, or s_cur_size.
 
 712          * See ipath_do_rc_send().
 
 716         case IB_WR_SEND_WITH_IMM:
 
 717                 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
 
 720         case IB_WR_RDMA_WRITE:
 
 721         case IB_WR_RDMA_WRITE_WITH_IMM:
 
 722                 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
 
 725         case IB_WR_RDMA_READ:
 
 726                 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 
 731                  * This case shouldn't happen since its only
 
 734                 qp->s_state = OP(SEND_LAST);
 
 741  * ipath_restart_rc - back up requester to resend the last un-ACKed request
 
 742  * @qp: the QP to restart
 
 743  * @psn: packet sequence number for the request
 
 744  * @wc: the work completion request
 
 746  * The QP s_lock should be held and interrupts disabled.
 
 748 void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
 
 750         struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
 
 751         struct ipath_ibdev *dev;
 
 753         if (qp->s_retry == 0) {
 
 754                 wc->wr_id = wqe->wr.wr_id;
 
 755                 wc->status = IB_WC_RETRY_EXC_ERR;
 
 756                 wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
 
 760                 wc->src_qp = qp->remote_qpn;
 
 762                 wc->slid = qp->remote_ah_attr.dlid;
 
 763                 wc->sl = qp->remote_ah_attr.sl;
 
 764                 wc->dlid_path_bits = 0;
 
 766                 ipath_sqerror_qp(qp, wc);
 
 772          * Remove the QP from the timeout queue.
 
 773          * Note: it may already have been removed by ipath_ib_timer().
 
 775         dev = to_idev(qp->ibqp.device);
 
 776         spin_lock(&dev->pending_lock);
 
 777         if (!list_empty(&qp->timerwait))
 
 778                 list_del_init(&qp->timerwait);
 
 779         spin_unlock(&dev->pending_lock);
 
 781         if (wqe->wr.opcode == IB_WR_RDMA_READ)
 
 784                 dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
 
 787         tasklet_hi_schedule(&qp->s_task);
 
 793 static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
 
 795         if (qp->s_wait_credit) {
 
 796                 qp->s_wait_credit = 0;
 
 797                 tasklet_hi_schedule(&qp->s_task);
 
 799         qp->s_last_psn = psn;
 
 803  * do_rc_ack - process an incoming RC ACK
 
 804  * @qp: the QP the ACK came in on
 
 805  * @psn: the packet sequence number of the ACK
 
 806  * @opcode: the opcode of the request that resulted in the ACK
 
 808  * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
 
 810  * Called at interrupt level with the QP s_lock held and interrupts disabled.
 
 811  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
 
 813 static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 
 816         struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 
 818         struct ipath_swqe *wqe;
 
 824          * Remove the QP from the timeout queue (or RNR timeout queue).
 
 825          * If ipath_ib_timer() has already removed it,
 
 826          * it's OK since we hold the QP s_lock and ipath_restart_rc()
 
 827          * just won't find anything to restart if we ACK everything.
 
 829         spin_lock(&dev->pending_lock);
 
 830         if (!list_empty(&qp->timerwait))
 
 831                 list_del_init(&qp->timerwait);
 
 832         spin_unlock(&dev->pending_lock);
 
 835          * Note that NAKs implicitly ACK outstanding SEND and RDMA write
 
 836          * requests and implicitly NAK RDMA read and atomic requests issued
 
 837          * before the NAK'ed request.  The MSN won't include the NAK'ed
 
 838          * request but will include an ACK'ed request(s).
 
 843         wqe = get_swqe_ptr(qp, qp->s_last);
 
 846          * The MSN might be for a later WQE than the PSN indicates so
 
 847          * only complete WQEs that the PSN finishes.
 
 849         while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
 
 851                  * RDMA_READ_RESPONSE_ONLY is a special case since
 
 852                  * we want to generate completion events for everything
 
 853                  * before the RDMA read, copy the data, then generate
 
 854                  * the completion for the read.
 
 856                 if (wqe->wr.opcode == IB_WR_RDMA_READ &&
 
 857                     opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
 
 863                  * If this request is a RDMA read or atomic, and the ACK is
 
 864                  * for a later operation, this ACK NAKs the RDMA read or
 
 865                  * atomic.  In other words, only a RDMA_READ_LAST or ONLY
 
 866                  * can ACK a RDMA read and likewise for atomic ops.  Note
 
 867                  * that the NAK case can only happen if relaxed ordering is
 
 868                  * used and requests are sent after an RDMA read or atomic
 
 869                  * is sent but before the response is received.
 
 871                 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
 
 872                      (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
 
 873                     ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 
 874                       wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
 
 875                      (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
 
 877                          * The last valid PSN seen is the previous
 
 880                         update_last_psn(qp, wqe->psn - 1);
 
 881                         /* Retry this request. */
 
 882                         ipath_restart_rc(qp, wqe->psn, &wc);
 
 884                          * No need to process the ACK/NAK since we are
 
 885                          * restarting an earlier request.
 
 889                 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 
 890                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
 
 891                         *(u64 *) wqe->sg_list[0].vaddr = val;
 
 892                 if (qp->s_num_rd_atomic &&
 
 893                     (wqe->wr.opcode == IB_WR_RDMA_READ ||
 
 894                      wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 
 895                      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
 
 896                         qp->s_num_rd_atomic--;
 
 897                         /* Restart sending task if fence is complete */
 
 898                         if ((qp->s_flags & IPATH_S_FENCE_PENDING) &&
 
 899                             !qp->s_num_rd_atomic) {
 
 900                                 qp->s_flags &= ~IPATH_S_FENCE_PENDING;
 
 901                                 tasklet_hi_schedule(&qp->s_task);
 
 902                         } else if (qp->s_flags & IPATH_S_RDMAR_PENDING) {
 
 903                                 qp->s_flags &= ~IPATH_S_RDMAR_PENDING;
 
 904                                 tasklet_hi_schedule(&qp->s_task);
 
 907                 /* Post a send completion queue entry if requested. */
 
 908                 if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
 
 909                     (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
 
 910                         wc.wr_id = wqe->wr.wr_id;
 
 911                         wc.status = IB_WC_SUCCESS;
 
 912                         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
 
 914                         wc.byte_len = wqe->length;
 
 917                         wc.src_qp = qp->remote_qpn;
 
 920                         wc.slid = qp->remote_ah_attr.dlid;
 
 921                         wc.sl = qp->remote_ah_attr.sl;
 
 922                         wc.dlid_path_bits = 0;
 
 924                         ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
 
 926                 qp->s_retry = qp->s_retry_cnt;
 
 928                  * If we are completing a request which is in the process of
 
 929                  * being resent, we can stop resending it since we know the
 
 930                  * responder has already seen it.
 
 932                 if (qp->s_last == qp->s_cur) {
 
 933                         if (++qp->s_cur >= qp->s_size)
 
 935                         qp->s_last = qp->s_cur;
 
 936                         if (qp->s_last == qp->s_tail)
 
 938                         wqe = get_swqe_ptr(qp, qp->s_cur);
 
 939                         qp->s_state = OP(SEND_LAST);
 
 940                         qp->s_psn = wqe->psn;
 
 942                         if (++qp->s_last >= qp->s_size)
 
 944                         if (qp->s_last == qp->s_tail)
 
 946                         wqe = get_swqe_ptr(qp, qp->s_last);
 
 950         switch (aeth >> 29) {
 
 953                 /* If this is a partial ACK, reset the retransmit timer. */
 
 954                 if (qp->s_last != qp->s_tail) {
 
 955                         spin_lock(&dev->pending_lock);
 
 956                         list_add_tail(&qp->timerwait,
 
 957                                       &dev->pending[dev->pending_index]);
 
 958                         spin_unlock(&dev->pending_lock);
 
 960                          * If we get a partial ACK for a resent operation,
 
 961                          * we can stop resending the earlier packets and
 
 962                          * continue with the next packet the receiver wants.
 
 964                         if (ipath_cmp24(qp->s_psn, psn) <= 0) {
 
 965                                 reset_psn(qp, psn + 1);
 
 966                                 tasklet_hi_schedule(&qp->s_task);
 
 968                 } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
 
 969                         qp->s_state = OP(SEND_LAST);
 
 972                 ipath_get_credit(qp, aeth);
 
 973                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
 
 974                 qp->s_retry = qp->s_retry_cnt;
 
 975                 update_last_psn(qp, psn);
 
 979         case 1:         /* RNR NAK */
 
 981                 if (qp->s_last == qp->s_tail)
 
 983                 if (qp->s_rnr_retry == 0) {
 
 984                         wc.status = IB_WC_RNR_RETRY_EXC_ERR;
 
 987                 if (qp->s_rnr_retry_cnt < 7)
 
 990                 /* The last valid PSN is the previous PSN. */
 
 991                 update_last_psn(qp, psn - 1);
 
 993                 if (wqe->wr.opcode == IB_WR_RDMA_READ)
 
 997                                 (qp->s_psn - psn) & IPATH_PSN_MASK;
 
1002                         ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
 
1003                                            IPATH_AETH_CREDIT_MASK];
 
1004                 ipath_insert_rnr_queue(qp);
 
1008                 if (qp->s_last == qp->s_tail)
 
1010                 /* The last valid PSN is the previous PSN. */
 
1011                 update_last_psn(qp, psn - 1);
 
1012                 switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
 
1013                         IPATH_AETH_CREDIT_MASK) {
 
1014                 case 0: /* PSN sequence error */
 
1017                          * Back up to the responder's expected PSN.
 
1018                          * Note that we might get a NAK in the middle of an
 
1019                          * RDMA READ response which terminates the RDMA
 
1022                         ipath_restart_rc(qp, psn, &wc);
 
1025                 case 1: /* Invalid Request */
 
1026                         wc.status = IB_WC_REM_INV_REQ_ERR;
 
1027                         dev->n_other_naks++;
 
1030                 case 2: /* Remote Access Error */
 
1031                         wc.status = IB_WC_REM_ACCESS_ERR;
 
1032                         dev->n_other_naks++;
 
1035                 case 3: /* Remote Operation Error */
 
1036                         wc.status = IB_WC_REM_OP_ERR;
 
1037                         dev->n_other_naks++;
 
1039                         wc.wr_id = wqe->wr.wr_id;
 
1040                         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
 
1044                         wc.src_qp = qp->remote_qpn;
 
1046                         wc.slid = qp->remote_ah_attr.dlid;
 
1047                         wc.sl = qp->remote_ah_attr.sl;
 
1048                         wc.dlid_path_bits = 0;
 
1050                         ipath_sqerror_qp(qp, &wc);
 
1054                         /* Ignore other reserved NAK error codes */
 
1057                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
 
1060         default:                /* 2: reserved */
 
1062                 /* Ignore reserved NAK codes. */
 
1071  * ipath_rc_rcv_resp - process an incoming RC response packet
 
1072  * @dev: the device this packet came in on
 
1073  * @ohdr: the other headers for this packet
 
1074  * @data: the packet data
 
1075  * @tlen: the packet length
 
1076  * @qp: the QP for this packet
 
1077  * @opcode: the opcode for this packet
 
1078  * @psn: the packet sequence number for this packet
 
1079  * @hdrsize: the header length
 
1080  * @pmtu: the path MTU
 
1081  * @header_in_data: true if part of the header data is in the data buffer
 
1083  * This is called from ipath_rc_rcv() to process an incoming RC response
 
1084  * packet for the given QP.
 
1085  * Called at interrupt level.
 
1087 static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
 
1088                                      struct ipath_other_headers *ohdr,
 
1089                                      void *data, u32 tlen,
 
1090                                      struct ipath_qp *qp,
 
1092                                      u32 psn, u32 hdrsize, u32 pmtu,
 
1095         struct ipath_swqe *wqe;
 
1096         unsigned long flags;
 
1103         spin_lock_irqsave(&qp->s_lock, flags);
 
1105         /* Ignore invalid responses. */
 
1106         if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
 
1109         /* Ignore duplicate responses. */
 
1110         diff = ipath_cmp24(psn, qp->s_last_psn);
 
1111         if (unlikely(diff <= 0)) {
 
1112                 /* Update credits for "ghost" ACKs */
 
1113                 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
 
1114                         if (!header_in_data)
 
1115                                 aeth = be32_to_cpu(ohdr->u.aeth);
 
1117                                 aeth = be32_to_cpu(((__be32 *) data)[0]);
 
1118                                 data += sizeof(__be32);
 
1120                         if ((aeth >> 29) == 0)
 
1121                                 ipath_get_credit(qp, aeth);
 
1126         if (unlikely(qp->s_last == qp->s_tail))
 
1128         wqe = get_swqe_ptr(qp, qp->s_last);
 
1131         case OP(ACKNOWLEDGE):
 
1132         case OP(ATOMIC_ACKNOWLEDGE):
 
1133         case OP(RDMA_READ_RESPONSE_FIRST):
 
1134                 if (!header_in_data)
 
1135                         aeth = be32_to_cpu(ohdr->u.aeth);
 
1137                         aeth = be32_to_cpu(((__be32 *) data)[0]);
 
1138                         data += sizeof(__be32);
 
1140                 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
 
1141                         if (!header_in_data) {
 
1142                                 __be32 *p = ohdr->u.at.atomic_ack_eth;
 
1144                                 val = ((u64) be32_to_cpu(p[0]) << 32) |
 
1147                                 val = be64_to_cpu(((__be64 *) data)[0]);
 
1150                 if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
 
1151                     opcode != OP(RDMA_READ_RESPONSE_FIRST))
 
1154                 wqe = get_swqe_ptr(qp, qp->s_last);
 
1155                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
 
1158                  * If this is a response to a resent RDMA read, we
 
1159                  * have to be careful to copy the data to the right
 
1162                 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
 
1166         case OP(RDMA_READ_RESPONSE_MIDDLE):
 
1167                 /* no AETH, no ACK */
 
1168                 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
 
1170                         ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
 
1173                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
 
1176                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
 
1178                 if (unlikely(pmtu >= qp->s_rdma_read_len))
 
1181                 /* We got a response so update the timeout. */
 
1182                 spin_lock(&dev->pending_lock);
 
1183                 if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
 
1184                         list_move_tail(&qp->timerwait,
 
1185                                        &dev->pending[dev->pending_index]);
 
1186                 spin_unlock(&dev->pending_lock);
 
1188                  * Update the RDMA receive state but do the copy w/o
 
1189                  * holding the locks and blocking interrupts.
 
1191                 qp->s_rdma_read_len -= pmtu;
 
1192                 update_last_psn(qp, psn);
 
1193                 spin_unlock_irqrestore(&qp->s_lock, flags);
 
1194                 ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
 
1197         case OP(RDMA_READ_RESPONSE_ONLY):
 
1198                 if (!header_in_data)
 
1199                         aeth = be32_to_cpu(ohdr->u.aeth);
 
1201                         aeth = be32_to_cpu(((__be32 *) data)[0]);
 
1202                 if (!do_rc_ack(qp, aeth, psn, opcode, 0))
 
1204                 /* Get the number of bytes the message was padded by. */
 
1205                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
 
1207                  * Check that the data size is >= 0 && <= pmtu.
 
1208                  * Remember to account for the AETH header (4) and
 
1211                 if (unlikely(tlen < (hdrsize + pad + 8)))
 
1214                  * If this is a response to a resent RDMA read, we
 
1215                  * have to be careful to copy the data to the right
 
1218                 wqe = get_swqe_ptr(qp, qp->s_last);
 
1219                 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
 
1223         case OP(RDMA_READ_RESPONSE_LAST):
 
1224                 /* ACKs READ req. */
 
1225                 if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
 
1227                         ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
 
1230                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
 
1232                 /* Get the number of bytes the message was padded by. */
 
1233                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
 
1235                  * Check that the data size is >= 1 && <= pmtu.
 
1236                  * Remember to account for the AETH header (4) and
 
1239                 if (unlikely(tlen <= (hdrsize + pad + 8)))
 
1242                 tlen -= hdrsize + pad + 8;
 
1243                 if (unlikely(tlen != qp->s_rdma_read_len))
 
1245                 if (!header_in_data)
 
1246                         aeth = be32_to_cpu(ohdr->u.aeth);
 
1248                         aeth = be32_to_cpu(((__be32 *) data)[0]);
 
1249                         data += sizeof(__be32);
 
1251                 ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
 
1252                 (void) do_rc_ack(qp, aeth, psn,
 
1253                                  OP(RDMA_READ_RESPONSE_LAST), 0);
 
1258         spin_unlock_irqrestore(&qp->s_lock, flags);
 
1262         wc.status = IB_WC_LOC_QP_OP_ERR;
 
1266         wc.status = IB_WC_LOC_LEN_ERR;
 
1268         wc.wr_id = wqe->wr.wr_id;
 
1269         wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
 
1274         wc.src_qp = qp->remote_qpn;
 
1277         wc.slid = qp->remote_ah_attr.dlid;
 
1278         wc.sl = qp->remote_ah_attr.sl;
 
1279         wc.dlid_path_bits = 0;
 
1281         ipath_sqerror_qp(qp, &wc);
 
1282         spin_unlock_irqrestore(&qp->s_lock, flags);
 
1288  * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
 
1289  * @dev: the device this packet came in on
 
1290  * @ohdr: the other headers for this packet
 
1291  * @data: the packet data
 
1292  * @qp: the QP for this packet
 
1293  * @opcode: the opcode for this packet
 
1294  * @psn: the packet sequence number for this packet
 
1295  * @diff: the difference between the PSN and the expected PSN
 
1296  * @header_in_data: true if part of the header data is in the data buffer
 
1298  * This is called from ipath_rc_rcv() to process an unexpected
 
1299  * incoming RC packet for the given QP.
 
1300  * Called at interrupt level.
 
1301  * Return 1 if no more processing is needed; otherwise return 0 to
 
1302  * schedule a response to be sent.
 
1304 static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 
1305                                      struct ipath_other_headers *ohdr,
 
1307                                      struct ipath_qp *qp,
 
1313         struct ipath_ack_entry *e;
 
1316         unsigned long flags;
 
1320                  * Packet sequence error.
 
1321                  * A NAK will ACK earlier sends and RDMA writes.
 
1322                  * Don't queue the NAK if we already sent one.
 
1324                 if (!qp->r_nak_state) {
 
1325                         qp->r_nak_state = IB_NAK_PSN_ERROR;
 
1326                         /* Use the expected PSN. */
 
1327                         qp->r_ack_psn = qp->r_psn;
 
1334          * Handle a duplicate request.  Don't re-execute SEND, RDMA
 
1335          * write or atomic op.  Don't NAK errors, just silently drop
 
1336          * the duplicate request.  Note that r_sge, r_len, and
 
1337          * r_rcv_len may be in use so don't modify them.
 
1339          * We are supposed to ACK the earliest duplicate PSN but we
 
1340          * can coalesce an outstanding duplicate ACK.  We have to
 
1341          * send the earliest so that RDMA reads can be restarted at
 
1342          * the requester's expected PSN.
 
1344          * First, find where this duplicate PSN falls within the
 
1345          * ACKs previously sent.
 
1347         psn &= IPATH_PSN_MASK;
 
1350         spin_lock_irqsave(&qp->s_lock, flags);
 
1351         for (i = qp->r_head_ack_queue; ; i = prev) {
 
1352                 if (i == qp->s_tail_ack_queue)
 
1357                         prev = IPATH_MAX_RDMA_ATOMIC;
 
1358                 if (prev == qp->r_head_ack_queue) {
 
1362                 e = &qp->s_ack_queue[prev];
 
1367                 if (ipath_cmp24(psn, e->psn) >= 0) {
 
1368                         if (prev == qp->s_tail_ack_queue)
 
1374         case OP(RDMA_READ_REQUEST): {
 
1375                 struct ib_reth *reth;
 
1380                  * If we didn't find the RDMA read request in the ack queue,
 
1381                  * or the send tasklet is already backed up to send an
 
1382                  * earlier entry, we can ignore this request.
 
1384                 if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
 
1386                 /* RETH comes after BTH */
 
1387                 if (!header_in_data)
 
1388                         reth = &ohdr->u.rc.reth;
 
1390                         reth = (struct ib_reth *)data;
 
1391                         data += sizeof(*reth);
 
1394                  * Address range must be a subset of the original
 
1395                  * request and start on pmtu boundaries.
 
1396                  * We reuse the old ack_queue slot since the requester
 
1397                  * should not back up and request an earlier PSN for the
 
1400                 offset = ((psn - e->psn) & IPATH_PSN_MASK) *
 
1401                         ib_mtu_enum_to_int(qp->path_mtu);
 
1402                 len = be32_to_cpu(reth->length);
 
1403                 if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
 
1406                         u32 rkey = be32_to_cpu(reth->rkey);
 
1407                         u64 vaddr = be64_to_cpu(reth->vaddr);
 
1410                         ok = ipath_rkey_ok(qp, &e->rdma_sge,
 
1412                                            IB_ACCESS_REMOTE_READ);
 
1416                         e->rdma_sge.sg_list = NULL;
 
1417                         e->rdma_sge.num_sge = 0;
 
1418                         e->rdma_sge.sge.mr = NULL;
 
1419                         e->rdma_sge.sge.vaddr = NULL;
 
1420                         e->rdma_sge.sge.length = 0;
 
1421                         e->rdma_sge.sge.sge_length = 0;
 
1424                 qp->s_ack_state = OP(ACKNOWLEDGE);
 
1425                 qp->s_tail_ack_queue = prev;
 
1429         case OP(COMPARE_SWAP):
 
1430         case OP(FETCH_ADD): {
 
1432                  * If we didn't find the atomic request in the ack queue
 
1433                  * or the send tasklet is already backed up to send an
 
1434                  * earlier entry, we can ignore this request.
 
1436                 if (!e || e->opcode != (u8) opcode || old_req)
 
1438                 qp->s_ack_state = OP(ACKNOWLEDGE);
 
1439                 qp->s_tail_ack_queue = prev;
 
1447                  * Resend the most recent ACK if this request is
 
1448                  * after all the previous RDMA reads and atomics.
 
1450                 if (i == qp->r_head_ack_queue) {
 
1451                         spin_unlock_irqrestore(&qp->s_lock, flags);
 
1452                         qp->r_nak_state = 0;
 
1453                         qp->r_ack_psn = qp->r_psn - 1;
 
1457                  * Resend the RDMA read or atomic op which
 
1458                  * ACKs this duplicate request.
 
1460                 qp->s_ack_state = OP(ACKNOWLEDGE);
 
1461                 qp->s_tail_ack_queue = i;
 
1464         qp->r_nak_state = 0;
 
1465         tasklet_hi_schedule(&qp->s_task);
 
1468         spin_unlock_irqrestore(&qp->s_lock, flags);
 
1476 static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
 
1478         unsigned long flags;
 
1480         spin_lock_irqsave(&qp->s_lock, flags);
 
1481         qp->state = IB_QPS_ERR;
 
1482         ipath_error_qp(qp, err);
 
1483         spin_unlock_irqrestore(&qp->s_lock, flags);
 
1486 static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
 
1488         unsigned long flags;
 
1492         if (next > IPATH_MAX_RDMA_ATOMIC)
 
1494         spin_lock_irqsave(&qp->s_lock, flags);
 
1495         if (n == qp->s_tail_ack_queue) {
 
1496                 qp->s_tail_ack_queue = next;
 
1497                 qp->s_ack_state = OP(ACKNOWLEDGE);
 
1499         spin_unlock_irqrestore(&qp->s_lock, flags);
 
1503  * ipath_rc_rcv - process an incoming RC packet
 
1504  * @dev: the device this packet came in on
 
1505  * @hdr: the header of this packet
 
1506  * @has_grh: true if the header has a GRH
 
1507  * @data: the packet data
 
1508  * @tlen: the packet length
 
1509  * @qp: the QP for this packet
 
1511  * This is called from ipath_qp_rcv() to process an incoming RC packet
 
1513  * Called at interrupt level.
 
1515 void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 
1516                   int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
 
1518         struct ipath_other_headers *ohdr;
 
1524         u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
 
1526         struct ib_reth *reth;
 
1529         /* Validate the SLID. See Ch. 9.6.1.5 */
 
1530         if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
 
1536                 hdrsize = 8 + 12;       /* LRH + BTH */
 
1537                 psn = be32_to_cpu(ohdr->bth[2]);
 
1540                 ohdr = &hdr->u.l.oth;
 
1541                 hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
 
1543                  * The header with GRH is 60 bytes and the core driver sets
 
1544                  * the eager header buffer size to 56 bytes so the last 4
 
1545                  * bytes of the BTH header (PSN) is in the data buffer.
 
1547                 header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
 
1548                 if (header_in_data) {
 
1549                         psn = be32_to_cpu(((__be32 *) data)[0]);
 
1550                         data += sizeof(__be32);
 
1552                         psn = be32_to_cpu(ohdr->bth[2]);
 
1556          * Process responses (ACKs) before anything else.  Note that the
 
1557          * packet sequence number will be for something in the send work
 
1558          * queue rather than the expected receive packet sequence number.
 
1559          * In other words, this QP is the requester.
 
1561         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
 
1562         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
 
1563             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
 
1564                 ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
 
1565                                   hdrsize, pmtu, header_in_data);
 
1569         /* Compute 24 bits worth of difference. */
 
1570         diff = ipath_cmp24(psn, qp->r_psn);
 
1571         if (unlikely(diff)) {
 
1572                 if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
 
1573                                        psn, diff, header_in_data))
 
1578         /* Check for opcode sequence errors. */
 
1579         switch (qp->r_state) {
 
1580         case OP(SEND_FIRST):
 
1581         case OP(SEND_MIDDLE):
 
1582                 if (opcode == OP(SEND_MIDDLE) ||
 
1583                     opcode == OP(SEND_LAST) ||
 
1584                     opcode == OP(SEND_LAST_WITH_IMMEDIATE))
 
1587                 ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR);
 
1588                 qp->r_nak_state = IB_NAK_INVALID_REQUEST;
 
1589                 qp->r_ack_psn = qp->r_psn;
 
1592         case OP(RDMA_WRITE_FIRST):
 
1593         case OP(RDMA_WRITE_MIDDLE):
 
1594                 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
 
1595                     opcode == OP(RDMA_WRITE_LAST) ||
 
1596                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
 
1601                 if (opcode == OP(SEND_MIDDLE) ||
 
1602                     opcode == OP(SEND_LAST) ||
 
1603                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
 
1604                     opcode == OP(RDMA_WRITE_MIDDLE) ||
 
1605                     opcode == OP(RDMA_WRITE_LAST) ||
 
1606                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
 
1609                  * Note that it is up to the requester to not send a new
 
1610                  * RDMA read or atomic operation before receiving an ACK
 
1611                  * for the previous operation.
 
1619         /* OK, process the packet. */
 
1621         case OP(SEND_FIRST):
 
1622                 if (!ipath_get_rwqe(qp, 0)) {
 
1625                          * A RNR NAK will ACK earlier sends and RDMA writes.
 
1626                          * Don't queue the NAK if a RDMA read or atomic
 
1627                          * is pending though.
 
1629                         if (qp->r_nak_state)
 
1631                         qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
 
1632                         qp->r_ack_psn = qp->r_psn;
 
1637         case OP(SEND_MIDDLE):
 
1638         case OP(RDMA_WRITE_MIDDLE):
 
1640                 /* Check for invalid length PMTU or posted rwqe len. */
 
1641                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
 
1643                 qp->r_rcv_len += pmtu;
 
1644                 if (unlikely(qp->r_rcv_len > qp->r_len))
 
1646                 ipath_copy_sge(&qp->r_sge, data, pmtu);
 
1649         case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
 
1651                 if (!ipath_get_rwqe(qp, 1))
 
1656         case OP(SEND_ONLY_WITH_IMMEDIATE):
 
1657                 if (!ipath_get_rwqe(qp, 0))
 
1660                 if (opcode == OP(SEND_ONLY))
 
1663         case OP(SEND_LAST_WITH_IMMEDIATE):
 
1665                 if (header_in_data) {
 
1666                         wc.imm_data = *(__be32 *) data;
 
1667                         data += sizeof(__be32);
 
1669                         /* Immediate data comes after BTH */
 
1670                         wc.imm_data = ohdr->u.imm_data;
 
1673                 wc.wc_flags = IB_WC_WITH_IMM;
 
1676         case OP(RDMA_WRITE_LAST):
 
1678                 /* Get the number of bytes the message was padded by. */
 
1679                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
 
1680                 /* Check for invalid length. */
 
1681                 /* XXX LAST len should be >= 1 */
 
1682                 if (unlikely(tlen < (hdrsize + pad + 4)))
 
1684                 /* Don't count the CRC. */
 
1685                 tlen -= (hdrsize + pad + 4);
 
1686                 wc.byte_len = tlen + qp->r_rcv_len;
 
1687                 if (unlikely(wc.byte_len > qp->r_len))
 
1689                 ipath_copy_sge(&qp->r_sge, data, tlen);
 
1691                 if (!qp->r_wrid_valid)
 
1693                 qp->r_wrid_valid = 0;
 
1694                 wc.wr_id = qp->r_wr_id;
 
1695                 wc.status = IB_WC_SUCCESS;
 
1696                 wc.opcode = IB_WC_RECV;
 
1699                 wc.src_qp = qp->remote_qpn;
 
1701                 wc.slid = qp->remote_ah_attr.dlid;
 
1702                 wc.sl = qp->remote_ah_attr.sl;
 
1703                 wc.dlid_path_bits = 0;
 
1705                 /* Signal completion event if the solicited bit is set. */
 
1706                 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 
1708                                 __constant_cpu_to_be32(1 << 23)) != 0);
 
1711         case OP(RDMA_WRITE_FIRST):
 
1712         case OP(RDMA_WRITE_ONLY):
 
1713         case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
 
1714                 if (unlikely(!(qp->qp_access_flags &
 
1715                                IB_ACCESS_REMOTE_WRITE)))
 
1718                 /* RETH comes after BTH */
 
1719                 if (!header_in_data)
 
1720                         reth = &ohdr->u.rc.reth;
 
1722                         reth = (struct ib_reth *)data;
 
1723                         data += sizeof(*reth);
 
1725                 hdrsize += sizeof(*reth);
 
1726                 qp->r_len = be32_to_cpu(reth->length);
 
1728                 if (qp->r_len != 0) {
 
1729                         u32 rkey = be32_to_cpu(reth->rkey);
 
1730                         u64 vaddr = be64_to_cpu(reth->vaddr);
 
1733                         /* Check rkey & NAK */
 
1734                         ok = ipath_rkey_ok(qp, &qp->r_sge,
 
1735                                            qp->r_len, vaddr, rkey,
 
1736                                            IB_ACCESS_REMOTE_WRITE);
 
1740                         qp->r_sge.sg_list = NULL;
 
1741                         qp->r_sge.sge.mr = NULL;
 
1742                         qp->r_sge.sge.vaddr = NULL;
 
1743                         qp->r_sge.sge.length = 0;
 
1744                         qp->r_sge.sge.sge_length = 0;
 
1746                 if (opcode == OP(RDMA_WRITE_FIRST))
 
1748                 else if (opcode == OP(RDMA_WRITE_ONLY))
 
1750                 if (!ipath_get_rwqe(qp, 1))
 
1754         case OP(RDMA_READ_REQUEST): {
 
1755                 struct ipath_ack_entry *e;
 
1759                 if (unlikely(!(qp->qp_access_flags &
 
1760                                IB_ACCESS_REMOTE_READ)))
 
1762                 next = qp->r_head_ack_queue + 1;
 
1763                 if (next > IPATH_MAX_RDMA_ATOMIC)
 
1765                 if (unlikely(next == qp->s_tail_ack_queue)) {
 
1766                         if (!qp->s_ack_queue[next].sent)
 
1768                         ipath_update_ack_queue(qp, next);
 
1770                 e = &qp->s_ack_queue[qp->r_head_ack_queue];
 
1771                 /* RETH comes after BTH */
 
1772                 if (!header_in_data)
 
1773                         reth = &ohdr->u.rc.reth;
 
1775                         reth = (struct ib_reth *)data;
 
1776                         data += sizeof(*reth);
 
1778                 len = be32_to_cpu(reth->length);
 
1780                         u32 rkey = be32_to_cpu(reth->rkey);
 
1781                         u64 vaddr = be64_to_cpu(reth->vaddr);
 
1784                         /* Check rkey & NAK */
 
1785                         ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
 
1786                                            rkey, IB_ACCESS_REMOTE_READ);
 
1790                          * Update the next expected PSN.  We add 1 later
 
1791                          * below, so only add the remainder here.
 
1794                                 qp->r_psn += (len - 1) / pmtu;
 
1796                         e->rdma_sge.sg_list = NULL;
 
1797                         e->rdma_sge.num_sge = 0;
 
1798                         e->rdma_sge.sge.mr = NULL;
 
1799                         e->rdma_sge.sge.vaddr = NULL;
 
1800                         e->rdma_sge.sge.length = 0;
 
1801                         e->rdma_sge.sge.sge_length = 0;
 
1807                  * We need to increment the MSN here instead of when we
 
1808                  * finish sending the result since a duplicate request would
 
1809                  * increment it more than once.
 
1813                 qp->r_state = opcode;
 
1814                 qp->r_nak_state = 0;
 
1816                 qp->r_head_ack_queue = next;
 
1818                 /* Call ipath_do_rc_send() in another thread. */
 
1819                 tasklet_hi_schedule(&qp->s_task);
 
1824         case OP(COMPARE_SWAP):
 
1825         case OP(FETCH_ADD): {
 
1826                 struct ib_atomic_eth *ateth;
 
1827                 struct ipath_ack_entry *e;
 
1834                 if (unlikely(!(qp->qp_access_flags &
 
1835                                IB_ACCESS_REMOTE_ATOMIC)))
 
1837                 next = qp->r_head_ack_queue + 1;
 
1838                 if (next > IPATH_MAX_RDMA_ATOMIC)
 
1840                 if (unlikely(next == qp->s_tail_ack_queue)) {
 
1841                         if (!qp->s_ack_queue[next].sent)
 
1843                         ipath_update_ack_queue(qp, next);
 
1845                 if (!header_in_data)
 
1846                         ateth = &ohdr->u.atomic_eth;
 
1848                         ateth = (struct ib_atomic_eth *)data;
 
1849                 vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
 
1850                         be32_to_cpu(ateth->vaddr[1]);
 
1851                 if (unlikely(vaddr & (sizeof(u64) - 1)))
 
1853                 rkey = be32_to_cpu(ateth->rkey);
 
1854                 /* Check rkey & NAK */
 
1855                 if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
 
1856                                             sizeof(u64), vaddr, rkey,
 
1857                                             IB_ACCESS_REMOTE_ATOMIC)))
 
1859                 /* Perform atomic OP and save result. */
 
1860                 maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
 
1861                 sdata = be64_to_cpu(ateth->swap_data);
 
1862                 e = &qp->s_ack_queue[qp->r_head_ack_queue];
 
1863                 e->atomic_data = (opcode == OP(FETCH_ADD)) ?
 
1864                         (u64) atomic64_add_return(sdata, maddr) - sdata :
 
1865                         (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
 
1866                                       be64_to_cpu(ateth->compare_data),
 
1870                 e->psn = psn & IPATH_PSN_MASK;
 
1873                 qp->r_state = opcode;
 
1874                 qp->r_nak_state = 0;
 
1876                 qp->r_head_ack_queue = next;
 
1878                 /* Call ipath_do_rc_send() in another thread. */
 
1879                 tasklet_hi_schedule(&qp->s_task);
 
1885                 /* NAK unknown opcodes. */
 
1889         qp->r_state = opcode;
 
1890         qp->r_ack_psn = psn;
 
1891         qp->r_nak_state = 0;
 
1892         /* Send an ACK if requested or required. */
 
1893         if (psn & (1 << 31))
 
1898         ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR);
 
1899         qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
 
1900         qp->r_ack_psn = qp->r_psn;