[PATCH] KVM: MMU: Support emulated writes into RAM
[linux-2.6] / drivers / infiniband / hw / ipath / ipath_verbs.c
1 /*
2  * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <rdma/ib_mad.h>
35 #include <rdma/ib_user_verbs.h>
36 #include <linux/io.h>
37 #include <linux/utsname.h>
38
39 #include "ipath_kernel.h"
40 #include "ipath_verbs.h"
41 #include "ipath_common.h"
42
43 static unsigned int ib_ipath_qp_table_size = 251;
44 module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
45 MODULE_PARM_DESC(qp_table_size, "QP table size");
46
47 unsigned int ib_ipath_lkey_table_size = 12;
48 module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
49                    S_IRUGO);
50 MODULE_PARM_DESC(lkey_table_size,
51                  "LKEY table size in bits (2^n, 1 <= n <= 23)");
52
53 static unsigned int ib_ipath_max_pds = 0xFFFF;
54 module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
55 MODULE_PARM_DESC(max_pds,
56                  "Maximum number of protection domains to support");
57
58 static unsigned int ib_ipath_max_ahs = 0xFFFF;
59 module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
60 MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
61
62 unsigned int ib_ipath_max_cqes = 0x2FFFF;
63 module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
64 MODULE_PARM_DESC(max_cqes,
65                  "Maximum number of completion queue entries to support");
66
67 unsigned int ib_ipath_max_cqs = 0x1FFFF;
68 module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
69 MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
70
71 unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
72 module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
73                    S_IWUSR | S_IRUGO);
74 MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
75
76 unsigned int ib_ipath_max_qps = 16384;
77 module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
78 MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
79
80 unsigned int ib_ipath_max_sges = 0x60;
81 module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
82 MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
83
84 unsigned int ib_ipath_max_mcast_grps = 16384;
85 module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
86                    S_IWUSR | S_IRUGO);
87 MODULE_PARM_DESC(max_mcast_grps,
88                  "Maximum number of multicast groups to support");
89
90 unsigned int ib_ipath_max_mcast_qp_attached = 16;
91 module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
92                    uint, S_IWUSR | S_IRUGO);
93 MODULE_PARM_DESC(max_mcast_qp_attached,
94                  "Maximum number of attached QPs to support");
95
96 unsigned int ib_ipath_max_srqs = 1024;
97 module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
98 MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
99
100 unsigned int ib_ipath_max_srq_sges = 128;
101 module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
102                    uint, S_IWUSR | S_IRUGO);
103 MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
104
105 unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
106 module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
107                    uint, S_IWUSR | S_IRUGO);
108 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
109
110 static unsigned int ib_ipath_disable_sma;
111 module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
112 MODULE_PARM_DESC(ib_ipath_disable_sma, "Disable the SMA");
113
114 const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
115         [IB_QPS_RESET] = 0,
116         [IB_QPS_INIT] = IPATH_POST_RECV_OK,
117         [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
118         [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
119             IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
120         [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
121             IPATH_POST_SEND_OK,
122         [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
123         [IB_QPS_ERR] = 0,
124 };
125
126 struct ipath_ucontext {
127         struct ib_ucontext ibucontext;
128 };
129
130 static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
131                                                   *ibucontext)
132 {
133         return container_of(ibucontext, struct ipath_ucontext, ibucontext);
134 }
135
136 /*
137  * Translate ib_wr_opcode into ib_wc_opcode.
138  */
139 const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
140         [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
141         [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
142         [IB_WR_SEND] = IB_WC_SEND,
143         [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
144         [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
145         [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
146         [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
147 };
148
149 /*
150  * System image GUID.
151  */
152 static __be64 sys_image_guid;
153
154 /**
155  * ipath_copy_sge - copy data to SGE memory
156  * @ss: the SGE state
157  * @data: the data to copy
158  * @length: the length of the data
159  */
160 void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
161 {
162         struct ipath_sge *sge = &ss->sge;
163
164         while (length) {
165                 u32 len = sge->length;
166
167                 BUG_ON(len == 0);
168                 if (len > length)
169                         len = length;
170                 memcpy(sge->vaddr, data, len);
171                 sge->vaddr += len;
172                 sge->length -= len;
173                 sge->sge_length -= len;
174                 if (sge->sge_length == 0) {
175                         if (--ss->num_sge)
176                                 *sge = *ss->sg_list++;
177                 } else if (sge->length == 0 && sge->mr != NULL) {
178                         if (++sge->n >= IPATH_SEGSZ) {
179                                 if (++sge->m >= sge->mr->mapsz)
180                                         break;
181                                 sge->n = 0;
182                         }
183                         sge->vaddr =
184                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
185                         sge->length =
186                                 sge->mr->map[sge->m]->segs[sge->n].length;
187                 }
188                 data += len;
189                 length -= len;
190         }
191 }
192
193 /**
194  * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
195  * @ss: the SGE state
196  * @length: the number of bytes to skip
197  */
198 void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
199 {
200         struct ipath_sge *sge = &ss->sge;
201
202         while (length) {
203                 u32 len = sge->length;
204
205                 BUG_ON(len == 0);
206                 if (len > length)
207                         len = length;
208                 sge->vaddr += len;
209                 sge->length -= len;
210                 sge->sge_length -= len;
211                 if (sge->sge_length == 0) {
212                         if (--ss->num_sge)
213                                 *sge = *ss->sg_list++;
214                 } else if (sge->length == 0 && sge->mr != NULL) {
215                         if (++sge->n >= IPATH_SEGSZ) {
216                                 if (++sge->m >= sge->mr->mapsz)
217                                         break;
218                                 sge->n = 0;
219                         }
220                         sge->vaddr =
221                                 sge->mr->map[sge->m]->segs[sge->n].vaddr;
222                         sge->length =
223                                 sge->mr->map[sge->m]->segs[sge->n].length;
224                 }
225                 length -= len;
226         }
227 }
228
229 /**
230  * ipath_post_send - post a send on a QP
231  * @ibqp: the QP to post the send on
232  * @wr: the list of work requests to post
233  * @bad_wr: the first bad WR is put here
234  *
235  * This may be called from interrupt context.
236  */
237 static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
238                            struct ib_send_wr **bad_wr)
239 {
240         struct ipath_qp *qp = to_iqp(ibqp);
241         int err = 0;
242
243         /* Check that state is OK to post send. */
244         if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) {
245                 *bad_wr = wr;
246                 err = -EINVAL;
247                 goto bail;
248         }
249
250         for (; wr; wr = wr->next) {
251                 switch (qp->ibqp.qp_type) {
252                 case IB_QPT_UC:
253                 case IB_QPT_RC:
254                         err = ipath_post_ruc_send(qp, wr);
255                         break;
256
257                 case IB_QPT_SMI:
258                 case IB_QPT_GSI:
259                 case IB_QPT_UD:
260                         err = ipath_post_ud_send(qp, wr);
261                         break;
262
263                 default:
264                         err = -EINVAL;
265                 }
266                 if (err) {
267                         *bad_wr = wr;
268                         break;
269                 }
270         }
271
272 bail:
273         return err;
274 }
275
276 /**
277  * ipath_post_receive - post a receive on a QP
278  * @ibqp: the QP to post the receive on
279  * @wr: the WR to post
280  * @bad_wr: the first bad WR is put here
281  *
282  * This may be called from interrupt context.
283  */
284 static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
285                               struct ib_recv_wr **bad_wr)
286 {
287         struct ipath_qp *qp = to_iqp(ibqp);
288         struct ipath_rwq *wq = qp->r_rq.wq;
289         unsigned long flags;
290         int ret;
291
292         /* Check that state is OK to post receive. */
293         if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
294                 *bad_wr = wr;
295                 ret = -EINVAL;
296                 goto bail;
297         }
298
299         for (; wr; wr = wr->next) {
300                 struct ipath_rwqe *wqe;
301                 u32 next;
302                 int i;
303
304                 if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
305                         *bad_wr = wr;
306                         ret = -ENOMEM;
307                         goto bail;
308                 }
309
310                 spin_lock_irqsave(&qp->r_rq.lock, flags);
311                 next = wq->head + 1;
312                 if (next >= qp->r_rq.size)
313                         next = 0;
314                 if (next == wq->tail) {
315                         spin_unlock_irqrestore(&qp->r_rq.lock, flags);
316                         *bad_wr = wr;
317                         ret = -ENOMEM;
318                         goto bail;
319                 }
320
321                 wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
322                 wqe->wr_id = wr->wr_id;
323                 wqe->num_sge = wr->num_sge;
324                 for (i = 0; i < wr->num_sge; i++)
325                         wqe->sg_list[i] = wr->sg_list[i];
326                 wq->head = next;
327                 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
328         }
329         ret = 0;
330
331 bail:
332         return ret;
333 }
334
335 /**
336  * ipath_qp_rcv - processing an incoming packet on a QP
337  * @dev: the device the packet came on
338  * @hdr: the packet header
339  * @has_grh: true if the packet has a GRH
340  * @data: the packet data
341  * @tlen: the packet length
342  * @qp: the QP the packet came on
343  *
344  * This is called from ipath_ib_rcv() to process an incoming packet
345  * for the given QP.
346  * Called at interrupt level.
347  */
348 static void ipath_qp_rcv(struct ipath_ibdev *dev,
349                          struct ipath_ib_header *hdr, int has_grh,
350                          void *data, u32 tlen, struct ipath_qp *qp)
351 {
352         /* Check for valid receive state. */
353         if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
354                 dev->n_pkt_drops++;
355                 return;
356         }
357
358         switch (qp->ibqp.qp_type) {
359         case IB_QPT_SMI:
360         case IB_QPT_GSI:
361                 if (ib_ipath_disable_sma)
362                         break;
363                 /* FALLTHROUGH */
364         case IB_QPT_UD:
365                 ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
366                 break;
367
368         case IB_QPT_RC:
369                 ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
370                 break;
371
372         case IB_QPT_UC:
373                 ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
374                 break;
375
376         default:
377                 break;
378         }
379 }
380
381 /**
382  * ipath_ib_rcv - process an incoming packet
383  * @arg: the device pointer
384  * @rhdr: the header of the packet
385  * @data: the packet data
386  * @tlen: the packet length
387  *
388  * This is called from ipath_kreceive() to process an incoming packet at
389  * interrupt level. Tlen is the length of the header + data + CRC in bytes.
390  */
391 void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
392                   u32 tlen)
393 {
394         struct ipath_ib_header *hdr = rhdr;
395         struct ipath_other_headers *ohdr;
396         struct ipath_qp *qp;
397         u32 qp_num;
398         int lnh;
399         u8 opcode;
400         u16 lid;
401
402         if (unlikely(dev == NULL))
403                 goto bail;
404
405         if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
406                 dev->rcv_errors++;
407                 goto bail;
408         }
409
410         /* Check for a valid destination LID (see ch. 7.11.1). */
411         lid = be16_to_cpu(hdr->lrh[1]);
412         if (lid < IPATH_MULTICAST_LID_BASE) {
413                 lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
414                 if (unlikely(lid != dev->dd->ipath_lid)) {
415                         dev->rcv_errors++;
416                         goto bail;
417                 }
418         }
419
420         /* Check for GRH */
421         lnh = be16_to_cpu(hdr->lrh[0]) & 3;
422         if (lnh == IPATH_LRH_BTH)
423                 ohdr = &hdr->u.oth;
424         else if (lnh == IPATH_LRH_GRH)
425                 ohdr = &hdr->u.l.oth;
426         else {
427                 dev->rcv_errors++;
428                 goto bail;
429         }
430
431         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
432         dev->opstats[opcode].n_bytes += tlen;
433         dev->opstats[opcode].n_packets++;
434
435         /* Get the destination QP number. */
436         qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
437         if (qp_num == IPATH_MULTICAST_QPN) {
438                 struct ipath_mcast *mcast;
439                 struct ipath_mcast_qp *p;
440
441                 mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
442                 if (mcast == NULL) {
443                         dev->n_pkt_drops++;
444                         goto bail;
445                 }
446                 dev->n_multicast_rcv++;
447                 list_for_each_entry_rcu(p, &mcast->qp_list, list)
448                         ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
449                                      tlen, p->qp);
450                 /*
451                  * Notify ipath_multicast_detach() if it is waiting for us
452                  * to finish.
453                  */
454                 if (atomic_dec_return(&mcast->refcount) <= 1)
455                         wake_up(&mcast->wait);
456         } else {
457                 qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
458                 if (qp) {
459                         dev->n_unicast_rcv++;
460                         ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
461                                      tlen, qp);
462                         /*
463                          * Notify ipath_destroy_qp() if it is waiting
464                          * for us to finish.
465                          */
466                         if (atomic_dec_and_test(&qp->refcount))
467                                 wake_up(&qp->wait);
468                 } else
469                         dev->n_pkt_drops++;
470         }
471
472 bail:;
473 }
474
475 /**
476  * ipath_ib_timer - verbs timer
477  * @arg: the device pointer
478  *
479  * This is called from ipath_do_rcv_timer() at interrupt level to check for
480  * QPs which need retransmits and to collect performance numbers.
481  */
482 void ipath_ib_timer(struct ipath_ibdev *dev)
483 {
484         struct ipath_qp *resend = NULL;
485         struct list_head *last;
486         struct ipath_qp *qp;
487         unsigned long flags;
488
489         if (dev == NULL)
490                 return;
491
492         spin_lock_irqsave(&dev->pending_lock, flags);
493         /* Start filling the next pending queue. */
494         if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
495                 dev->pending_index = 0;
496         /* Save any requests still in the new queue, they have timed out. */
497         last = &dev->pending[dev->pending_index];
498         while (!list_empty(last)) {
499                 qp = list_entry(last->next, struct ipath_qp, timerwait);
500                 list_del_init(&qp->timerwait);
501                 qp->timer_next = resend;
502                 resend = qp;
503                 atomic_inc(&qp->refcount);
504         }
505         last = &dev->rnrwait;
506         if (!list_empty(last)) {
507                 qp = list_entry(last->next, struct ipath_qp, timerwait);
508                 if (--qp->s_rnr_timeout == 0) {
509                         do {
510                                 list_del_init(&qp->timerwait);
511                                 tasklet_hi_schedule(&qp->s_task);
512                                 if (list_empty(last))
513                                         break;
514                                 qp = list_entry(last->next, struct ipath_qp,
515                                                 timerwait);
516                         } while (qp->s_rnr_timeout == 0);
517                 }
518         }
519         /*
520          * We should only be in the started state if pma_sample_start != 0
521          */
522         if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
523             --dev->pma_sample_start == 0) {
524                 dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
525                 ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
526                                         &dev->ipath_rword,
527                                         &dev->ipath_spkts,
528                                         &dev->ipath_rpkts,
529                                         &dev->ipath_xmit_wait);
530         }
531         if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
532                 if (dev->pma_sample_interval == 0) {
533                         u64 ta, tb, tc, td, te;
534
535                         dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
536                         ipath_snapshot_counters(dev->dd, &ta, &tb,
537                                                 &tc, &td, &te);
538
539                         dev->ipath_sword = ta - dev->ipath_sword;
540                         dev->ipath_rword = tb - dev->ipath_rword;
541                         dev->ipath_spkts = tc - dev->ipath_spkts;
542                         dev->ipath_rpkts = td - dev->ipath_rpkts;
543                         dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
544                 }
545                 else
546                         dev->pma_sample_interval--;
547         }
548         spin_unlock_irqrestore(&dev->pending_lock, flags);
549
550         /* XXX What if timer fires again while this is running? */
551         for (qp = resend; qp != NULL; qp = qp->timer_next) {
552                 struct ib_wc wc;
553
554                 spin_lock_irqsave(&qp->s_lock, flags);
555                 if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
556                         dev->n_timeouts++;
557                         ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
558                 }
559                 spin_unlock_irqrestore(&qp->s_lock, flags);
560
561                 /* Notify ipath_destroy_qp() if it is waiting. */
562                 if (atomic_dec_and_test(&qp->refcount))
563                         wake_up(&qp->wait);
564         }
565 }
566
567 static void update_sge(struct ipath_sge_state *ss, u32 length)
568 {
569         struct ipath_sge *sge = &ss->sge;
570
571         sge->vaddr += length;
572         sge->length -= length;
573         sge->sge_length -= length;
574         if (sge->sge_length == 0) {
575                 if (--ss->num_sge)
576                         *sge = *ss->sg_list++;
577         } else if (sge->length == 0 && sge->mr != NULL) {
578                 if (++sge->n >= IPATH_SEGSZ) {
579                         if (++sge->m >= sge->mr->mapsz)
580                                 return;
581                         sge->n = 0;
582                 }
583                 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
584                 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
585         }
586 }
587
588 #ifdef __LITTLE_ENDIAN
589 static inline u32 get_upper_bits(u32 data, u32 shift)
590 {
591         return data >> shift;
592 }
593
594 static inline u32 set_upper_bits(u32 data, u32 shift)
595 {
596         return data << shift;
597 }
598
599 static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
600 {
601         data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
602         data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
603         return data;
604 }
605 #else
606 static inline u32 get_upper_bits(u32 data, u32 shift)
607 {
608         return data << shift;
609 }
610
611 static inline u32 set_upper_bits(u32 data, u32 shift)
612 {
613         return data >> shift;
614 }
615
616 static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
617 {
618         data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
619         data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
620         return data;
621 }
622 #endif
623
624 static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
625                     u32 length)
626 {
627         u32 extra = 0;
628         u32 data = 0;
629         u32 last;
630
631         while (1) {
632                 u32 len = ss->sge.length;
633                 u32 off;
634
635                 BUG_ON(len == 0);
636                 if (len > length)
637                         len = length;
638                 if (len > ss->sge.sge_length)
639                         len = ss->sge.sge_length;
640                 /* If the source address is not aligned, try to align it. */
641                 off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
642                 if (off) {
643                         u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
644                                             ~(sizeof(u32) - 1));
645                         u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
646                         u32 y;
647
648                         y = sizeof(u32) - off;
649                         if (len > y)
650                                 len = y;
651                         if (len + extra >= sizeof(u32)) {
652                                 data |= set_upper_bits(v, extra *
653                                                        BITS_PER_BYTE);
654                                 len = sizeof(u32) - extra;
655                                 if (len == length) {
656                                         last = data;
657                                         break;
658                                 }
659                                 __raw_writel(data, piobuf);
660                                 piobuf++;
661                                 extra = 0;
662                                 data = 0;
663                         } else {
664                                 /* Clear unused upper bytes */
665                                 data |= clear_upper_bytes(v, len, extra);
666                                 if (len == length) {
667                                         last = data;
668                                         break;
669                                 }
670                                 extra += len;
671                         }
672                 } else if (extra) {
673                         /* Source address is aligned. */
674                         u32 *addr = (u32 *) ss->sge.vaddr;
675                         int shift = extra * BITS_PER_BYTE;
676                         int ushift = 32 - shift;
677                         u32 l = len;
678
679                         while (l >= sizeof(u32)) {
680                                 u32 v = *addr;
681
682                                 data |= set_upper_bits(v, shift);
683                                 __raw_writel(data, piobuf);
684                                 data = get_upper_bits(v, ushift);
685                                 piobuf++;
686                                 addr++;
687                                 l -= sizeof(u32);
688                         }
689                         /*
690                          * We still have 'extra' number of bytes leftover.
691                          */
692                         if (l) {
693                                 u32 v = *addr;
694
695                                 if (l + extra >= sizeof(u32)) {
696                                         data |= set_upper_bits(v, shift);
697                                         len -= l + extra - sizeof(u32);
698                                         if (len == length) {
699                                                 last = data;
700                                                 break;
701                                         }
702                                         __raw_writel(data, piobuf);
703                                         piobuf++;
704                                         extra = 0;
705                                         data = 0;
706                                 } else {
707                                         /* Clear unused upper bytes */
708                                         data |= clear_upper_bytes(v, l,
709                                                                   extra);
710                                         if (len == length) {
711                                                 last = data;
712                                                 break;
713                                         }
714                                         extra += l;
715                                 }
716                         } else if (len == length) {
717                                 last = data;
718                                 break;
719                         }
720                 } else if (len == length) {
721                         u32 w;
722
723                         /*
724                          * Need to round up for the last dword in the
725                          * packet.
726                          */
727                         w = (len + 3) >> 2;
728                         __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
729                         piobuf += w - 1;
730                         last = ((u32 *) ss->sge.vaddr)[w - 1];
731                         break;
732                 } else {
733                         u32 w = len >> 2;
734
735                         __iowrite32_copy(piobuf, ss->sge.vaddr, w);
736                         piobuf += w;
737
738                         extra = len & (sizeof(u32) - 1);
739                         if (extra) {
740                                 u32 v = ((u32 *) ss->sge.vaddr)[w];
741
742                                 /* Clear unused upper bytes */
743                                 data = clear_upper_bytes(v, extra, 0);
744                         }
745                 }
746                 update_sge(ss, len);
747                 length -= len;
748         }
749         /* Update address before sending packet. */
750         update_sge(ss, length);
751         /* must flush early everything before trigger word */
752         ipath_flush_wc();
753         __raw_writel(last, piobuf);
754         /* be sure trigger word is written */
755         ipath_flush_wc();
756 }
757
758 /**
759  * ipath_verbs_send - send a packet
760  * @dd: the infinipath device
761  * @hdrwords: the number of words in the header
762  * @hdr: the packet header
763  * @len: the length of the packet in bytes
764  * @ss: the SGE to send
765  */
766 int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
767                      u32 *hdr, u32 len, struct ipath_sge_state *ss)
768 {
769         u32 __iomem *piobuf;
770         u32 plen;
771         int ret;
772
773         /* +1 is for the qword padding of pbc */
774         plen = hdrwords + ((len + 3) >> 2) + 1;
775         if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) {
776                 ipath_dbg("packet len 0x%x too long, failing\n", plen);
777                 ret = -EINVAL;
778                 goto bail;
779         }
780
781         /* Get a PIO buffer to use. */
782         piobuf = ipath_getpiobuf(dd, NULL);
783         if (unlikely(piobuf == NULL)) {
784                 ret = -EBUSY;
785                 goto bail;
786         }
787
788         /*
789          * Write len to control qword, no flags.
790          * We have to flush after the PBC for correctness on some cpus
791          * or WC buffer can be written out of order.
792          */
793         writeq(plen, piobuf);
794         ipath_flush_wc();
795         piobuf += 2;
796         if (len == 0) {
797                 /*
798                  * If there is just the header portion, must flush before
799                  * writing last word of header for correctness, and after
800                  * the last header word (trigger word).
801                  */
802                 __iowrite32_copy(piobuf, hdr, hdrwords - 1);
803                 ipath_flush_wc();
804                 __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
805                 ipath_flush_wc();
806                 ret = 0;
807                 goto bail;
808         }
809
810         __iowrite32_copy(piobuf, hdr, hdrwords);
811         piobuf += hdrwords;
812
813         /* The common case is aligned and contained in one segment. */
814         if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
815                    !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
816                 u32 w;
817                 u32 *addr = (u32 *) ss->sge.vaddr;
818
819                 /* Update address before sending packet. */
820                 update_sge(ss, len);
821                 /* Need to round up for the last dword in the packet. */
822                 w = (len + 3) >> 2;
823                 __iowrite32_copy(piobuf, addr, w - 1);
824                 /* must flush early everything before trigger word */
825                 ipath_flush_wc();
826                 __raw_writel(addr[w - 1], piobuf + w - 1);
827                 /* be sure trigger word is written */
828                 ipath_flush_wc();
829                 ret = 0;
830                 goto bail;
831         }
832         copy_io(piobuf, ss, len);
833         ret = 0;
834
835 bail:
836         return ret;
837 }
838
839 int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
840                             u64 *rwords, u64 *spkts, u64 *rpkts,
841                             u64 *xmit_wait)
842 {
843         int ret;
844
845         if (!(dd->ipath_flags & IPATH_INITTED)) {
846                 /* no hardware, freeze, etc. */
847                 ipath_dbg("unit %u not usable\n", dd->ipath_unit);
848                 ret = -EINVAL;
849                 goto bail;
850         }
851         *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
852         *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
853         *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
854         *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
855         *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
856
857         ret = 0;
858
859 bail:
860         return ret;
861 }
862
863 /**
864  * ipath_get_counters - get various chip counters
865  * @dd: the infinipath device
866  * @cntrs: counters are placed here
867  *
868  * Return the counters needed by recv_pma_get_portcounters().
869  */
870 int ipath_get_counters(struct ipath_devdata *dd,
871                        struct ipath_verbs_counters *cntrs)
872 {
873         int ret;
874
875         if (!(dd->ipath_flags & IPATH_INITTED)) {
876                 /* no hardware, freeze, etc. */
877                 ipath_dbg("unit %u not usable\n", dd->ipath_unit);
878                 ret = -EINVAL;
879                 goto bail;
880         }
881         cntrs->symbol_error_counter =
882                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt);
883         cntrs->link_error_recovery_counter =
884                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt);
885         /*
886          * The link downed counter counts when the other side downs the
887          * connection.  We add in the number of times we downed the link
888          * due to local link integrity errors to compensate.
889          */
890         cntrs->link_downed_counter =
891                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt);
892         cntrs->port_rcv_errors =
893                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) +
894                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) +
895                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) +
896                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) +
897                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) +
898                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) +
899                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) +
900                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) +
901                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt) +
902                 dd->ipath_rxfc_unsupvl_errs;
903         cntrs->port_rcv_remphys_errors =
904                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt);
905         cntrs->port_xmit_discards =
906                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt);
907         cntrs->port_xmit_data =
908                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
909         cntrs->port_rcv_data =
910                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
911         cntrs->port_xmit_packets =
912                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
913         cntrs->port_rcv_packets =
914                 ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
915         cntrs->local_link_integrity_errors =
916                 (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
917                 dd->ipath_lli_errs : dd->ipath_lli_errors;
918         cntrs->excessive_buffer_overrun_errors = dd->ipath_overrun_thresh_errs;
919
920         ret = 0;
921
922 bail:
923         return ret;
924 }
925
926 /**
927  * ipath_ib_piobufavail - callback when a PIO buffer is available
928  * @arg: the device pointer
929  *
930  * This is called from ipath_intr() at interrupt level when a PIO buffer is
931  * available after ipath_verbs_send() returned an error that no buffers were
932  * available.  Return 1 if we consumed all the PIO buffers and we still have
933  * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and
934  * return zero).
935  */
936 int ipath_ib_piobufavail(struct ipath_ibdev *dev)
937 {
938         struct ipath_qp *qp;
939         unsigned long flags;
940
941         if (dev == NULL)
942                 goto bail;
943
944         spin_lock_irqsave(&dev->pending_lock, flags);
945         while (!list_empty(&dev->piowait)) {
946                 qp = list_entry(dev->piowait.next, struct ipath_qp,
947                                 piowait);
948                 list_del_init(&qp->piowait);
949                 tasklet_hi_schedule(&qp->s_task);
950         }
951         spin_unlock_irqrestore(&dev->pending_lock, flags);
952
953 bail:
954         return 0;
955 }
956
957 static int ipath_query_device(struct ib_device *ibdev,
958                               struct ib_device_attr *props)
959 {
960         struct ipath_ibdev *dev = to_idev(ibdev);
961
962         memset(props, 0, sizeof(*props));
963
964         props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
965                 IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
966                 IB_DEVICE_SYS_IMAGE_GUID;
967         props->page_size_cap = PAGE_SIZE;
968         props->vendor_id = dev->dd->ipath_vendorid;
969         props->vendor_part_id = dev->dd->ipath_deviceid;
970         props->hw_ver = dev->dd->ipath_pcirev;
971
972         props->sys_image_guid = dev->sys_image_guid;
973
974         props->max_mr_size = ~0ull;
975         props->max_qp = ib_ipath_max_qps;
976         props->max_qp_wr = ib_ipath_max_qp_wrs;
977         props->max_sge = ib_ipath_max_sges;
978         props->max_cq = ib_ipath_max_cqs;
979         props->max_ah = ib_ipath_max_ahs;
980         props->max_cqe = ib_ipath_max_cqes;
981         props->max_mr = dev->lk_table.max;
982         props->max_pd = ib_ipath_max_pds;
983         props->max_qp_rd_atom = 1;
984         props->max_qp_init_rd_atom = 1;
985         /* props->max_res_rd_atom */
986         props->max_srq = ib_ipath_max_srqs;
987         props->max_srq_wr = ib_ipath_max_srq_wrs;
988         props->max_srq_sge = ib_ipath_max_srq_sges;
989         /* props->local_ca_ack_delay */
990         props->atomic_cap = IB_ATOMIC_HCA;
991         props->max_pkeys = ipath_get_npkeys(dev->dd);
992         props->max_mcast_grp = ib_ipath_max_mcast_grps;
993         props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
994         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
995                 props->max_mcast_grp;
996
997         return 0;
998 }
999
1000 const u8 ipath_cvt_physportstate[16] = {
1001         [INFINIPATH_IBCS_LT_STATE_DISABLED] = 3,
1002         [INFINIPATH_IBCS_LT_STATE_LINKUP] = 5,
1003         [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = 2,
1004         [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = 2,
1005         [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = 1,
1006         [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = 1,
1007         [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = 4,
1008         [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = 4,
1009         [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = 4,
1010         [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = 4,
1011         [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = 6,
1012         [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = 6,
1013         [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = 6,
1014 };
1015
1016 u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
1017 {
1018         return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
1019 }
1020
1021 static int ipath_query_port(struct ib_device *ibdev,
1022                             u8 port, struct ib_port_attr *props)
1023 {
1024         struct ipath_ibdev *dev = to_idev(ibdev);
1025         enum ib_mtu mtu;
1026         u16 lid = dev->dd->ipath_lid;
1027         u64 ibcstat;
1028
1029         memset(props, 0, sizeof(*props));
1030         props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE);
1031         props->lmc = dev->mkeyprot_resv_lmc & 7;
1032         props->sm_lid = dev->sm_lid;
1033         props->sm_sl = dev->sm_sl;
1034         ibcstat = dev->dd->ipath_lastibcstat;
1035         props->state = ((ibcstat >> 4) & 0x3) + 1;
1036         /* See phys_state_show() */
1037         props->phys_state = ipath_cvt_physportstate[
1038                 dev->dd->ipath_lastibcstat & 0xf];
1039         props->port_cap_flags = dev->port_cap_flags;
1040         props->gid_tbl_len = 1;
1041         props->max_msg_sz = 0x80000000;
1042         props->pkey_tbl_len = ipath_get_npkeys(dev->dd);
1043         props->bad_pkey_cntr = ipath_get_cr_errpkey(dev->dd) -
1044                 dev->z_pkey_violations;
1045         props->qkey_viol_cntr = dev->qkey_violations;
1046         props->active_width = IB_WIDTH_4X;
1047         /* See rate_show() */
1048         props->active_speed = 1;        /* Regular 10Mbs speed. */
1049         props->max_vl_num = 1;          /* VLCap = VL0 */
1050         props->init_type_reply = 0;
1051
1052         props->max_mtu = IB_MTU_4096;
1053         switch (dev->dd->ipath_ibmtu) {
1054         case 4096:
1055                 mtu = IB_MTU_4096;
1056                 break;
1057         case 2048:
1058                 mtu = IB_MTU_2048;
1059                 break;
1060         case 1024:
1061                 mtu = IB_MTU_1024;
1062                 break;
1063         case 512:
1064                 mtu = IB_MTU_512;
1065                 break;
1066         case 256:
1067                 mtu = IB_MTU_256;
1068                 break;
1069         default:
1070                 mtu = IB_MTU_2048;
1071         }
1072         props->active_mtu = mtu;
1073         props->subnet_timeout = dev->subnet_timeout;
1074
1075         return 0;
1076 }
1077
1078 static int ipath_modify_device(struct ib_device *device,
1079                                int device_modify_mask,
1080                                struct ib_device_modify *device_modify)
1081 {
1082         int ret;
1083
1084         if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1085                                    IB_DEVICE_MODIFY_NODE_DESC)) {
1086                 ret = -EOPNOTSUPP;
1087                 goto bail;
1088         }
1089
1090         if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
1091                 memcpy(device->node_desc, device_modify->node_desc, 64);
1092
1093         if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
1094                 to_idev(device)->sys_image_guid =
1095                         cpu_to_be64(device_modify->sys_image_guid);
1096
1097         ret = 0;
1098
1099 bail:
1100         return ret;
1101 }
1102
1103 static int ipath_modify_port(struct ib_device *ibdev,
1104                              u8 port, int port_modify_mask,
1105                              struct ib_port_modify *props)
1106 {
1107         struct ipath_ibdev *dev = to_idev(ibdev);
1108
1109         dev->port_cap_flags |= props->set_port_cap_mask;
1110         dev->port_cap_flags &= ~props->clr_port_cap_mask;
1111         if (port_modify_mask & IB_PORT_SHUTDOWN)
1112                 ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
1113         if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
1114                 dev->qkey_violations = 0;
1115         return 0;
1116 }
1117
1118 static int ipath_query_gid(struct ib_device *ibdev, u8 port,
1119                            int index, union ib_gid *gid)
1120 {
1121         struct ipath_ibdev *dev = to_idev(ibdev);
1122         int ret;
1123
1124         if (index >= 1) {
1125                 ret = -EINVAL;
1126                 goto bail;
1127         }
1128         gid->global.subnet_prefix = dev->gid_prefix;
1129         gid->global.interface_id = dev->dd->ipath_guid;
1130
1131         ret = 0;
1132
1133 bail:
1134         return ret;
1135 }
1136
1137 static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
1138                                     struct ib_ucontext *context,
1139                                     struct ib_udata *udata)
1140 {
1141         struct ipath_ibdev *dev = to_idev(ibdev);
1142         struct ipath_pd *pd;
1143         struct ib_pd *ret;
1144
1145         /*
1146          * This is actually totally arbitrary.  Some correctness tests
1147          * assume there's a maximum number of PDs that can be allocated.
1148          * We don't actually have this limit, but we fail the test if
1149          * we allow allocations of more than we report for this value.
1150          */
1151
1152         pd = kmalloc(sizeof *pd, GFP_KERNEL);
1153         if (!pd) {
1154                 ret = ERR_PTR(-ENOMEM);
1155                 goto bail;
1156         }
1157
1158         spin_lock(&dev->n_pds_lock);
1159         if (dev->n_pds_allocated == ib_ipath_max_pds) {
1160                 spin_unlock(&dev->n_pds_lock);
1161                 kfree(pd);
1162                 ret = ERR_PTR(-ENOMEM);
1163                 goto bail;
1164         }
1165
1166         dev->n_pds_allocated++;
1167         spin_unlock(&dev->n_pds_lock);
1168
1169         /* ib_alloc_pd() will initialize pd->ibpd. */
1170         pd->user = udata != NULL;
1171
1172         ret = &pd->ibpd;
1173
1174 bail:
1175         return ret;
1176 }
1177
1178 static int ipath_dealloc_pd(struct ib_pd *ibpd)
1179 {
1180         struct ipath_pd *pd = to_ipd(ibpd);
1181         struct ipath_ibdev *dev = to_idev(ibpd->device);
1182
1183         spin_lock(&dev->n_pds_lock);
1184         dev->n_pds_allocated--;
1185         spin_unlock(&dev->n_pds_lock);
1186
1187         kfree(pd);
1188
1189         return 0;
1190 }
1191
1192 /**
1193  * ipath_create_ah - create an address handle
1194  * @pd: the protection domain
1195  * @ah_attr: the attributes of the AH
1196  *
1197  * This may be called from interrupt context.
1198  */
1199 static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
1200                                      struct ib_ah_attr *ah_attr)
1201 {
1202         struct ipath_ah *ah;
1203         struct ib_ah *ret;
1204         struct ipath_ibdev *dev = to_idev(pd->device);
1205         unsigned long flags;
1206
1207         /* A multicast address requires a GRH (see ch. 8.4.1). */
1208         if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
1209             ah_attr->dlid != IPATH_PERMISSIVE_LID &&
1210             !(ah_attr->ah_flags & IB_AH_GRH)) {
1211                 ret = ERR_PTR(-EINVAL);
1212                 goto bail;
1213         }
1214
1215         if (ah_attr->dlid == 0) {
1216                 ret = ERR_PTR(-EINVAL);
1217                 goto bail;
1218         }
1219
1220         if (ah_attr->port_num < 1 ||
1221             ah_attr->port_num > pd->device->phys_port_cnt) {
1222                 ret = ERR_PTR(-EINVAL);
1223                 goto bail;
1224         }
1225
1226         ah = kmalloc(sizeof *ah, GFP_ATOMIC);
1227         if (!ah) {
1228                 ret = ERR_PTR(-ENOMEM);
1229                 goto bail;
1230         }
1231
1232         spin_lock_irqsave(&dev->n_ahs_lock, flags);
1233         if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
1234                 spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1235                 kfree(ah);
1236                 ret = ERR_PTR(-ENOMEM);
1237                 goto bail;
1238         }
1239
1240         dev->n_ahs_allocated++;
1241         spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1242
1243         /* ib_create_ah() will initialize ah->ibah. */
1244         ah->attr = *ah_attr;
1245
1246         ret = &ah->ibah;
1247
1248 bail:
1249         return ret;
1250 }
1251
1252 /**
1253  * ipath_destroy_ah - destroy an address handle
1254  * @ibah: the AH to destroy
1255  *
1256  * This may be called from interrupt context.
1257  */
1258 static int ipath_destroy_ah(struct ib_ah *ibah)
1259 {
1260         struct ipath_ibdev *dev = to_idev(ibah->device);
1261         struct ipath_ah *ah = to_iah(ibah);
1262         unsigned long flags;
1263
1264         spin_lock_irqsave(&dev->n_ahs_lock, flags);
1265         dev->n_ahs_allocated--;
1266         spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
1267
1268         kfree(ah);
1269
1270         return 0;
1271 }
1272
1273 static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
1274 {
1275         struct ipath_ah *ah = to_iah(ibah);
1276
1277         *ah_attr = ah->attr;
1278
1279         return 0;
1280 }
1281
1282 /**
1283  * ipath_get_npkeys - return the size of the PKEY table for port 0
1284  * @dd: the infinipath device
1285  */
1286 unsigned ipath_get_npkeys(struct ipath_devdata *dd)
1287 {
1288         return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
1289 }
1290
1291 /**
1292  * ipath_get_pkey - return the indexed PKEY from the port 0 PKEY table
1293  * @dd: the infinipath device
1294  * @index: the PKEY index
1295  */
1296 unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
1297 {
1298         unsigned ret;
1299
1300         if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
1301                 ret = 0;
1302         else
1303                 ret = dd->ipath_pd[0]->port_pkeys[index];
1304
1305         return ret;
1306 }
1307
1308 static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1309                             u16 *pkey)
1310 {
1311         struct ipath_ibdev *dev = to_idev(ibdev);
1312         int ret;
1313
1314         if (index >= ipath_get_npkeys(dev->dd)) {
1315                 ret = -EINVAL;
1316                 goto bail;
1317         }
1318
1319         *pkey = ipath_get_pkey(dev->dd, index);
1320         ret = 0;
1321
1322 bail:
1323         return ret;
1324 }
1325
1326 /**
1327  * ipath_alloc_ucontext - allocate a ucontest
1328  * @ibdev: the infiniband device
1329  * @udata: not used by the InfiniPath driver
1330  */
1331
1332 static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
1333                                                 struct ib_udata *udata)
1334 {
1335         struct ipath_ucontext *context;
1336         struct ib_ucontext *ret;
1337
1338         context = kmalloc(sizeof *context, GFP_KERNEL);
1339         if (!context) {
1340                 ret = ERR_PTR(-ENOMEM);
1341                 goto bail;
1342         }
1343
1344         ret = &context->ibucontext;
1345
1346 bail:
1347         return ret;
1348 }
1349
1350 static int ipath_dealloc_ucontext(struct ib_ucontext *context)
1351 {
1352         kfree(to_iucontext(context));
1353         return 0;
1354 }
1355
1356 static int ipath_verbs_register_sysfs(struct ib_device *dev);
1357
1358 static void __verbs_timer(unsigned long arg)
1359 {
1360         struct ipath_devdata *dd = (struct ipath_devdata *) arg;
1361
1362         /*
1363          * If port 0 receive packet interrupts are not available, or
1364          * can be missed, poll the receive queue
1365          */
1366         if (dd->ipath_flags & IPATH_POLL_RX_INTR)
1367                 ipath_kreceive(dd);
1368
1369         /* Handle verbs layer timeouts. */
1370         ipath_ib_timer(dd->verbs_dev);
1371
1372         mod_timer(&dd->verbs_timer, jiffies + 1);
1373 }
1374
1375 static int enable_timer(struct ipath_devdata *dd)
1376 {
1377         /*
1378          * Early chips had a design flaw where the chip and kernel idea
1379          * of the tail register don't always agree, and therefore we won't
1380          * get an interrupt on the next packet received.
1381          * If the board supports per packet receive interrupts, use it.
1382          * Otherwise, the timer function periodically checks for packets
1383          * to cover this case.
1384          * Either way, the timer is needed for verbs layer related
1385          * processing.
1386          */
1387         if (dd->ipath_flags & IPATH_GPIO_INTR) {
1388                 u64 val;
1389                 ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
1390                                  0x2074076542310ULL);
1391                 /* Enable GPIO bit 2 interrupt */
1392                 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
1393                 val |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
1394                 ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
1395         }
1396
1397         init_timer(&dd->verbs_timer);
1398         dd->verbs_timer.function = __verbs_timer;
1399         dd->verbs_timer.data = (unsigned long)dd;
1400         dd->verbs_timer.expires = jiffies + 1;
1401         add_timer(&dd->verbs_timer);
1402
1403         return 0;
1404 }
1405
1406 static int disable_timer(struct ipath_devdata *dd)
1407 {
1408         /* Disable GPIO bit 2 interrupt */
1409         if (dd->ipath_flags & IPATH_GPIO_INTR) {
1410                 u64 val;
1411                 /* Disable GPIO bit 2 interrupt */
1412                 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
1413                 val &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
1414                 ipath_write_kreg( dd, dd->ipath_kregs->kr_gpio_mask, val);
1415                 /*
1416                  * We might want to undo changes to debugportselect,
1417                  * but how?
1418                  */
1419         }
1420
1421         del_timer_sync(&dd->verbs_timer);
1422
1423         return 0;
1424 }
1425
1426 /**
1427  * ipath_register_ib_device - register our device with the infiniband core
1428  * @dd: the device data structure
1429  * Return the allocated ipath_ibdev pointer or NULL on error.
1430  */
1431 int ipath_register_ib_device(struct ipath_devdata *dd)
1432 {
1433         struct ipath_verbs_counters cntrs;
1434         struct ipath_ibdev *idev;
1435         struct ib_device *dev;
1436         int ret;
1437
1438         idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
1439         if (idev == NULL) {
1440                 ret = -ENOMEM;
1441                 goto bail;
1442         }
1443
1444         dev = &idev->ibdev;
1445
1446         /* Only need to initialize non-zero fields. */
1447         spin_lock_init(&idev->n_pds_lock);
1448         spin_lock_init(&idev->n_ahs_lock);
1449         spin_lock_init(&idev->n_cqs_lock);
1450         spin_lock_init(&idev->n_qps_lock);
1451         spin_lock_init(&idev->n_srqs_lock);
1452         spin_lock_init(&idev->n_mcast_grps_lock);
1453
1454         spin_lock_init(&idev->qp_table.lock);
1455         spin_lock_init(&idev->lk_table.lock);
1456         idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE);
1457         /* Set the prefix to the default value (see ch. 4.1.1) */
1458         idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL);
1459
1460         ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
1461         if (ret)
1462                 goto err_qp;
1463
1464         /*
1465          * The top ib_ipath_lkey_table_size bits are used to index the
1466          * table.  The lower 8 bits can be owned by the user (copied from
1467          * the LKEY).  The remaining bits act as a generation number or tag.
1468          */
1469         idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
1470         idev->lk_table.table = kzalloc(idev->lk_table.max *
1471                                        sizeof(*idev->lk_table.table),
1472                                        GFP_KERNEL);
1473         if (idev->lk_table.table == NULL) {
1474                 ret = -ENOMEM;
1475                 goto err_lk;
1476         }
1477         spin_lock_init(&idev->pending_lock);
1478         INIT_LIST_HEAD(&idev->pending[0]);
1479         INIT_LIST_HEAD(&idev->pending[1]);
1480         INIT_LIST_HEAD(&idev->pending[2]);
1481         INIT_LIST_HEAD(&idev->piowait);
1482         INIT_LIST_HEAD(&idev->rnrwait);
1483         idev->pending_index = 0;
1484         idev->port_cap_flags =
1485                 IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
1486         idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1487         idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1488         idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1489         idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1490         idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1491         idev->link_width_enabled = 3;   /* 1x or 4x */
1492
1493         /* Snapshot current HW counters to "clear" them. */
1494         ipath_get_counters(dd, &cntrs);
1495         idev->z_symbol_error_counter = cntrs.symbol_error_counter;
1496         idev->z_link_error_recovery_counter =
1497                 cntrs.link_error_recovery_counter;
1498         idev->z_link_downed_counter = cntrs.link_downed_counter;
1499         idev->z_port_rcv_errors = cntrs.port_rcv_errors;
1500         idev->z_port_rcv_remphys_errors =
1501                 cntrs.port_rcv_remphys_errors;
1502         idev->z_port_xmit_discards = cntrs.port_xmit_discards;
1503         idev->z_port_xmit_data = cntrs.port_xmit_data;
1504         idev->z_port_rcv_data = cntrs.port_rcv_data;
1505         idev->z_port_xmit_packets = cntrs.port_xmit_packets;
1506         idev->z_port_rcv_packets = cntrs.port_rcv_packets;
1507         idev->z_local_link_integrity_errors =
1508                 cntrs.local_link_integrity_errors;
1509         idev->z_excessive_buffer_overrun_errors =
1510                 cntrs.excessive_buffer_overrun_errors;
1511
1512         /*
1513          * The system image GUID is supposed to be the same for all
1514          * IB HCAs in a single system but since there can be other
1515          * device types in the system, we can't be sure this is unique.
1516          */
1517         if (!sys_image_guid)
1518                 sys_image_guid = dd->ipath_guid;
1519         idev->sys_image_guid = sys_image_guid;
1520         idev->ib_unit = dd->ipath_unit;
1521         idev->dd = dd;
1522
1523         strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
1524         dev->owner = THIS_MODULE;
1525         dev->node_guid = dd->ipath_guid;
1526         dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
1527         dev->uverbs_cmd_mask =
1528                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
1529                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
1530                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
1531                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
1532                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
1533                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
1534                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
1535                 (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
1536                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
1537                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
1538                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
1539                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
1540                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
1541                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
1542                 (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
1543                 (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
1544                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
1545                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
1546                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
1547                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
1548                 (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
1549                 (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
1550                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
1551                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
1552                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
1553                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
1554                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
1555                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
1556                 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
1557         dev->node_type = RDMA_NODE_IB_CA;
1558         dev->phys_port_cnt = 1;
1559         dev->dma_device = &dd->pcidev->dev;
1560         dev->class_dev.dev = dev->dma_device;
1561         dev->query_device = ipath_query_device;
1562         dev->modify_device = ipath_modify_device;
1563         dev->query_port = ipath_query_port;
1564         dev->modify_port = ipath_modify_port;
1565         dev->query_pkey = ipath_query_pkey;
1566         dev->query_gid = ipath_query_gid;
1567         dev->alloc_ucontext = ipath_alloc_ucontext;
1568         dev->dealloc_ucontext = ipath_dealloc_ucontext;
1569         dev->alloc_pd = ipath_alloc_pd;
1570         dev->dealloc_pd = ipath_dealloc_pd;
1571         dev->create_ah = ipath_create_ah;
1572         dev->destroy_ah = ipath_destroy_ah;
1573         dev->query_ah = ipath_query_ah;
1574         dev->create_srq = ipath_create_srq;
1575         dev->modify_srq = ipath_modify_srq;
1576         dev->query_srq = ipath_query_srq;
1577         dev->destroy_srq = ipath_destroy_srq;
1578         dev->create_qp = ipath_create_qp;
1579         dev->modify_qp = ipath_modify_qp;
1580         dev->query_qp = ipath_query_qp;
1581         dev->destroy_qp = ipath_destroy_qp;
1582         dev->post_send = ipath_post_send;
1583         dev->post_recv = ipath_post_receive;
1584         dev->post_srq_recv = ipath_post_srq_receive;
1585         dev->create_cq = ipath_create_cq;
1586         dev->destroy_cq = ipath_destroy_cq;
1587         dev->resize_cq = ipath_resize_cq;
1588         dev->poll_cq = ipath_poll_cq;
1589         dev->req_notify_cq = ipath_req_notify_cq;
1590         dev->get_dma_mr = ipath_get_dma_mr;
1591         dev->reg_phys_mr = ipath_reg_phys_mr;
1592         dev->reg_user_mr = ipath_reg_user_mr;
1593         dev->dereg_mr = ipath_dereg_mr;
1594         dev->alloc_fmr = ipath_alloc_fmr;
1595         dev->map_phys_fmr = ipath_map_phys_fmr;
1596         dev->unmap_fmr = ipath_unmap_fmr;
1597         dev->dealloc_fmr = ipath_dealloc_fmr;
1598         dev->attach_mcast = ipath_multicast_attach;
1599         dev->detach_mcast = ipath_multicast_detach;
1600         dev->process_mad = ipath_process_mad;
1601         dev->mmap = ipath_mmap;
1602         dev->dma_ops = &ipath_dma_mapping_ops;
1603
1604         snprintf(dev->node_desc, sizeof(dev->node_desc),
1605                  IPATH_IDSTR " %s", init_utsname()->nodename);
1606
1607         ret = ib_register_device(dev);
1608         if (ret)
1609                 goto err_reg;
1610
1611         if (ipath_verbs_register_sysfs(dev))
1612                 goto err_class;
1613
1614         enable_timer(dd);
1615
1616         goto bail;
1617
1618 err_class:
1619         ib_unregister_device(dev);
1620 err_reg:
1621         kfree(idev->lk_table.table);
1622 err_lk:
1623         kfree(idev->qp_table.table);
1624 err_qp:
1625         ib_dealloc_device(dev);
1626         ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1627         idev = NULL;
1628
1629 bail:
1630         dd->verbs_dev = idev;
1631         return ret;
1632 }
1633
1634 void ipath_unregister_ib_device(struct ipath_ibdev *dev)
1635 {
1636         struct ib_device *ibdev = &dev->ibdev;
1637
1638         disable_timer(dev->dd);
1639
1640         ib_unregister_device(ibdev);
1641
1642         if (!list_empty(&dev->pending[0]) ||
1643             !list_empty(&dev->pending[1]) ||
1644             !list_empty(&dev->pending[2]))
1645                 ipath_dev_err(dev->dd, "pending list not empty!\n");
1646         if (!list_empty(&dev->piowait))
1647                 ipath_dev_err(dev->dd, "piowait list not empty!\n");
1648         if (!list_empty(&dev->rnrwait))
1649                 ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
1650         if (!ipath_mcast_tree_empty())
1651                 ipath_dev_err(dev->dd, "multicast table memory leak!\n");
1652         /*
1653          * Note that ipath_unregister_ib_device() can be called before all
1654          * the QPs are destroyed!
1655          */
1656         ipath_free_all_qps(&dev->qp_table);
1657         kfree(dev->qp_table.table);
1658         kfree(dev->lk_table.table);
1659         ib_dealloc_device(ibdev);
1660 }
1661
1662 static ssize_t show_rev(struct class_device *cdev, char *buf)
1663 {
1664         struct ipath_ibdev *dev =
1665                 container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
1666
1667         return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
1668 }
1669
1670 static ssize_t show_hca(struct class_device *cdev, char *buf)
1671 {
1672         struct ipath_ibdev *dev =
1673                 container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
1674         int ret;
1675
1676         ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
1677         if (ret < 0)
1678                 goto bail;
1679         strcat(buf, "\n");
1680         ret = strlen(buf);
1681
1682 bail:
1683         return ret;
1684 }
1685
1686 static ssize_t show_stats(struct class_device *cdev, char *buf)
1687 {
1688         struct ipath_ibdev *dev =
1689                 container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
1690         int i;
1691         int len;
1692
1693         len = sprintf(buf,
1694                       "RC resends  %d\n"
1695                       "RC no QACK  %d\n"
1696                       "RC ACKs     %d\n"
1697                       "RC SEQ NAKs %d\n"
1698                       "RC RDMA seq %d\n"
1699                       "RC RNR NAKs %d\n"
1700                       "RC OTH NAKs %d\n"
1701                       "RC timeouts %d\n"
1702                       "RC RDMA dup %d\n"
1703                       "RC stalls   %d\n"
1704                       "piobuf wait %d\n"
1705                       "no piobuf   %d\n"
1706                       "PKT drops   %d\n"
1707                       "WQE errs    %d\n",
1708                       dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
1709                       dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
1710                       dev->n_other_naks, dev->n_timeouts,
1711                       dev->n_rdma_dup_busy, dev->n_rc_stalls, dev->n_piowait,
1712                       dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs);
1713         for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
1714                 const struct ipath_opcode_stats *si = &dev->opstats[i];
1715
1716                 if (!si->n_packets && !si->n_bytes)
1717                         continue;
1718                 len += sprintf(buf + len, "%02x %llu/%llu\n", i,
1719                                (unsigned long long) si->n_packets,
1720                                (unsigned long long) si->n_bytes);
1721         }
1722         return len;
1723 }
1724
1725 static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
1726 static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
1727 static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
1728 static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
1729
1730 static struct class_device_attribute *ipath_class_attributes[] = {
1731         &class_device_attr_hw_rev,
1732         &class_device_attr_hca_type,
1733         &class_device_attr_board_id,
1734         &class_device_attr_stats
1735 };
1736
1737 static int ipath_verbs_register_sysfs(struct ib_device *dev)
1738 {
1739         int i;
1740         int ret;
1741
1742         for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
1743                 if (class_device_create_file(&dev->class_dev,
1744                                              ipath_class_attributes[i])) {
1745                         ret = 1;
1746                         goto bail;
1747                 }
1748
1749         ret = 0;
1750
1751 bail:
1752         return ret;
1753 }