IPoIB/cm: Factor out ipoib_cm_free_rx_ring()
[linux-2.6] / drivers / infiniband / ulp / ipoib / ipoib_cm.c
1 /*
2  * Copyright (c) 2006 Mellanox Technologies. All rights reserved
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  *
32  * $Id$
33  */
34
35 #include <rdma/ib_cm.h>
36 #include <rdma/ib_cache.h>
37 #include <net/dst.h>
38 #include <net/icmp.h>
39 #include <linux/icmpv6.h>
40 #include <linux/delay.h>
41
42 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
43 static int data_debug_level;
44
45 module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
46 MODULE_PARM_DESC(cm_data_debug_level,
47                  "Enable data path debug tracing for connected mode if > 0");
48 #endif
49
50 #include "ipoib.h"
51
52 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
53
54 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
55 #define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
56 #define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
57 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
58
59 static struct ib_qp_attr ipoib_cm_err_attr = {
60         .qp_state = IB_QPS_ERR
61 };
62
63 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
64
65 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
66         .wr_id = IPOIB_CM_RX_DRAIN_WRID,
67         .opcode = IB_WR_SEND,
68 };
69
70 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
71                                struct ib_cm_event *event);
72
73 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
74                                   u64 mapping[IPOIB_CM_RX_SG])
75 {
76         int i;
77
78         ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
79
80         for (i = 0; i < frags; ++i)
81                 ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
82 }
83
84 static int ipoib_cm_post_receive(struct net_device *dev, int id)
85 {
86         struct ipoib_dev_priv *priv = netdev_priv(dev);
87         struct ib_recv_wr *bad_wr;
88         int i, ret;
89
90         priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
91
92         for (i = 0; i < IPOIB_CM_RX_SG; ++i)
93                 priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
94
95         ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
96         if (unlikely(ret)) {
97                 ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
98                 ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
99                                       priv->cm.srq_ring[id].mapping);
100                 dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
101                 priv->cm.srq_ring[id].skb = NULL;
102         }
103
104         return ret;
105 }
106
107 static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags,
108                                              u64 mapping[IPOIB_CM_RX_SG])
109 {
110         struct ipoib_dev_priv *priv = netdev_priv(dev);
111         struct sk_buff *skb;
112         int i;
113
114         skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
115         if (unlikely(!skb))
116                 return NULL;
117
118         /*
119          * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
120          * IP header to a multiple of 16.
121          */
122         skb_reserve(skb, 12);
123
124         mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
125                                        DMA_FROM_DEVICE);
126         if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
127                 dev_kfree_skb_any(skb);
128                 return NULL;
129         }
130
131         for (i = 0; i < frags; i++) {
132                 struct page *page = alloc_page(GFP_ATOMIC);
133
134                 if (!page)
135                         goto partial_error;
136                 skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
137
138                 mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page,
139                                                  0, PAGE_SIZE, DMA_FROM_DEVICE);
140                 if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
141                         goto partial_error;
142         }
143
144         priv->cm.srq_ring[id].skb = skb;
145         return skb;
146
147 partial_error:
148
149         ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
150
151         for (; i > 0; --i)
152                 ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
153
154         dev_kfree_skb_any(skb);
155         return NULL;
156 }
157
158 static void ipoib_cm_free_rx_ring(struct net_device *dev,
159                                   struct ipoib_cm_rx_buf *rx_ring)
160 {
161         struct ipoib_dev_priv *priv = netdev_priv(dev);
162         int i;
163
164         for (i = 0; i < ipoib_recvq_size; ++i)
165                 if (rx_ring[i].skb) {
166                         ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
167                                               rx_ring[i].mapping);
168                         dev_kfree_skb_any(rx_ring[i].skb);
169                 }
170
171         kfree(rx_ring);
172 }
173
174 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
175 {
176         struct ib_send_wr *bad_wr;
177         struct ipoib_cm_rx *p;
178
179         /* We only reserved 1 extra slot in CQ for drain WRs, so
180          * make sure we have at most 1 outstanding WR. */
181         if (list_empty(&priv->cm.rx_flush_list) ||
182             !list_empty(&priv->cm.rx_drain_list))
183                 return;
184
185         /*
186          * QPs on flush list are error state.  This way, a "flush
187          * error" WC will be immediately generated for each WR we post.
188          */
189         p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
190         if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
191                 ipoib_warn(priv, "failed to post drain wr\n");
192
193         list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
194 }
195
196 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
197 {
198         struct ipoib_cm_rx *p = ctx;
199         struct ipoib_dev_priv *priv = netdev_priv(p->dev);
200         unsigned long flags;
201
202         if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
203                 return;
204
205         spin_lock_irqsave(&priv->lock, flags);
206         list_move(&p->list, &priv->cm.rx_flush_list);
207         p->state = IPOIB_CM_RX_FLUSH;
208         ipoib_cm_start_rx_drain(priv);
209         spin_unlock_irqrestore(&priv->lock, flags);
210 }
211
212 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
213                                            struct ipoib_cm_rx *p)
214 {
215         struct ipoib_dev_priv *priv = netdev_priv(dev);
216         struct ib_qp_init_attr attr = {
217                 .event_handler = ipoib_cm_rx_event_handler,
218                 .send_cq = priv->cq, /* For drain WR */
219                 .recv_cq = priv->cq,
220                 .srq = priv->cm.srq,
221                 .cap.max_send_wr = 1, /* For drain WR */
222                 .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
223                 .sq_sig_type = IB_SIGNAL_ALL_WR,
224                 .qp_type = IB_QPT_RC,
225                 .qp_context = p,
226         };
227         return ib_create_qp(priv->pd, &attr);
228 }
229
230 static int ipoib_cm_modify_rx_qp(struct net_device *dev,
231                                   struct ib_cm_id *cm_id, struct ib_qp *qp,
232                                   unsigned psn)
233 {
234         struct ipoib_dev_priv *priv = netdev_priv(dev);
235         struct ib_qp_attr qp_attr;
236         int qp_attr_mask, ret;
237
238         qp_attr.qp_state = IB_QPS_INIT;
239         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
240         if (ret) {
241                 ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
242                 return ret;
243         }
244         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
245         if (ret) {
246                 ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
247                 return ret;
248         }
249         qp_attr.qp_state = IB_QPS_RTR;
250         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
251         if (ret) {
252                 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
253                 return ret;
254         }
255         qp_attr.rq_psn = psn;
256         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
257         if (ret) {
258                 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
259                 return ret;
260         }
261
262         /*
263          * Current Mellanox HCA firmware won't generate completions
264          * with error for drain WRs unless the QP has been moved to
265          * RTS first. This work-around leaves a window where a QP has
266          * moved to error asynchronously, but this will eventually get
267          * fixed in firmware, so let's not error out if modify QP
268          * fails.
269          */
270         qp_attr.qp_state = IB_QPS_RTS;
271         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
272         if (ret) {
273                 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
274                 return 0;
275         }
276         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
277         if (ret) {
278                 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
279                 return 0;
280         }
281
282         return 0;
283 }
284
285 static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
286                              struct ib_qp *qp, struct ib_cm_req_event_param *req,
287                              unsigned psn)
288 {
289         struct ipoib_dev_priv *priv = netdev_priv(dev);
290         struct ipoib_cm_data data = {};
291         struct ib_cm_rep_param rep = {};
292
293         data.qpn = cpu_to_be32(priv->qp->qp_num);
294         data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
295
296         rep.private_data = &data;
297         rep.private_data_len = sizeof data;
298         rep.flow_control = 0;
299         rep.rnr_retry_count = req->rnr_retry_count;
300         rep.srq = 1;
301         rep.qp_num = qp->qp_num;
302         rep.starting_psn = psn;
303         return ib_send_cm_rep(cm_id, &rep);
304 }
305
306 static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
307 {
308         struct net_device *dev = cm_id->context;
309         struct ipoib_dev_priv *priv = netdev_priv(dev);
310         struct ipoib_cm_rx *p;
311         unsigned psn;
312         int ret;
313
314         ipoib_dbg(priv, "REQ arrived\n");
315         p = kzalloc(sizeof *p, GFP_KERNEL);
316         if (!p)
317                 return -ENOMEM;
318         p->dev = dev;
319         p->id = cm_id;
320         cm_id->context = p;
321         p->state = IPOIB_CM_RX_LIVE;
322         p->jiffies = jiffies;
323         INIT_LIST_HEAD(&p->list);
324
325         p->qp = ipoib_cm_create_rx_qp(dev, p);
326         if (IS_ERR(p->qp)) {
327                 ret = PTR_ERR(p->qp);
328                 goto err_qp;
329         }
330
331         psn = random32() & 0xffffff;
332         ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
333         if (ret)
334                 goto err_modify;
335
336         spin_lock_irq(&priv->lock);
337         queue_delayed_work(ipoib_workqueue,
338                            &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
339         /* Add this entry to passive ids list head, but do not re-add it
340          * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
341         p->jiffies = jiffies;
342         if (p->state == IPOIB_CM_RX_LIVE)
343                 list_move(&p->list, &priv->cm.passive_ids);
344         spin_unlock_irq(&priv->lock);
345
346         ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
347         if (ret) {
348                 ipoib_warn(priv, "failed to send REP: %d\n", ret);
349                 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
350                         ipoib_warn(priv, "unable to move qp to error state\n");
351         }
352         return 0;
353
354 err_modify:
355         ib_destroy_qp(p->qp);
356 err_qp:
357         kfree(p);
358         return ret;
359 }
360
361 static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
362                                struct ib_cm_event *event)
363 {
364         struct ipoib_cm_rx *p;
365         struct ipoib_dev_priv *priv;
366
367         switch (event->event) {
368         case IB_CM_REQ_RECEIVED:
369                 return ipoib_cm_req_handler(cm_id, event);
370         case IB_CM_DREQ_RECEIVED:
371                 p = cm_id->context;
372                 ib_send_cm_drep(cm_id, NULL, 0);
373                 /* Fall through */
374         case IB_CM_REJ_RECEIVED:
375                 p = cm_id->context;
376                 priv = netdev_priv(p->dev);
377                 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
378                         ipoib_warn(priv, "unable to move qp to error state\n");
379                 /* Fall through */
380         default:
381                 return 0;
382         }
383 }
384 /* Adjust length of skb with fragments to match received data */
385 static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
386                           unsigned int length, struct sk_buff *toskb)
387 {
388         int i, num_frags;
389         unsigned int size;
390
391         /* put header into skb */
392         size = min(length, hdr_space);
393         skb->tail += size;
394         skb->len += size;
395         length -= size;
396
397         num_frags = skb_shinfo(skb)->nr_frags;
398         for (i = 0; i < num_frags; i++) {
399                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
400
401                 if (length == 0) {
402                         /* don't need this page */
403                         skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE);
404                         --skb_shinfo(skb)->nr_frags;
405                 } else {
406                         size = min(length, (unsigned) PAGE_SIZE);
407
408                         frag->size = size;
409                         skb->data_len += size;
410                         skb->truesize += size;
411                         skb->len += size;
412                         length -= size;
413                 }
414         }
415 }
416
417 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
418 {
419         struct ipoib_dev_priv *priv = netdev_priv(dev);
420         unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
421         struct sk_buff *skb, *newskb;
422         struct ipoib_cm_rx *p;
423         unsigned long flags;
424         u64 mapping[IPOIB_CM_RX_SG];
425         int frags;
426
427         ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
428                        wr_id, wc->status);
429
430         if (unlikely(wr_id >= ipoib_recvq_size)) {
431                 if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
432                         spin_lock_irqsave(&priv->lock, flags);
433                         list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
434                         ipoib_cm_start_rx_drain(priv);
435                         queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
436                         spin_unlock_irqrestore(&priv->lock, flags);
437                 } else
438                         ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
439                                    wr_id, ipoib_recvq_size);
440                 return;
441         }
442
443         skb  = priv->cm.srq_ring[wr_id].skb;
444
445         if (unlikely(wc->status != IB_WC_SUCCESS)) {
446                 ipoib_dbg(priv, "cm recv error "
447                            "(status=%d, wrid=%d vend_err %x)\n",
448                            wc->status, wr_id, wc->vendor_err);
449                 ++dev->stats.rx_dropped;
450                 goto repost;
451         }
452
453         if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
454                 p = wc->qp->qp_context;
455                 if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
456                         spin_lock_irqsave(&priv->lock, flags);
457                         p->jiffies = jiffies;
458                         /* Move this entry to list head, but do not re-add it
459                          * if it has been moved out of list. */
460                         if (p->state == IPOIB_CM_RX_LIVE)
461                                 list_move(&p->list, &priv->cm.passive_ids);
462                         spin_unlock_irqrestore(&priv->lock, flags);
463                 }
464         }
465
466         frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
467                                               (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
468
469         newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping);
470         if (unlikely(!newskb)) {
471                 /*
472                  * If we can't allocate a new RX buffer, dump
473                  * this packet and reuse the old buffer.
474                  */
475                 ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
476                 ++dev->stats.rx_dropped;
477                 goto repost;
478         }
479
480         ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping);
481         memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
482
483         ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
484                        wc->byte_len, wc->slid);
485
486         skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
487
488         skb->protocol = ((struct ipoib_header *) skb->data)->proto;
489         skb_reset_mac_header(skb);
490         skb_pull(skb, IPOIB_ENCAP_LEN);
491
492         dev->last_rx = jiffies;
493         ++dev->stats.rx_packets;
494         dev->stats.rx_bytes += skb->len;
495
496         skb->dev = dev;
497         /* XXX get correct PACKET_ type here */
498         skb->pkt_type = PACKET_HOST;
499         netif_receive_skb(skb);
500
501 repost:
502         if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
503                 ipoib_warn(priv, "ipoib_cm_post_receive failed "
504                            "for buf %d\n", wr_id);
505 }
506
507 static inline int post_send(struct ipoib_dev_priv *priv,
508                             struct ipoib_cm_tx *tx,
509                             unsigned int wr_id,
510                             u64 addr, int len)
511 {
512         struct ib_send_wr *bad_wr;
513
514         priv->tx_sge.addr       = addr;
515         priv->tx_sge.length     = len;
516
517         priv->tx_wr.wr_id       = wr_id | IPOIB_OP_CM;
518
519         return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
520 }
521
522 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
523 {
524         struct ipoib_dev_priv *priv = netdev_priv(dev);
525         struct ipoib_tx_buf *tx_req;
526         u64 addr;
527
528         if (unlikely(skb->len > tx->mtu)) {
529                 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
530                            skb->len, tx->mtu);
531                 ++dev->stats.tx_dropped;
532                 ++dev->stats.tx_errors;
533                 ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
534                 return;
535         }
536
537         ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
538                        tx->tx_head, skb->len, tx->qp->qp_num);
539
540         /*
541          * We put the skb into the tx_ring _before_ we call post_send()
542          * because it's entirely possible that the completion handler will
543          * run before we execute anything after the post_send().  That
544          * means we have to make sure everything is properly recorded and
545          * our state is consistent before we call post_send().
546          */
547         tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
548         tx_req->skb = skb;
549         addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
550         if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
551                 ++dev->stats.tx_errors;
552                 dev_kfree_skb_any(skb);
553                 return;
554         }
555
556         tx_req->mapping = addr;
557
558         if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
559                                addr, skb->len))) {
560                 ipoib_warn(priv, "post_send failed\n");
561                 ++dev->stats.tx_errors;
562                 ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
563                 dev_kfree_skb_any(skb);
564         } else {
565                 dev->trans_start = jiffies;
566                 ++tx->tx_head;
567
568                 if (++priv->tx_outstanding == ipoib_sendq_size) {
569                         ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
570                                   tx->qp->qp_num);
571                         netif_stop_queue(dev);
572                 }
573         }
574 }
575
576 void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
577 {
578         struct ipoib_dev_priv *priv = netdev_priv(dev);
579         struct ipoib_cm_tx *tx = wc->qp->qp_context;
580         unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
581         struct ipoib_tx_buf *tx_req;
582         unsigned long flags;
583
584         ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
585                        wr_id, wc->status);
586
587         if (unlikely(wr_id >= ipoib_sendq_size)) {
588                 ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
589                            wr_id, ipoib_sendq_size);
590                 return;
591         }
592
593         tx_req = &tx->tx_ring[wr_id];
594
595         ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
596
597         /* FIXME: is this right? Shouldn't we only increment on success? */
598         ++dev->stats.tx_packets;
599         dev->stats.tx_bytes += tx_req->skb->len;
600
601         dev_kfree_skb_any(tx_req->skb);
602
603         spin_lock_irqsave(&priv->tx_lock, flags);
604         ++tx->tx_tail;
605         if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
606             netif_queue_stopped(dev) &&
607             test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
608                 netif_wake_queue(dev);
609
610         if (wc->status != IB_WC_SUCCESS &&
611             wc->status != IB_WC_WR_FLUSH_ERR) {
612                 struct ipoib_neigh *neigh;
613
614                 ipoib_dbg(priv, "failed cm send event "
615                            "(status=%d, wrid=%d vend_err %x)\n",
616                            wc->status, wr_id, wc->vendor_err);
617
618                 spin_lock(&priv->lock);
619                 neigh = tx->neigh;
620
621                 if (neigh) {
622                         neigh->cm = NULL;
623                         list_del(&neigh->list);
624                         if (neigh->ah)
625                                 ipoib_put_ah(neigh->ah);
626                         ipoib_neigh_free(dev, neigh);
627
628                         tx->neigh = NULL;
629                 }
630
631                 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
632                         list_move(&tx->list, &priv->cm.reap_list);
633                         queue_work(ipoib_workqueue, &priv->cm.reap_task);
634                 }
635
636                 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
637
638                 spin_unlock(&priv->lock);
639         }
640
641         spin_unlock_irqrestore(&priv->tx_lock, flags);
642 }
643
644 int ipoib_cm_dev_open(struct net_device *dev)
645 {
646         struct ipoib_dev_priv *priv = netdev_priv(dev);
647         int ret;
648
649         if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
650                 return 0;
651
652         priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
653         if (IS_ERR(priv->cm.id)) {
654                 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
655                 ret = PTR_ERR(priv->cm.id);
656                 goto err_cm;
657         }
658
659         ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
660                            0, NULL);
661         if (ret) {
662                 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
663                        IPOIB_CM_IETF_ID | priv->qp->qp_num);
664                 goto err_listen;
665         }
666
667         return 0;
668
669 err_listen:
670         ib_destroy_cm_id(priv->cm.id);
671 err_cm:
672         priv->cm.id = NULL;
673         return ret;
674 }
675
676 void ipoib_cm_dev_stop(struct net_device *dev)
677 {
678         struct ipoib_dev_priv *priv = netdev_priv(dev);
679         struct ipoib_cm_rx *p, *n;
680         unsigned long begin;
681         LIST_HEAD(list);
682         int ret;
683
684         if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
685                 return;
686
687         ib_destroy_cm_id(priv->cm.id);
688         priv->cm.id = NULL;
689
690         spin_lock_irq(&priv->lock);
691         while (!list_empty(&priv->cm.passive_ids)) {
692                 p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
693                 list_move(&p->list, &priv->cm.rx_error_list);
694                 p->state = IPOIB_CM_RX_ERROR;
695                 spin_unlock_irq(&priv->lock);
696                 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
697                 if (ret)
698                         ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
699                 spin_lock_irq(&priv->lock);
700         }
701
702         /* Wait for all RX to be drained */
703         begin = jiffies;
704
705         while (!list_empty(&priv->cm.rx_error_list) ||
706                !list_empty(&priv->cm.rx_flush_list) ||
707                !list_empty(&priv->cm.rx_drain_list)) {
708                 if (time_after(jiffies, begin + 5 * HZ)) {
709                         ipoib_warn(priv, "RX drain timing out\n");
710
711                         /*
712                          * assume the HW is wedged and just free up everything.
713                          */
714                         list_splice_init(&priv->cm.rx_flush_list, &list);
715                         list_splice_init(&priv->cm.rx_error_list, &list);
716                         list_splice_init(&priv->cm.rx_drain_list, &list);
717                         break;
718                 }
719                 spin_unlock_irq(&priv->lock);
720                 msleep(1);
721                 ipoib_drain_cq(dev);
722                 spin_lock_irq(&priv->lock);
723         }
724
725         list_splice_init(&priv->cm.rx_reap_list, &list);
726
727         spin_unlock_irq(&priv->lock);
728
729         list_for_each_entry_safe(p, n, &list, list) {
730                 ib_destroy_cm_id(p->id);
731                 ib_destroy_qp(p->qp);
732                 kfree(p);
733         }
734
735         cancel_delayed_work(&priv->cm.stale_task);
736 }
737
738 static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
739 {
740         struct ipoib_cm_tx *p = cm_id->context;
741         struct ipoib_dev_priv *priv = netdev_priv(p->dev);
742         struct ipoib_cm_data *data = event->private_data;
743         struct sk_buff_head skqueue;
744         struct ib_qp_attr qp_attr;
745         int qp_attr_mask, ret;
746         struct sk_buff *skb;
747
748         p->mtu = be32_to_cpu(data->mtu);
749
750         if (p->mtu <= IPOIB_ENCAP_LEN) {
751                 ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
752                            p->mtu, IPOIB_ENCAP_LEN);
753                 return -EINVAL;
754         }
755
756         qp_attr.qp_state = IB_QPS_RTR;
757         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
758         if (ret) {
759                 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
760                 return ret;
761         }
762
763         qp_attr.rq_psn = 0 /* FIXME */;
764         ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
765         if (ret) {
766                 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
767                 return ret;
768         }
769
770         qp_attr.qp_state = IB_QPS_RTS;
771         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
772         if (ret) {
773                 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
774                 return ret;
775         }
776         ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
777         if (ret) {
778                 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
779                 return ret;
780         }
781
782         skb_queue_head_init(&skqueue);
783
784         spin_lock_irq(&priv->lock);
785         set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
786         if (p->neigh)
787                 while ((skb = __skb_dequeue(&p->neigh->queue)))
788                         __skb_queue_tail(&skqueue, skb);
789         spin_unlock_irq(&priv->lock);
790
791         while ((skb = __skb_dequeue(&skqueue))) {
792                 skb->dev = p->dev;
793                 if (dev_queue_xmit(skb))
794                         ipoib_warn(priv, "dev_queue_xmit failed "
795                                    "to requeue packet\n");
796         }
797
798         ret = ib_send_cm_rtu(cm_id, NULL, 0);
799         if (ret) {
800                 ipoib_warn(priv, "failed to send RTU: %d\n", ret);
801                 return ret;
802         }
803         return 0;
804 }
805
806 static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
807 {
808         struct ipoib_dev_priv *priv = netdev_priv(dev);
809         struct ib_qp_init_attr attr = {
810                 .send_cq                = priv->cq,
811                 .recv_cq                = priv->cq,
812                 .srq                    = priv->cm.srq,
813                 .cap.max_send_wr        = ipoib_sendq_size,
814                 .cap.max_send_sge       = 1,
815                 .sq_sig_type            = IB_SIGNAL_ALL_WR,
816                 .qp_type                = IB_QPT_RC,
817                 .qp_context             = tx
818         };
819
820         return ib_create_qp(priv->pd, &attr);
821 }
822
823 static int ipoib_cm_send_req(struct net_device *dev,
824                              struct ib_cm_id *id, struct ib_qp *qp,
825                              u32 qpn,
826                              struct ib_sa_path_rec *pathrec)
827 {
828         struct ipoib_dev_priv *priv = netdev_priv(dev);
829         struct ipoib_cm_data data = {};
830         struct ib_cm_req_param req = {};
831
832         data.qpn = cpu_to_be32(priv->qp->qp_num);
833         data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
834
835         req.primary_path                = pathrec;
836         req.alternate_path              = NULL;
837         req.service_id                  = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
838         req.qp_num                      = qp->qp_num;
839         req.qp_type                     = qp->qp_type;
840         req.private_data                = &data;
841         req.private_data_len            = sizeof data;
842         req.flow_control                = 0;
843
844         req.starting_psn                = 0; /* FIXME */
845
846         /*
847          * Pick some arbitrary defaults here; we could make these
848          * module parameters if anyone cared about setting them.
849          */
850         req.responder_resources         = 4;
851         req.remote_cm_response_timeout  = 20;
852         req.local_cm_response_timeout   = 20;
853         req.retry_count                 = 0; /* RFC draft warns against retries */
854         req.rnr_retry_count             = 0; /* RFC draft warns against retries */
855         req.max_cm_retries              = 15;
856         req.srq                         = 1;
857         return ib_send_cm_req(id, &req);
858 }
859
860 static int ipoib_cm_modify_tx_init(struct net_device *dev,
861                                   struct ib_cm_id *cm_id, struct ib_qp *qp)
862 {
863         struct ipoib_dev_priv *priv = netdev_priv(dev);
864         struct ib_qp_attr qp_attr;
865         int qp_attr_mask, ret;
866         ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
867         if (ret) {
868                 ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret);
869                 return ret;
870         }
871
872         qp_attr.qp_state = IB_QPS_INIT;
873         qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
874         qp_attr.port_num = priv->port;
875         qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
876
877         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
878         if (ret) {
879                 ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
880                 return ret;
881         }
882         return 0;
883 }
884
885 static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
886                             struct ib_sa_path_rec *pathrec)
887 {
888         struct ipoib_dev_priv *priv = netdev_priv(p->dev);
889         int ret;
890
891         p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring,
892                                 GFP_KERNEL);
893         if (!p->tx_ring) {
894                 ipoib_warn(priv, "failed to allocate tx ring\n");
895                 ret = -ENOMEM;
896                 goto err_tx;
897         }
898
899         p->qp = ipoib_cm_create_tx_qp(p->dev, p);
900         if (IS_ERR(p->qp)) {
901                 ret = PTR_ERR(p->qp);
902                 ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
903                 goto err_qp;
904         }
905
906         p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
907         if (IS_ERR(p->id)) {
908                 ret = PTR_ERR(p->id);
909                 ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
910                 goto err_id;
911         }
912
913         ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
914         if (ret) {
915                 ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
916                 goto err_modify;
917         }
918
919         ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
920         if (ret) {
921                 ipoib_warn(priv, "failed to send cm req: %d\n", ret);
922                 goto err_send_cm;
923         }
924
925         ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n",
926                   p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn);
927
928         return 0;
929
930 err_send_cm:
931 err_modify:
932         ib_destroy_cm_id(p->id);
933 err_id:
934         p->id = NULL;
935         ib_destroy_qp(p->qp);
936 err_qp:
937         p->qp = NULL;
938 err_tx:
939         return ret;
940 }
941
942 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
943 {
944         struct ipoib_dev_priv *priv = netdev_priv(p->dev);
945         struct ipoib_tx_buf *tx_req;
946         unsigned long flags;
947         unsigned long begin;
948
949         ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
950                   p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
951
952         if (p->id)
953                 ib_destroy_cm_id(p->id);
954
955         if (p->tx_ring) {
956                 /* Wait for all sends to complete */
957                 begin = jiffies;
958                 while ((int) p->tx_tail - (int) p->tx_head < 0) {
959                         if (time_after(jiffies, begin + 5 * HZ)) {
960                                 ipoib_warn(priv, "timing out; %d sends not completed\n",
961                                            p->tx_head - p->tx_tail);
962                                 goto timeout;
963                         }
964
965                         msleep(1);
966                 }
967         }
968
969 timeout:
970
971         while ((int) p->tx_tail - (int) p->tx_head < 0) {
972                 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
973                 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
974                                     DMA_TO_DEVICE);
975                 dev_kfree_skb_any(tx_req->skb);
976                 ++p->tx_tail;
977                 spin_lock_irqsave(&priv->tx_lock, flags);
978                 if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
979                     netif_queue_stopped(p->dev) &&
980                     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
981                         netif_wake_queue(p->dev);
982                 spin_unlock_irqrestore(&priv->tx_lock, flags);
983         }
984
985         if (p->qp)
986                 ib_destroy_qp(p->qp);
987
988         kfree(p->tx_ring);
989         kfree(p);
990 }
991
992 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
993                                struct ib_cm_event *event)
994 {
995         struct ipoib_cm_tx *tx = cm_id->context;
996         struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
997         struct net_device *dev = priv->dev;
998         struct ipoib_neigh *neigh;
999         int ret;
1000
1001         switch (event->event) {
1002         case IB_CM_DREQ_RECEIVED:
1003                 ipoib_dbg(priv, "DREQ received.\n");
1004                 ib_send_cm_drep(cm_id, NULL, 0);
1005                 break;
1006         case IB_CM_REP_RECEIVED:
1007                 ipoib_dbg(priv, "REP received.\n");
1008                 ret = ipoib_cm_rep_handler(cm_id, event);
1009                 if (ret)
1010                         ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
1011                                        NULL, 0, NULL, 0);
1012                 break;
1013         case IB_CM_REQ_ERROR:
1014         case IB_CM_REJ_RECEIVED:
1015         case IB_CM_TIMEWAIT_EXIT:
1016                 ipoib_dbg(priv, "CM error %d.\n", event->event);
1017                 spin_lock_irq(&priv->tx_lock);
1018                 spin_lock(&priv->lock);
1019                 neigh = tx->neigh;
1020
1021                 if (neigh) {
1022                         neigh->cm = NULL;
1023                         list_del(&neigh->list);
1024                         if (neigh->ah)
1025                                 ipoib_put_ah(neigh->ah);
1026                         ipoib_neigh_free(dev, neigh);
1027
1028                         tx->neigh = NULL;
1029                 }
1030
1031                 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1032                         list_move(&tx->list, &priv->cm.reap_list);
1033                         queue_work(ipoib_workqueue, &priv->cm.reap_task);
1034                 }
1035
1036                 spin_unlock(&priv->lock);
1037                 spin_unlock_irq(&priv->tx_lock);
1038                 break;
1039         default:
1040                 break;
1041         }
1042
1043         return 0;
1044 }
1045
1046 struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
1047                                        struct ipoib_neigh *neigh)
1048 {
1049         struct ipoib_dev_priv *priv = netdev_priv(dev);
1050         struct ipoib_cm_tx *tx;
1051
1052         tx = kzalloc(sizeof *tx, GFP_ATOMIC);
1053         if (!tx)
1054                 return NULL;
1055
1056         neigh->cm = tx;
1057         tx->neigh = neigh;
1058         tx->path = path;
1059         tx->dev = dev;
1060         list_add(&tx->list, &priv->cm.start_list);
1061         set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
1062         queue_work(ipoib_workqueue, &priv->cm.start_task);
1063         return tx;
1064 }
1065
1066 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1067 {
1068         struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
1069         if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1070                 list_move(&tx->list, &priv->cm.reap_list);
1071                 queue_work(ipoib_workqueue, &priv->cm.reap_task);
1072                 ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n",
1073                           IPOIB_GID_ARG(tx->neigh->dgid));
1074                 tx->neigh = NULL;
1075         }
1076 }
1077
1078 static void ipoib_cm_tx_start(struct work_struct *work)
1079 {
1080         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1081                                                    cm.start_task);
1082         struct net_device *dev = priv->dev;
1083         struct ipoib_neigh *neigh;
1084         struct ipoib_cm_tx *p;
1085         unsigned long flags;
1086         int ret;
1087
1088         struct ib_sa_path_rec pathrec;
1089         u32 qpn;
1090
1091         spin_lock_irqsave(&priv->tx_lock, flags);
1092         spin_lock(&priv->lock);
1093         while (!list_empty(&priv->cm.start_list)) {
1094                 p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1095                 list_del_init(&p->list);
1096                 neigh = p->neigh;
1097                 qpn = IPOIB_QPN(neigh->neighbour->ha);
1098                 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
1099                 spin_unlock(&priv->lock);
1100                 spin_unlock_irqrestore(&priv->tx_lock, flags);
1101                 ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1102                 spin_lock_irqsave(&priv->tx_lock, flags);
1103                 spin_lock(&priv->lock);
1104                 if (ret) {
1105                         neigh = p->neigh;
1106                         if (neigh) {
1107                                 neigh->cm = NULL;
1108                                 list_del(&neigh->list);
1109                                 if (neigh->ah)
1110                                         ipoib_put_ah(neigh->ah);
1111                                 ipoib_neigh_free(dev, neigh);
1112                         }
1113                         list_del(&p->list);
1114                         kfree(p);
1115                 }
1116         }
1117         spin_unlock(&priv->lock);
1118         spin_unlock_irqrestore(&priv->tx_lock, flags);
1119 }
1120
1121 static void ipoib_cm_tx_reap(struct work_struct *work)
1122 {
1123         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1124                                                    cm.reap_task);
1125         struct ipoib_cm_tx *p;
1126
1127         spin_lock_irq(&priv->tx_lock);
1128         spin_lock(&priv->lock);
1129         while (!list_empty(&priv->cm.reap_list)) {
1130                 p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1131                 list_del(&p->list);
1132                 spin_unlock(&priv->lock);
1133                 spin_unlock_irq(&priv->tx_lock);
1134                 ipoib_cm_tx_destroy(p);
1135                 spin_lock_irq(&priv->tx_lock);
1136                 spin_lock(&priv->lock);
1137         }
1138         spin_unlock(&priv->lock);
1139         spin_unlock_irq(&priv->tx_lock);
1140 }
1141
1142 static void ipoib_cm_skb_reap(struct work_struct *work)
1143 {
1144         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1145                                                    cm.skb_task);
1146         struct sk_buff *skb;
1147
1148         unsigned mtu = priv->mcast_mtu;
1149
1150         spin_lock_irq(&priv->tx_lock);
1151         spin_lock(&priv->lock);
1152         while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
1153                 spin_unlock(&priv->lock);
1154                 spin_unlock_irq(&priv->tx_lock);
1155                 if (skb->protocol == htons(ETH_P_IP))
1156                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1157 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1158                 else if (skb->protocol == htons(ETH_P_IPV6))
1159                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev);
1160 #endif
1161                 dev_kfree_skb_any(skb);
1162                 spin_lock_irq(&priv->tx_lock);
1163                 spin_lock(&priv->lock);
1164         }
1165         spin_unlock(&priv->lock);
1166         spin_unlock_irq(&priv->tx_lock);
1167 }
1168
1169 void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
1170                            unsigned int mtu)
1171 {
1172         struct ipoib_dev_priv *priv = netdev_priv(dev);
1173         int e = skb_queue_empty(&priv->cm.skb_queue);
1174
1175         if (skb->dst)
1176                 skb->dst->ops->update_pmtu(skb->dst, mtu);
1177
1178         skb_queue_tail(&priv->cm.skb_queue, skb);
1179         if (e)
1180                 queue_work(ipoib_workqueue, &priv->cm.skb_task);
1181 }
1182
1183 static void ipoib_cm_rx_reap(struct work_struct *work)
1184 {
1185         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1186                                                    cm.rx_reap_task);
1187         struct ipoib_cm_rx *p, *n;
1188         LIST_HEAD(list);
1189
1190         spin_lock_irq(&priv->lock);
1191         list_splice_init(&priv->cm.rx_reap_list, &list);
1192         spin_unlock_irq(&priv->lock);
1193
1194         list_for_each_entry_safe(p, n, &list, list) {
1195                 ib_destroy_cm_id(p->id);
1196                 ib_destroy_qp(p->qp);
1197                 kfree(p);
1198         }
1199 }
1200
1201 static void ipoib_cm_stale_task(struct work_struct *work)
1202 {
1203         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1204                                                    cm.stale_task.work);
1205         struct ipoib_cm_rx *p;
1206         int ret;
1207
1208         spin_lock_irq(&priv->lock);
1209         while (!list_empty(&priv->cm.passive_ids)) {
1210                 /* List is sorted by LRU, start from tail,
1211                  * stop when we see a recently used entry */
1212                 p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1213                 if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1214                         break;
1215                 list_move(&p->list, &priv->cm.rx_error_list);
1216                 p->state = IPOIB_CM_RX_ERROR;
1217                 spin_unlock_irq(&priv->lock);
1218                 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1219                 if (ret)
1220                         ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1221                 spin_lock_irq(&priv->lock);
1222         }
1223
1224         if (!list_empty(&priv->cm.passive_ids))
1225                 queue_delayed_work(ipoib_workqueue,
1226                                    &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1227         spin_unlock_irq(&priv->lock);
1228 }
1229
1230
1231 static ssize_t show_mode(struct device *d, struct device_attribute *attr,
1232                          char *buf)
1233 {
1234         struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
1235
1236         if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
1237                 return sprintf(buf, "connected\n");
1238         else
1239                 return sprintf(buf, "datagram\n");
1240 }
1241
1242 static ssize_t set_mode(struct device *d, struct device_attribute *attr,
1243                         const char *buf, size_t count)
1244 {
1245         struct net_device *dev = to_net_dev(d);
1246         struct ipoib_dev_priv *priv = netdev_priv(dev);
1247
1248         /* flush paths if we switch modes so that connections are restarted */
1249         if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
1250                 set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
1251                 ipoib_warn(priv, "enabling connected mode "
1252                            "will cause multicast packet drops\n");
1253                 ipoib_flush_paths(dev);
1254                 return count;
1255         }
1256
1257         if (!strcmp(buf, "datagram\n")) {
1258                 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
1259                 dev->mtu = min(priv->mcast_mtu, dev->mtu);
1260                 ipoib_flush_paths(dev);
1261                 return count;
1262         }
1263
1264         return -EINVAL;
1265 }
1266
1267 static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
1268
1269 int ipoib_cm_add_mode_attr(struct net_device *dev)
1270 {
1271         return device_create_file(&dev->dev, &dev_attr_mode);
1272 }
1273
1274 int ipoib_cm_dev_init(struct net_device *dev)
1275 {
1276         struct ipoib_dev_priv *priv = netdev_priv(dev);
1277         struct ib_srq_init_attr srq_init_attr = {
1278                 .attr = {
1279                         .max_wr  = ipoib_recvq_size,
1280                         .max_sge = IPOIB_CM_RX_SG
1281                 }
1282         };
1283         int ret, i;
1284
1285         INIT_LIST_HEAD(&priv->cm.passive_ids);
1286         INIT_LIST_HEAD(&priv->cm.reap_list);
1287         INIT_LIST_HEAD(&priv->cm.start_list);
1288         INIT_LIST_HEAD(&priv->cm.rx_error_list);
1289         INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1290         INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1291         INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1292         INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1293         INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1294         INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1295         INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1296         INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1297
1298         skb_queue_head_init(&priv->cm.skb_queue);
1299
1300         priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1301         if (IS_ERR(priv->cm.srq)) {
1302                 ret = PTR_ERR(priv->cm.srq);
1303                 priv->cm.srq = NULL;
1304                 return ret;
1305         }
1306
1307         priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
1308                                     GFP_KERNEL);
1309         if (!priv->cm.srq_ring) {
1310                 printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n",
1311                        priv->ca->name, ipoib_recvq_size);
1312                 ipoib_cm_dev_cleanup(dev);
1313                 return -ENOMEM;
1314         }
1315
1316         for (i = 0; i < IPOIB_CM_RX_SG; ++i)
1317                 priv->cm.rx_sge[i].lkey = priv->mr->lkey;
1318
1319         priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
1320         for (i = 1; i < IPOIB_CM_RX_SG; ++i)
1321                 priv->cm.rx_sge[i].length = PAGE_SIZE;
1322         priv->cm.rx_wr.next = NULL;
1323         priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
1324         priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
1325
1326         for (i = 0; i < ipoib_recvq_size; ++i) {
1327                 if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
1328                                            priv->cm.srq_ring[i].mapping)) {
1329                         ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
1330                         ipoib_cm_dev_cleanup(dev);
1331                         return -ENOMEM;
1332                 }
1333                 if (ipoib_cm_post_receive(dev, i)) {
1334                         ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
1335                         ipoib_cm_dev_cleanup(dev);
1336                         return -EIO;
1337                 }
1338         }
1339
1340         priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
1341         return 0;
1342 }
1343
1344 void ipoib_cm_dev_cleanup(struct net_device *dev)
1345 {
1346         struct ipoib_dev_priv *priv = netdev_priv(dev);
1347         int ret;
1348
1349         if (!priv->cm.srq)
1350                 return;
1351
1352         ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1353
1354         ret = ib_destroy_srq(priv->cm.srq);
1355         if (ret)
1356                 ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
1357
1358         priv->cm.srq = NULL;
1359         if (!priv->cm.srq_ring)
1360                 return;
1361
1362         ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
1363         priv->cm.srq_ring = NULL;
1364 }