]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/sunrpc/xprtrdma/verbs.c
net: replace NIPQUAD() in net/*/
[linux-2.6-omap-h63xx.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/pci.h>  /* for Tavor hack below */
51
52 #include "xprt_rdma.h"
53
54 /*
55  * Globals/Macros
56  */
57
58 #ifdef RPC_DEBUG
59 # define RPCDBG_FACILITY        RPCDBG_TRANS
60 #endif
61
62 /*
63  * internal functions
64  */
65
66 /*
67  * handle replies in tasklet context, using a single, global list
68  * rdma tasklet function -- just turn around and call the func
69  * for all replies on the list
70  */
71
72 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73 static LIST_HEAD(rpcrdma_tasklets_g);
74
75 static void
76 rpcrdma_run_tasklet(unsigned long data)
77 {
78         struct rpcrdma_rep *rep;
79         void (*func)(struct rpcrdma_rep *);
80         unsigned long flags;
81
82         data = data;
83         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84         while (!list_empty(&rpcrdma_tasklets_g)) {
85                 rep = list_entry(rpcrdma_tasklets_g.next,
86                                  struct rpcrdma_rep, rr_list);
87                 list_del(&rep->rr_list);
88                 func = rep->rr_func;
89                 rep->rr_func = NULL;
90                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92                 if (func)
93                         func(rep);
94                 else
95                         rpcrdma_recv_buffer_put(rep);
96
97                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98         }
99         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100 }
101
102 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104 static inline void
105 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106 {
107         unsigned long flags;
108
109         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110         list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112         tasklet_schedule(&rpcrdma_tasklet_g);
113 }
114
115 static void
116 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117 {
118         struct rpcrdma_ep *ep = context;
119
120         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
121                 __func__, event->event, event->device->name, context);
122         if (ep->rep_connected == 1) {
123                 ep->rep_connected = -EIO;
124                 ep->rep_func(ep);
125                 wake_up_all(&ep->rep_connect_wait);
126         }
127 }
128
129 static void
130 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131 {
132         struct rpcrdma_ep *ep = context;
133
134         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
135                 __func__, event->event, event->device->name, context);
136         if (ep->rep_connected == 1) {
137                 ep->rep_connected = -EIO;
138                 ep->rep_func(ep);
139                 wake_up_all(&ep->rep_connect_wait);
140         }
141 }
142
143 static inline
144 void rpcrdma_event_process(struct ib_wc *wc)
145 {
146         struct rpcrdma_rep *rep =
147                         (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149         dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
150                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152         if (!rep) /* send or bind completion that we don't care about */
153                 return;
154
155         if (IB_WC_SUCCESS != wc->status) {
156                 dprintk("RPC:       %s: %s WC status %X, connection lost\n",
157                         __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158                          wc->status);
159                 rep->rr_len = ~0U;
160                 rpcrdma_schedule_tasklet(rep);
161                 return;
162         }
163
164         switch (wc->opcode) {
165         case IB_WC_RECV:
166                 rep->rr_len = wc->byte_len;
167                 ib_dma_sync_single_for_cpu(
168                         rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170                 /* Keep (only) the most recent credits, after check validity */
171                 if (rep->rr_len >= 16) {
172                         struct rpcrdma_msg *p =
173                                         (struct rpcrdma_msg *) rep->rr_base;
174                         unsigned int credits = ntohl(p->rm_credit);
175                         if (credits == 0) {
176                                 dprintk("RPC:       %s: server"
177                                         " dropped credits to 0!\n", __func__);
178                                 /* don't deadlock */
179                                 credits = 1;
180                         } else if (credits > rep->rr_buffer->rb_max_requests) {
181                                 dprintk("RPC:       %s: server"
182                                         " over-crediting: %d (%d)\n",
183                                         __func__, credits,
184                                         rep->rr_buffer->rb_max_requests);
185                                 credits = rep->rr_buffer->rb_max_requests;
186                         }
187                         atomic_set(&rep->rr_buffer->rb_credits, credits);
188                 }
189                 /* fall through */
190         case IB_WC_BIND_MW:
191                 rpcrdma_schedule_tasklet(rep);
192                 break;
193         default:
194                 dprintk("RPC:       %s: unexpected WC event %X\n",
195                         __func__, wc->opcode);
196                 break;
197         }
198 }
199
200 static inline int
201 rpcrdma_cq_poll(struct ib_cq *cq)
202 {
203         struct ib_wc wc;
204         int rc;
205
206         for (;;) {
207                 rc = ib_poll_cq(cq, 1, &wc);
208                 if (rc < 0) {
209                         dprintk("RPC:       %s: ib_poll_cq failed %i\n",
210                                 __func__, rc);
211                         return rc;
212                 }
213                 if (rc == 0)
214                         break;
215
216                 rpcrdma_event_process(&wc);
217         }
218
219         return 0;
220 }
221
222 /*
223  * rpcrdma_cq_event_upcall
224  *
225  * This upcall handles recv, send, bind and unbind events.
226  * It is reentrant but processes single events in order to maintain
227  * ordering of receives to keep server credits.
228  *
229  * It is the responsibility of the scheduled tasklet to return
230  * recv buffers to the pool. NOTE: this affects synchronization of
231  * connection shutdown. That is, the structures required for
232  * the completion of the reply handler must remain intact until
233  * all memory has been reclaimed.
234  *
235  * Note that send events are suppressed and do not result in an upcall.
236  */
237 static void
238 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239 {
240         int rc;
241
242         rc = rpcrdma_cq_poll(cq);
243         if (rc)
244                 return;
245
246         rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247         if (rc) {
248                 dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
249                         __func__, rc);
250                 return;
251         }
252
253         rpcrdma_cq_poll(cq);
254 }
255
256 #ifdef RPC_DEBUG
257 static const char * const conn[] = {
258         "address resolved",
259         "address error",
260         "route resolved",
261         "route error",
262         "connect request",
263         "connect response",
264         "connect error",
265         "unreachable",
266         "rejected",
267         "established",
268         "disconnected",
269         "device removal"
270 };
271 #endif
272
273 static int
274 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275 {
276         struct rpcrdma_xprt *xprt = id->context;
277         struct rpcrdma_ia *ia = &xprt->rx_ia;
278         struct rpcrdma_ep *ep = &xprt->rx_ep;
279         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280         struct ib_qp_attr attr;
281         struct ib_qp_init_attr iattr;
282         int connstate = 0;
283
284         switch (event->event) {
285         case RDMA_CM_EVENT_ADDR_RESOLVED:
286         case RDMA_CM_EVENT_ROUTE_RESOLVED:
287                 ia->ri_async_rc = 0;
288                 complete(&ia->ri_done);
289                 break;
290         case RDMA_CM_EVENT_ADDR_ERROR:
291                 ia->ri_async_rc = -EHOSTUNREACH;
292                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
293                         __func__, ep);
294                 complete(&ia->ri_done);
295                 break;
296         case RDMA_CM_EVENT_ROUTE_ERROR:
297                 ia->ri_async_rc = -ENETUNREACH;
298                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
299                         __func__, ep);
300                 complete(&ia->ri_done);
301                 break;
302         case RDMA_CM_EVENT_ESTABLISHED:
303                 connstate = 1;
304                 ib_query_qp(ia->ri_id->qp, &attr,
305                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
306                         &iattr);
307                 dprintk("RPC:       %s: %d responder resources"
308                         " (%d initiator)\n",
309                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
310                 goto connected;
311         case RDMA_CM_EVENT_CONNECT_ERROR:
312                 connstate = -ENOTCONN;
313                 goto connected;
314         case RDMA_CM_EVENT_UNREACHABLE:
315                 connstate = -ENETDOWN;
316                 goto connected;
317         case RDMA_CM_EVENT_REJECTED:
318                 connstate = -ECONNREFUSED;
319                 goto connected;
320         case RDMA_CM_EVENT_DISCONNECTED:
321                 connstate = -ECONNABORTED;
322                 goto connected;
323         case RDMA_CM_EVENT_DEVICE_REMOVAL:
324                 connstate = -ENODEV;
325 connected:
326                 dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
327                         __func__,
328                         (event->event <= 11) ? conn[event->event] :
329                                                 "unknown connection error",
330                         &addr->sin_addr.s_addr,
331                         ntohs(addr->sin_port),
332                         ep, event->event);
333                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334                 dprintk("RPC:       %s: %sconnected\n",
335                                         __func__, connstate > 0 ? "" : "dis");
336                 ep->rep_connected = connstate;
337                 ep->rep_func(ep);
338                 wake_up_all(&ep->rep_connect_wait);
339                 break;
340         default:
341                 dprintk("RPC:       %s: unexpected CM event %d\n",
342                         __func__, event->event);
343                 break;
344         }
345
346 #ifdef RPC_DEBUG
347         if (connstate == 1) {
348                 int ird = attr.max_dest_rd_atomic;
349                 int tird = ep->rep_remote_cma.responder_resources;
350                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
351                         "on %s, memreg %d slots %d ird %d%s\n",
352                         &addr->sin_addr.s_addr,
353                         ntohs(addr->sin_port),
354                         ia->ri_id->device->name,
355                         ia->ri_memreg_strategy,
356                         xprt->rx_buf.rb_max_requests,
357                         ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
358         } else if (connstate < 0) {
359                 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
360                         &addr->sin_addr.s_addr,
361                         ntohs(addr->sin_port),
362                         connstate);
363         }
364 #endif
365
366         return 0;
367 }
368
369 static struct rdma_cm_id *
370 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
371                         struct rpcrdma_ia *ia, struct sockaddr *addr)
372 {
373         struct rdma_cm_id *id;
374         int rc;
375
376         init_completion(&ia->ri_done);
377
378         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
379         if (IS_ERR(id)) {
380                 rc = PTR_ERR(id);
381                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
382                         __func__, rc);
383                 return id;
384         }
385
386         ia->ri_async_rc = -ETIMEDOUT;
387         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
388         if (rc) {
389                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
390                         __func__, rc);
391                 goto out;
392         }
393         wait_for_completion_interruptible_timeout(&ia->ri_done,
394                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
395         rc = ia->ri_async_rc;
396         if (rc)
397                 goto out;
398
399         ia->ri_async_rc = -ETIMEDOUT;
400         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
401         if (rc) {
402                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
403                         __func__, rc);
404                 goto out;
405         }
406         wait_for_completion_interruptible_timeout(&ia->ri_done,
407                                 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
408         rc = ia->ri_async_rc;
409         if (rc)
410                 goto out;
411
412         return id;
413
414 out:
415         rdma_destroy_id(id);
416         return ERR_PTR(rc);
417 }
418
419 /*
420  * Drain any cq, prior to teardown.
421  */
422 static void
423 rpcrdma_clean_cq(struct ib_cq *cq)
424 {
425         struct ib_wc wc;
426         int count = 0;
427
428         while (1 == ib_poll_cq(cq, 1, &wc))
429                 ++count;
430
431         if (count)
432                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
433                         __func__, count, wc.opcode);
434 }
435
436 /*
437  * Exported functions.
438  */
439
440 /*
441  * Open and initialize an Interface Adapter.
442  *  o initializes fields of struct rpcrdma_ia, including
443  *    interface and provider attributes and protection zone.
444  */
445 int
446 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
447 {
448         int rc, mem_priv;
449         struct ib_device_attr devattr;
450         struct rpcrdma_ia *ia = &xprt->rx_ia;
451
452         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
453         if (IS_ERR(ia->ri_id)) {
454                 rc = PTR_ERR(ia->ri_id);
455                 goto out1;
456         }
457
458         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
459         if (IS_ERR(ia->ri_pd)) {
460                 rc = PTR_ERR(ia->ri_pd);
461                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
462                         __func__, rc);
463                 goto out2;
464         }
465
466         /*
467          * Query the device to determine if the requested memory
468          * registration strategy is supported. If it isn't, set the
469          * strategy to a globally supported model.
470          */
471         rc = ib_query_device(ia->ri_id->device, &devattr);
472         if (rc) {
473                 dprintk("RPC:       %s: ib_query_device failed %d\n",
474                         __func__, rc);
475                 goto out2;
476         }
477
478         if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
479                 ia->ri_have_dma_lkey = 1;
480                 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
481         }
482
483         switch (memreg) {
484         case RPCRDMA_MEMWINDOWS:
485         case RPCRDMA_MEMWINDOWS_ASYNC:
486                 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
487                         dprintk("RPC:       %s: MEMWINDOWS registration "
488                                 "specified but not supported by adapter, "
489                                 "using slower RPCRDMA_REGISTER\n",
490                                 __func__);
491                         memreg = RPCRDMA_REGISTER;
492                 }
493                 break;
494         case RPCRDMA_MTHCAFMR:
495                 if (!ia->ri_id->device->alloc_fmr) {
496 #if RPCRDMA_PERSISTENT_REGISTRATION
497                         dprintk("RPC:       %s: MTHCAFMR registration "
498                                 "specified but not supported by adapter, "
499                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
500                                 __func__);
501                         memreg = RPCRDMA_ALLPHYSICAL;
502 #else
503                         dprintk("RPC:       %s: MTHCAFMR registration "
504                                 "specified but not supported by adapter, "
505                                 "using slower RPCRDMA_REGISTER\n",
506                                 __func__);
507                         memreg = RPCRDMA_REGISTER;
508 #endif
509                 }
510                 break;
511         case RPCRDMA_FRMR:
512                 /* Requires both frmr reg and local dma lkey */
513                 if ((devattr.device_cap_flags &
514                      (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
515                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
516 #if RPCRDMA_PERSISTENT_REGISTRATION
517                         dprintk("RPC:       %s: FRMR registration "
518                                 "specified but not supported by adapter, "
519                                 "using riskier RPCRDMA_ALLPHYSICAL\n",
520                                 __func__);
521                         memreg = RPCRDMA_ALLPHYSICAL;
522 #else
523                         dprintk("RPC:       %s: FRMR registration "
524                                 "specified but not supported by adapter, "
525                                 "using slower RPCRDMA_REGISTER\n",
526                                 __func__);
527                         memreg = RPCRDMA_REGISTER;
528 #endif
529                 }
530                 break;
531         }
532
533         /*
534          * Optionally obtain an underlying physical identity mapping in
535          * order to do a memory window-based bind. This base registration
536          * is protected from remote access - that is enabled only by binding
537          * for the specific bytes targeted during each RPC operation, and
538          * revoked after the corresponding completion similar to a storage
539          * adapter.
540          */
541         switch (memreg) {
542         case RPCRDMA_BOUNCEBUFFERS:
543         case RPCRDMA_REGISTER:
544         case RPCRDMA_FRMR:
545                 break;
546 #if RPCRDMA_PERSISTENT_REGISTRATION
547         case RPCRDMA_ALLPHYSICAL:
548                 mem_priv = IB_ACCESS_LOCAL_WRITE |
549                                 IB_ACCESS_REMOTE_WRITE |
550                                 IB_ACCESS_REMOTE_READ;
551                 goto register_setup;
552 #endif
553         case RPCRDMA_MEMWINDOWS_ASYNC:
554         case RPCRDMA_MEMWINDOWS:
555                 mem_priv = IB_ACCESS_LOCAL_WRITE |
556                                 IB_ACCESS_MW_BIND;
557                 goto register_setup;
558         case RPCRDMA_MTHCAFMR:
559                 if (ia->ri_have_dma_lkey)
560                         break;
561                 mem_priv = IB_ACCESS_LOCAL_WRITE;
562         register_setup:
563                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
564                 if (IS_ERR(ia->ri_bind_mem)) {
565                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
566                                 "phys register failed with %lX\n\t"
567                                 "Will continue with degraded performance\n",
568                                 __func__, PTR_ERR(ia->ri_bind_mem));
569                         memreg = RPCRDMA_REGISTER;
570                         ia->ri_bind_mem = NULL;
571                 }
572                 break;
573         default:
574                 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
575                                 __func__, memreg);
576                 rc = -EINVAL;
577                 goto out2;
578         }
579         dprintk("RPC:       %s: memory registration strategy is %d\n",
580                 __func__, memreg);
581
582         /* Else will do memory reg/dereg for each chunk */
583         ia->ri_memreg_strategy = memreg;
584
585         return 0;
586 out2:
587         rdma_destroy_id(ia->ri_id);
588         ia->ri_id = NULL;
589 out1:
590         return rc;
591 }
592
593 /*
594  * Clean up/close an IA.
595  *   o if event handles and PD have been initialized, free them.
596  *   o close the IA
597  */
598 void
599 rpcrdma_ia_close(struct rpcrdma_ia *ia)
600 {
601         int rc;
602
603         dprintk("RPC:       %s: entering\n", __func__);
604         if (ia->ri_bind_mem != NULL) {
605                 rc = ib_dereg_mr(ia->ri_bind_mem);
606                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
607                         __func__, rc);
608         }
609         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
610                 if (ia->ri_id->qp)
611                         rdma_destroy_qp(ia->ri_id);
612                 rdma_destroy_id(ia->ri_id);
613                 ia->ri_id = NULL;
614         }
615         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
616                 rc = ib_dealloc_pd(ia->ri_pd);
617                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
618                         __func__, rc);
619         }
620 }
621
622 /*
623  * Create unconnected endpoint.
624  */
625 int
626 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
627                                 struct rpcrdma_create_data_internal *cdata)
628 {
629         struct ib_device_attr devattr;
630         int rc, err;
631
632         rc = ib_query_device(ia->ri_id->device, &devattr);
633         if (rc) {
634                 dprintk("RPC:       %s: ib_query_device failed %d\n",
635                         __func__, rc);
636                 return rc;
637         }
638
639         /* check provider's send/recv wr limits */
640         if (cdata->max_requests > devattr.max_qp_wr)
641                 cdata->max_requests = devattr.max_qp_wr;
642
643         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
644         ep->rep_attr.qp_context = ep;
645         /* send_cq and recv_cq initialized below */
646         ep->rep_attr.srq = NULL;
647         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
648         switch (ia->ri_memreg_strategy) {
649         case RPCRDMA_FRMR:
650                 /* Add room for frmr register and invalidate WRs */
651                 ep->rep_attr.cap.max_send_wr *= 3;
652                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
653                         return -EINVAL;
654                 break;
655         case RPCRDMA_MEMWINDOWS_ASYNC:
656         case RPCRDMA_MEMWINDOWS:
657                 /* Add room for mw_binds+unbinds - overkill! */
658                 ep->rep_attr.cap.max_send_wr++;
659                 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
660                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
661                         return -EINVAL;
662                 break;
663         default:
664                 break;
665         }
666         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
667         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
668         ep->rep_attr.cap.max_recv_sge = 1;
669         ep->rep_attr.cap.max_inline_data = 0;
670         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
671         ep->rep_attr.qp_type = IB_QPT_RC;
672         ep->rep_attr.port_num = ~0;
673
674         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
675                 "iovs: send %d recv %d\n",
676                 __func__,
677                 ep->rep_attr.cap.max_send_wr,
678                 ep->rep_attr.cap.max_recv_wr,
679                 ep->rep_attr.cap.max_send_sge,
680                 ep->rep_attr.cap.max_recv_sge);
681
682         /* set trigger for requesting send completion */
683         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
684         switch (ia->ri_memreg_strategy) {
685         case RPCRDMA_MEMWINDOWS_ASYNC:
686         case RPCRDMA_MEMWINDOWS:
687                 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
688                 break;
689         default:
690                 break;
691         }
692         if (ep->rep_cqinit <= 2)
693                 ep->rep_cqinit = 0;
694         INIT_CQCOUNT(ep);
695         ep->rep_ia = ia;
696         init_waitqueue_head(&ep->rep_connect_wait);
697
698         /*
699          * Create a single cq for receive dto and mw_bind (only ever
700          * care about unbind, really). Send completions are suppressed.
701          * Use single threaded tasklet upcalls to maintain ordering.
702          */
703         ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
704                                   rpcrdma_cq_async_error_upcall, NULL,
705                                   ep->rep_attr.cap.max_recv_wr +
706                                   ep->rep_attr.cap.max_send_wr + 1, 0);
707         if (IS_ERR(ep->rep_cq)) {
708                 rc = PTR_ERR(ep->rep_cq);
709                 dprintk("RPC:       %s: ib_create_cq failed: %i\n",
710                         __func__, rc);
711                 goto out1;
712         }
713
714         rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
715         if (rc) {
716                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
717                         __func__, rc);
718                 goto out2;
719         }
720
721         ep->rep_attr.send_cq = ep->rep_cq;
722         ep->rep_attr.recv_cq = ep->rep_cq;
723
724         /* Initialize cma parameters */
725
726         /* RPC/RDMA does not use private data */
727         ep->rep_remote_cma.private_data = NULL;
728         ep->rep_remote_cma.private_data_len = 0;
729
730         /* Client offers RDMA Read but does not initiate */
731         ep->rep_remote_cma.initiator_depth = 0;
732         if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
733                 ep->rep_remote_cma.responder_resources = 0;
734         else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
735                 ep->rep_remote_cma.responder_resources = 32;
736         else
737                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
738
739         ep->rep_remote_cma.retry_count = 7;
740         ep->rep_remote_cma.flow_control = 0;
741         ep->rep_remote_cma.rnr_retry_count = 0;
742
743         return 0;
744
745 out2:
746         err = ib_destroy_cq(ep->rep_cq);
747         if (err)
748                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
749                         __func__, err);
750 out1:
751         return rc;
752 }
753
754 /*
755  * rpcrdma_ep_destroy
756  *
757  * Disconnect and destroy endpoint. After this, the only
758  * valid operations on the ep are to free it (if dynamically
759  * allocated) or re-create it.
760  *
761  * The caller's error handling must be sure to not leak the endpoint
762  * if this function fails.
763  */
764 int
765 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
766 {
767         int rc;
768
769         dprintk("RPC:       %s: entering, connected is %d\n",
770                 __func__, ep->rep_connected);
771
772         if (ia->ri_id->qp) {
773                 rc = rpcrdma_ep_disconnect(ep, ia);
774                 if (rc)
775                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
776                                 " returned %i\n", __func__, rc);
777                 rdma_destroy_qp(ia->ri_id);
778                 ia->ri_id->qp = NULL;
779         }
780
781         /* padding - could be done in rpcrdma_buffer_destroy... */
782         if (ep->rep_pad_mr) {
783                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
784                 ep->rep_pad_mr = NULL;
785         }
786
787         rpcrdma_clean_cq(ep->rep_cq);
788         rc = ib_destroy_cq(ep->rep_cq);
789         if (rc)
790                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
791                         __func__, rc);
792
793         return rc;
794 }
795
796 /*
797  * Connect unconnected endpoint.
798  */
799 int
800 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
801 {
802         struct rdma_cm_id *id;
803         int rc = 0;
804         int retry_count = 0;
805
806         if (ep->rep_connected != 0) {
807                 struct rpcrdma_xprt *xprt;
808 retry:
809                 rc = rpcrdma_ep_disconnect(ep, ia);
810                 if (rc && rc != -ENOTCONN)
811                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
812                                 " status %i\n", __func__, rc);
813                 rpcrdma_clean_cq(ep->rep_cq);
814
815                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
816                 id = rpcrdma_create_id(xprt, ia,
817                                 (struct sockaddr *)&xprt->rx_data.addr);
818                 if (IS_ERR(id)) {
819                         rc = PTR_ERR(id);
820                         goto out;
821                 }
822                 /* TEMP TEMP TEMP - fail if new device:
823                  * Deregister/remarshal *all* requests!
824                  * Close and recreate adapter, pd, etc!
825                  * Re-determine all attributes still sane!
826                  * More stuff I haven't thought of!
827                  * Rrrgh!
828                  */
829                 if (ia->ri_id->device != id->device) {
830                         printk("RPC:       %s: can't reconnect on "
831                                 "different device!\n", __func__);
832                         rdma_destroy_id(id);
833                         rc = -ENETDOWN;
834                         goto out;
835                 }
836                 /* END TEMP */
837                 rdma_destroy_qp(ia->ri_id);
838                 rdma_destroy_id(ia->ri_id);
839                 ia->ri_id = id;
840         }
841
842         rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
843         if (rc) {
844                 dprintk("RPC:       %s: rdma_create_qp failed %i\n",
845                         __func__, rc);
846                 goto out;
847         }
848
849 /* XXX Tavor device performs badly with 2K MTU! */
850 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
851         struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
852         if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
853             (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
854              pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
855                 struct ib_qp_attr attr = {
856                         .path_mtu = IB_MTU_1024
857                 };
858                 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
859         }
860 }
861
862         ep->rep_connected = 0;
863
864         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
865         if (rc) {
866                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
867                                 __func__, rc);
868                 goto out;
869         }
870
871         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
872
873         /*
874          * Check state. A non-peer reject indicates no listener
875          * (ECONNREFUSED), which may be a transient state. All
876          * others indicate a transport condition which has already
877          * undergone a best-effort.
878          */
879         if (ep->rep_connected == -ECONNREFUSED
880             && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
881                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
882                 goto retry;
883         }
884         if (ep->rep_connected <= 0) {
885                 /* Sometimes, the only way to reliably connect to remote
886                  * CMs is to use same nonzero values for ORD and IRD. */
887                 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
888                     (ep->rep_remote_cma.responder_resources == 0 ||
889                      ep->rep_remote_cma.initiator_depth !=
890                                 ep->rep_remote_cma.responder_resources)) {
891                         if (ep->rep_remote_cma.responder_resources == 0)
892                                 ep->rep_remote_cma.responder_resources = 1;
893                         ep->rep_remote_cma.initiator_depth =
894                                 ep->rep_remote_cma.responder_resources;
895                         goto retry;
896                 }
897                 rc = ep->rep_connected;
898         } else {
899                 dprintk("RPC:       %s: connected\n", __func__);
900         }
901
902 out:
903         if (rc)
904                 ep->rep_connected = rc;
905         return rc;
906 }
907
908 /*
909  * rpcrdma_ep_disconnect
910  *
911  * This is separate from destroy to facilitate the ability
912  * to reconnect without recreating the endpoint.
913  *
914  * This call is not reentrant, and must not be made in parallel
915  * on the same endpoint.
916  */
917 int
918 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
919 {
920         int rc;
921
922         rpcrdma_clean_cq(ep->rep_cq);
923         rc = rdma_disconnect(ia->ri_id);
924         if (!rc) {
925                 /* returns without wait if not connected */
926                 wait_event_interruptible(ep->rep_connect_wait,
927                                                         ep->rep_connected != 1);
928                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
929                         (ep->rep_connected == 1) ? "still " : "dis");
930         } else {
931                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
932                 ep->rep_connected = rc;
933         }
934         return rc;
935 }
936
937 /*
938  * Initialize buffer memory
939  */
940 int
941 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
942         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
943 {
944         char *p;
945         size_t len;
946         int i, rc;
947         struct rpcrdma_mw *r;
948
949         buf->rb_max_requests = cdata->max_requests;
950         spin_lock_init(&buf->rb_lock);
951         atomic_set(&buf->rb_credits, 1);
952
953         /* Need to allocate:
954          *   1.  arrays for send and recv pointers
955          *   2.  arrays of struct rpcrdma_req to fill in pointers
956          *   3.  array of struct rpcrdma_rep for replies
957          *   4.  padding, if any
958          *   5.  mw's, fmr's or frmr's, if any
959          * Send/recv buffers in req/rep need to be registered
960          */
961
962         len = buf->rb_max_requests *
963                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
964         len += cdata->padding;
965         switch (ia->ri_memreg_strategy) {
966         case RPCRDMA_FRMR:
967                 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
968                                 sizeof(struct rpcrdma_mw);
969                 break;
970         case RPCRDMA_MTHCAFMR:
971                 /* TBD we are perhaps overallocating here */
972                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
973                                 sizeof(struct rpcrdma_mw);
974                 break;
975         case RPCRDMA_MEMWINDOWS_ASYNC:
976         case RPCRDMA_MEMWINDOWS:
977                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
978                                 sizeof(struct rpcrdma_mw);
979                 break;
980         default:
981                 break;
982         }
983
984         /* allocate 1, 4 and 5 in one shot */
985         p = kzalloc(len, GFP_KERNEL);
986         if (p == NULL) {
987                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
988                         __func__, len);
989                 rc = -ENOMEM;
990                 goto out;
991         }
992         buf->rb_pool = p;       /* for freeing it later */
993
994         buf->rb_send_bufs = (struct rpcrdma_req **) p;
995         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
996         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
997         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
998
999         /*
1000          * Register the zeroed pad buffer, if any.
1001          */
1002         if (cdata->padding) {
1003                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1004                                             &ep->rep_pad_mr, &ep->rep_pad);
1005                 if (rc)
1006                         goto out;
1007         }
1008         p += cdata->padding;
1009
1010         /*
1011          * Allocate the fmr's, or mw's for mw_bind chunk registration.
1012          * We "cycle" the mw's in order to minimize rkey reuse,
1013          * and also reduce unbind-to-bind collision.
1014          */
1015         INIT_LIST_HEAD(&buf->rb_mws);
1016         r = (struct rpcrdma_mw *)p;
1017         switch (ia->ri_memreg_strategy) {
1018         case RPCRDMA_FRMR:
1019                 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1020                         r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1021                                                          RPCRDMA_MAX_SEGS);
1022                         if (IS_ERR(r->r.frmr.fr_mr)) {
1023                                 rc = PTR_ERR(r->r.frmr.fr_mr);
1024                                 dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1025                                         " failed %i\n", __func__, rc);
1026                                 goto out;
1027                         }
1028                         r->r.frmr.fr_pgl =
1029                                 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1030                                                             RPCRDMA_MAX_SEGS);
1031                         if (IS_ERR(r->r.frmr.fr_pgl)) {
1032                                 rc = PTR_ERR(r->r.frmr.fr_pgl);
1033                                 dprintk("RPC:       %s: "
1034                                         "ib_alloc_fast_reg_page_list "
1035                                         "failed %i\n", __func__, rc);
1036                                 goto out;
1037                         }
1038                         list_add(&r->mw_list, &buf->rb_mws);
1039                         ++r;
1040                 }
1041                 break;
1042         case RPCRDMA_MTHCAFMR:
1043                 /* TBD we are perhaps overallocating here */
1044                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1045                         static struct ib_fmr_attr fa =
1046                                 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1047                         r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1048                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1049                                 &fa);
1050                         if (IS_ERR(r->r.fmr)) {
1051                                 rc = PTR_ERR(r->r.fmr);
1052                                 dprintk("RPC:       %s: ib_alloc_fmr"
1053                                         " failed %i\n", __func__, rc);
1054                                 goto out;
1055                         }
1056                         list_add(&r->mw_list, &buf->rb_mws);
1057                         ++r;
1058                 }
1059                 break;
1060         case RPCRDMA_MEMWINDOWS_ASYNC:
1061         case RPCRDMA_MEMWINDOWS:
1062                 /* Allocate one extra request's worth, for full cycling */
1063                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1064                         r->r.mw = ib_alloc_mw(ia->ri_pd);
1065                         if (IS_ERR(r->r.mw)) {
1066                                 rc = PTR_ERR(r->r.mw);
1067                                 dprintk("RPC:       %s: ib_alloc_mw"
1068                                         " failed %i\n", __func__, rc);
1069                                 goto out;
1070                         }
1071                         list_add(&r->mw_list, &buf->rb_mws);
1072                         ++r;
1073                 }
1074                 break;
1075         default:
1076                 break;
1077         }
1078
1079         /*
1080          * Allocate/init the request/reply buffers. Doing this
1081          * using kmalloc for now -- one for each buf.
1082          */
1083         for (i = 0; i < buf->rb_max_requests; i++) {
1084                 struct rpcrdma_req *req;
1085                 struct rpcrdma_rep *rep;
1086
1087                 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1088                 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1089                 /* Typical ~2400b, so rounding up saves work later */
1090                 if (len < 4096)
1091                         len = 4096;
1092                 req = kmalloc(len, GFP_KERNEL);
1093                 if (req == NULL) {
1094                         dprintk("RPC:       %s: request buffer %d alloc"
1095                                 " failed\n", __func__, i);
1096                         rc = -ENOMEM;
1097                         goto out;
1098                 }
1099                 memset(req, 0, sizeof(struct rpcrdma_req));
1100                 buf->rb_send_bufs[i] = req;
1101                 buf->rb_send_bufs[i]->rl_buffer = buf;
1102
1103                 rc = rpcrdma_register_internal(ia, req->rl_base,
1104                                 len - offsetof(struct rpcrdma_req, rl_base),
1105                                 &buf->rb_send_bufs[i]->rl_handle,
1106                                 &buf->rb_send_bufs[i]->rl_iov);
1107                 if (rc)
1108                         goto out;
1109
1110                 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1111
1112                 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1113                 rep = kmalloc(len, GFP_KERNEL);
1114                 if (rep == NULL) {
1115                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1116                                 __func__, i);
1117                         rc = -ENOMEM;
1118                         goto out;
1119                 }
1120                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1121                 buf->rb_recv_bufs[i] = rep;
1122                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1123                 init_waitqueue_head(&rep->rr_unbind);
1124
1125                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1126                                 len - offsetof(struct rpcrdma_rep, rr_base),
1127                                 &buf->rb_recv_bufs[i]->rr_handle,
1128                                 &buf->rb_recv_bufs[i]->rr_iov);
1129                 if (rc)
1130                         goto out;
1131
1132         }
1133         dprintk("RPC:       %s: max_requests %d\n",
1134                 __func__, buf->rb_max_requests);
1135         /* done */
1136         return 0;
1137 out:
1138         rpcrdma_buffer_destroy(buf);
1139         return rc;
1140 }
1141
1142 /*
1143  * Unregister and destroy buffer memory. Need to deal with
1144  * partial initialization, so it's callable from failed create.
1145  * Must be called before destroying endpoint, as registrations
1146  * reference it.
1147  */
1148 void
1149 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1150 {
1151         int rc, i;
1152         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1153         struct rpcrdma_mw *r;
1154
1155         /* clean up in reverse order from create
1156          *   1.  recv mr memory (mr free, then kfree)
1157          *   1a. bind mw memory
1158          *   2.  send mr memory (mr free, then kfree)
1159          *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1160          *   4.  arrays
1161          */
1162         dprintk("RPC:       %s: entering\n", __func__);
1163
1164         for (i = 0; i < buf->rb_max_requests; i++) {
1165                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1166                         rpcrdma_deregister_internal(ia,
1167                                         buf->rb_recv_bufs[i]->rr_handle,
1168                                         &buf->rb_recv_bufs[i]->rr_iov);
1169                         kfree(buf->rb_recv_bufs[i]);
1170                 }
1171                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1172                         while (!list_empty(&buf->rb_mws)) {
1173                                 r = list_entry(buf->rb_mws.next,
1174                                         struct rpcrdma_mw, mw_list);
1175                                 list_del(&r->mw_list);
1176                                 switch (ia->ri_memreg_strategy) {
1177                                 case RPCRDMA_FRMR:
1178                                         rc = ib_dereg_mr(r->r.frmr.fr_mr);
1179                                         if (rc)
1180                                                 dprintk("RPC:       %s:"
1181                                                         " ib_dereg_mr"
1182                                                         " failed %i\n",
1183                                                         __func__, rc);
1184                                         ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1185                                         break;
1186                                 case RPCRDMA_MTHCAFMR:
1187                                         rc = ib_dealloc_fmr(r->r.fmr);
1188                                         if (rc)
1189                                                 dprintk("RPC:       %s:"
1190                                                         " ib_dealloc_fmr"
1191                                                         " failed %i\n",
1192                                                         __func__, rc);
1193                                         break;
1194                                 case RPCRDMA_MEMWINDOWS_ASYNC:
1195                                 case RPCRDMA_MEMWINDOWS:
1196                                         rc = ib_dealloc_mw(r->r.mw);
1197                                         if (rc)
1198                                                 dprintk("RPC:       %s:"
1199                                                         " ib_dealloc_mw"
1200                                                         " failed %i\n",
1201                                                         __func__, rc);
1202                                         break;
1203                                 default:
1204                                         break;
1205                                 }
1206                         }
1207                         rpcrdma_deregister_internal(ia,
1208                                         buf->rb_send_bufs[i]->rl_handle,
1209                                         &buf->rb_send_bufs[i]->rl_iov);
1210                         kfree(buf->rb_send_bufs[i]);
1211                 }
1212         }
1213
1214         kfree(buf->rb_pool);
1215 }
1216
1217 /*
1218  * Get a set of request/reply buffers.
1219  *
1220  * Reply buffer (if needed) is attached to send buffer upon return.
1221  * Rule:
1222  *    rb_send_index and rb_recv_index MUST always be pointing to the
1223  *    *next* available buffer (non-NULL). They are incremented after
1224  *    removing buffers, and decremented *before* returning them.
1225  */
1226 struct rpcrdma_req *
1227 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1228 {
1229         struct rpcrdma_req *req;
1230         unsigned long flags;
1231         int i;
1232         struct rpcrdma_mw *r;
1233
1234         spin_lock_irqsave(&buffers->rb_lock, flags);
1235         if (buffers->rb_send_index == buffers->rb_max_requests) {
1236                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1237                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1238                 return ((struct rpcrdma_req *)NULL);
1239         }
1240
1241         req = buffers->rb_send_bufs[buffers->rb_send_index];
1242         if (buffers->rb_send_index < buffers->rb_recv_index) {
1243                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1244                         __func__,
1245                         buffers->rb_recv_index - buffers->rb_send_index);
1246                 req->rl_reply = NULL;
1247         } else {
1248                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1249                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1250         }
1251         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1252         if (!list_empty(&buffers->rb_mws)) {
1253                 i = RPCRDMA_MAX_SEGS - 1;
1254                 do {
1255                         r = list_entry(buffers->rb_mws.next,
1256                                         struct rpcrdma_mw, mw_list);
1257                         list_del(&r->mw_list);
1258                         req->rl_segments[i].mr_chunk.rl_mw = r;
1259                 } while (--i >= 0);
1260         }
1261         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1262         return req;
1263 }
1264
1265 /*
1266  * Put request/reply buffers back into pool.
1267  * Pre-decrement counter/array index.
1268  */
1269 void
1270 rpcrdma_buffer_put(struct rpcrdma_req *req)
1271 {
1272         struct rpcrdma_buffer *buffers = req->rl_buffer;
1273         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1274         int i;
1275         unsigned long flags;
1276
1277         BUG_ON(req->rl_nchunks != 0);
1278         spin_lock_irqsave(&buffers->rb_lock, flags);
1279         buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1280         req->rl_niovs = 0;
1281         if (req->rl_reply) {
1282                 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1283                 init_waitqueue_head(&req->rl_reply->rr_unbind);
1284                 req->rl_reply->rr_func = NULL;
1285                 req->rl_reply = NULL;
1286         }
1287         switch (ia->ri_memreg_strategy) {
1288         case RPCRDMA_FRMR:
1289         case RPCRDMA_MTHCAFMR:
1290         case RPCRDMA_MEMWINDOWS_ASYNC:
1291         case RPCRDMA_MEMWINDOWS:
1292                 /*
1293                  * Cycle mw's back in reverse order, and "spin" them.
1294                  * This delays and scrambles reuse as much as possible.
1295                  */
1296                 i = 1;
1297                 do {
1298                         struct rpcrdma_mw **mw;
1299                         mw = &req->rl_segments[i].mr_chunk.rl_mw;
1300                         list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1301                         *mw = NULL;
1302                 } while (++i < RPCRDMA_MAX_SEGS);
1303                 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1304                                         &buffers->rb_mws);
1305                 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1306                 break;
1307         default:
1308                 break;
1309         }
1310         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1311 }
1312
1313 /*
1314  * Recover reply buffers from pool.
1315  * This happens when recovering from error conditions.
1316  * Post-increment counter/array index.
1317  */
1318 void
1319 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1320 {
1321         struct rpcrdma_buffer *buffers = req->rl_buffer;
1322         unsigned long flags;
1323
1324         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1325                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1326         spin_lock_irqsave(&buffers->rb_lock, flags);
1327         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1328                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1329                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1330         }
1331         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1332 }
1333
1334 /*
1335  * Put reply buffers back into pool when not attached to
1336  * request. This happens in error conditions, and when
1337  * aborting unbinds. Pre-decrement counter/array index.
1338  */
1339 void
1340 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1341 {
1342         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1343         unsigned long flags;
1344
1345         rep->rr_func = NULL;
1346         spin_lock_irqsave(&buffers->rb_lock, flags);
1347         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1348         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1349 }
1350
1351 /*
1352  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1353  */
1354
1355 int
1356 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1357                                 struct ib_mr **mrp, struct ib_sge *iov)
1358 {
1359         struct ib_phys_buf ipb;
1360         struct ib_mr *mr;
1361         int rc;
1362
1363         /*
1364          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1365          */
1366         iov->addr = ib_dma_map_single(ia->ri_id->device,
1367                         va, len, DMA_BIDIRECTIONAL);
1368         iov->length = len;
1369
1370         if (ia->ri_have_dma_lkey) {
1371                 *mrp = NULL;
1372                 iov->lkey = ia->ri_dma_lkey;
1373                 return 0;
1374         } else if (ia->ri_bind_mem != NULL) {
1375                 *mrp = NULL;
1376                 iov->lkey = ia->ri_bind_mem->lkey;
1377                 return 0;
1378         }
1379
1380         ipb.addr = iov->addr;
1381         ipb.size = iov->length;
1382         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1383                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1384
1385         dprintk("RPC:       %s: phys convert: 0x%llx "
1386                         "registered 0x%llx length %d\n",
1387                         __func__, (unsigned long long)ipb.addr,
1388                         (unsigned long long)iov->addr, len);
1389
1390         if (IS_ERR(mr)) {
1391                 *mrp = NULL;
1392                 rc = PTR_ERR(mr);
1393                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1394         } else {
1395                 *mrp = mr;
1396                 iov->lkey = mr->lkey;
1397                 rc = 0;
1398         }
1399
1400         return rc;
1401 }
1402
1403 int
1404 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1405                                 struct ib_mr *mr, struct ib_sge *iov)
1406 {
1407         int rc;
1408
1409         ib_dma_unmap_single(ia->ri_id->device,
1410                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1411
1412         if (NULL == mr)
1413                 return 0;
1414
1415         rc = ib_dereg_mr(mr);
1416         if (rc)
1417                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1418         return rc;
1419 }
1420
1421 /*
1422  * Wrappers for chunk registration, shared by read/write chunk code.
1423  */
1424
1425 static void
1426 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1427 {
1428         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1429         seg->mr_dmalen = seg->mr_len;
1430         if (seg->mr_page)
1431                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1432                                 seg->mr_page, offset_in_page(seg->mr_offset),
1433                                 seg->mr_dmalen, seg->mr_dir);
1434         else
1435                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1436                                 seg->mr_offset,
1437                                 seg->mr_dmalen, seg->mr_dir);
1438 }
1439
1440 static void
1441 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1442 {
1443         if (seg->mr_page)
1444                 ib_dma_unmap_page(ia->ri_id->device,
1445                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1446         else
1447                 ib_dma_unmap_single(ia->ri_id->device,
1448                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1449 }
1450
1451 static int
1452 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1453                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1454                         struct rpcrdma_xprt *r_xprt)
1455 {
1456         struct rpcrdma_mr_seg *seg1 = seg;
1457         struct ib_send_wr frmr_wr, *bad_wr;
1458         u8 key;
1459         int len, pageoff;
1460         int i, rc;
1461
1462         pageoff = offset_in_page(seg1->mr_offset);
1463         seg1->mr_offset -= pageoff;     /* start of page */
1464         seg1->mr_len += pageoff;
1465         len = -pageoff;
1466         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1467                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1468         for (i = 0; i < *nsegs;) {
1469                 rpcrdma_map_one(ia, seg, writing);
1470                 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1471                 len += seg->mr_len;
1472                 ++seg;
1473                 ++i;
1474                 /* Check for holes */
1475                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1476                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1477                         break;
1478         }
1479         dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1480                 __func__, seg1->mr_chunk.rl_mw, i);
1481
1482         /* Bump the key */
1483         key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1484         ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1485
1486         /* Prepare FRMR WR */
1487         memset(&frmr_wr, 0, sizeof frmr_wr);
1488         frmr_wr.opcode = IB_WR_FAST_REG_MR;
1489         frmr_wr.send_flags = 0;                 /* unsignaled */
1490         frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1491         frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1492         frmr_wr.wr.fast_reg.page_list_len = i;
1493         frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1494         frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1495         frmr_wr.wr.fast_reg.access_flags = (writing ?
1496                                 IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
1497         frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1498         DECR_CQCOUNT(&r_xprt->rx_ep);
1499
1500         rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1501
1502         if (rc) {
1503                 dprintk("RPC:       %s: failed ib_post_send for register,"
1504                         " status %i\n", __func__, rc);
1505                 while (i--)
1506                         rpcrdma_unmap_one(ia, --seg);
1507         } else {
1508                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1509                 seg1->mr_base = seg1->mr_dma + pageoff;
1510                 seg1->mr_nsegs = i;
1511                 seg1->mr_len = len;
1512         }
1513         *nsegs = i;
1514         return rc;
1515 }
1516
1517 static int
1518 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1519                         struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1520 {
1521         struct rpcrdma_mr_seg *seg1 = seg;
1522         struct ib_send_wr invalidate_wr, *bad_wr;
1523         int rc;
1524
1525         while (seg1->mr_nsegs--)
1526                 rpcrdma_unmap_one(ia, seg++);
1527
1528         memset(&invalidate_wr, 0, sizeof invalidate_wr);
1529         invalidate_wr.opcode = IB_WR_LOCAL_INV;
1530         invalidate_wr.send_flags = 0;                   /* unsignaled */
1531         invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1532         DECR_CQCOUNT(&r_xprt->rx_ep);
1533
1534         rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1535         if (rc)
1536                 dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1537                         " status %i\n", __func__, rc);
1538         return rc;
1539 }
1540
1541 static int
1542 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1543                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1544 {
1545         struct rpcrdma_mr_seg *seg1 = seg;
1546         u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1547         int len, pageoff, i, rc;
1548
1549         pageoff = offset_in_page(seg1->mr_offset);
1550         seg1->mr_offset -= pageoff;     /* start of page */
1551         seg1->mr_len += pageoff;
1552         len = -pageoff;
1553         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1554                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1555         for (i = 0; i < *nsegs;) {
1556                 rpcrdma_map_one(ia, seg, writing);
1557                 physaddrs[i] = seg->mr_dma;
1558                 len += seg->mr_len;
1559                 ++seg;
1560                 ++i;
1561                 /* Check for holes */
1562                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1563                     offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1564                         break;
1565         }
1566         rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1567                                 physaddrs, i, seg1->mr_dma);
1568         if (rc) {
1569                 dprintk("RPC:       %s: failed ib_map_phys_fmr "
1570                         "%u@0x%llx+%i (%d)... status %i\n", __func__,
1571                         len, (unsigned long long)seg1->mr_dma,
1572                         pageoff, i, rc);
1573                 while (i--)
1574                         rpcrdma_unmap_one(ia, --seg);
1575         } else {
1576                 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1577                 seg1->mr_base = seg1->mr_dma + pageoff;
1578                 seg1->mr_nsegs = i;
1579                 seg1->mr_len = len;
1580         }
1581         *nsegs = i;
1582         return rc;
1583 }
1584
1585 static int
1586 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1587                         struct rpcrdma_ia *ia)
1588 {
1589         struct rpcrdma_mr_seg *seg1 = seg;
1590         LIST_HEAD(l);
1591         int rc;
1592
1593         list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1594         rc = ib_unmap_fmr(&l);
1595         while (seg1->mr_nsegs--)
1596                 rpcrdma_unmap_one(ia, seg++);
1597         if (rc)
1598                 dprintk("RPC:       %s: failed ib_unmap_fmr,"
1599                         " status %i\n", __func__, rc);
1600         return rc;
1601 }
1602
1603 static int
1604 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1605                         int *nsegs, int writing, struct rpcrdma_ia *ia,
1606                         struct rpcrdma_xprt *r_xprt)
1607 {
1608         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1609                                   IB_ACCESS_REMOTE_READ);
1610         struct ib_mw_bind param;
1611         int rc;
1612
1613         *nsegs = 1;
1614         rpcrdma_map_one(ia, seg, writing);
1615         param.mr = ia->ri_bind_mem;
1616         param.wr_id = 0ULL;     /* no send cookie */
1617         param.addr = seg->mr_dma;
1618         param.length = seg->mr_len;
1619         param.send_flags = 0;
1620         param.mw_access_flags = mem_priv;
1621
1622         DECR_CQCOUNT(&r_xprt->rx_ep);
1623         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1624         if (rc) {
1625                 dprintk("RPC:       %s: failed ib_bind_mw "
1626                         "%u@0x%llx status %i\n",
1627                         __func__, seg->mr_len,
1628                         (unsigned long long)seg->mr_dma, rc);
1629                 rpcrdma_unmap_one(ia, seg);
1630         } else {
1631                 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1632                 seg->mr_base = param.addr;
1633                 seg->mr_nsegs = 1;
1634         }
1635         return rc;
1636 }
1637
1638 static int
1639 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1640                         struct rpcrdma_ia *ia,
1641                         struct rpcrdma_xprt *r_xprt, void **r)
1642 {
1643         struct ib_mw_bind param;
1644         LIST_HEAD(l);
1645         int rc;
1646
1647         BUG_ON(seg->mr_nsegs != 1);
1648         param.mr = ia->ri_bind_mem;
1649         param.addr = 0ULL;      /* unbind */
1650         param.length = 0;
1651         param.mw_access_flags = 0;
1652         if (*r) {
1653                 param.wr_id = (u64) (unsigned long) *r;
1654                 param.send_flags = IB_SEND_SIGNALED;
1655                 INIT_CQCOUNT(&r_xprt->rx_ep);
1656         } else {
1657                 param.wr_id = 0ULL;
1658                 param.send_flags = 0;
1659                 DECR_CQCOUNT(&r_xprt->rx_ep);
1660         }
1661         rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1662         rpcrdma_unmap_one(ia, seg);
1663         if (rc)
1664                 dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1665                         " status %i\n", __func__, rc);
1666         else
1667                 *r = NULL;      /* will upcall on completion */
1668         return rc;
1669 }
1670
1671 static int
1672 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1673                         int *nsegs, int writing, struct rpcrdma_ia *ia)
1674 {
1675         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1676                                   IB_ACCESS_REMOTE_READ);
1677         struct rpcrdma_mr_seg *seg1 = seg;
1678         struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1679         int len, i, rc = 0;
1680
1681         if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1682                 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1683         for (len = 0, i = 0; i < *nsegs;) {
1684                 rpcrdma_map_one(ia, seg, writing);
1685                 ipb[i].addr = seg->mr_dma;
1686                 ipb[i].size = seg->mr_len;
1687                 len += seg->mr_len;
1688                 ++seg;
1689                 ++i;
1690                 /* Check for holes */
1691                 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1692                     offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1693                         break;
1694         }
1695         seg1->mr_base = seg1->mr_dma;
1696         seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1697                                 ipb, i, mem_priv, &seg1->mr_base);
1698         if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1699                 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1700                 dprintk("RPC:       %s: failed ib_reg_phys_mr "
1701                         "%u@0x%llx (%d)... status %i\n",
1702                         __func__, len,
1703                         (unsigned long long)seg1->mr_dma, i, rc);
1704                 while (i--)
1705                         rpcrdma_unmap_one(ia, --seg);
1706         } else {
1707                 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1708                 seg1->mr_nsegs = i;
1709                 seg1->mr_len = len;
1710         }
1711         *nsegs = i;
1712         return rc;
1713 }
1714
1715 static int
1716 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1717                         struct rpcrdma_ia *ia)
1718 {
1719         struct rpcrdma_mr_seg *seg1 = seg;
1720         int rc;
1721
1722         rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1723         seg1->mr_chunk.rl_mr = NULL;
1724         while (seg1->mr_nsegs--)
1725                 rpcrdma_unmap_one(ia, seg++);
1726         if (rc)
1727                 dprintk("RPC:       %s: failed ib_dereg_mr,"
1728                         " status %i\n", __func__, rc);
1729         return rc;
1730 }
1731
1732 int
1733 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1734                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1735 {
1736         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1737         int rc = 0;
1738
1739         switch (ia->ri_memreg_strategy) {
1740
1741 #if RPCRDMA_PERSISTENT_REGISTRATION
1742         case RPCRDMA_ALLPHYSICAL:
1743                 rpcrdma_map_one(ia, seg, writing);
1744                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1745                 seg->mr_base = seg->mr_dma;
1746                 seg->mr_nsegs = 1;
1747                 nsegs = 1;
1748                 break;
1749 #endif
1750
1751         /* Registration using frmr registration */
1752         case RPCRDMA_FRMR:
1753                 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1754                 break;
1755
1756         /* Registration using fmr memory registration */
1757         case RPCRDMA_MTHCAFMR:
1758                 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1759                 break;
1760
1761         /* Registration using memory windows */
1762         case RPCRDMA_MEMWINDOWS_ASYNC:
1763         case RPCRDMA_MEMWINDOWS:
1764                 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1765                 break;
1766
1767         /* Default registration each time */
1768         default:
1769                 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1770                 break;
1771         }
1772         if (rc)
1773                 return -1;
1774
1775         return nsegs;
1776 }
1777
1778 int
1779 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1780                 struct rpcrdma_xprt *r_xprt, void *r)
1781 {
1782         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1783         int nsegs = seg->mr_nsegs, rc;
1784
1785         switch (ia->ri_memreg_strategy) {
1786
1787 #if RPCRDMA_PERSISTENT_REGISTRATION
1788         case RPCRDMA_ALLPHYSICAL:
1789                 BUG_ON(nsegs != 1);
1790                 rpcrdma_unmap_one(ia, seg);
1791                 rc = 0;
1792                 break;
1793 #endif
1794
1795         case RPCRDMA_FRMR:
1796                 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1797                 break;
1798
1799         case RPCRDMA_MTHCAFMR:
1800                 rc = rpcrdma_deregister_fmr_external(seg, ia);
1801                 break;
1802
1803         case RPCRDMA_MEMWINDOWS_ASYNC:
1804         case RPCRDMA_MEMWINDOWS:
1805                 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1806                 break;
1807
1808         default:
1809                 rc = rpcrdma_deregister_default_external(seg, ia);
1810                 break;
1811         }
1812         if (r) {
1813                 struct rpcrdma_rep *rep = r;
1814                 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1815                 rep->rr_func = NULL;
1816                 func(rep);      /* dereg done, callback now */
1817         }
1818         return nsegs;
1819 }
1820
1821 /*
1822  * Prepost any receive buffer, then post send.
1823  *
1824  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1825  */
1826 int
1827 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1828                 struct rpcrdma_ep *ep,
1829                 struct rpcrdma_req *req)
1830 {
1831         struct ib_send_wr send_wr, *send_wr_fail;
1832         struct rpcrdma_rep *rep = req->rl_reply;
1833         int rc;
1834
1835         if (rep) {
1836                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1837                 if (rc)
1838                         goto out;
1839                 req->rl_reply = NULL;
1840         }
1841
1842         send_wr.next = NULL;
1843         send_wr.wr_id = 0ULL;   /* no send cookie */
1844         send_wr.sg_list = req->rl_send_iov;
1845         send_wr.num_sge = req->rl_niovs;
1846         send_wr.opcode = IB_WR_SEND;
1847         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1848                 ib_dma_sync_single_for_device(ia->ri_id->device,
1849                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1850                         DMA_TO_DEVICE);
1851         ib_dma_sync_single_for_device(ia->ri_id->device,
1852                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1853                 DMA_TO_DEVICE);
1854         ib_dma_sync_single_for_device(ia->ri_id->device,
1855                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1856                 DMA_TO_DEVICE);
1857
1858         if (DECR_CQCOUNT(ep) > 0)
1859                 send_wr.send_flags = 0;
1860         else { /* Provider must take a send completion every now and then */
1861                 INIT_CQCOUNT(ep);
1862                 send_wr.send_flags = IB_SEND_SIGNALED;
1863         }
1864
1865         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1866         if (rc)
1867                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1868                         rc);
1869 out:
1870         return rc;
1871 }
1872
1873 /*
1874  * (Re)post a receive buffer.
1875  */
1876 int
1877 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1878                      struct rpcrdma_ep *ep,
1879                      struct rpcrdma_rep *rep)
1880 {
1881         struct ib_recv_wr recv_wr, *recv_wr_fail;
1882         int rc;
1883
1884         recv_wr.next = NULL;
1885         recv_wr.wr_id = (u64) (unsigned long) rep;
1886         recv_wr.sg_list = &rep->rr_iov;
1887         recv_wr.num_sge = 1;
1888
1889         ib_dma_sync_single_for_cpu(ia->ri_id->device,
1890                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1891
1892         DECR_CQCOUNT(ep);
1893         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1894
1895         if (rc)
1896                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1897                         rc);
1898         return rc;
1899 }