}
 
        if (xdrbuf->tail[0].iov_len) {
+               /* the rpcrdma protocol allows us to omit any trailing
+                * xdr pad bytes, saving the server an RDMA operation. */
+               if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+                       return n;
                if (n == nsegs)
                        return 0;
                seg[n].mr_page = NULL;
  * Scatter inline received data back into provided iov's.
  */
 static void
-rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
        int i, npages, curlen, olen;
        char *destp;
        } else
                rqst->rq_rcv_buf.tail[0].iov_len = 0;
 
+       if (pad) {
+               /* implicit padding on terminal chunk */
+               unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+               while (pad--)
+                       p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+       }
+
        if (copy_len)
                dprintk("RPC:       %s: %d bytes in"
                        " %d extra segments (%d lost)\n",
                            ((unsigned char *)iptr - (unsigned char *)headerp);
                        status = rep->rr_len + rdmalen;
                        r_xprt->rx_stats.total_rdma_reply += rdmalen;
+                       /* special case - last chunk may omit padding */
+                       if (rdmalen &= 3) {
+                               rdmalen = 4 - rdmalen;
+                               status += rdmalen;
+                       }
                } else {
                        /* else ordinary inline */
+                       rdmalen = 0;
                        iptr = (__be32 *)((unsigned char *)headerp + 28);
                        rep->rr_len -= 28; /*sizeof *headerp;*/
                        status = rep->rr_len;
                }
                /* Fix up the rpc results for upper layer */
-               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
                break;
 
        case __constant_htonl(RDMA_NOMSG):
 
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+                int xprt_rdma_pad_optimize = 0;
 
 #ifdef RPC_DEBUG
 
                .extra1         = &min_memreg,
                .extra2         = &max_memreg,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "rdma_pad_optimize",
+               .data           = &xprt_rdma_pad_optimize,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
        {
                .ctl_name = 0,
        },
 
 #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
 #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
 /*
  * Interface Adapter calls - xprtrdma/verbs.c
  */