1*c0dd49bdSEiji Ota /* 2*c0dd49bdSEiji Ota * CDDL HEADER START 3*c0dd49bdSEiji Ota * 4*c0dd49bdSEiji Ota * The contents of this file are subject to the terms of the 5*c0dd49bdSEiji Ota * Common Development and Distribution License (the "License"). 6*c0dd49bdSEiji Ota * You may not use this file except in compliance with the License. 7*c0dd49bdSEiji Ota * 8*c0dd49bdSEiji Ota * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*c0dd49bdSEiji Ota * or http://www.opensolaris.org/os/licensing. 10*c0dd49bdSEiji Ota * See the License for the specific language governing permissions 11*c0dd49bdSEiji Ota * and limitations under the License. 12*c0dd49bdSEiji Ota * 13*c0dd49bdSEiji Ota * When distributing Covered Code, include this CDDL HEADER in each 14*c0dd49bdSEiji Ota * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*c0dd49bdSEiji Ota * If applicable, add the following below this CDDL HEADER, with the 16*c0dd49bdSEiji Ota * fields enclosed by brackets "[]" replaced with your own identifying 17*c0dd49bdSEiji Ota * information: Portions Copyright [yyyy] [name of copyright owner] 18*c0dd49bdSEiji Ota * 19*c0dd49bdSEiji Ota * CDDL HEADER END 20*c0dd49bdSEiji Ota */ 21*c0dd49bdSEiji Ota /* 22*c0dd49bdSEiji Ota * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23*c0dd49bdSEiji Ota */ 24*c0dd49bdSEiji Ota 25*c0dd49bdSEiji Ota /* 26*c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved. 27*c0dd49bdSEiji Ota * 28*c0dd49bdSEiji Ota * This software is available to you under a choice of one of two 29*c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU 30*c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file 31*c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the 32*c0dd49bdSEiji Ota * OpenIB.org BSD license below: 33*c0dd49bdSEiji Ota * 34*c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or 35*c0dd49bdSEiji Ota * without modification, are permitted provided that the following 36*c0dd49bdSEiji Ota * conditions are met: 37*c0dd49bdSEiji Ota * 38*c0dd49bdSEiji Ota * - Redistributions of source code must retain the above 39*c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 40*c0dd49bdSEiji Ota * disclaimer. 41*c0dd49bdSEiji Ota * 42*c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above 43*c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 44*c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials 45*c0dd49bdSEiji Ota * provided with the distribution. 46*c0dd49bdSEiji Ota * 47*c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48*c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49*c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50*c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51*c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52*c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53*c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54*c0dd49bdSEiji Ota * SOFTWARE. 55*c0dd49bdSEiji Ota * 56*c0dd49bdSEiji Ota */ 57*c0dd49bdSEiji Ota #include <sys/stropts.h> 58*c0dd49bdSEiji Ota #include <sys/systm.h> 59*c0dd49bdSEiji Ota 60*c0dd49bdSEiji Ota #include <sys/rds.h> 61*c0dd49bdSEiji Ota #include <sys/socket.h> 62*c0dd49bdSEiji Ota #include <sys/socketvar.h> 63*c0dd49bdSEiji Ota 64*c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h> 65*c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h> 66*c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 67*c0dd49bdSEiji Ota 68*c0dd49bdSEiji Ota /* 69*c0dd49bdSEiji Ota * When transmitting messages in rdsv3_send_xmit, we need to emerge from 70*c0dd49bdSEiji Ota * time to time and briefly release the CPU. Otherwise the softlock watchdog 71*c0dd49bdSEiji Ota * will kick our shin. 72*c0dd49bdSEiji Ota * Also, it seems fairer to not let one busy connection stall all the 73*c0dd49bdSEiji Ota * others. 74*c0dd49bdSEiji Ota * 75*c0dd49bdSEiji Ota * send_batch_count is the number of times we'll loop in send_xmit. Setting 76*c0dd49bdSEiji Ota * it to 0 will restore the old behavior (where we looped until we had 77*c0dd49bdSEiji Ota * drained the queue). 78*c0dd49bdSEiji Ota */ 79*c0dd49bdSEiji Ota static int send_batch_count = 64; 80*c0dd49bdSEiji Ota 81*c0dd49bdSEiji Ota extern void rdsv3_ib_send_unmap_rdma(void *ic, struct rdsv3_rdma_op *op); 82*c0dd49bdSEiji Ota /* 83*c0dd49bdSEiji Ota * Reset the send state. Caller must hold c_send_lock when calling here. 84*c0dd49bdSEiji Ota */ 85*c0dd49bdSEiji Ota void 86*c0dd49bdSEiji Ota rdsv3_send_reset(struct rdsv3_connection *conn) 87*c0dd49bdSEiji Ota { 88*c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp; 89*c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro; 90*c0dd49bdSEiji Ota 91*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_reset", "Enter(conn: %p)", conn); 92*c0dd49bdSEiji Ota 93*c0dd49bdSEiji Ota if (conn->c_xmit_rm) { 94*c0dd49bdSEiji Ota rm = conn->c_xmit_rm; 95*c0dd49bdSEiji Ota ro = rm->m_rdma_op; 96*c0dd49bdSEiji Ota if (ro && ro->r_mapped) { 97*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_send_reset", 98*c0dd49bdSEiji Ota "rm %p mflg 0x%x map %d mihdl %p sgl %p", 99*c0dd49bdSEiji Ota rm, rm->m_flags, ro->r_mapped, 100*c0dd49bdSEiji Ota ro->r_rdma_sg[0].mihdl, 101*c0dd49bdSEiji Ota ro->r_rdma_sg[0].swr.wr_sgl); 102*c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(conn->c_transport_data, ro); 103*c0dd49bdSEiji Ota } 104*c0dd49bdSEiji Ota /* 105*c0dd49bdSEiji Ota * Tell the user the RDMA op is no longer mapped by the 106*c0dd49bdSEiji Ota * transport. This isn't entirely true (it's flushed out 107*c0dd49bdSEiji Ota * independently) but as the connection is down, there's 108*c0dd49bdSEiji Ota * no ongoing RDMA to/from that memory 109*c0dd49bdSEiji Ota */ 110*c0dd49bdSEiji Ota rdsv3_message_unmapped(conn->c_xmit_rm); 111*c0dd49bdSEiji Ota rdsv3_message_put(conn->c_xmit_rm); 112*c0dd49bdSEiji Ota conn->c_xmit_rm = NULL; 113*c0dd49bdSEiji Ota } 114*c0dd49bdSEiji Ota conn->c_xmit_sg = 0; 115*c0dd49bdSEiji Ota conn->c_xmit_hdr_off = 0; 116*c0dd49bdSEiji Ota conn->c_xmit_data_off = 0; 117*c0dd49bdSEiji Ota conn->c_xmit_rdma_sent = 0; 118*c0dd49bdSEiji Ota 119*c0dd49bdSEiji Ota conn->c_map_queued = 0; 120*c0dd49bdSEiji Ota 121*c0dd49bdSEiji Ota conn->c_unacked_packets = rdsv3_sysctl_max_unacked_packets; 122*c0dd49bdSEiji Ota conn->c_unacked_bytes = rdsv3_sysctl_max_unacked_bytes; 123*c0dd49bdSEiji Ota 124*c0dd49bdSEiji Ota /* Mark messages as retransmissions, and move them to the send q */ 125*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 126*c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { 127*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); 128*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags); 129*c0dd49bdSEiji Ota if (rm->m_rdma_op && rm->m_rdma_op->r_mapped) { 130*c0dd49bdSEiji Ota RDSV3_DPRINTF4("_send_reset", 131*c0dd49bdSEiji Ota "RT rm %p mflg 0x%x sgl %p", 132*c0dd49bdSEiji Ota rm, rm->m_flags, 133*c0dd49bdSEiji Ota rm->m_rdma_op->r_rdma_sg[0].swr.wr_sgl); 134*c0dd49bdSEiji Ota } 135*c0dd49bdSEiji Ota } 136*c0dd49bdSEiji Ota list_move_tail(&conn->c_send_queue, &conn->c_retrans); 137*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 138*c0dd49bdSEiji Ota 139*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_reset", "Return(conn: %p)", conn); 140*c0dd49bdSEiji Ota } 141*c0dd49bdSEiji Ota 142*c0dd49bdSEiji Ota /* 143*c0dd49bdSEiji Ota * We're making the concious trade-off here to only send one message 144*c0dd49bdSEiji Ota * down the connection at a time. 145*c0dd49bdSEiji Ota * Pro: 146*c0dd49bdSEiji Ota * - tx queueing is a simple fifo list 147*c0dd49bdSEiji Ota * - reassembly is optional and easily done by transports per conn 148*c0dd49bdSEiji Ota * - no per flow rx lookup at all, straight to the socket 149*c0dd49bdSEiji Ota * - less per-frag memory and wire overhead 150*c0dd49bdSEiji Ota * Con: 151*c0dd49bdSEiji Ota * - queued acks can be delayed behind large messages 152*c0dd49bdSEiji Ota * Depends: 153*c0dd49bdSEiji Ota * - small message latency is higher behind queued large messages 154*c0dd49bdSEiji Ota * - large message latency isn't starved by intervening small sends 155*c0dd49bdSEiji Ota */ 156*c0dd49bdSEiji Ota int 157*c0dd49bdSEiji Ota rdsv3_send_xmit(struct rdsv3_connection *conn) 158*c0dd49bdSEiji Ota { 159*c0dd49bdSEiji Ota struct rdsv3_message *rm; 160*c0dd49bdSEiji Ota unsigned int tmp; 161*c0dd49bdSEiji Ota unsigned int send_quota = send_batch_count; 162*c0dd49bdSEiji Ota struct rdsv3_scatterlist *sg; 163*c0dd49bdSEiji Ota int ret = 0; 164*c0dd49bdSEiji Ota int was_empty = 0; 165*c0dd49bdSEiji Ota list_t to_be_dropped; 166*c0dd49bdSEiji Ota 167*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_xmit", "Enter(conn: %p)", conn); 168*c0dd49bdSEiji Ota 169*c0dd49bdSEiji Ota list_create(&to_be_dropped, sizeof (struct rdsv3_message), 170*c0dd49bdSEiji Ota offsetof(struct rdsv3_message, m_conn_item)); 171*c0dd49bdSEiji Ota 172*c0dd49bdSEiji Ota /* 173*c0dd49bdSEiji Ota * sendmsg calls here after having queued its message on the send 174*c0dd49bdSEiji Ota * queue. We only have one task feeding the connection at a time. If 175*c0dd49bdSEiji Ota * another thread is already feeding the queue then we back off. This 176*c0dd49bdSEiji Ota * avoids blocking the caller and trading per-connection data between 177*c0dd49bdSEiji Ota * caches per message. 178*c0dd49bdSEiji Ota * 179*c0dd49bdSEiji Ota * The sem holder will issue a retry if they notice that someone queued 180*c0dd49bdSEiji Ota * a message after they stopped walking the send queue but before they 181*c0dd49bdSEiji Ota * dropped the sem. 182*c0dd49bdSEiji Ota */ 183*c0dd49bdSEiji Ota if (!mutex_tryenter(&conn->c_send_lock)) { 184*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_xmit", 185*c0dd49bdSEiji Ota "Another thread running(conn: %p)", conn); 186*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_sem_contention); 187*c0dd49bdSEiji Ota ret = -ENOMEM; 188*c0dd49bdSEiji Ota goto out; 189*c0dd49bdSEiji Ota } 190*c0dd49bdSEiji Ota 191*c0dd49bdSEiji Ota if (conn->c_trans->xmit_prepare) 192*c0dd49bdSEiji Ota conn->c_trans->xmit_prepare(conn); 193*c0dd49bdSEiji Ota 194*c0dd49bdSEiji Ota /* 195*c0dd49bdSEiji Ota * spin trying to push headers and data down the connection until 196*c0dd49bdSEiji Ota * the connection doens't make forward progress. 197*c0dd49bdSEiji Ota */ 198*c0dd49bdSEiji Ota while (--send_quota) { 199*c0dd49bdSEiji Ota /* 200*c0dd49bdSEiji Ota * See if need to send a congestion map update if we're 201*c0dd49bdSEiji Ota * between sending messages. The send_sem protects our sole 202*c0dd49bdSEiji Ota * use of c_map_offset and _bytes. 203*c0dd49bdSEiji Ota * Note this is used only by transports that define a special 204*c0dd49bdSEiji Ota * xmit_cong_map function. For all others, we create allocate 205*c0dd49bdSEiji Ota * a cong_map message and treat it just like any other send. 206*c0dd49bdSEiji Ota */ 207*c0dd49bdSEiji Ota if (conn->c_map_bytes) { 208*c0dd49bdSEiji Ota ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, 209*c0dd49bdSEiji Ota conn->c_map_offset); 210*c0dd49bdSEiji Ota if (ret <= 0) 211*c0dd49bdSEiji Ota break; 212*c0dd49bdSEiji Ota 213*c0dd49bdSEiji Ota conn->c_map_offset += ret; 214*c0dd49bdSEiji Ota conn->c_map_bytes -= ret; 215*c0dd49bdSEiji Ota if (conn->c_map_bytes) 216*c0dd49bdSEiji Ota continue; 217*c0dd49bdSEiji Ota } 218*c0dd49bdSEiji Ota 219*c0dd49bdSEiji Ota /* 220*c0dd49bdSEiji Ota * If we're done sending the current message, clear the 221*c0dd49bdSEiji Ota * offset and S/G temporaries. 222*c0dd49bdSEiji Ota */ 223*c0dd49bdSEiji Ota rm = conn->c_xmit_rm; 224*c0dd49bdSEiji Ota if (rm != NULL && 225*c0dd49bdSEiji Ota conn->c_xmit_hdr_off == sizeof (struct rdsv3_header) && 226*c0dd49bdSEiji Ota conn->c_xmit_sg == rm->m_nents) { 227*c0dd49bdSEiji Ota conn->c_xmit_rm = NULL; 228*c0dd49bdSEiji Ota conn->c_xmit_sg = 0; 229*c0dd49bdSEiji Ota conn->c_xmit_hdr_off = 0; 230*c0dd49bdSEiji Ota conn->c_xmit_data_off = 0; 231*c0dd49bdSEiji Ota conn->c_xmit_rdma_sent = 0; 232*c0dd49bdSEiji Ota 233*c0dd49bdSEiji Ota /* Release the reference to the previous message. */ 234*c0dd49bdSEiji Ota rdsv3_message_put(rm); 235*c0dd49bdSEiji Ota rm = NULL; 236*c0dd49bdSEiji Ota } 237*c0dd49bdSEiji Ota 238*c0dd49bdSEiji Ota /* If we're asked to send a cong map update, do so. */ 239*c0dd49bdSEiji Ota if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { 240*c0dd49bdSEiji Ota if (conn->c_trans->xmit_cong_map != NULL) { 241*c0dd49bdSEiji Ota conn->c_map_offset = 0; 242*c0dd49bdSEiji Ota conn->c_map_bytes = 243*c0dd49bdSEiji Ota sizeof (struct rdsv3_header) + 244*c0dd49bdSEiji Ota RDSV3_CONG_MAP_BYTES; 245*c0dd49bdSEiji Ota continue; 246*c0dd49bdSEiji Ota } 247*c0dd49bdSEiji Ota 248*c0dd49bdSEiji Ota rm = rdsv3_cong_update_alloc(conn); 249*c0dd49bdSEiji Ota if (IS_ERR(rm)) { 250*c0dd49bdSEiji Ota ret = PTR_ERR(rm); 251*c0dd49bdSEiji Ota break; 252*c0dd49bdSEiji Ota } 253*c0dd49bdSEiji Ota 254*c0dd49bdSEiji Ota conn->c_xmit_rm = rm; 255*c0dd49bdSEiji Ota } 256*c0dd49bdSEiji Ota 257*c0dd49bdSEiji Ota /* 258*c0dd49bdSEiji Ota * Grab the next message from the send queue, if there is one. 259*c0dd49bdSEiji Ota * 260*c0dd49bdSEiji Ota * c_xmit_rm holds a ref while we're sending this message down 261*c0dd49bdSEiji Ota * the connction. We can use this ref while holding the 262*c0dd49bdSEiji Ota * send_sem.. rdsv3_send_reset() is serialized with it. 263*c0dd49bdSEiji Ota */ 264*c0dd49bdSEiji Ota if (rm == NULL) { 265*c0dd49bdSEiji Ota unsigned int len; 266*c0dd49bdSEiji Ota 267*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 268*c0dd49bdSEiji Ota 269*c0dd49bdSEiji Ota if (!list_is_empty(&conn->c_send_queue)) { 270*c0dd49bdSEiji Ota rm = list_remove_head(&conn->c_send_queue); 271*c0dd49bdSEiji Ota rdsv3_message_addref(rm); 272*c0dd49bdSEiji Ota 273*c0dd49bdSEiji Ota /* 274*c0dd49bdSEiji Ota * Move the message from the send queue to 275*c0dd49bdSEiji Ota * the retransmit 276*c0dd49bdSEiji Ota * list right away. 277*c0dd49bdSEiji Ota */ 278*c0dd49bdSEiji Ota list_insert_tail(&conn->c_retrans, rm); 279*c0dd49bdSEiji Ota } 280*c0dd49bdSEiji Ota 281*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 282*c0dd49bdSEiji Ota 283*c0dd49bdSEiji Ota if (rm == NULL) { 284*c0dd49bdSEiji Ota was_empty = 1; 285*c0dd49bdSEiji Ota break; 286*c0dd49bdSEiji Ota } 287*c0dd49bdSEiji Ota 288*c0dd49bdSEiji Ota /* 289*c0dd49bdSEiji Ota * Unfortunately, the way Infiniband deals with 290*c0dd49bdSEiji Ota * RDMA to a bad MR key is by moving the entire 291*c0dd49bdSEiji Ota * queue pair to error state. We cold possibly 292*c0dd49bdSEiji Ota * recover from that, but right now we drop the 293*c0dd49bdSEiji Ota * connection. 294*c0dd49bdSEiji Ota * Therefore, we never retransmit messages with 295*c0dd49bdSEiji Ota * RDMA ops. 296*c0dd49bdSEiji Ota */ 297*c0dd49bdSEiji Ota if (rm->m_rdma_op && 298*c0dd49bdSEiji Ota test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) { 299*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 300*c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_MSG_ON_CONN, 301*c0dd49bdSEiji Ota &rm->m_flags)) 302*c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item); 303*c0dd49bdSEiji Ota list_insert_tail(&to_be_dropped, rm); 304*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 305*c0dd49bdSEiji Ota rdsv3_message_put(rm); 306*c0dd49bdSEiji Ota continue; 307*c0dd49bdSEiji Ota } 308*c0dd49bdSEiji Ota 309*c0dd49bdSEiji Ota /* Require an ACK every once in a while */ 310*c0dd49bdSEiji Ota len = ntohl(rm->m_inc.i_hdr.h_len); 311*c0dd49bdSEiji Ota if (conn->c_unacked_packets == 0 || 312*c0dd49bdSEiji Ota conn->c_unacked_bytes < len) { 313*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); 314*c0dd49bdSEiji Ota 315*c0dd49bdSEiji Ota conn->c_unacked_packets = 316*c0dd49bdSEiji Ota rdsv3_sysctl_max_unacked_packets; 317*c0dd49bdSEiji Ota conn->c_unacked_bytes = 318*c0dd49bdSEiji Ota rdsv3_sysctl_max_unacked_bytes; 319*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_ack_required); 320*c0dd49bdSEiji Ota } else { 321*c0dd49bdSEiji Ota conn->c_unacked_bytes -= len; 322*c0dd49bdSEiji Ota conn->c_unacked_packets--; 323*c0dd49bdSEiji Ota } 324*c0dd49bdSEiji Ota 325*c0dd49bdSEiji Ota conn->c_xmit_rm = rm; 326*c0dd49bdSEiji Ota } 327*c0dd49bdSEiji Ota 328*c0dd49bdSEiji Ota /* 329*c0dd49bdSEiji Ota * Try and send an rdma message. Let's see if we can 330*c0dd49bdSEiji Ota * keep this simple and require that the transport either 331*c0dd49bdSEiji Ota * send the whole rdma or none of it. 332*c0dd49bdSEiji Ota */ 333*c0dd49bdSEiji Ota if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { 334*c0dd49bdSEiji Ota ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); 335*c0dd49bdSEiji Ota if (ret) 336*c0dd49bdSEiji Ota break; 337*c0dd49bdSEiji Ota conn->c_xmit_rdma_sent = 1; 338*c0dd49bdSEiji Ota /* 339*c0dd49bdSEiji Ota * The transport owns the mapped memory for now. 340*c0dd49bdSEiji Ota * You can't unmap it while it's on the send queue 341*c0dd49bdSEiji Ota */ 342*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_MAPPED, &rm->m_flags); 343*c0dd49bdSEiji Ota } 344*c0dd49bdSEiji Ota 345*c0dd49bdSEiji Ota if (conn->c_xmit_hdr_off < sizeof (struct rdsv3_header) || 346*c0dd49bdSEiji Ota conn->c_xmit_sg < rm->m_nents) { 347*c0dd49bdSEiji Ota ret = conn->c_trans->xmit(conn, rm, 348*c0dd49bdSEiji Ota conn->c_xmit_hdr_off, 349*c0dd49bdSEiji Ota conn->c_xmit_sg, 350*c0dd49bdSEiji Ota conn->c_xmit_data_off); 351*c0dd49bdSEiji Ota if (ret <= 0) 352*c0dd49bdSEiji Ota break; 353*c0dd49bdSEiji Ota 354*c0dd49bdSEiji Ota if (conn->c_xmit_hdr_off < 355*c0dd49bdSEiji Ota sizeof (struct rdsv3_header)) { 356*c0dd49bdSEiji Ota tmp = min(ret, 357*c0dd49bdSEiji Ota sizeof (struct rdsv3_header) - 358*c0dd49bdSEiji Ota conn->c_xmit_hdr_off); 359*c0dd49bdSEiji Ota conn->c_xmit_hdr_off += tmp; 360*c0dd49bdSEiji Ota ret -= tmp; 361*c0dd49bdSEiji Ota } 362*c0dd49bdSEiji Ota 363*c0dd49bdSEiji Ota sg = &rm->m_sg[conn->c_xmit_sg]; 364*c0dd49bdSEiji Ota while (ret) { 365*c0dd49bdSEiji Ota tmp = min(ret, rdsv3_sg_len(sg) - 366*c0dd49bdSEiji Ota conn->c_xmit_data_off); 367*c0dd49bdSEiji Ota conn->c_xmit_data_off += tmp; 368*c0dd49bdSEiji Ota ret -= tmp; 369*c0dd49bdSEiji Ota if (conn->c_xmit_data_off == rdsv3_sg_len(sg)) { 370*c0dd49bdSEiji Ota conn->c_xmit_data_off = 0; 371*c0dd49bdSEiji Ota sg++; 372*c0dd49bdSEiji Ota conn->c_xmit_sg++; 373*c0dd49bdSEiji Ota ASSERT(!(ret != 0 && 374*c0dd49bdSEiji Ota conn->c_xmit_sg == rm->m_nents)); 375*c0dd49bdSEiji Ota } 376*c0dd49bdSEiji Ota } 377*c0dd49bdSEiji Ota } 378*c0dd49bdSEiji Ota } 379*c0dd49bdSEiji Ota 380*c0dd49bdSEiji Ota /* Nuke any messages we decided not to retransmit. */ 381*c0dd49bdSEiji Ota if (!list_is_empty(&to_be_dropped)) 382*c0dd49bdSEiji Ota rdsv3_send_remove_from_sock(&to_be_dropped, RDSV3_RDMA_DROPPED); 383*c0dd49bdSEiji Ota 384*c0dd49bdSEiji Ota if (conn->c_trans->xmit_complete) 385*c0dd49bdSEiji Ota conn->c_trans->xmit_complete(conn); 386*c0dd49bdSEiji Ota 387*c0dd49bdSEiji Ota /* 388*c0dd49bdSEiji Ota * We might be racing with another sender who queued a message but 389*c0dd49bdSEiji Ota * backed off on noticing that we held the c_send_lock. If we check 390*c0dd49bdSEiji Ota * for queued messages after dropping the sem then either we'll 391*c0dd49bdSEiji Ota * see the queued message or the queuer will get the sem. If we 392*c0dd49bdSEiji Ota * notice the queued message then we trigger an immediate retry. 393*c0dd49bdSEiji Ota * 394*c0dd49bdSEiji Ota * We need to be careful only to do this when we stopped processing 395*c0dd49bdSEiji Ota * the send queue because it was empty. It's the only way we 396*c0dd49bdSEiji Ota * stop processing the loop when the transport hasn't taken 397*c0dd49bdSEiji Ota * responsibility for forward progress. 398*c0dd49bdSEiji Ota */ 399*c0dd49bdSEiji Ota mutex_exit(&conn->c_send_lock); 400*c0dd49bdSEiji Ota 401*c0dd49bdSEiji Ota if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { 402*c0dd49bdSEiji Ota /* 403*c0dd49bdSEiji Ota * We exhausted the send quota, but there's work left to 404*c0dd49bdSEiji Ota * do. Return and (re-)schedule the send worker. 405*c0dd49bdSEiji Ota */ 406*c0dd49bdSEiji Ota ret = -EAGAIN; 407*c0dd49bdSEiji Ota } 408*c0dd49bdSEiji Ota 409*c0dd49bdSEiji Ota if (ret == 0 && was_empty) { 410*c0dd49bdSEiji Ota /* 411*c0dd49bdSEiji Ota * A simple bit test would be way faster than taking the 412*c0dd49bdSEiji Ota * spin lock 413*c0dd49bdSEiji Ota */ 414*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 415*c0dd49bdSEiji Ota if (!list_is_empty(&conn->c_send_queue)) { 416*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_sem_queue_raced); 417*c0dd49bdSEiji Ota ret = -EAGAIN; 418*c0dd49bdSEiji Ota } 419*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 420*c0dd49bdSEiji Ota } 421*c0dd49bdSEiji Ota 422*c0dd49bdSEiji Ota out: 423*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_xmit", "Return(conn: %p, ret: %d)", 424*c0dd49bdSEiji Ota conn, ret); 425*c0dd49bdSEiji Ota return (ret); 426*c0dd49bdSEiji Ota } 427*c0dd49bdSEiji Ota 428*c0dd49bdSEiji Ota static void 429*c0dd49bdSEiji Ota rdsv3_send_sndbuf_remove(struct rdsv3_sock *rs, struct rdsv3_message *rm) 430*c0dd49bdSEiji Ota { 431*c0dd49bdSEiji Ota uint32_t len = ntohl(rm->m_inc.i_hdr.h_len); 432*c0dd49bdSEiji Ota 433*c0dd49bdSEiji Ota ASSERT(mutex_owned(&rs->rs_lock)); 434*c0dd49bdSEiji Ota 435*c0dd49bdSEiji Ota ASSERT(rs->rs_snd_bytes >= len); 436*c0dd49bdSEiji Ota rs->rs_snd_bytes -= len; 437*c0dd49bdSEiji Ota 438*c0dd49bdSEiji Ota if (rs->rs_snd_bytes == 0) 439*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_queue_empty); 440*c0dd49bdSEiji Ota } 441*c0dd49bdSEiji Ota 442*c0dd49bdSEiji Ota static inline int 443*c0dd49bdSEiji Ota rdsv3_send_is_acked(struct rdsv3_message *rm, uint64_t ack, 444*c0dd49bdSEiji Ota is_acked_func is_acked) 445*c0dd49bdSEiji Ota { 446*c0dd49bdSEiji Ota if (is_acked) 447*c0dd49bdSEiji Ota return (is_acked(rm, ack)); 448*c0dd49bdSEiji Ota return (ntohll(rm->m_inc.i_hdr.h_sequence) <= ack); 449*c0dd49bdSEiji Ota } 450*c0dd49bdSEiji Ota 451*c0dd49bdSEiji Ota /* 452*c0dd49bdSEiji Ota * Returns true if there are no messages on the send and retransmit queues 453*c0dd49bdSEiji Ota * which have a sequence number greater than or equal to the given sequence 454*c0dd49bdSEiji Ota * number. 455*c0dd49bdSEiji Ota */ 456*c0dd49bdSEiji Ota int 457*c0dd49bdSEiji Ota rdsv3_send_acked_before(struct rdsv3_connection *conn, uint64_t seq) 458*c0dd49bdSEiji Ota { 459*c0dd49bdSEiji Ota struct rdsv3_message *rm; 460*c0dd49bdSEiji Ota int ret = 1; 461*c0dd49bdSEiji Ota 462*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_acked_before", "Enter(conn: %p)", conn); 463*c0dd49bdSEiji Ota 464*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 465*c0dd49bdSEiji Ota 466*c0dd49bdSEiji Ota /* XXX - original code spits out warning */ 467*c0dd49bdSEiji Ota rm = list_head(&conn->c_retrans); 468*c0dd49bdSEiji Ota if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq) 469*c0dd49bdSEiji Ota ret = 0; 470*c0dd49bdSEiji Ota 471*c0dd49bdSEiji Ota /* XXX - original code spits out warning */ 472*c0dd49bdSEiji Ota rm = list_head(&conn->c_send_queue); 473*c0dd49bdSEiji Ota if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq) 474*c0dd49bdSEiji Ota ret = 0; 475*c0dd49bdSEiji Ota 476*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 477*c0dd49bdSEiji Ota 478*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_acked_before", "Return(conn: %p)", conn); 479*c0dd49bdSEiji Ota 480*c0dd49bdSEiji Ota return (ret); 481*c0dd49bdSEiji Ota } 482*c0dd49bdSEiji Ota 483*c0dd49bdSEiji Ota /* 484*c0dd49bdSEiji Ota * This is pretty similar to what happens below in the ACK 485*c0dd49bdSEiji Ota * handling code - except that we call here as soon as we get 486*c0dd49bdSEiji Ota * the IB send completion on the RDMA op and the accompanying 487*c0dd49bdSEiji Ota * message. 488*c0dd49bdSEiji Ota */ 489*c0dd49bdSEiji Ota void 490*c0dd49bdSEiji Ota rdsv3_rdma_send_complete(struct rdsv3_message *rm, int status) 491*c0dd49bdSEiji Ota { 492*c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL; 493*c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro; 494*c0dd49bdSEiji Ota struct rdsv3_notifier *notifier; 495*c0dd49bdSEiji Ota 496*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Enter(rm: %p)", rm); 497*c0dd49bdSEiji Ota 498*c0dd49bdSEiji Ota mutex_enter(&rm->m_rs_lock); 499*c0dd49bdSEiji Ota 500*c0dd49bdSEiji Ota ro = rm->m_rdma_op; 501*c0dd49bdSEiji Ota if (test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags) && 502*c0dd49bdSEiji Ota ro && ro->r_notify && 503*c0dd49bdSEiji Ota (notifier = ro->r_notifier) != NULL) { 504*c0dd49bdSEiji Ota ro->r_notifier = NULL; 505*c0dd49bdSEiji Ota rs = rm->m_rs; 506*c0dd49bdSEiji Ota rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 507*c0dd49bdSEiji Ota 508*c0dd49bdSEiji Ota notifier->n_status = status; 509*c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 510*c0dd49bdSEiji Ota list_insert_tail(&rs->rs_notify_queue, notifier); 511*c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 512*c0dd49bdSEiji Ota } 513*c0dd49bdSEiji Ota 514*c0dd49bdSEiji Ota mutex_exit(&rm->m_rs_lock); 515*c0dd49bdSEiji Ota 516*c0dd49bdSEiji Ota if (rs) { 517*c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 518*c0dd49bdSEiji Ota rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 519*c0dd49bdSEiji Ota } 520*c0dd49bdSEiji Ota 521*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Return(rm: %p)", rm); 522*c0dd49bdSEiji Ota } 523*c0dd49bdSEiji Ota 524*c0dd49bdSEiji Ota /* 525*c0dd49bdSEiji Ota * This is the same as rdsv3_rdma_send_complete except we 526*c0dd49bdSEiji Ota * don't do any locking - we have all the ingredients (message, 527*c0dd49bdSEiji Ota * socket, socket lock) and can just move the notifier. 528*c0dd49bdSEiji Ota */ 529*c0dd49bdSEiji Ota static inline void 530*c0dd49bdSEiji Ota __rdsv3_rdma_send_complete(struct rdsv3_sock *rs, struct rdsv3_message *rm, 531*c0dd49bdSEiji Ota int status) 532*c0dd49bdSEiji Ota { 533*c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro; 534*c0dd49bdSEiji Ota void *ic; 535*c0dd49bdSEiji Ota 536*c0dd49bdSEiji Ota RDSV3_DPRINTF4("__rdsv3_rdma_send_complete", 537*c0dd49bdSEiji Ota "Enter(rs: %p, rm: %p)", rs, rm); 538*c0dd49bdSEiji Ota 539*c0dd49bdSEiji Ota ro = rm->m_rdma_op; 540*c0dd49bdSEiji Ota if (ro && ro->r_notify && ro->r_notifier) { 541*c0dd49bdSEiji Ota ro->r_notifier->n_status = status; 542*c0dd49bdSEiji Ota list_insert_tail(&rs->rs_notify_queue, ro->r_notifier); 543*c0dd49bdSEiji Ota ro->r_notifier = NULL; 544*c0dd49bdSEiji Ota } 545*c0dd49bdSEiji Ota 546*c0dd49bdSEiji Ota /* No need to wake the app - caller does this */ 547*c0dd49bdSEiji Ota } 548*c0dd49bdSEiji Ota 549*c0dd49bdSEiji Ota /* 550*c0dd49bdSEiji Ota * This is called from the IB send completion when we detect 551*c0dd49bdSEiji Ota * a RDMA operation that failed with remote access error. 552*c0dd49bdSEiji Ota * So speed is not an issue here. 553*c0dd49bdSEiji Ota */ 554*c0dd49bdSEiji Ota struct rdsv3_message * 555*c0dd49bdSEiji Ota rdsv3_send_get_message(struct rdsv3_connection *conn, 556*c0dd49bdSEiji Ota struct rdsv3_rdma_op *op) 557*c0dd49bdSEiji Ota { 558*c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp, *found = NULL; 559*c0dd49bdSEiji Ota 560*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_get_message", "Enter(conn: %p)", conn); 561*c0dd49bdSEiji Ota 562*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 563*c0dd49bdSEiji Ota 564*c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { 565*c0dd49bdSEiji Ota if (rm->m_rdma_op == op) { 566*c0dd49bdSEiji Ota atomic_add_32(&rm->m_refcount, 1); 567*c0dd49bdSEiji Ota found = rm; 568*c0dd49bdSEiji Ota goto out; 569*c0dd49bdSEiji Ota } 570*c0dd49bdSEiji Ota } 571*c0dd49bdSEiji Ota 572*c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_send_queue, 573*c0dd49bdSEiji Ota m_conn_item) { 574*c0dd49bdSEiji Ota if (rm->m_rdma_op == op) { 575*c0dd49bdSEiji Ota atomic_add_32(&rm->m_refcount, 1); 576*c0dd49bdSEiji Ota found = rm; 577*c0dd49bdSEiji Ota break; 578*c0dd49bdSEiji Ota } 579*c0dd49bdSEiji Ota } 580*c0dd49bdSEiji Ota 581*c0dd49bdSEiji Ota out: 582*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 583*c0dd49bdSEiji Ota 584*c0dd49bdSEiji Ota return (found); 585*c0dd49bdSEiji Ota } 586*c0dd49bdSEiji Ota 587*c0dd49bdSEiji Ota /* 588*c0dd49bdSEiji Ota * This removes messages from the socket's list if they're on it. The list 589*c0dd49bdSEiji Ota * argument must be private to the caller, we must be able to modify it 590*c0dd49bdSEiji Ota * without locks. The messages must have a reference held for their 591*c0dd49bdSEiji Ota * position on the list. This function will drop that reference after 592*c0dd49bdSEiji Ota * removing the messages from the 'messages' list regardless of if it found 593*c0dd49bdSEiji Ota * the messages on the socket list or not. 594*c0dd49bdSEiji Ota */ 595*c0dd49bdSEiji Ota void 596*c0dd49bdSEiji Ota rdsv3_send_remove_from_sock(struct list *messages, int status) 597*c0dd49bdSEiji Ota { 598*c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL; 599*c0dd49bdSEiji Ota struct rdsv3_message *rm; 600*c0dd49bdSEiji Ota 601*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Enter"); 602*c0dd49bdSEiji Ota 603*c0dd49bdSEiji Ota while (!list_is_empty(messages)) { 604*c0dd49bdSEiji Ota rm = list_remove_head(messages); 605*c0dd49bdSEiji Ota 606*c0dd49bdSEiji Ota /* 607*c0dd49bdSEiji Ota * If we see this flag cleared then we're *sure* that someone 608*c0dd49bdSEiji Ota * else beat us to removing it from the sock. If we race 609*c0dd49bdSEiji Ota * with their flag update we'll get the lock and then really 610*c0dd49bdSEiji Ota * see that the flag has been cleared. 611*c0dd49bdSEiji Ota * 612*c0dd49bdSEiji Ota * The message spinlock makes sure nobody clears rm->m_rs 613*c0dd49bdSEiji Ota * while we're messing with it. It does not prevent the 614*c0dd49bdSEiji Ota * message from being removed from the socket, though. 615*c0dd49bdSEiji Ota */ 616*c0dd49bdSEiji Ota mutex_enter(&rm->m_rs_lock); 617*c0dd49bdSEiji Ota if (!test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) 618*c0dd49bdSEiji Ota goto unlock_and_drop; 619*c0dd49bdSEiji Ota 620*c0dd49bdSEiji Ota if (rs != rm->m_rs) { 621*c0dd49bdSEiji Ota if (rs) { 622*c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 623*c0dd49bdSEiji Ota rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 624*c0dd49bdSEiji Ota } 625*c0dd49bdSEiji Ota rs = rm->m_rs; 626*c0dd49bdSEiji Ota rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 627*c0dd49bdSEiji Ota } 628*c0dd49bdSEiji Ota 629*c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 630*c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) { 631*c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro = rm->m_rdma_op; 632*c0dd49bdSEiji Ota struct rdsv3_notifier *notifier; 633*c0dd49bdSEiji Ota 634*c0dd49bdSEiji Ota list_remove_node(&rm->m_sock_item); 635*c0dd49bdSEiji Ota rdsv3_send_sndbuf_remove(rs, rm); 636*c0dd49bdSEiji Ota 637*c0dd49bdSEiji Ota if (ro && 638*c0dd49bdSEiji Ota (notifier = ro->r_notifier) != NULL && 639*c0dd49bdSEiji Ota (status || ro->r_notify)) { 640*c0dd49bdSEiji Ota list_insert_tail(&rs->rs_notify_queue, 641*c0dd49bdSEiji Ota notifier); 642*c0dd49bdSEiji Ota if (!notifier->n_status) 643*c0dd49bdSEiji Ota notifier->n_status = status; 644*c0dd49bdSEiji Ota rm->m_rdma_op->r_notifier = NULL; 645*c0dd49bdSEiji Ota } 646*c0dd49bdSEiji Ota rdsv3_message_put(rm); 647*c0dd49bdSEiji Ota rm->m_rs = NULL; 648*c0dd49bdSEiji Ota } 649*c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 650*c0dd49bdSEiji Ota 651*c0dd49bdSEiji Ota unlock_and_drop: 652*c0dd49bdSEiji Ota mutex_exit(&rm->m_rs_lock); 653*c0dd49bdSEiji Ota rdsv3_message_put(rm); 654*c0dd49bdSEiji Ota } 655*c0dd49bdSEiji Ota 656*c0dd49bdSEiji Ota if (rs) { 657*c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 658*c0dd49bdSEiji Ota rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 659*c0dd49bdSEiji Ota } 660*c0dd49bdSEiji Ota 661*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Return"); 662*c0dd49bdSEiji Ota } 663*c0dd49bdSEiji Ota 664*c0dd49bdSEiji Ota /* 665*c0dd49bdSEiji Ota * Transports call here when they've determined that the receiver queued 666*c0dd49bdSEiji Ota * messages up to, and including, the given sequence number. Messages are 667*c0dd49bdSEiji Ota * moved to the retrans queue when rdsv3_send_xmit picks them off the send 668*c0dd49bdSEiji Ota * queue. This means that in the TCP case, the message may not have been 669*c0dd49bdSEiji Ota * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked 670*c0dd49bdSEiji Ota * checks the RDSV3_MSG_HAS_ACK_SEQ bit. 671*c0dd49bdSEiji Ota * 672*c0dd49bdSEiji Ota * XXX It's not clear to me how this is safely serialized with socket 673*c0dd49bdSEiji Ota * destruction. Maybe it should bail if it sees SOCK_DEAD. 674*c0dd49bdSEiji Ota */ 675*c0dd49bdSEiji Ota void 676*c0dd49bdSEiji Ota rdsv3_send_drop_acked(struct rdsv3_connection *conn, uint64_t ack, 677*c0dd49bdSEiji Ota is_acked_func is_acked) 678*c0dd49bdSEiji Ota { 679*c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp; 680*c0dd49bdSEiji Ota list_t list; 681*c0dd49bdSEiji Ota 682*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Enter(conn: %p)", conn); 683*c0dd49bdSEiji Ota 684*c0dd49bdSEiji Ota list_create(&list, sizeof (struct rdsv3_message), 685*c0dd49bdSEiji Ota offsetof(struct rdsv3_message, m_conn_item)); 686*c0dd49bdSEiji Ota 687*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 688*c0dd49bdSEiji Ota 689*c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { 690*c0dd49bdSEiji Ota if (!rdsv3_send_is_acked(rm, ack, is_acked)) 691*c0dd49bdSEiji Ota break; 692*c0dd49bdSEiji Ota 693*c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item); 694*c0dd49bdSEiji Ota list_insert_tail(&list, rm); 695*c0dd49bdSEiji Ota clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 696*c0dd49bdSEiji Ota } 697*c0dd49bdSEiji Ota 698*c0dd49bdSEiji Ota #if 0 699*c0dd49bdSEiji Ota XXX 700*c0dd49bdSEiji Ota /* order flag updates with spin locks */ 701*c0dd49bdSEiji Ota if (!list_is_empty(&list)) 702*c0dd49bdSEiji Ota smp_mb__after_clear_bit(); 703*c0dd49bdSEiji Ota #endif 704*c0dd49bdSEiji Ota 705*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 706*c0dd49bdSEiji Ota 707*c0dd49bdSEiji Ota /* now remove the messages from the sock list as needed */ 708*c0dd49bdSEiji Ota rdsv3_send_remove_from_sock(&list, RDSV3_RDMA_SUCCESS); 709*c0dd49bdSEiji Ota 710*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Return(conn: %p)", conn); 711*c0dd49bdSEiji Ota } 712*c0dd49bdSEiji Ota 713*c0dd49bdSEiji Ota void 714*c0dd49bdSEiji Ota rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest) 715*c0dd49bdSEiji Ota { 716*c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp; 717*c0dd49bdSEiji Ota struct rdsv3_connection *conn; 718*c0dd49bdSEiji Ota list_t list; 719*c0dd49bdSEiji Ota int wake = 0; 720*c0dd49bdSEiji Ota 721*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_to", "Enter(rs: %p)", rs); 722*c0dd49bdSEiji Ota 723*c0dd49bdSEiji Ota list_create(&list, sizeof (struct rdsv3_message), 724*c0dd49bdSEiji Ota offsetof(struct rdsv3_message, m_sock_item)); 725*c0dd49bdSEiji Ota 726*c0dd49bdSEiji Ota /* get all the messages we're dropping under the rs lock */ 727*c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 728*c0dd49bdSEiji Ota 729*c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &rs->rs_send_queue, 730*c0dd49bdSEiji Ota m_sock_item) { 731*c0dd49bdSEiji Ota if (dest && (dest->sin_addr.s_addr != rm->m_daddr || 732*c0dd49bdSEiji Ota dest->sin_port != rm->m_inc.i_hdr.h_dport)) 733*c0dd49bdSEiji Ota continue; 734*c0dd49bdSEiji Ota 735*c0dd49bdSEiji Ota wake = 1; 736*c0dd49bdSEiji Ota list_remove(&rs->rs_send_queue, rm); 737*c0dd49bdSEiji Ota list_insert_tail(&list, rm); 738*c0dd49bdSEiji Ota rdsv3_send_sndbuf_remove(rs, rm); 739*c0dd49bdSEiji Ota clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags); 740*c0dd49bdSEiji Ota } 741*c0dd49bdSEiji Ota 742*c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 743*c0dd49bdSEiji Ota 744*c0dd49bdSEiji Ota conn = NULL; 745*c0dd49bdSEiji Ota 746*c0dd49bdSEiji Ota /* now remove the messages from the conn list as needed */ 747*c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE(rm, &list, m_sock_item) { 748*c0dd49bdSEiji Ota /* 749*c0dd49bdSEiji Ota * We do this here rather than in the loop above, so that 750*c0dd49bdSEiji Ota * we don't have to nest m_rs_lock under rs->rs_lock 751*c0dd49bdSEiji Ota */ 752*c0dd49bdSEiji Ota mutex_enter(&rm->m_rs_lock); 753*c0dd49bdSEiji Ota /* If this is a RDMA operation, notify the app. */ 754*c0dd49bdSEiji Ota __rdsv3_rdma_send_complete(rs, rm, RDSV3_RDMA_CANCELED); 755*c0dd49bdSEiji Ota rm->m_rs = NULL; 756*c0dd49bdSEiji Ota mutex_exit(&rm->m_rs_lock); 757*c0dd49bdSEiji Ota 758*c0dd49bdSEiji Ota /* 759*c0dd49bdSEiji Ota * If we see this flag cleared then we're *sure* that someone 760*c0dd49bdSEiji Ota * else beat us to removing it from the conn. If we race 761*c0dd49bdSEiji Ota * with their flag update we'll get the lock and then really 762*c0dd49bdSEiji Ota * see that the flag has been cleared. 763*c0dd49bdSEiji Ota */ 764*c0dd49bdSEiji Ota if (!test_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) 765*c0dd49bdSEiji Ota continue; 766*c0dd49bdSEiji Ota 767*c0dd49bdSEiji Ota if (conn != rm->m_inc.i_conn) { 768*c0dd49bdSEiji Ota if (conn) 769*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 770*c0dd49bdSEiji Ota conn = rm->m_inc.i_conn; 771*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 772*c0dd49bdSEiji Ota } 773*c0dd49bdSEiji Ota 774*c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) { 775*c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item); 776*c0dd49bdSEiji Ota rdsv3_message_put(rm); 777*c0dd49bdSEiji Ota } 778*c0dd49bdSEiji Ota } 779*c0dd49bdSEiji Ota 780*c0dd49bdSEiji Ota if (conn) 781*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 782*c0dd49bdSEiji Ota 783*c0dd49bdSEiji Ota if (wake) 784*c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 785*c0dd49bdSEiji Ota 786*c0dd49bdSEiji Ota while (!list_is_empty(&list)) { 787*c0dd49bdSEiji Ota rm = list_remove_head(&list); 788*c0dd49bdSEiji Ota 789*c0dd49bdSEiji Ota rdsv3_message_wait(rm); 790*c0dd49bdSEiji Ota rdsv3_message_put(rm); 791*c0dd49bdSEiji Ota } 792*c0dd49bdSEiji Ota 793*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_to", "Return(rs: %p)", rs); 794*c0dd49bdSEiji Ota } 795*c0dd49bdSEiji Ota 796*c0dd49bdSEiji Ota /* 797*c0dd49bdSEiji Ota * we only want this to fire once so we use the callers 'queued'. It's 798*c0dd49bdSEiji Ota * possible that another thread can race with us and remove the 799*c0dd49bdSEiji Ota * message from the flow with RDSV3_CANCEL_SENT_TO. 800*c0dd49bdSEiji Ota */ 801*c0dd49bdSEiji Ota static int 802*c0dd49bdSEiji Ota rdsv3_send_queue_rm(struct rdsv3_sock *rs, struct rdsv3_connection *conn, 803*c0dd49bdSEiji Ota struct rdsv3_message *rm, uint16_be_t sport, 804*c0dd49bdSEiji Ota uint16_be_t dport, int *queued) 805*c0dd49bdSEiji Ota { 806*c0dd49bdSEiji Ota uint32_t len; 807*c0dd49bdSEiji Ota 808*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Enter(rs: %p, rm: %p)", rs, rm); 809*c0dd49bdSEiji Ota 810*c0dd49bdSEiji Ota if (*queued) 811*c0dd49bdSEiji Ota goto out; 812*c0dd49bdSEiji Ota 813*c0dd49bdSEiji Ota len = ntohl(rm->m_inc.i_hdr.h_len); 814*c0dd49bdSEiji Ota 815*c0dd49bdSEiji Ota /* 816*c0dd49bdSEiji Ota * this is the only place which holds both the socket's rs_lock 817*c0dd49bdSEiji Ota * and the connection's c_lock 818*c0dd49bdSEiji Ota */ 819*c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 820*c0dd49bdSEiji Ota 821*c0dd49bdSEiji Ota /* 822*c0dd49bdSEiji Ota * If there is a little space in sndbuf, we don't queue anything, 823*c0dd49bdSEiji Ota * and userspace gets -EAGAIN. But poll() indicates there's send 824*c0dd49bdSEiji Ota * room. This can lead to bad behavior (spinning) if snd_bytes isn't 825*c0dd49bdSEiji Ota * freed up by incoming acks. So we check the *old* value of 826*c0dd49bdSEiji Ota * rs_snd_bytes here to allow the last msg to exceed the buffer, 827*c0dd49bdSEiji Ota * and poll() now knows no more data can be sent. 828*c0dd49bdSEiji Ota */ 829*c0dd49bdSEiji Ota if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) { 830*c0dd49bdSEiji Ota rs->rs_snd_bytes += len; 831*c0dd49bdSEiji Ota 832*c0dd49bdSEiji Ota /* 833*c0dd49bdSEiji Ota * let recv side know we are close to send space exhaustion. 834*c0dd49bdSEiji Ota * This is probably not the optimal way to do it, as this 835*c0dd49bdSEiji Ota * means we set the flag on *all* messages as soon as our 836*c0dd49bdSEiji Ota * throughput hits a certain threshold. 837*c0dd49bdSEiji Ota */ 838*c0dd49bdSEiji Ota if (rs->rs_snd_bytes >= rdsv3_sk_sndbuf(rs) / 2) 839*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); 840*c0dd49bdSEiji Ota 841*c0dd49bdSEiji Ota list_insert_tail(&rs->rs_send_queue, rm); 842*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags); 843*c0dd49bdSEiji Ota 844*c0dd49bdSEiji Ota rdsv3_message_addref(rm); 845*c0dd49bdSEiji Ota rm->m_rs = rs; 846*c0dd49bdSEiji Ota 847*c0dd49bdSEiji Ota /* 848*c0dd49bdSEiji Ota * The code ordering is a little weird, but we're 849*c0dd49bdSEiji Ota * trying to minimize the time we hold c_lock 850*c0dd49bdSEiji Ota */ 851*c0dd49bdSEiji Ota rdsv3_message_populate_header(&rm->m_inc.i_hdr, sport, 852*c0dd49bdSEiji Ota dport, 0); 853*c0dd49bdSEiji Ota rm->m_inc.i_conn = conn; 854*c0dd49bdSEiji Ota rdsv3_message_addref(rm); /* XXX - called twice */ 855*c0dd49bdSEiji Ota 856*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 857*c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_sequence = htonll(conn->c_next_tx_seq++); 858*c0dd49bdSEiji Ota list_insert_tail(&conn->c_send_queue, rm); 859*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 860*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 861*c0dd49bdSEiji Ota 862*c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_send_queue_rm", 863*c0dd49bdSEiji Ota "queued msg %p len %d, rs %p bytes %d seq %llu", 864*c0dd49bdSEiji Ota rm, len, rs, rs->rs_snd_bytes, 865*c0dd49bdSEiji Ota (unsigned long long)ntohll( 866*c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_sequence)); 867*c0dd49bdSEiji Ota 868*c0dd49bdSEiji Ota *queued = 1; 869*c0dd49bdSEiji Ota } 870*c0dd49bdSEiji Ota 871*c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 872*c0dd49bdSEiji Ota 873*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Return(rs: %p)", rs); 874*c0dd49bdSEiji Ota out: 875*c0dd49bdSEiji Ota return (*queued); 876*c0dd49bdSEiji Ota } 877*c0dd49bdSEiji Ota 878*c0dd49bdSEiji Ota static int 879*c0dd49bdSEiji Ota rdsv3_cmsg_send(struct rdsv3_sock *rs, struct rdsv3_message *rm, 880*c0dd49bdSEiji Ota struct msghdr *msg, int *allocated_mr) 881*c0dd49bdSEiji Ota { 882*c0dd49bdSEiji Ota struct cmsghdr *cmsg; 883*c0dd49bdSEiji Ota int ret = 0; 884*c0dd49bdSEiji Ota 885*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_cmsg_send", "Enter(rs: %p)", rs); 886*c0dd49bdSEiji Ota 887*c0dd49bdSEiji Ota for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { 888*c0dd49bdSEiji Ota 889*c0dd49bdSEiji Ota if (cmsg->cmsg_level != SOL_RDS) 890*c0dd49bdSEiji Ota continue; 891*c0dd49bdSEiji Ota 892*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_cmsg_send", "cmsg(%p, %p) type %d", 893*c0dd49bdSEiji Ota cmsg, rm, cmsg->cmsg_type); 894*c0dd49bdSEiji Ota /* 895*c0dd49bdSEiji Ota * As a side effect, RDMA_DEST and RDMA_MAP will set 896*c0dd49bdSEiji Ota * rm->m_rdma_cookie and rm->m_rdma_mr. 897*c0dd49bdSEiji Ota */ 898*c0dd49bdSEiji Ota switch (cmsg->cmsg_type) { 899*c0dd49bdSEiji Ota case RDSV3_CMSG_RDMA_ARGS: 900*c0dd49bdSEiji Ota ret = rdsv3_cmsg_rdma_args(rs, rm, cmsg); 901*c0dd49bdSEiji Ota break; 902*c0dd49bdSEiji Ota 903*c0dd49bdSEiji Ota case RDSV3_CMSG_RDMA_DEST: 904*c0dd49bdSEiji Ota ret = rdsv3_cmsg_rdma_dest(rs, rm, cmsg); 905*c0dd49bdSEiji Ota break; 906*c0dd49bdSEiji Ota 907*c0dd49bdSEiji Ota case RDSV3_CMSG_RDMA_MAP: 908*c0dd49bdSEiji Ota ret = rdsv3_cmsg_rdma_map(rs, rm, cmsg); 909*c0dd49bdSEiji Ota if (ret) 910*c0dd49bdSEiji Ota *allocated_mr = 1; 911*c0dd49bdSEiji Ota break; 912*c0dd49bdSEiji Ota 913*c0dd49bdSEiji Ota default: 914*c0dd49bdSEiji Ota return (-EINVAL); 915*c0dd49bdSEiji Ota } 916*c0dd49bdSEiji Ota 917*c0dd49bdSEiji Ota if (ret) 918*c0dd49bdSEiji Ota break; 919*c0dd49bdSEiji Ota } 920*c0dd49bdSEiji Ota 921*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_cmsg_send", "Return(rs: %p)", rs); 922*c0dd49bdSEiji Ota 923*c0dd49bdSEiji Ota return (ret); 924*c0dd49bdSEiji Ota } 925*c0dd49bdSEiji Ota 926*c0dd49bdSEiji Ota int 927*c0dd49bdSEiji Ota rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg, 928*c0dd49bdSEiji Ota size_t payload_len) 929*c0dd49bdSEiji Ota { 930*c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 931*c0dd49bdSEiji Ota struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; 932*c0dd49bdSEiji Ota uint32_be_t daddr; 933*c0dd49bdSEiji Ota uint16_be_t dport; 934*c0dd49bdSEiji Ota struct rdsv3_message *rm = NULL; 935*c0dd49bdSEiji Ota struct rdsv3_connection *conn; 936*c0dd49bdSEiji Ota int ret = 0; 937*c0dd49bdSEiji Ota int queued = 0, allocated_mr = 0; 938*c0dd49bdSEiji Ota int nonblock = msg->msg_flags & MSG_DONTWAIT; 939*c0dd49bdSEiji Ota long timeo = rdsv3_rcvtimeo(sk, nonblock); 940*c0dd49bdSEiji Ota 941*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_sendmsg", "Enter(rs: %p)", rs); 942*c0dd49bdSEiji Ota 943*c0dd49bdSEiji Ota if (msg->msg_namelen) { 944*c0dd49bdSEiji Ota /* XXX fail non-unicast destination IPs? */ 945*c0dd49bdSEiji Ota if (msg->msg_namelen < sizeof (*usin) || 946*c0dd49bdSEiji Ota usin->sin_family != AF_INET_OFFLOAD) { 947*c0dd49bdSEiji Ota ret = -EINVAL; 948*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret); 949*c0dd49bdSEiji Ota goto out; 950*c0dd49bdSEiji Ota } 951*c0dd49bdSEiji Ota daddr = usin->sin_addr.s_addr; 952*c0dd49bdSEiji Ota dport = usin->sin_port; 953*c0dd49bdSEiji Ota } else { 954*c0dd49bdSEiji Ota /* We only care about consistency with ->connect() */ 955*c0dd49bdSEiji Ota mutex_enter(&sk->sk_lock); 956*c0dd49bdSEiji Ota daddr = rs->rs_conn_addr; 957*c0dd49bdSEiji Ota dport = rs->rs_conn_port; 958*c0dd49bdSEiji Ota mutex_exit(&sk->sk_lock); 959*c0dd49bdSEiji Ota } 960*c0dd49bdSEiji Ota 961*c0dd49bdSEiji Ota /* racing with another thread binding seems ok here */ 962*c0dd49bdSEiji Ota if (daddr == 0 || rs->rs_bound_addr == 0) { 963*c0dd49bdSEiji Ota ret = -ENOTCONN; /* XXX not a great errno */ 964*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret); 965*c0dd49bdSEiji Ota goto out; 966*c0dd49bdSEiji Ota } 967*c0dd49bdSEiji Ota 968*c0dd49bdSEiji Ota rm = rdsv3_message_copy_from_user(uio, payload_len); 969*c0dd49bdSEiji Ota if (IS_ERR(rm)) { 970*c0dd49bdSEiji Ota ret = PTR_ERR(rm); 971*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 972*c0dd49bdSEiji Ota "rdsv3_message_copy_from_user failed %d", -ret); 973*c0dd49bdSEiji Ota rm = NULL; 974*c0dd49bdSEiji Ota goto out; 975*c0dd49bdSEiji Ota } 976*c0dd49bdSEiji Ota 977*c0dd49bdSEiji Ota rm->m_daddr = daddr; 978*c0dd49bdSEiji Ota 979*c0dd49bdSEiji Ota /* 980*c0dd49bdSEiji Ota * rdsv3_conn_create has a spinlock that runs with IRQ off. 981*c0dd49bdSEiji Ota * Caching the conn in the socket helps a lot. 982*c0dd49bdSEiji Ota */ 983*c0dd49bdSEiji Ota mutex_enter(&rs->rs_conn_lock); 984*c0dd49bdSEiji Ota if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) { 985*c0dd49bdSEiji Ota conn = rs->rs_conn; 986*c0dd49bdSEiji Ota } else { 987*c0dd49bdSEiji Ota conn = rdsv3_conn_create_outgoing(rs->rs_bound_addr, 988*c0dd49bdSEiji Ota daddr, rs->rs_transport, KM_NOSLEEP); 989*c0dd49bdSEiji Ota if (IS_ERR(conn)) { 990*c0dd49bdSEiji Ota mutex_exit(&rs->rs_conn_lock); 991*c0dd49bdSEiji Ota ret = PTR_ERR(conn); 992*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 993*c0dd49bdSEiji Ota "rdsv3_conn_create_outgoing failed %d", 994*c0dd49bdSEiji Ota -ret); 995*c0dd49bdSEiji Ota goto out; 996*c0dd49bdSEiji Ota } 997*c0dd49bdSEiji Ota rs->rs_conn = conn; 998*c0dd49bdSEiji Ota } 999*c0dd49bdSEiji Ota mutex_exit(&rs->rs_conn_lock); 1000*c0dd49bdSEiji Ota 1001*c0dd49bdSEiji Ota /* Parse any control messages the user may have included. */ 1002*c0dd49bdSEiji Ota ret = rdsv3_cmsg_send(rs, rm, msg, &allocated_mr); 1003*c0dd49bdSEiji Ota if (ret) { 1004*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1005*c0dd49bdSEiji Ota "rdsv3_cmsg_send(rs: %p rm: %p msg: %p) returned: %d", 1006*c0dd49bdSEiji Ota rs, rm, msg, ret); 1007*c0dd49bdSEiji Ota goto out; 1008*c0dd49bdSEiji Ota } 1009*c0dd49bdSEiji Ota 1010*c0dd49bdSEiji Ota if ((rm->m_rdma_cookie || rm->m_rdma_op) && 1011*c0dd49bdSEiji Ota conn->c_trans->xmit_rdma == NULL) { 1012*c0dd49bdSEiji Ota RDSV3_DPRINTF0("rdsv3_sendmsg", "rdma_op %p conn xmit_rdma %p", 1013*c0dd49bdSEiji Ota rm->m_rdma_op, conn->c_trans->xmit_rdma); 1014*c0dd49bdSEiji Ota ret = -EOPNOTSUPP; 1015*c0dd49bdSEiji Ota goto out; 1016*c0dd49bdSEiji Ota } 1017*c0dd49bdSEiji Ota 1018*c0dd49bdSEiji Ota /* 1019*c0dd49bdSEiji Ota * If the connection is down, trigger a connect. We may 1020*c0dd49bdSEiji Ota * have scheduled a delayed reconnect however - in this case 1021*c0dd49bdSEiji Ota * we should not interfere. 1022*c0dd49bdSEiji Ota */ 1023*c0dd49bdSEiji Ota if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN && 1024*c0dd49bdSEiji Ota !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags)) 1025*c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 1026*c0dd49bdSEiji Ota 1027*c0dd49bdSEiji Ota ret = rdsv3_cong_wait(conn->c_fcong, dport, nonblock, rs); 1028*c0dd49bdSEiji Ota if (ret) { 1029*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1030*c0dd49bdSEiji Ota "rdsv3_cong_wait (dport: %d) returned: %d", dport, ret); 1031*c0dd49bdSEiji Ota goto out; 1032*c0dd49bdSEiji Ota } 1033*c0dd49bdSEiji Ota 1034*c0dd49bdSEiji Ota (void) rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, 1035*c0dd49bdSEiji Ota &queued); 1036*c0dd49bdSEiji Ota if (!queued) { 1037*c0dd49bdSEiji Ota /* rdsv3_stats_inc(s_send_queue_full); */ 1038*c0dd49bdSEiji Ota /* XXX make sure this is reasonable */ 1039*c0dd49bdSEiji Ota if (payload_len > rdsv3_sk_sndbuf(rs)) { 1040*c0dd49bdSEiji Ota ret = -EMSGSIZE; 1041*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1042*c0dd49bdSEiji Ota "msgsize(%d) too big, returning: %d", 1043*c0dd49bdSEiji Ota payload_len, -ret); 1044*c0dd49bdSEiji Ota goto out; 1045*c0dd49bdSEiji Ota } 1046*c0dd49bdSEiji Ota if (nonblock) { 1047*c0dd49bdSEiji Ota ret = -EAGAIN; 1048*c0dd49bdSEiji Ota RDSV3_DPRINTF3("rdsv3_sendmsg", 1049*c0dd49bdSEiji Ota "send queue full (%d), returning: %d", 1050*c0dd49bdSEiji Ota payload_len, -ret); 1051*c0dd49bdSEiji Ota goto out; 1052*c0dd49bdSEiji Ota } 1053*c0dd49bdSEiji Ota 1054*c0dd49bdSEiji Ota mutex_enter(&sk->sk_sleep->waitq_mutex); 1055*c0dd49bdSEiji Ota while (!rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, 1056*c0dd49bdSEiji Ota dport, &queued)) { 1057*c0dd49bdSEiji Ota #if 0 1058*c0dd49bdSEiji Ota ret = cv_timedwait_sig(&sk->sk_sleep->waitq_cv, 1059*c0dd49bdSEiji Ota &sk->sk_sleep->waitq_mutex, 1060*c0dd49bdSEiji Ota timeo * drv_usectohz(1000000) + ddi_get_lbolt()); 1061*c0dd49bdSEiji Ota if (ret <= 0) { 1062*c0dd49bdSEiji Ota /* signal/timeout pending */ 1063*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1064*c0dd49bdSEiji Ota "woke due to signal/timeout: %d", 1065*c0dd49bdSEiji Ota ret); 1066*c0dd49bdSEiji Ota ret = (ret == 0) ? -ERESTART : -ETIMEDOUT; 1067*c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 1068*c0dd49bdSEiji Ota goto out; 1069*c0dd49bdSEiji Ota } 1070*c0dd49bdSEiji Ota #else 1071*c0dd49bdSEiji Ota ret = cv_wait_sig(&sk->sk_sleep->waitq_cv, 1072*c0dd49bdSEiji Ota &sk->sk_sleep->waitq_mutex); 1073*c0dd49bdSEiji Ota if (ret == 0) { 1074*c0dd49bdSEiji Ota /* signal/timeout pending */ 1075*c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1076*c0dd49bdSEiji Ota "woke due to signal: %d", 1077*c0dd49bdSEiji Ota ret); 1078*c0dd49bdSEiji Ota ret = -ERESTART; 1079*c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 1080*c0dd49bdSEiji Ota goto out; 1081*c0dd49bdSEiji Ota } 1082*c0dd49bdSEiji Ota #endif 1083*c0dd49bdSEiji Ota } 1084*c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 1085*c0dd49bdSEiji Ota 1086*c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_sendmsg", "sendmsg woke queued %d", 1087*c0dd49bdSEiji Ota queued); 1088*c0dd49bdSEiji Ota 1089*c0dd49bdSEiji Ota ASSERT(queued); 1090*c0dd49bdSEiji Ota ret = 0; 1091*c0dd49bdSEiji Ota } 1092*c0dd49bdSEiji Ota 1093*c0dd49bdSEiji Ota /* 1094*c0dd49bdSEiji Ota * By now we've committed to the send. We reuse rdsv3_send_worker() 1095*c0dd49bdSEiji Ota * to retry sends in the rds thread if the transport asks us to. 1096*c0dd49bdSEiji Ota */ 1097*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_queued); 1098*c0dd49bdSEiji Ota 1099*c0dd49bdSEiji Ota if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 1100*c0dd49bdSEiji Ota rdsv3_send_worker(&conn->c_send_w.work); 1101*c0dd49bdSEiji Ota 1102*c0dd49bdSEiji Ota rdsv3_message_put(rm); 1103*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_sendmsg", "Return(rs: %p, len: %d)", 1104*c0dd49bdSEiji Ota rs, payload_len); 1105*c0dd49bdSEiji Ota return (payload_len); 1106*c0dd49bdSEiji Ota 1107*c0dd49bdSEiji Ota out: 1108*c0dd49bdSEiji Ota /* 1109*c0dd49bdSEiji Ota * If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. 1110*c0dd49bdSEiji Ota * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN 1111*c0dd49bdSEiji Ota * or in any other way, we need to destroy the MR again 1112*c0dd49bdSEiji Ota */ 1113*c0dd49bdSEiji Ota if (allocated_mr) 1114*c0dd49bdSEiji Ota rdsv3_rdma_unuse(rs, rdsv3_rdma_cookie_key(rm->m_rdma_cookie), 1115*c0dd49bdSEiji Ota 1); 1116*c0dd49bdSEiji Ota 1117*c0dd49bdSEiji Ota if (rm) 1118*c0dd49bdSEiji Ota rdsv3_message_put(rm); 1119*c0dd49bdSEiji Ota return (ret); 1120*c0dd49bdSEiji Ota } 1121*c0dd49bdSEiji Ota 1122*c0dd49bdSEiji Ota /* 1123*c0dd49bdSEiji Ota * Reply to a ping packet. 1124*c0dd49bdSEiji Ota */ 1125*c0dd49bdSEiji Ota int 1126*c0dd49bdSEiji Ota rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport) 1127*c0dd49bdSEiji Ota { 1128*c0dd49bdSEiji Ota struct rdsv3_message *rm; 1129*c0dd49bdSEiji Ota int ret = 0; 1130*c0dd49bdSEiji Ota 1131*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_pong", "Enter(conn: %p)", conn); 1132*c0dd49bdSEiji Ota 1133*c0dd49bdSEiji Ota rm = rdsv3_message_alloc(0, KM_NOSLEEP); 1134*c0dd49bdSEiji Ota if (rm == NULL) { 1135*c0dd49bdSEiji Ota ret = -ENOMEM; 1136*c0dd49bdSEiji Ota goto out; 1137*c0dd49bdSEiji Ota } 1138*c0dd49bdSEiji Ota 1139*c0dd49bdSEiji Ota rm->m_daddr = conn->c_faddr; 1140*c0dd49bdSEiji Ota 1141*c0dd49bdSEiji Ota /* 1142*c0dd49bdSEiji Ota * If the connection is down, trigger a connect. We may 1143*c0dd49bdSEiji Ota * have scheduled a delayed reconnect however - in this case 1144*c0dd49bdSEiji Ota * we should not interfere. 1145*c0dd49bdSEiji Ota */ 1146*c0dd49bdSEiji Ota if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN && 1147*c0dd49bdSEiji Ota !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags)) 1148*c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 1149*c0dd49bdSEiji Ota 1150*c0dd49bdSEiji Ota ret = rdsv3_cong_wait(conn->c_fcong, dport, 1, NULL); 1151*c0dd49bdSEiji Ota if (ret) 1152*c0dd49bdSEiji Ota goto out; 1153*c0dd49bdSEiji Ota 1154*c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 1155*c0dd49bdSEiji Ota list_insert_tail(&conn->c_send_queue, rm); 1156*c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 1157*c0dd49bdSEiji Ota rdsv3_message_addref(rm); 1158*c0dd49bdSEiji Ota rm->m_inc.i_conn = conn; 1159*c0dd49bdSEiji Ota 1160*c0dd49bdSEiji Ota rdsv3_message_populate_header(&rm->m_inc.i_hdr, 0, dport, 1161*c0dd49bdSEiji Ota conn->c_next_tx_seq); 1162*c0dd49bdSEiji Ota conn->c_next_tx_seq++; 1163*c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 1164*c0dd49bdSEiji Ota 1165*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_queued); 1166*c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_pong); 1167*c0dd49bdSEiji Ota 1168*c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 1169*c0dd49bdSEiji Ota rdsv3_message_put(rm); 1170*c0dd49bdSEiji Ota 1171*c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_pong", "Return(conn: %p)", conn); 1172*c0dd49bdSEiji Ota return (0); 1173*c0dd49bdSEiji Ota 1174*c0dd49bdSEiji Ota out: 1175*c0dd49bdSEiji Ota if (rm) 1176*c0dd49bdSEiji Ota rdsv3_message_put(rm); 1177*c0dd49bdSEiji Ota return (ret); 1178*c0dd49bdSEiji Ota } 1179