1c0dd49bdSEiji Ota /* 2c0dd49bdSEiji Ota * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3c0dd49bdSEiji Ota */ 4c0dd49bdSEiji Ota 5c0dd49bdSEiji Ota /* 6*16e76cddSagiri * This file contains code imported from the OFED rds source file send.c 7*16e76cddSagiri * Oracle elects to have and use the contents of send.c under and governed 8*16e76cddSagiri * by the OpenIB.org BSD license (see below for full license text). However, 9*16e76cddSagiri * the following notice accompanied the original version of this file: 10*16e76cddSagiri */ 11*16e76cddSagiri 12*16e76cddSagiri /* 13c0dd49bdSEiji Ota * Copyright (c) 2006 Oracle. All rights reserved. 14c0dd49bdSEiji Ota * 15c0dd49bdSEiji Ota * This software is available to you under a choice of one of two 16c0dd49bdSEiji Ota * licenses. You may choose to be licensed under the terms of the GNU 17c0dd49bdSEiji Ota * General Public License (GPL) Version 2, available from the file 18c0dd49bdSEiji Ota * COPYING in the main directory of this source tree, or the 19c0dd49bdSEiji Ota * OpenIB.org BSD license below: 20c0dd49bdSEiji Ota * 21c0dd49bdSEiji Ota * Redistribution and use in source and binary forms, with or 22c0dd49bdSEiji Ota * without modification, are permitted provided that the following 23c0dd49bdSEiji Ota * conditions are met: 24c0dd49bdSEiji Ota * 25c0dd49bdSEiji Ota * - Redistributions of source code must retain the above 26c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 27c0dd49bdSEiji Ota * disclaimer. 28c0dd49bdSEiji Ota * 29c0dd49bdSEiji Ota * - Redistributions in binary form must reproduce the above 30c0dd49bdSEiji Ota * copyright notice, this list of conditions and the following 31c0dd49bdSEiji Ota * disclaimer in the documentation and/or other materials 32c0dd49bdSEiji Ota * provided with the distribution. 33c0dd49bdSEiji Ota * 34c0dd49bdSEiji Ota * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35c0dd49bdSEiji Ota * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36c0dd49bdSEiji Ota * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37c0dd49bdSEiji Ota * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38c0dd49bdSEiji Ota * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39c0dd49bdSEiji Ota * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40c0dd49bdSEiji Ota * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41c0dd49bdSEiji Ota * SOFTWARE. 42c0dd49bdSEiji Ota * 43c0dd49bdSEiji Ota */ 44c0dd49bdSEiji Ota #include <sys/stropts.h> 45c0dd49bdSEiji Ota #include <sys/systm.h> 46c0dd49bdSEiji Ota 47c0dd49bdSEiji Ota #include <sys/rds.h> 48c0dd49bdSEiji Ota #include <sys/socket.h> 49c0dd49bdSEiji Ota #include <sys/socketvar.h> 50c0dd49bdSEiji Ota 51c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h> 52c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdma.h> 53c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 54c0dd49bdSEiji Ota 55c0dd49bdSEiji Ota /* 56c0dd49bdSEiji Ota * When transmitting messages in rdsv3_send_xmit, we need to emerge from 57c0dd49bdSEiji Ota * time to time and briefly release the CPU. Otherwise the softlock watchdog 58c0dd49bdSEiji Ota * will kick our shin. 59c0dd49bdSEiji Ota * Also, it seems fairer to not let one busy connection stall all the 60c0dd49bdSEiji Ota * others. 61c0dd49bdSEiji Ota * 62c0dd49bdSEiji Ota * send_batch_count is the number of times we'll loop in send_xmit. Setting 63c0dd49bdSEiji Ota * it to 0 will restore the old behavior (where we looped until we had 64c0dd49bdSEiji Ota * drained the queue). 65c0dd49bdSEiji Ota */ 66c0dd49bdSEiji Ota static int send_batch_count = 64; 67c0dd49bdSEiji Ota 68c0dd49bdSEiji Ota extern void rdsv3_ib_send_unmap_rdma(void *ic, struct rdsv3_rdma_op *op); 69c0dd49bdSEiji Ota /* 70c0dd49bdSEiji Ota * Reset the send state. Caller must hold c_send_lock when calling here. 71c0dd49bdSEiji Ota */ 72c0dd49bdSEiji Ota void 73c0dd49bdSEiji Ota rdsv3_send_reset(struct rdsv3_connection *conn) 74c0dd49bdSEiji Ota { 75c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp; 76c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro; 77c0dd49bdSEiji Ota 78c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_reset", "Enter(conn: %p)", conn); 79c0dd49bdSEiji Ota 805d5562f5SEiji Ota ASSERT(MUTEX_HELD(&conn->c_send_lock)); 815d5562f5SEiji Ota 82c0dd49bdSEiji Ota if (conn->c_xmit_rm) { 83c0dd49bdSEiji Ota rm = conn->c_xmit_rm; 84c0dd49bdSEiji Ota ro = rm->m_rdma_op; 85c0dd49bdSEiji Ota if (ro && ro->r_mapped) { 86c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_send_reset", 87c0dd49bdSEiji Ota "rm %p mflg 0x%x map %d mihdl %p sgl %p", 88c0dd49bdSEiji Ota rm, rm->m_flags, ro->r_mapped, 89c0dd49bdSEiji Ota ro->r_rdma_sg[0].mihdl, 90c0dd49bdSEiji Ota ro->r_rdma_sg[0].swr.wr_sgl); 91c0dd49bdSEiji Ota rdsv3_ib_send_unmap_rdma(conn->c_transport_data, ro); 92c0dd49bdSEiji Ota } 93c0dd49bdSEiji Ota /* 94c0dd49bdSEiji Ota * Tell the user the RDMA op is no longer mapped by the 95c0dd49bdSEiji Ota * transport. This isn't entirely true (it's flushed out 96c0dd49bdSEiji Ota * independently) but as the connection is down, there's 97c0dd49bdSEiji Ota * no ongoing RDMA to/from that memory 98c0dd49bdSEiji Ota */ 99c0dd49bdSEiji Ota rdsv3_message_unmapped(conn->c_xmit_rm); 100c0dd49bdSEiji Ota rdsv3_message_put(conn->c_xmit_rm); 101c0dd49bdSEiji Ota conn->c_xmit_rm = NULL; 102c0dd49bdSEiji Ota } 1035d5562f5SEiji Ota 104c0dd49bdSEiji Ota conn->c_xmit_sg = 0; 105c0dd49bdSEiji Ota conn->c_xmit_hdr_off = 0; 106c0dd49bdSEiji Ota conn->c_xmit_data_off = 0; 107c0dd49bdSEiji Ota conn->c_xmit_rdma_sent = 0; 108c0dd49bdSEiji Ota conn->c_map_queued = 0; 109c0dd49bdSEiji Ota 110c0dd49bdSEiji Ota conn->c_unacked_packets = rdsv3_sysctl_max_unacked_packets; 111c0dd49bdSEiji Ota conn->c_unacked_bytes = rdsv3_sysctl_max_unacked_bytes; 112c0dd49bdSEiji Ota 113c0dd49bdSEiji Ota /* Mark messages as retransmissions, and move them to the send q */ 114c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 115c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { 116c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); 117c0dd49bdSEiji Ota set_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags); 118c0dd49bdSEiji Ota if (rm->m_rdma_op && rm->m_rdma_op->r_mapped) { 119c0dd49bdSEiji Ota RDSV3_DPRINTF4("_send_reset", 120c0dd49bdSEiji Ota "RT rm %p mflg 0x%x sgl %p", 121c0dd49bdSEiji Ota rm, rm->m_flags, 122c0dd49bdSEiji Ota rm->m_rdma_op->r_rdma_sg[0].swr.wr_sgl); 123c0dd49bdSEiji Ota } 124c0dd49bdSEiji Ota } 125c0dd49bdSEiji Ota list_move_tail(&conn->c_send_queue, &conn->c_retrans); 126c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 127c0dd49bdSEiji Ota 128c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_reset", "Return(conn: %p)", conn); 129c0dd49bdSEiji Ota } 130c0dd49bdSEiji Ota 131c0dd49bdSEiji Ota /* 132c0dd49bdSEiji Ota * We're making the concious trade-off here to only send one message 133c0dd49bdSEiji Ota * down the connection at a time. 134c0dd49bdSEiji Ota * Pro: 135c0dd49bdSEiji Ota * - tx queueing is a simple fifo list 136c0dd49bdSEiji Ota * - reassembly is optional and easily done by transports per conn 137c0dd49bdSEiji Ota * - no per flow rx lookup at all, straight to the socket 138c0dd49bdSEiji Ota * - less per-frag memory and wire overhead 139c0dd49bdSEiji Ota * Con: 140c0dd49bdSEiji Ota * - queued acks can be delayed behind large messages 141c0dd49bdSEiji Ota * Depends: 142c0dd49bdSEiji Ota * - small message latency is higher behind queued large messages 143c0dd49bdSEiji Ota * - large message latency isn't starved by intervening small sends 144c0dd49bdSEiji Ota */ 145c0dd49bdSEiji Ota int 146c0dd49bdSEiji Ota rdsv3_send_xmit(struct rdsv3_connection *conn) 147c0dd49bdSEiji Ota { 148c0dd49bdSEiji Ota struct rdsv3_message *rm; 149c0dd49bdSEiji Ota unsigned int tmp; 150c0dd49bdSEiji Ota unsigned int send_quota = send_batch_count; 151c0dd49bdSEiji Ota struct rdsv3_scatterlist *sg; 152c0dd49bdSEiji Ota int ret = 0; 153c0dd49bdSEiji Ota int was_empty = 0; 154c0dd49bdSEiji Ota list_t to_be_dropped; 155c0dd49bdSEiji Ota 1565d5562f5SEiji Ota restart: 1575d5562f5SEiji Ota if (!rdsv3_conn_up(conn)) 1585d5562f5SEiji Ota goto out; 1595d5562f5SEiji Ota 160c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_xmit", "Enter(conn: %p)", conn); 161c0dd49bdSEiji Ota 162c0dd49bdSEiji Ota list_create(&to_be_dropped, sizeof (struct rdsv3_message), 163c0dd49bdSEiji Ota offsetof(struct rdsv3_message, m_conn_item)); 164c0dd49bdSEiji Ota 165c0dd49bdSEiji Ota /* 166c0dd49bdSEiji Ota * sendmsg calls here after having queued its message on the send 167c0dd49bdSEiji Ota * queue. We only have one task feeding the connection at a time. If 168c0dd49bdSEiji Ota * another thread is already feeding the queue then we back off. This 169c0dd49bdSEiji Ota * avoids blocking the caller and trading per-connection data between 170c0dd49bdSEiji Ota * caches per message. 171c0dd49bdSEiji Ota */ 172c0dd49bdSEiji Ota if (!mutex_tryenter(&conn->c_send_lock)) { 173c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_xmit", 174c0dd49bdSEiji Ota "Another thread running(conn: %p)", conn); 175c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_sem_contention); 176c0dd49bdSEiji Ota ret = -ENOMEM; 177c0dd49bdSEiji Ota goto out; 178c0dd49bdSEiji Ota } 1795d5562f5SEiji Ota atomic_add_32(&conn->c_senders, 1); 180c0dd49bdSEiji Ota 181c0dd49bdSEiji Ota if (conn->c_trans->xmit_prepare) 182c0dd49bdSEiji Ota conn->c_trans->xmit_prepare(conn); 183c0dd49bdSEiji Ota 184c0dd49bdSEiji Ota /* 185c0dd49bdSEiji Ota * spin trying to push headers and data down the connection until 1865d5562f5SEiji Ota * the connection doesn't make forward progress. 187c0dd49bdSEiji Ota */ 188c0dd49bdSEiji Ota while (--send_quota) { 189c0dd49bdSEiji Ota /* 190c0dd49bdSEiji Ota * See if need to send a congestion map update if we're 191c0dd49bdSEiji Ota * between sending messages. The send_sem protects our sole 192c0dd49bdSEiji Ota * use of c_map_offset and _bytes. 193c0dd49bdSEiji Ota * Note this is used only by transports that define a special 194c0dd49bdSEiji Ota * xmit_cong_map function. For all others, we create allocate 195c0dd49bdSEiji Ota * a cong_map message and treat it just like any other send. 196c0dd49bdSEiji Ota */ 197c0dd49bdSEiji Ota if (conn->c_map_bytes) { 198c0dd49bdSEiji Ota ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, 199c0dd49bdSEiji Ota conn->c_map_offset); 200c0dd49bdSEiji Ota if (ret <= 0) 201c0dd49bdSEiji Ota break; 202c0dd49bdSEiji Ota 203c0dd49bdSEiji Ota conn->c_map_offset += ret; 204c0dd49bdSEiji Ota conn->c_map_bytes -= ret; 205c0dd49bdSEiji Ota if (conn->c_map_bytes) 206c0dd49bdSEiji Ota continue; 207c0dd49bdSEiji Ota } 208c0dd49bdSEiji Ota 209c0dd49bdSEiji Ota /* 210c0dd49bdSEiji Ota * If we're done sending the current message, clear the 211c0dd49bdSEiji Ota * offset and S/G temporaries. 212c0dd49bdSEiji Ota */ 213c0dd49bdSEiji Ota rm = conn->c_xmit_rm; 214c0dd49bdSEiji Ota if (rm != NULL && 215c0dd49bdSEiji Ota conn->c_xmit_hdr_off == sizeof (struct rdsv3_header) && 216c0dd49bdSEiji Ota conn->c_xmit_sg == rm->m_nents) { 217c0dd49bdSEiji Ota conn->c_xmit_rm = NULL; 218c0dd49bdSEiji Ota conn->c_xmit_sg = 0; 219c0dd49bdSEiji Ota conn->c_xmit_hdr_off = 0; 220c0dd49bdSEiji Ota conn->c_xmit_data_off = 0; 221c0dd49bdSEiji Ota conn->c_xmit_rdma_sent = 0; 222c0dd49bdSEiji Ota 223c0dd49bdSEiji Ota /* Release the reference to the previous message. */ 224c0dd49bdSEiji Ota rdsv3_message_put(rm); 225c0dd49bdSEiji Ota rm = NULL; 226c0dd49bdSEiji Ota } 227c0dd49bdSEiji Ota 228c0dd49bdSEiji Ota /* If we're asked to send a cong map update, do so. */ 229c0dd49bdSEiji Ota if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { 230c0dd49bdSEiji Ota if (conn->c_trans->xmit_cong_map != NULL) { 231c0dd49bdSEiji Ota conn->c_map_offset = 0; 232c0dd49bdSEiji Ota conn->c_map_bytes = 233c0dd49bdSEiji Ota sizeof (struct rdsv3_header) + 234c0dd49bdSEiji Ota RDSV3_CONG_MAP_BYTES; 235c0dd49bdSEiji Ota continue; 236c0dd49bdSEiji Ota } 237c0dd49bdSEiji Ota 238c0dd49bdSEiji Ota rm = rdsv3_cong_update_alloc(conn); 239c0dd49bdSEiji Ota if (IS_ERR(rm)) { 240c0dd49bdSEiji Ota ret = PTR_ERR(rm); 241c0dd49bdSEiji Ota break; 242c0dd49bdSEiji Ota } 243c0dd49bdSEiji Ota 244c0dd49bdSEiji Ota conn->c_xmit_rm = rm; 245c0dd49bdSEiji Ota } 246c0dd49bdSEiji Ota 247c0dd49bdSEiji Ota /* 248c0dd49bdSEiji Ota * Grab the next message from the send queue, if there is one. 249c0dd49bdSEiji Ota * 250c0dd49bdSEiji Ota * c_xmit_rm holds a ref while we're sending this message down 251c0dd49bdSEiji Ota * the connction. We can use this ref while holding the 252c0dd49bdSEiji Ota * send_sem.. rdsv3_send_reset() is serialized with it. 253c0dd49bdSEiji Ota */ 254c0dd49bdSEiji Ota if (rm == NULL) { 255c0dd49bdSEiji Ota unsigned int len; 256c0dd49bdSEiji Ota 257c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 258c0dd49bdSEiji Ota 259c0dd49bdSEiji Ota if (!list_is_empty(&conn->c_send_queue)) { 260c0dd49bdSEiji Ota rm = list_remove_head(&conn->c_send_queue); 261c0dd49bdSEiji Ota rdsv3_message_addref(rm); 262c0dd49bdSEiji Ota 263c0dd49bdSEiji Ota /* 264c0dd49bdSEiji Ota * Move the message from the send queue to 265c0dd49bdSEiji Ota * the retransmit 266c0dd49bdSEiji Ota * list right away. 267c0dd49bdSEiji Ota */ 268c0dd49bdSEiji Ota list_insert_tail(&conn->c_retrans, rm); 269c0dd49bdSEiji Ota } 270c0dd49bdSEiji Ota 271c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 272c0dd49bdSEiji Ota 273c0dd49bdSEiji Ota if (rm == NULL) { 274c0dd49bdSEiji Ota was_empty = 1; 275c0dd49bdSEiji Ota break; 276c0dd49bdSEiji Ota } 277c0dd49bdSEiji Ota 278c0dd49bdSEiji Ota /* 279c0dd49bdSEiji Ota * Unfortunately, the way Infiniband deals with 280c0dd49bdSEiji Ota * RDMA to a bad MR key is by moving the entire 281c0dd49bdSEiji Ota * queue pair to error state. We cold possibly 282c0dd49bdSEiji Ota * recover from that, but right now we drop the 283c0dd49bdSEiji Ota * connection. 284c0dd49bdSEiji Ota * Therefore, we never retransmit messages with 285c0dd49bdSEiji Ota * RDMA ops. 286c0dd49bdSEiji Ota */ 287c0dd49bdSEiji Ota if (rm->m_rdma_op && 288c0dd49bdSEiji Ota test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) { 289c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 290c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_MSG_ON_CONN, 291c0dd49bdSEiji Ota &rm->m_flags)) 292c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item); 293c0dd49bdSEiji Ota list_insert_tail(&to_be_dropped, rm); 294c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 295c0dd49bdSEiji Ota rdsv3_message_put(rm); 296c0dd49bdSEiji Ota continue; 297c0dd49bdSEiji Ota } 298c0dd49bdSEiji Ota 299c0dd49bdSEiji Ota /* Require an ACK every once in a while */ 300c0dd49bdSEiji Ota len = ntohl(rm->m_inc.i_hdr.h_len); 301c0dd49bdSEiji Ota if (conn->c_unacked_packets == 0 || 302c0dd49bdSEiji Ota conn->c_unacked_bytes < len) { 303c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); 304c0dd49bdSEiji Ota 305c0dd49bdSEiji Ota conn->c_unacked_packets = 306c0dd49bdSEiji Ota rdsv3_sysctl_max_unacked_packets; 307c0dd49bdSEiji Ota conn->c_unacked_bytes = 308c0dd49bdSEiji Ota rdsv3_sysctl_max_unacked_bytes; 309c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_ack_required); 310c0dd49bdSEiji Ota } else { 311c0dd49bdSEiji Ota conn->c_unacked_bytes -= len; 312c0dd49bdSEiji Ota conn->c_unacked_packets--; 313c0dd49bdSEiji Ota } 314c0dd49bdSEiji Ota 315c0dd49bdSEiji Ota conn->c_xmit_rm = rm; 316c0dd49bdSEiji Ota } 317c0dd49bdSEiji Ota 318c0dd49bdSEiji Ota /* 319c0dd49bdSEiji Ota * Try and send an rdma message. Let's see if we can 320c0dd49bdSEiji Ota * keep this simple and require that the transport either 321c0dd49bdSEiji Ota * send the whole rdma or none of it. 322c0dd49bdSEiji Ota */ 323c0dd49bdSEiji Ota if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { 324c0dd49bdSEiji Ota ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); 325c0dd49bdSEiji Ota if (ret) 326c0dd49bdSEiji Ota break; 327c0dd49bdSEiji Ota conn->c_xmit_rdma_sent = 1; 328c0dd49bdSEiji Ota /* 329c0dd49bdSEiji Ota * The transport owns the mapped memory for now. 330c0dd49bdSEiji Ota * You can't unmap it while it's on the send queue 331c0dd49bdSEiji Ota */ 332c0dd49bdSEiji Ota set_bit(RDSV3_MSG_MAPPED, &rm->m_flags); 333c0dd49bdSEiji Ota } 334c0dd49bdSEiji Ota 335c0dd49bdSEiji Ota if (conn->c_xmit_hdr_off < sizeof (struct rdsv3_header) || 336c0dd49bdSEiji Ota conn->c_xmit_sg < rm->m_nents) { 337c0dd49bdSEiji Ota ret = conn->c_trans->xmit(conn, rm, 338c0dd49bdSEiji Ota conn->c_xmit_hdr_off, 339c0dd49bdSEiji Ota conn->c_xmit_sg, 340c0dd49bdSEiji Ota conn->c_xmit_data_off); 341c0dd49bdSEiji Ota if (ret <= 0) 342c0dd49bdSEiji Ota break; 343c0dd49bdSEiji Ota 344c0dd49bdSEiji Ota if (conn->c_xmit_hdr_off < 345c0dd49bdSEiji Ota sizeof (struct rdsv3_header)) { 346c0dd49bdSEiji Ota tmp = min(ret, 347c0dd49bdSEiji Ota sizeof (struct rdsv3_header) - 348c0dd49bdSEiji Ota conn->c_xmit_hdr_off); 349c0dd49bdSEiji Ota conn->c_xmit_hdr_off += tmp; 350c0dd49bdSEiji Ota ret -= tmp; 351c0dd49bdSEiji Ota } 352c0dd49bdSEiji Ota 353c0dd49bdSEiji Ota sg = &rm->m_sg[conn->c_xmit_sg]; 354c0dd49bdSEiji Ota while (ret) { 355c0dd49bdSEiji Ota tmp = min(ret, rdsv3_sg_len(sg) - 356c0dd49bdSEiji Ota conn->c_xmit_data_off); 357c0dd49bdSEiji Ota conn->c_xmit_data_off += tmp; 358c0dd49bdSEiji Ota ret -= tmp; 359c0dd49bdSEiji Ota if (conn->c_xmit_data_off == rdsv3_sg_len(sg)) { 360c0dd49bdSEiji Ota conn->c_xmit_data_off = 0; 361c0dd49bdSEiji Ota sg++; 362c0dd49bdSEiji Ota conn->c_xmit_sg++; 363c0dd49bdSEiji Ota ASSERT(!(ret != 0 && 364c0dd49bdSEiji Ota conn->c_xmit_sg == rm->m_nents)); 365c0dd49bdSEiji Ota } 366c0dd49bdSEiji Ota } 367c0dd49bdSEiji Ota } 368c0dd49bdSEiji Ota } 369c0dd49bdSEiji Ota 370c0dd49bdSEiji Ota /* Nuke any messages we decided not to retransmit. */ 371c0dd49bdSEiji Ota if (!list_is_empty(&to_be_dropped)) 372fe817b60SEiji Ota rdsv3_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); 373c0dd49bdSEiji Ota 374c0dd49bdSEiji Ota if (conn->c_trans->xmit_complete) 375c0dd49bdSEiji Ota conn->c_trans->xmit_complete(conn); 376c0dd49bdSEiji Ota 377c0dd49bdSEiji Ota /* 378c0dd49bdSEiji Ota * We might be racing with another sender who queued a message but 379c0dd49bdSEiji Ota * backed off on noticing that we held the c_send_lock. If we check 380c0dd49bdSEiji Ota * for queued messages after dropping the sem then either we'll 381c0dd49bdSEiji Ota * see the queued message or the queuer will get the sem. If we 382c0dd49bdSEiji Ota * notice the queued message then we trigger an immediate retry. 383c0dd49bdSEiji Ota * 384c0dd49bdSEiji Ota * We need to be careful only to do this when we stopped processing 385c0dd49bdSEiji Ota * the send queue because it was empty. It's the only way we 386c0dd49bdSEiji Ota * stop processing the loop when the transport hasn't taken 387c0dd49bdSEiji Ota * responsibility for forward progress. 388c0dd49bdSEiji Ota */ 389c0dd49bdSEiji Ota mutex_exit(&conn->c_send_lock); 390c0dd49bdSEiji Ota 391c0dd49bdSEiji Ota if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { 392c0dd49bdSEiji Ota /* 393c0dd49bdSEiji Ota * We exhausted the send quota, but there's work left to 394c0dd49bdSEiji Ota * do. Return and (re-)schedule the send worker. 395c0dd49bdSEiji Ota */ 396c0dd49bdSEiji Ota ret = -EAGAIN; 397c0dd49bdSEiji Ota } 398c0dd49bdSEiji Ota 3995d5562f5SEiji Ota atomic_dec_32(&conn->c_senders); 4005d5562f5SEiji Ota 401c0dd49bdSEiji Ota if (ret == 0 && was_empty) { 402c0dd49bdSEiji Ota /* 403c0dd49bdSEiji Ota * A simple bit test would be way faster than taking the 404c0dd49bdSEiji Ota * spin lock 405c0dd49bdSEiji Ota */ 406c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 407c0dd49bdSEiji Ota if (!list_is_empty(&conn->c_send_queue)) { 408c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_sem_queue_raced); 409c0dd49bdSEiji Ota ret = -EAGAIN; 410c0dd49bdSEiji Ota } 411c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 412c0dd49bdSEiji Ota } 413c0dd49bdSEiji Ota 414c0dd49bdSEiji Ota out: 415c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_xmit", "Return(conn: %p, ret: %d)", 416c0dd49bdSEiji Ota conn, ret); 417c0dd49bdSEiji Ota return (ret); 418c0dd49bdSEiji Ota } 419c0dd49bdSEiji Ota 420c0dd49bdSEiji Ota static void 421c0dd49bdSEiji Ota rdsv3_send_sndbuf_remove(struct rdsv3_sock *rs, struct rdsv3_message *rm) 422c0dd49bdSEiji Ota { 423c0dd49bdSEiji Ota uint32_t len = ntohl(rm->m_inc.i_hdr.h_len); 424c0dd49bdSEiji Ota 425c0dd49bdSEiji Ota ASSERT(mutex_owned(&rs->rs_lock)); 426c0dd49bdSEiji Ota 427c0dd49bdSEiji Ota ASSERT(rs->rs_snd_bytes >= len); 428c0dd49bdSEiji Ota rs->rs_snd_bytes -= len; 429c0dd49bdSEiji Ota 430c0dd49bdSEiji Ota if (rs->rs_snd_bytes == 0) 431c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_queue_empty); 432c0dd49bdSEiji Ota } 433c0dd49bdSEiji Ota 434c0dd49bdSEiji Ota static inline int 435c0dd49bdSEiji Ota rdsv3_send_is_acked(struct rdsv3_message *rm, uint64_t ack, 436c0dd49bdSEiji Ota is_acked_func is_acked) 437c0dd49bdSEiji Ota { 438c0dd49bdSEiji Ota if (is_acked) 439c0dd49bdSEiji Ota return (is_acked(rm, ack)); 440c0dd49bdSEiji Ota return (ntohll(rm->m_inc.i_hdr.h_sequence) <= ack); 441c0dd49bdSEiji Ota } 442c0dd49bdSEiji Ota 443c0dd49bdSEiji Ota /* 444c0dd49bdSEiji Ota * Returns true if there are no messages on the send and retransmit queues 445c0dd49bdSEiji Ota * which have a sequence number greater than or equal to the given sequence 446c0dd49bdSEiji Ota * number. 447c0dd49bdSEiji Ota */ 448c0dd49bdSEiji Ota int 449c0dd49bdSEiji Ota rdsv3_send_acked_before(struct rdsv3_connection *conn, uint64_t seq) 450c0dd49bdSEiji Ota { 451c0dd49bdSEiji Ota struct rdsv3_message *rm; 452c0dd49bdSEiji Ota int ret = 1; 453c0dd49bdSEiji Ota 454c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_acked_before", "Enter(conn: %p)", conn); 455c0dd49bdSEiji Ota 456c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 457c0dd49bdSEiji Ota 458c0dd49bdSEiji Ota /* XXX - original code spits out warning */ 459c0dd49bdSEiji Ota rm = list_head(&conn->c_retrans); 460c0dd49bdSEiji Ota if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq) 461c0dd49bdSEiji Ota ret = 0; 462c0dd49bdSEiji Ota 463c0dd49bdSEiji Ota /* XXX - original code spits out warning */ 464c0dd49bdSEiji Ota rm = list_head(&conn->c_send_queue); 465c0dd49bdSEiji Ota if (ntohll(rm->m_inc.i_hdr.h_sequence) < seq) 466c0dd49bdSEiji Ota ret = 0; 467c0dd49bdSEiji Ota 468c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 469c0dd49bdSEiji Ota 470c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_acked_before", "Return(conn: %p)", conn); 471c0dd49bdSEiji Ota 472c0dd49bdSEiji Ota return (ret); 473c0dd49bdSEiji Ota } 474c0dd49bdSEiji Ota 475c0dd49bdSEiji Ota /* 476c0dd49bdSEiji Ota * This is pretty similar to what happens below in the ACK 477c0dd49bdSEiji Ota * handling code - except that we call here as soon as we get 478c0dd49bdSEiji Ota * the IB send completion on the RDMA op and the accompanying 479c0dd49bdSEiji Ota * message. 480c0dd49bdSEiji Ota */ 481c0dd49bdSEiji Ota void 482c0dd49bdSEiji Ota rdsv3_rdma_send_complete(struct rdsv3_message *rm, int status) 483c0dd49bdSEiji Ota { 484c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL; 485c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro; 486c0dd49bdSEiji Ota struct rdsv3_notifier *notifier; 487c0dd49bdSEiji Ota 488c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Enter(rm: %p)", rm); 489c0dd49bdSEiji Ota 490c0dd49bdSEiji Ota mutex_enter(&rm->m_rs_lock); 491c0dd49bdSEiji Ota 492c0dd49bdSEiji Ota ro = rm->m_rdma_op; 493c0dd49bdSEiji Ota if (test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags) && 494cadbfdc3SEiji Ota ro && ro->r_notify && ro->r_notifier) { 495cadbfdc3SEiji Ota notifier = ro->r_notifier; 496c0dd49bdSEiji Ota rs = rm->m_rs; 497c0dd49bdSEiji Ota rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 498c0dd49bdSEiji Ota 499c0dd49bdSEiji Ota notifier->n_status = status; 500c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 501c0dd49bdSEiji Ota list_insert_tail(&rs->rs_notify_queue, notifier); 502c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 503cadbfdc3SEiji Ota ro->r_notifier = NULL; 504c0dd49bdSEiji Ota } 505c0dd49bdSEiji Ota 506c0dd49bdSEiji Ota mutex_exit(&rm->m_rs_lock); 507c0dd49bdSEiji Ota 508c0dd49bdSEiji Ota if (rs) { 5093f756f37Sagiri struct rsock *sk = rdsv3_rs_to_sk(rs); 5103f756f37Sagiri int error; 5113f756f37Sagiri 512c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 5133f756f37Sagiri 5143f756f37Sagiri /* wake up anyone waiting in poll */ 5153f756f37Sagiri sk->sk_upcalls->su_recv(sk->sk_upper_handle, NULL, 5163f756f37Sagiri 0, 0, &error, NULL); 5173f756f37Sagiri if (error != 0) { 5183f756f37Sagiri RDSV3_DPRINTF2("rdsv3_recv_incoming", 5193f756f37Sagiri "su_recv returned: %d", error); 5203f756f37Sagiri } 5213f756f37Sagiri 522c0dd49bdSEiji Ota rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 523c0dd49bdSEiji Ota } 524c0dd49bdSEiji Ota 525c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_rdma_send_complete", "Return(rm: %p)", rm); 526c0dd49bdSEiji Ota } 527c0dd49bdSEiji Ota 528c0dd49bdSEiji Ota /* 529c0dd49bdSEiji Ota * This is the same as rdsv3_rdma_send_complete except we 530c0dd49bdSEiji Ota * don't do any locking - we have all the ingredients (message, 531c0dd49bdSEiji Ota * socket, socket lock) and can just move the notifier. 532c0dd49bdSEiji Ota */ 533c0dd49bdSEiji Ota static inline void 534c0dd49bdSEiji Ota __rdsv3_rdma_send_complete(struct rdsv3_sock *rs, struct rdsv3_message *rm, 535c0dd49bdSEiji Ota int status) 536c0dd49bdSEiji Ota { 537c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro; 538c0dd49bdSEiji Ota void *ic; 539c0dd49bdSEiji Ota 540c0dd49bdSEiji Ota RDSV3_DPRINTF4("__rdsv3_rdma_send_complete", 541c0dd49bdSEiji Ota "Enter(rs: %p, rm: %p)", rs, rm); 542c0dd49bdSEiji Ota 543c0dd49bdSEiji Ota ro = rm->m_rdma_op; 544c0dd49bdSEiji Ota if (ro && ro->r_notify && ro->r_notifier) { 545c0dd49bdSEiji Ota ro->r_notifier->n_status = status; 546c0dd49bdSEiji Ota list_insert_tail(&rs->rs_notify_queue, ro->r_notifier); 547c0dd49bdSEiji Ota ro->r_notifier = NULL; 548c0dd49bdSEiji Ota } 549c0dd49bdSEiji Ota 550c0dd49bdSEiji Ota /* No need to wake the app - caller does this */ 551c0dd49bdSEiji Ota } 552c0dd49bdSEiji Ota 553c0dd49bdSEiji Ota /* 554c0dd49bdSEiji Ota * This is called from the IB send completion when we detect 555c0dd49bdSEiji Ota * a RDMA operation that failed with remote access error. 556c0dd49bdSEiji Ota * So speed is not an issue here. 557c0dd49bdSEiji Ota */ 558c0dd49bdSEiji Ota struct rdsv3_message * 559c0dd49bdSEiji Ota rdsv3_send_get_message(struct rdsv3_connection *conn, 560c0dd49bdSEiji Ota struct rdsv3_rdma_op *op) 561c0dd49bdSEiji Ota { 562c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp, *found = NULL; 563c0dd49bdSEiji Ota 564c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_get_message", "Enter(conn: %p)", conn); 565c0dd49bdSEiji Ota 566c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 567c0dd49bdSEiji Ota 568c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { 569c0dd49bdSEiji Ota if (rm->m_rdma_op == op) { 570c0dd49bdSEiji Ota atomic_add_32(&rm->m_refcount, 1); 571c0dd49bdSEiji Ota found = rm; 572c0dd49bdSEiji Ota goto out; 573c0dd49bdSEiji Ota } 574c0dd49bdSEiji Ota } 575c0dd49bdSEiji Ota 576c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_send_queue, 577c0dd49bdSEiji Ota m_conn_item) { 578c0dd49bdSEiji Ota if (rm->m_rdma_op == op) { 579c0dd49bdSEiji Ota atomic_add_32(&rm->m_refcount, 1); 580c0dd49bdSEiji Ota found = rm; 581c0dd49bdSEiji Ota break; 582c0dd49bdSEiji Ota } 583c0dd49bdSEiji Ota } 584c0dd49bdSEiji Ota 585c0dd49bdSEiji Ota out: 586c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 587c0dd49bdSEiji Ota 588c0dd49bdSEiji Ota return (found); 589c0dd49bdSEiji Ota } 590c0dd49bdSEiji Ota 591c0dd49bdSEiji Ota /* 592c0dd49bdSEiji Ota * This removes messages from the socket's list if they're on it. The list 593c0dd49bdSEiji Ota * argument must be private to the caller, we must be able to modify it 594c0dd49bdSEiji Ota * without locks. The messages must have a reference held for their 595c0dd49bdSEiji Ota * position on the list. This function will drop that reference after 596c0dd49bdSEiji Ota * removing the messages from the 'messages' list regardless of if it found 597c0dd49bdSEiji Ota * the messages on the socket list or not. 598c0dd49bdSEiji Ota */ 599c0dd49bdSEiji Ota void 600c0dd49bdSEiji Ota rdsv3_send_remove_from_sock(struct list *messages, int status) 601c0dd49bdSEiji Ota { 602c0dd49bdSEiji Ota struct rdsv3_sock *rs = NULL; 603c0dd49bdSEiji Ota struct rdsv3_message *rm; 604c0dd49bdSEiji Ota 605c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Enter"); 606c0dd49bdSEiji Ota 607c0dd49bdSEiji Ota while (!list_is_empty(messages)) { 608cadbfdc3SEiji Ota int was_on_sock = 0; 609c0dd49bdSEiji Ota rm = list_remove_head(messages); 610c0dd49bdSEiji Ota 611c0dd49bdSEiji Ota /* 612c0dd49bdSEiji Ota * If we see this flag cleared then we're *sure* that someone 613c0dd49bdSEiji Ota * else beat us to removing it from the sock. If we race 614c0dd49bdSEiji Ota * with their flag update we'll get the lock and then really 615c0dd49bdSEiji Ota * see that the flag has been cleared. 616c0dd49bdSEiji Ota * 617c0dd49bdSEiji Ota * The message spinlock makes sure nobody clears rm->m_rs 618c0dd49bdSEiji Ota * while we're messing with it. It does not prevent the 619c0dd49bdSEiji Ota * message from being removed from the socket, though. 620c0dd49bdSEiji Ota */ 621c0dd49bdSEiji Ota mutex_enter(&rm->m_rs_lock); 622c0dd49bdSEiji Ota if (!test_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) 623c0dd49bdSEiji Ota goto unlock_and_drop; 624c0dd49bdSEiji Ota 625c0dd49bdSEiji Ota if (rs != rm->m_rs) { 626c0dd49bdSEiji Ota if (rs) { 627c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 628c0dd49bdSEiji Ota rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 629c0dd49bdSEiji Ota } 630c0dd49bdSEiji Ota rs = rm->m_rs; 631c0dd49bdSEiji Ota rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 632c0dd49bdSEiji Ota } 633c0dd49bdSEiji Ota 634c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 635c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags)) { 636c0dd49bdSEiji Ota struct rdsv3_rdma_op *ro = rm->m_rdma_op; 637c0dd49bdSEiji Ota struct rdsv3_notifier *notifier; 638c0dd49bdSEiji Ota 639c0dd49bdSEiji Ota list_remove_node(&rm->m_sock_item); 640c0dd49bdSEiji Ota rdsv3_send_sndbuf_remove(rs, rm); 641cadbfdc3SEiji Ota if (ro && ro->r_notifier && 642c0dd49bdSEiji Ota (status || ro->r_notify)) { 643cadbfdc3SEiji Ota notifier = ro->r_notifier; 644c0dd49bdSEiji Ota list_insert_tail(&rs->rs_notify_queue, 645c0dd49bdSEiji Ota notifier); 646c0dd49bdSEiji Ota if (!notifier->n_status) 647c0dd49bdSEiji Ota notifier->n_status = status; 648c0dd49bdSEiji Ota rm->m_rdma_op->r_notifier = NULL; 649c0dd49bdSEiji Ota } 650cadbfdc3SEiji Ota was_on_sock = 1; 651c0dd49bdSEiji Ota rm->m_rs = NULL; 652c0dd49bdSEiji Ota } 653c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 654c0dd49bdSEiji Ota 655c0dd49bdSEiji Ota unlock_and_drop: 656c0dd49bdSEiji Ota mutex_exit(&rm->m_rs_lock); 657c0dd49bdSEiji Ota rdsv3_message_put(rm); 658cadbfdc3SEiji Ota if (was_on_sock) 659cadbfdc3SEiji Ota rdsv3_message_put(rm); 660c0dd49bdSEiji Ota } 661c0dd49bdSEiji Ota 662c0dd49bdSEiji Ota if (rs) { 663c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 664c0dd49bdSEiji Ota rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 665c0dd49bdSEiji Ota } 666c0dd49bdSEiji Ota 667c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_remove_from_sock", "Return"); 668c0dd49bdSEiji Ota } 669c0dd49bdSEiji Ota 670c0dd49bdSEiji Ota /* 671c0dd49bdSEiji Ota * Transports call here when they've determined that the receiver queued 672c0dd49bdSEiji Ota * messages up to, and including, the given sequence number. Messages are 673c0dd49bdSEiji Ota * moved to the retrans queue when rdsv3_send_xmit picks them off the send 674c0dd49bdSEiji Ota * queue. This means that in the TCP case, the message may not have been 675c0dd49bdSEiji Ota * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked 676c0dd49bdSEiji Ota * checks the RDSV3_MSG_HAS_ACK_SEQ bit. 677c0dd49bdSEiji Ota * 678c0dd49bdSEiji Ota * XXX It's not clear to me how this is safely serialized with socket 679c0dd49bdSEiji Ota * destruction. Maybe it should bail if it sees SOCK_DEAD. 680c0dd49bdSEiji Ota */ 681c0dd49bdSEiji Ota void 682c0dd49bdSEiji Ota rdsv3_send_drop_acked(struct rdsv3_connection *conn, uint64_t ack, 683c0dd49bdSEiji Ota is_acked_func is_acked) 684c0dd49bdSEiji Ota { 685c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp; 686c0dd49bdSEiji Ota list_t list; 687c0dd49bdSEiji Ota 688c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Enter(conn: %p)", conn); 689c0dd49bdSEiji Ota 690c0dd49bdSEiji Ota list_create(&list, sizeof (struct rdsv3_message), 691c0dd49bdSEiji Ota offsetof(struct rdsv3_message, m_conn_item)); 692c0dd49bdSEiji Ota 693c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 694c0dd49bdSEiji Ota 695c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &conn->c_retrans, m_conn_item) { 696c0dd49bdSEiji Ota if (!rdsv3_send_is_acked(rm, ack, is_acked)) 697c0dd49bdSEiji Ota break; 698c0dd49bdSEiji Ota 699c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item); 700c0dd49bdSEiji Ota list_insert_tail(&list, rm); 701c0dd49bdSEiji Ota clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 702c0dd49bdSEiji Ota } 703c0dd49bdSEiji Ota 704c0dd49bdSEiji Ota #if 0 705c0dd49bdSEiji Ota XXX 706c0dd49bdSEiji Ota /* order flag updates with spin locks */ 707c0dd49bdSEiji Ota if (!list_is_empty(&list)) 708c0dd49bdSEiji Ota smp_mb__after_clear_bit(); 709c0dd49bdSEiji Ota #endif 710c0dd49bdSEiji Ota 711c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 712c0dd49bdSEiji Ota 713c0dd49bdSEiji Ota /* now remove the messages from the sock list as needed */ 714fe817b60SEiji Ota rdsv3_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); 715c0dd49bdSEiji Ota 716c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_acked", "Return(conn: %p)", conn); 717c0dd49bdSEiji Ota } 718c0dd49bdSEiji Ota 719c0dd49bdSEiji Ota void 720c0dd49bdSEiji Ota rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest) 721c0dd49bdSEiji Ota { 722c0dd49bdSEiji Ota struct rdsv3_message *rm, *tmp; 723c0dd49bdSEiji Ota struct rdsv3_connection *conn; 724c0dd49bdSEiji Ota list_t list; 725c0dd49bdSEiji Ota int wake = 0; 726c0dd49bdSEiji Ota 727c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_to", "Enter(rs: %p)", rs); 728c0dd49bdSEiji Ota 729c0dd49bdSEiji Ota list_create(&list, sizeof (struct rdsv3_message), 730c0dd49bdSEiji Ota offsetof(struct rdsv3_message, m_sock_item)); 731c0dd49bdSEiji Ota 732c0dd49bdSEiji Ota /* get all the messages we're dropping under the rs lock */ 733c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 734c0dd49bdSEiji Ota 735c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, tmp, &rs->rs_send_queue, 736c0dd49bdSEiji Ota m_sock_item) { 737c0dd49bdSEiji Ota if (dest && (dest->sin_addr.s_addr != rm->m_daddr || 738c0dd49bdSEiji Ota dest->sin_port != rm->m_inc.i_hdr.h_dport)) 739c0dd49bdSEiji Ota continue; 740c0dd49bdSEiji Ota wake = 1; 741c0dd49bdSEiji Ota list_remove(&rs->rs_send_queue, rm); 742c0dd49bdSEiji Ota list_insert_tail(&list, rm); 743c0dd49bdSEiji Ota rdsv3_send_sndbuf_remove(rs, rm); 744c0dd49bdSEiji Ota clear_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags); 745c0dd49bdSEiji Ota } 746c0dd49bdSEiji Ota 747c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 748c0dd49bdSEiji Ota 749c0dd49bdSEiji Ota conn = NULL; 750c0dd49bdSEiji Ota 751c0dd49bdSEiji Ota /* now remove the messages from the conn list as needed */ 752c0dd49bdSEiji Ota RDSV3_FOR_EACH_LIST_NODE(rm, &list, m_sock_item) { 753c0dd49bdSEiji Ota /* 754c0dd49bdSEiji Ota * We do this here rather than in the loop above, so that 755c0dd49bdSEiji Ota * we don't have to nest m_rs_lock under rs->rs_lock 756c0dd49bdSEiji Ota */ 757c0dd49bdSEiji Ota mutex_enter(&rm->m_rs_lock); 758c0dd49bdSEiji Ota /* If this is a RDMA operation, notify the app. */ 759fe817b60SEiji Ota __rdsv3_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); 760c0dd49bdSEiji Ota rm->m_rs = NULL; 761c0dd49bdSEiji Ota mutex_exit(&rm->m_rs_lock); 762c0dd49bdSEiji Ota 763c0dd49bdSEiji Ota /* 764c0dd49bdSEiji Ota * If we see this flag cleared then we're *sure* that someone 765c0dd49bdSEiji Ota * else beat us to removing it from the conn. If we race 766c0dd49bdSEiji Ota * with their flag update we'll get the lock and then really 767c0dd49bdSEiji Ota * see that the flag has been cleared. 768c0dd49bdSEiji Ota */ 769c0dd49bdSEiji Ota if (!test_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) 770c0dd49bdSEiji Ota continue; 771c0dd49bdSEiji Ota 772c0dd49bdSEiji Ota if (conn != rm->m_inc.i_conn) { 773c0dd49bdSEiji Ota if (conn) 774c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 775c0dd49bdSEiji Ota conn = rm->m_inc.i_conn; 776c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 777c0dd49bdSEiji Ota } 778c0dd49bdSEiji Ota 779c0dd49bdSEiji Ota if (test_and_clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags)) { 780c0dd49bdSEiji Ota list_remove_node(&rm->m_conn_item); 781c0dd49bdSEiji Ota rdsv3_message_put(rm); 782c0dd49bdSEiji Ota } 783c0dd49bdSEiji Ota } 784c0dd49bdSEiji Ota 785c0dd49bdSEiji Ota if (conn) 786c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 787c0dd49bdSEiji Ota 788c0dd49bdSEiji Ota if (wake) 789c0dd49bdSEiji Ota rdsv3_wake_sk_sleep(rs); 790c0dd49bdSEiji Ota 791c0dd49bdSEiji Ota while (!list_is_empty(&list)) { 792c0dd49bdSEiji Ota rm = list_remove_head(&list); 793c0dd49bdSEiji Ota 794c0dd49bdSEiji Ota rdsv3_message_wait(rm); 795c0dd49bdSEiji Ota rdsv3_message_put(rm); 796c0dd49bdSEiji Ota } 797c0dd49bdSEiji Ota 798c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_drop_to", "Return(rs: %p)", rs); 799c0dd49bdSEiji Ota } 800c0dd49bdSEiji Ota 801c0dd49bdSEiji Ota /* 802c0dd49bdSEiji Ota * we only want this to fire once so we use the callers 'queued'. It's 803c0dd49bdSEiji Ota * possible that another thread can race with us and remove the 804c0dd49bdSEiji Ota * message from the flow with RDSV3_CANCEL_SENT_TO. 805c0dd49bdSEiji Ota */ 806c0dd49bdSEiji Ota static int 807c0dd49bdSEiji Ota rdsv3_send_queue_rm(struct rdsv3_sock *rs, struct rdsv3_connection *conn, 808c0dd49bdSEiji Ota struct rdsv3_message *rm, uint16_be_t sport, 809c0dd49bdSEiji Ota uint16_be_t dport, int *queued) 810c0dd49bdSEiji Ota { 811c0dd49bdSEiji Ota uint32_t len; 812c0dd49bdSEiji Ota 813c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Enter(rs: %p, rm: %p)", rs, rm); 814c0dd49bdSEiji Ota 815c0dd49bdSEiji Ota if (*queued) 816c0dd49bdSEiji Ota goto out; 817c0dd49bdSEiji Ota 818c0dd49bdSEiji Ota len = ntohl(rm->m_inc.i_hdr.h_len); 819c0dd49bdSEiji Ota 820c0dd49bdSEiji Ota /* 821c0dd49bdSEiji Ota * this is the only place which holds both the socket's rs_lock 822c0dd49bdSEiji Ota * and the connection's c_lock 823c0dd49bdSEiji Ota */ 824c0dd49bdSEiji Ota mutex_enter(&rs->rs_lock); 825c0dd49bdSEiji Ota 826c0dd49bdSEiji Ota /* 827c0dd49bdSEiji Ota * If there is a little space in sndbuf, we don't queue anything, 828c0dd49bdSEiji Ota * and userspace gets -EAGAIN. But poll() indicates there's send 829c0dd49bdSEiji Ota * room. This can lead to bad behavior (spinning) if snd_bytes isn't 830c0dd49bdSEiji Ota * freed up by incoming acks. So we check the *old* value of 831c0dd49bdSEiji Ota * rs_snd_bytes here to allow the last msg to exceed the buffer, 832c0dd49bdSEiji Ota * and poll() now knows no more data can be sent. 833c0dd49bdSEiji Ota */ 834c0dd49bdSEiji Ota if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) { 835c0dd49bdSEiji Ota rs->rs_snd_bytes += len; 836c0dd49bdSEiji Ota 837c0dd49bdSEiji Ota /* 838c0dd49bdSEiji Ota * let recv side know we are close to send space exhaustion. 839c0dd49bdSEiji Ota * This is probably not the optimal way to do it, as this 840c0dd49bdSEiji Ota * means we set the flag on *all* messages as soon as our 841c0dd49bdSEiji Ota * throughput hits a certain threshold. 842c0dd49bdSEiji Ota */ 843c0dd49bdSEiji Ota if (rs->rs_snd_bytes >= rdsv3_sk_sndbuf(rs) / 2) 844c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags); 845c0dd49bdSEiji Ota 846c0dd49bdSEiji Ota list_insert_tail(&rs->rs_send_queue, rm); 847c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ON_SOCK, &rm->m_flags); 848c0dd49bdSEiji Ota 849c0dd49bdSEiji Ota rdsv3_message_addref(rm); 850c0dd49bdSEiji Ota rm->m_rs = rs; 851c0dd49bdSEiji Ota 852c0dd49bdSEiji Ota /* 853c0dd49bdSEiji Ota * The code ordering is a little weird, but we're 854c0dd49bdSEiji Ota * trying to minimize the time we hold c_lock 855c0dd49bdSEiji Ota */ 856c0dd49bdSEiji Ota rdsv3_message_populate_header(&rm->m_inc.i_hdr, sport, 857c0dd49bdSEiji Ota dport, 0); 858c0dd49bdSEiji Ota rm->m_inc.i_conn = conn; 859c0dd49bdSEiji Ota rdsv3_message_addref(rm); /* XXX - called twice */ 860c0dd49bdSEiji Ota 861c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 862c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_sequence = htonll(conn->c_next_tx_seq++); 863c0dd49bdSEiji Ota list_insert_tail(&conn->c_send_queue, rm); 864c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 865c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 866c0dd49bdSEiji Ota 867c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_send_queue_rm", 868c0dd49bdSEiji Ota "queued msg %p len %d, rs %p bytes %d seq %llu", 869c0dd49bdSEiji Ota rm, len, rs, rs->rs_snd_bytes, 870c0dd49bdSEiji Ota (unsigned long long)ntohll( 871c0dd49bdSEiji Ota rm->m_inc.i_hdr.h_sequence)); 872c0dd49bdSEiji Ota 873c0dd49bdSEiji Ota *queued = 1; 874c0dd49bdSEiji Ota } 875c0dd49bdSEiji Ota 876c0dd49bdSEiji Ota mutex_exit(&rs->rs_lock); 877c0dd49bdSEiji Ota 878c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_queue_rm", "Return(rs: %p)", rs); 879c0dd49bdSEiji Ota out: 880c0dd49bdSEiji Ota return (*queued); 881c0dd49bdSEiji Ota } 882c0dd49bdSEiji Ota 883c0dd49bdSEiji Ota static int 884c0dd49bdSEiji Ota rdsv3_cmsg_send(struct rdsv3_sock *rs, struct rdsv3_message *rm, 885c0dd49bdSEiji Ota struct msghdr *msg, int *allocated_mr) 886c0dd49bdSEiji Ota { 887c0dd49bdSEiji Ota struct cmsghdr *cmsg; 888c0dd49bdSEiji Ota int ret = 0; 889c0dd49bdSEiji Ota 890c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_cmsg_send", "Enter(rs: %p)", rs); 891c0dd49bdSEiji Ota 892c0dd49bdSEiji Ota for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { 893c0dd49bdSEiji Ota 894c0dd49bdSEiji Ota if (cmsg->cmsg_level != SOL_RDS) 895c0dd49bdSEiji Ota continue; 896c0dd49bdSEiji Ota 897c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_cmsg_send", "cmsg(%p, %p) type %d", 898c0dd49bdSEiji Ota cmsg, rm, cmsg->cmsg_type); 899c0dd49bdSEiji Ota /* 900c0dd49bdSEiji Ota * As a side effect, RDMA_DEST and RDMA_MAP will set 901c0dd49bdSEiji Ota * rm->m_rdma_cookie and rm->m_rdma_mr. 902c0dd49bdSEiji Ota */ 903c0dd49bdSEiji Ota switch (cmsg->cmsg_type) { 904fe817b60SEiji Ota case RDS_CMSG_RDMA_ARGS: 905c0dd49bdSEiji Ota ret = rdsv3_cmsg_rdma_args(rs, rm, cmsg); 906c0dd49bdSEiji Ota break; 907c0dd49bdSEiji Ota 908fe817b60SEiji Ota case RDS_CMSG_RDMA_DEST: 909c0dd49bdSEiji Ota ret = rdsv3_cmsg_rdma_dest(rs, rm, cmsg); 910c0dd49bdSEiji Ota break; 911c0dd49bdSEiji Ota 912fe817b60SEiji Ota case RDS_CMSG_RDMA_MAP: 913c0dd49bdSEiji Ota ret = rdsv3_cmsg_rdma_map(rs, rm, cmsg); 914c0dd49bdSEiji Ota if (ret) 915c0dd49bdSEiji Ota *allocated_mr = 1; 916c0dd49bdSEiji Ota break; 917c0dd49bdSEiji Ota 918c0dd49bdSEiji Ota default: 919c0dd49bdSEiji Ota return (-EINVAL); 920c0dd49bdSEiji Ota } 921c0dd49bdSEiji Ota 922c0dd49bdSEiji Ota if (ret) 923c0dd49bdSEiji Ota break; 924c0dd49bdSEiji Ota } 925c0dd49bdSEiji Ota 926c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_cmsg_send", "Return(rs: %p)", rs); 927c0dd49bdSEiji Ota 928c0dd49bdSEiji Ota return (ret); 929c0dd49bdSEiji Ota } 930c0dd49bdSEiji Ota 9313f756f37Sagiri extern unsigned long rdsv3_max_bcopy_size; 9323f756f37Sagiri 933c0dd49bdSEiji Ota int 934c0dd49bdSEiji Ota rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg, 935c0dd49bdSEiji Ota size_t payload_len) 936c0dd49bdSEiji Ota { 937c0dd49bdSEiji Ota struct rsock *sk = rdsv3_rs_to_sk(rs); 938c0dd49bdSEiji Ota struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; 939c0dd49bdSEiji Ota uint32_be_t daddr; 940c0dd49bdSEiji Ota uint16_be_t dport; 941c0dd49bdSEiji Ota struct rdsv3_message *rm = NULL; 942c0dd49bdSEiji Ota struct rdsv3_connection *conn; 943c0dd49bdSEiji Ota int ret = 0; 944c0dd49bdSEiji Ota int queued = 0, allocated_mr = 0; 945c0dd49bdSEiji Ota int nonblock = msg->msg_flags & MSG_DONTWAIT; 946cadbfdc3SEiji Ota long timeo = rdsv3_sndtimeo(sk, nonblock); 947c0dd49bdSEiji Ota 948c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_sendmsg", "Enter(rs: %p)", rs); 949c0dd49bdSEiji Ota 950c0dd49bdSEiji Ota if (msg->msg_namelen) { 951c0dd49bdSEiji Ota /* XXX fail non-unicast destination IPs? */ 952c0dd49bdSEiji Ota if (msg->msg_namelen < sizeof (*usin) || 953c0dd49bdSEiji Ota usin->sin_family != AF_INET_OFFLOAD) { 954c0dd49bdSEiji Ota ret = -EINVAL; 955c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret); 956c0dd49bdSEiji Ota goto out; 957c0dd49bdSEiji Ota } 958c0dd49bdSEiji Ota daddr = usin->sin_addr.s_addr; 959c0dd49bdSEiji Ota dport = usin->sin_port; 960c0dd49bdSEiji Ota } else { 961c0dd49bdSEiji Ota /* We only care about consistency with ->connect() */ 962c0dd49bdSEiji Ota mutex_enter(&sk->sk_lock); 963c0dd49bdSEiji Ota daddr = rs->rs_conn_addr; 964c0dd49bdSEiji Ota dport = rs->rs_conn_port; 965c0dd49bdSEiji Ota mutex_exit(&sk->sk_lock); 966c0dd49bdSEiji Ota } 967c0dd49bdSEiji Ota 968c0dd49bdSEiji Ota /* racing with another thread binding seems ok here */ 969c0dd49bdSEiji Ota if (daddr == 0 || rs->rs_bound_addr == 0) { 970c0dd49bdSEiji Ota ret = -ENOTCONN; /* XXX not a great errno */ 971c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", "returning: %d", -ret); 972c0dd49bdSEiji Ota goto out; 973c0dd49bdSEiji Ota } 974c0dd49bdSEiji Ota 9753f756f37Sagiri if (payload_len > rdsv3_max_bcopy_size) { 9763f756f37Sagiri RDSV3_DPRINTF2("rdsv3_sendmsg", "Message too large: %d", 9773f756f37Sagiri payload_len); 9783f756f37Sagiri ret = -EMSGSIZE; 9793f756f37Sagiri goto out; 9803f756f37Sagiri } 9813f756f37Sagiri 982c0dd49bdSEiji Ota rm = rdsv3_message_copy_from_user(uio, payload_len); 983c0dd49bdSEiji Ota if (IS_ERR(rm)) { 984c0dd49bdSEiji Ota ret = PTR_ERR(rm); 985c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 986c0dd49bdSEiji Ota "rdsv3_message_copy_from_user failed %d", -ret); 987c0dd49bdSEiji Ota rm = NULL; 988c0dd49bdSEiji Ota goto out; 989c0dd49bdSEiji Ota } 990c0dd49bdSEiji Ota 991c0dd49bdSEiji Ota rm->m_daddr = daddr; 992c0dd49bdSEiji Ota 993cadbfdc3SEiji Ota /* Parse any control messages the user may have included. */ 994cadbfdc3SEiji Ota ret = rdsv3_cmsg_send(rs, rm, msg, &allocated_mr); 995cadbfdc3SEiji Ota if (ret) { 996cadbfdc3SEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 997cadbfdc3SEiji Ota "rdsv3_cmsg_send(rs: %p rm: %p msg: %p) returned: %d", 998cadbfdc3SEiji Ota rs, rm, msg, ret); 999cadbfdc3SEiji Ota goto out; 1000cadbfdc3SEiji Ota } 1001cadbfdc3SEiji Ota 1002c0dd49bdSEiji Ota /* 1003c0dd49bdSEiji Ota * rdsv3_conn_create has a spinlock that runs with IRQ off. 1004c0dd49bdSEiji Ota * Caching the conn in the socket helps a lot. 1005c0dd49bdSEiji Ota */ 1006c0dd49bdSEiji Ota mutex_enter(&rs->rs_conn_lock); 1007c0dd49bdSEiji Ota if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) { 1008c0dd49bdSEiji Ota conn = rs->rs_conn; 1009c0dd49bdSEiji Ota } else { 1010c0dd49bdSEiji Ota conn = rdsv3_conn_create_outgoing(rs->rs_bound_addr, 1011c0dd49bdSEiji Ota daddr, rs->rs_transport, KM_NOSLEEP); 1012c0dd49bdSEiji Ota if (IS_ERR(conn)) { 1013c0dd49bdSEiji Ota mutex_exit(&rs->rs_conn_lock); 1014c0dd49bdSEiji Ota ret = PTR_ERR(conn); 1015c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1016c0dd49bdSEiji Ota "rdsv3_conn_create_outgoing failed %d", 1017c0dd49bdSEiji Ota -ret); 1018c0dd49bdSEiji Ota goto out; 1019c0dd49bdSEiji Ota } 1020c0dd49bdSEiji Ota rs->rs_conn = conn; 1021c0dd49bdSEiji Ota } 1022c0dd49bdSEiji Ota mutex_exit(&rs->rs_conn_lock); 1023c0dd49bdSEiji Ota 1024c0dd49bdSEiji Ota if ((rm->m_rdma_cookie || rm->m_rdma_op) && 1025c0dd49bdSEiji Ota conn->c_trans->xmit_rdma == NULL) { 10266e18d381Sagiri RDSV3_DPRINTF2("rdsv3_sendmsg", "rdma_op %p conn xmit_rdma %p", 1027c0dd49bdSEiji Ota rm->m_rdma_op, conn->c_trans->xmit_rdma); 1028c0dd49bdSEiji Ota ret = -EOPNOTSUPP; 1029c0dd49bdSEiji Ota goto out; 1030c0dd49bdSEiji Ota } 1031c0dd49bdSEiji Ota 1032c0dd49bdSEiji Ota /* 1033c0dd49bdSEiji Ota * If the connection is down, trigger a connect. We may 1034c0dd49bdSEiji Ota * have scheduled a delayed reconnect however - in this case 1035c0dd49bdSEiji Ota * we should not interfere. 1036c0dd49bdSEiji Ota */ 1037c0dd49bdSEiji Ota if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN && 1038c0dd49bdSEiji Ota !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags)) 1039c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 1040c0dd49bdSEiji Ota 1041c0dd49bdSEiji Ota ret = rdsv3_cong_wait(conn->c_fcong, dport, nonblock, rs); 1042c0dd49bdSEiji Ota if (ret) { 10435d5562f5SEiji Ota mutex_enter(&rs->rs_congested_lock); 1044cadbfdc3SEiji Ota rs->rs_seen_congestion = 1; 10455d5562f5SEiji Ota cv_signal(&rs->rs_congested_cv); 10465d5562f5SEiji Ota mutex_exit(&rs->rs_congested_lock); 1047cadbfdc3SEiji Ota 1048c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1049c0dd49bdSEiji Ota "rdsv3_cong_wait (dport: %d) returned: %d", dport, ret); 1050c0dd49bdSEiji Ota goto out; 1051c0dd49bdSEiji Ota } 1052c0dd49bdSEiji Ota 1053c0dd49bdSEiji Ota (void) rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, 1054c0dd49bdSEiji Ota &queued); 1055c0dd49bdSEiji Ota if (!queued) { 1056c0dd49bdSEiji Ota /* rdsv3_stats_inc(s_send_queue_full); */ 1057c0dd49bdSEiji Ota /* XXX make sure this is reasonable */ 1058c0dd49bdSEiji Ota if (payload_len > rdsv3_sk_sndbuf(rs)) { 1059c0dd49bdSEiji Ota ret = -EMSGSIZE; 1060c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 1061c0dd49bdSEiji Ota "msgsize(%d) too big, returning: %d", 1062c0dd49bdSEiji Ota payload_len, -ret); 1063c0dd49bdSEiji Ota goto out; 1064c0dd49bdSEiji Ota } 1065c0dd49bdSEiji Ota if (nonblock) { 1066c0dd49bdSEiji Ota ret = -EAGAIN; 1067c0dd49bdSEiji Ota RDSV3_DPRINTF3("rdsv3_sendmsg", 1068c0dd49bdSEiji Ota "send queue full (%d), returning: %d", 1069c0dd49bdSEiji Ota payload_len, -ret); 1070c0dd49bdSEiji Ota goto out; 1071c0dd49bdSEiji Ota } 1072c0dd49bdSEiji Ota 1073c0dd49bdSEiji Ota #if 0 10746e18d381Sagiri ret = rdsv3_wait_sig(sk->sk_sleep, 10756e18d381Sagiri (rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, 10766e18d381Sagiri dport, &queued))); 10776e18d381Sagiri if (ret == 0) { 1078c0dd49bdSEiji Ota /* signal/timeout pending */ 1079c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 10806e18d381Sagiri "woke due to signal: %d", ret); 10816e18d381Sagiri ret = -ERESTART; 1082c0dd49bdSEiji Ota goto out; 1083c0dd49bdSEiji Ota } 1084c0dd49bdSEiji Ota #else 10856e18d381Sagiri mutex_enter(&sk->sk_sleep->waitq_mutex); 10866e18d381Sagiri sk->sk_sleep->waitq_waiters++; 10876e18d381Sagiri while (!rdsv3_send_queue_rm(rs, conn, rm, rs->rs_bound_port, 10886e18d381Sagiri dport, &queued)) { 1089c0dd49bdSEiji Ota ret = cv_wait_sig(&sk->sk_sleep->waitq_cv, 1090c0dd49bdSEiji Ota &sk->sk_sleep->waitq_mutex); 1091c0dd49bdSEiji Ota if (ret == 0) { 1092c0dd49bdSEiji Ota /* signal/timeout pending */ 1093c0dd49bdSEiji Ota RDSV3_DPRINTF2("rdsv3_sendmsg", 10946e18d381Sagiri "woke due to signal: %d", ret); 1095c0dd49bdSEiji Ota ret = -ERESTART; 10966e18d381Sagiri sk->sk_sleep->waitq_waiters--; 1097c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 1098c0dd49bdSEiji Ota goto out; 1099c0dd49bdSEiji Ota } 1100c0dd49bdSEiji Ota } 11016e18d381Sagiri sk->sk_sleep->waitq_waiters--; 1102c0dd49bdSEiji Ota mutex_exit(&sk->sk_sleep->waitq_mutex); 11036e18d381Sagiri #endif 1104c0dd49bdSEiji Ota 1105c0dd49bdSEiji Ota RDSV3_DPRINTF5("rdsv3_sendmsg", "sendmsg woke queued %d", 1106c0dd49bdSEiji Ota queued); 1107c0dd49bdSEiji Ota 1108c0dd49bdSEiji Ota ASSERT(queued); 1109c0dd49bdSEiji Ota ret = 0; 1110c0dd49bdSEiji Ota } 1111c0dd49bdSEiji Ota 1112c0dd49bdSEiji Ota /* 1113c0dd49bdSEiji Ota * By now we've committed to the send. We reuse rdsv3_send_worker() 1114c0dd49bdSEiji Ota * to retry sends in the rds thread if the transport asks us to. 1115c0dd49bdSEiji Ota */ 1116c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_queued); 1117c0dd49bdSEiji Ota 1118c0dd49bdSEiji Ota if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 11193f756f37Sagiri (void) rdsv3_send_worker(&conn->c_send_w.work); 1120c0dd49bdSEiji Ota 1121c0dd49bdSEiji Ota rdsv3_message_put(rm); 1122c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_sendmsg", "Return(rs: %p, len: %d)", 1123c0dd49bdSEiji Ota rs, payload_len); 1124c0dd49bdSEiji Ota return (payload_len); 1125c0dd49bdSEiji Ota 1126c0dd49bdSEiji Ota out: 1127c0dd49bdSEiji Ota /* 1128c0dd49bdSEiji Ota * If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. 1129c0dd49bdSEiji Ota * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN 1130c0dd49bdSEiji Ota * or in any other way, we need to destroy the MR again 1131c0dd49bdSEiji Ota */ 1132c0dd49bdSEiji Ota if (allocated_mr) 1133c0dd49bdSEiji Ota rdsv3_rdma_unuse(rs, rdsv3_rdma_cookie_key(rm->m_rdma_cookie), 1134c0dd49bdSEiji Ota 1); 1135c0dd49bdSEiji Ota 1136c0dd49bdSEiji Ota if (rm) 1137c0dd49bdSEiji Ota rdsv3_message_put(rm); 1138c0dd49bdSEiji Ota return (ret); 1139c0dd49bdSEiji Ota } 1140c0dd49bdSEiji Ota 1141c0dd49bdSEiji Ota /* 1142c0dd49bdSEiji Ota * Reply to a ping packet. 1143c0dd49bdSEiji Ota */ 1144c0dd49bdSEiji Ota int 1145c0dd49bdSEiji Ota rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport) 1146c0dd49bdSEiji Ota { 1147c0dd49bdSEiji Ota struct rdsv3_message *rm; 1148c0dd49bdSEiji Ota int ret = 0; 1149c0dd49bdSEiji Ota 1150c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_pong", "Enter(conn: %p)", conn); 1151c0dd49bdSEiji Ota 1152c0dd49bdSEiji Ota rm = rdsv3_message_alloc(0, KM_NOSLEEP); 11535d5562f5SEiji Ota if (!rm) { 1154c0dd49bdSEiji Ota ret = -ENOMEM; 1155c0dd49bdSEiji Ota goto out; 1156c0dd49bdSEiji Ota } 1157c0dd49bdSEiji Ota 1158c0dd49bdSEiji Ota rm->m_daddr = conn->c_faddr; 1159c0dd49bdSEiji Ota 1160c0dd49bdSEiji Ota /* 1161c0dd49bdSEiji Ota * If the connection is down, trigger a connect. We may 1162c0dd49bdSEiji Ota * have scheduled a delayed reconnect however - in this case 1163c0dd49bdSEiji Ota * we should not interfere. 1164c0dd49bdSEiji Ota */ 1165c0dd49bdSEiji Ota if (rdsv3_conn_state(conn) == RDSV3_CONN_DOWN && 1166c0dd49bdSEiji Ota !test_and_set_bit(RDSV3_RECONNECT_PENDING, &conn->c_flags)) 1167c0dd49bdSEiji Ota rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_conn_w, 0); 1168c0dd49bdSEiji Ota 1169c0dd49bdSEiji Ota ret = rdsv3_cong_wait(conn->c_fcong, dport, 1, NULL); 1170c0dd49bdSEiji Ota if (ret) 1171c0dd49bdSEiji Ota goto out; 1172c0dd49bdSEiji Ota 1173c0dd49bdSEiji Ota mutex_enter(&conn->c_lock); 1174c0dd49bdSEiji Ota list_insert_tail(&conn->c_send_queue, rm); 1175c0dd49bdSEiji Ota set_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); 1176c0dd49bdSEiji Ota rdsv3_message_addref(rm); 1177c0dd49bdSEiji Ota rm->m_inc.i_conn = conn; 1178c0dd49bdSEiji Ota 1179c0dd49bdSEiji Ota rdsv3_message_populate_header(&rm->m_inc.i_hdr, 0, dport, 1180c0dd49bdSEiji Ota conn->c_next_tx_seq); 1181c0dd49bdSEiji Ota conn->c_next_tx_seq++; 1182c0dd49bdSEiji Ota mutex_exit(&conn->c_lock); 1183c0dd49bdSEiji Ota 1184c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_queued); 1185c0dd49bdSEiji Ota rdsv3_stats_inc(s_send_pong); 1186c0dd49bdSEiji Ota 11875d5562f5SEiji Ota if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 11885d5562f5SEiji Ota (void) rdsv3_send_xmit(conn); 11895d5562f5SEiji Ota 1190c0dd49bdSEiji Ota rdsv3_message_put(rm); 1191c0dd49bdSEiji Ota 1192c0dd49bdSEiji Ota RDSV3_DPRINTF4("rdsv3_send_pong", "Return(conn: %p)", conn); 1193c0dd49bdSEiji Ota return (0); 1194c0dd49bdSEiji Ota 1195c0dd49bdSEiji Ota out: 1196c0dd49bdSEiji Ota if (rm) 1197c0dd49bdSEiji Ota rdsv3_message_put(rm); 1198c0dd49bdSEiji Ota return (ret); 1199c0dd49bdSEiji Ota } 1200