1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 */ 27 /* Copyright (c) 1990 Mentat Inc. */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strlog.h> 33 #include <sys/strsun.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/timod.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/strsubr.h> 40 #include <sys/suntpi.h> 41 #include <sys/xti_inet.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kmem.h> 44 #include <sys/cred.h> 45 #include <sys/policy.h> 46 #include <sys/priv.h> 47 #include <sys/ucred.h> 48 #include <sys/zone.h> 49 50 #include <sys/sockio.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/vtrace.h> 54 #include <sys/sdt.h> 55 #include <sys/debug.h> 56 #include <sys/isa_defs.h> 57 #include <sys/random.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/udp.h> 62 63 #include <inet/common.h> 64 #include <inet/ip.h> 65 #include <inet/ip_impl.h> 66 #include <inet/ipsec_impl.h> 67 #include <inet/ip6.h> 68 #include <inet/ip_ire.h> 69 #include <inet/ip_if.h> 70 #include <inet/ip_multi.h> 71 #include <inet/ip_ndp.h> 72 #include <inet/proto_set.h> 73 #include <inet/mib2.h> 74 #include <inet/nd.h> 75 #include <inet/optcom.h> 76 #include <inet/snmpcom.h> 77 #include <inet/kstatcom.h> 78 #include <inet/ipclassifier.h> 79 80 #include <sys/tsol/label.h> 81 #include <sys/tsol/tnet.h> 82 83 #include <inet/rawip_impl.h> 84 85 #include <sys/disp.h> 86 87 /* 88 * Synchronization notes: 89 * 90 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 91 * conn_lock to protect the icmp_t. 92 * 93 * Plumbing notes: 94 * ICMP is always a device driver. For compatibility with mibopen() code 95 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 96 * dummy module. 97 */ 98 static void icmp_addr_req(queue_t *q, mblk_t *mp); 99 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 100 static void icmp_bind_proto(icmp_t *icmp); 101 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 102 const in6_addr_t *, uint32_t); 103 static void icmp_capability_req(queue_t *q, mblk_t *mp); 104 static int icmp_close(queue_t *q, int flags, cred_t *); 105 static void icmp_close_free(conn_t *); 106 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 107 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 108 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 109 int sys_error); 110 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 111 t_scalar_t tlierr, int sys_error); 112 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 113 ip_recv_attr_t *); 114 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 115 ip_recv_attr_t *); 116 static void icmp_info_req(queue_t *q, mblk_t *mp); 117 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 118 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 119 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 122 cred_t *credp); 123 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 124 int icmp_opt_set(conn_t *connp, uint_t optset_context, 125 int level, int name, uint_t inlen, 126 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 127 void *thisdg_attrs, cred_t *cr); 128 int icmp_opt_get(conn_t *connp, int level, int name, 129 uchar_t *ptr); 130 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 131 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 132 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 133 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 134 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 135 mblk_t *, const in6_addr_t *, uint32_t, int *); 136 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 137 uchar_t *ptr, int len); 138 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 139 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 140 static int icmp_wput(queue_t *q, mblk_t *mp); 141 static int icmp_wput_fallback(queue_t *q, mblk_t *mp); 142 static void icmp_wput_other(queue_t *q, mblk_t *mp); 143 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 144 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 145 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 146 147 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 148 static void rawip_stack_fini(netstackid_t stackid, void *arg); 149 150 static void *rawip_kstat_init(netstackid_t stackid); 151 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 152 static int rawip_kstat_update(kstat_t *kp, int rw); 153 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 154 155 /* Common routines for TPI and socket module */ 156 static conn_t *rawip_do_open(int, cred_t *, int *, int); 157 static void rawip_do_close(conn_t *); 158 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 159 static int rawip_do_unbind(conn_t *); 160 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 161 cred_t *, pid_t); 162 163 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 164 socklen_t *, cred_t *); 165 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 166 socklen_t *, cred_t *); 167 168 static struct module_info icmp_mod_info = { 169 5707, "icmp", 1, INFPSZ, 512, 128 170 }; 171 172 /* 173 * Entry points for ICMP as a device. 174 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 175 */ 176 static struct qinit icmprinitv4 = { 177 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 178 }; 179 180 static struct qinit icmprinitv6 = { 181 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 182 }; 183 184 static struct qinit icmpwinit = { 185 icmp_wput, ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 186 }; 187 188 /* ICMP entry point during fallback */ 189 static struct qinit icmp_fallback_sock_winit = { 190 icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 191 }; 192 193 /* For AF_INET aka /dev/icmp */ 194 struct streamtab icmpinfov4 = { 195 &icmprinitv4, &icmpwinit 196 }; 197 198 /* For AF_INET6 aka /dev/icmp6 */ 199 struct streamtab icmpinfov6 = { 200 &icmprinitv6, &icmpwinit 201 }; 202 203 /* Default structure copied into T_INFO_ACK messages */ 204 static struct T_info_ack icmp_g_t_info_ack = { 205 T_INFO_ACK, 206 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 207 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 208 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 209 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 210 0, /* ADDR_size - filled in later. */ 211 0, /* OPT_size - not initialized here */ 212 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 213 T_CLTS, /* SERV_type. icmp supports connection-less. */ 214 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 215 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 216 }; 217 218 static int 219 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, 220 const char *ifname, const void *pval, uint_t flags) 221 { 222 return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl, 223 stack, cr, pinfo, ifname, pval, flags)); 224 } 225 226 static int 227 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname, 228 void *val, uint_t psize, uint_t flags) 229 { 230 return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack, 231 pinfo, ifname, val, psize, flags)); 232 } 233 234 /* 235 * All of these are alterable, within the min/max values given, at run time. 236 * 237 * Note: All those tunables which do not start with "icmp_" are Committed and 238 * therefore are public. See PSARC 2010/080. 239 */ 240 static mod_prop_info_t icmp_propinfo_tbl[] = { 241 /* tunable - 0 */ 242 { "_wroff_extra", MOD_PROTO_RAWIP, 243 mod_set_uint32, mod_get_uint32, 244 {0, 128, 32}, {32} }, 245 246 { "_ipv4_ttl", MOD_PROTO_RAWIP, 247 mod_set_uint32, mod_get_uint32, 248 {1, 255, 255}, {255} }, 249 250 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 251 mod_set_uint32, mod_get_uint32, 252 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 253 {IPV6_DEFAULT_HOPS} }, 254 255 { "_bsd_compat", MOD_PROTO_RAWIP, 256 mod_set_boolean, mod_get_boolean, 257 {B_TRUE}, {B_TRUE} }, 258 259 { "send_buf", MOD_PROTO_RAWIP, 260 icmp_set_buf_prop, icmp_get_buf_prop, 261 {4096, 65536, 8192}, {8192} }, 262 263 { "_xmit_lowat", MOD_PROTO_RAWIP, 264 mod_set_uint32, mod_get_uint32, 265 {0, 65536, 1024}, {1024} }, 266 267 { "recv_buf", MOD_PROTO_RAWIP, 268 icmp_set_buf_prop, icmp_get_buf_prop, 269 {4096, 65536, 8192}, {8192} }, 270 271 { "max_buf", MOD_PROTO_RAWIP, 272 mod_set_uint32, mod_get_uint32, 273 {65536, ULP_MAX_BUF, 256*1024}, {256*1024} }, 274 275 { "_pmtu_discovery", MOD_PROTO_RAWIP, 276 mod_set_boolean, mod_get_boolean, 277 {B_FALSE}, {B_FALSE} }, 278 279 { "_sendto_ignerr", MOD_PROTO_RAWIP, 280 mod_set_boolean, mod_get_boolean, 281 {B_FALSE}, {B_FALSE} }, 282 283 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 284 285 { NULL, 0, NULL, NULL, {0}, {0} } 286 }; 287 288 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 289 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 290 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 291 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 292 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 293 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 294 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 295 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 296 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 297 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 298 299 typedef union T_primitives *t_primp_t; 300 301 /* 302 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 303 * passed to icmp_wput. 304 * It calls IP to verify the local IP address, and calls IP to insert 305 * the conn_t in the fanout table. 306 * If everything is ok it then sends the T_BIND_ACK back up. 307 */ 308 static void 309 icmp_tpi_bind(queue_t *q, mblk_t *mp) 310 { 311 int error; 312 struct sockaddr *sa; 313 struct T_bind_req *tbr; 314 socklen_t len; 315 sin_t *sin; 316 sin6_t *sin6; 317 icmp_t *icmp; 318 conn_t *connp = Q_TO_CONN(q); 319 mblk_t *mp1; 320 cred_t *cr; 321 322 /* 323 * All Solaris components should pass a db_credp 324 * for this TPI message, hence we ASSERT. 325 * But in case there is some other M_PROTO that looks 326 * like a TPI message sent by some other kernel 327 * component, we check and return an error. 328 */ 329 cr = msg_getcred(mp, NULL); 330 ASSERT(cr != NULL); 331 if (cr == NULL) { 332 icmp_err_ack(q, mp, TSYSERR, EINVAL); 333 return; 334 } 335 336 icmp = connp->conn_icmp; 337 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 338 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 339 "icmp_bind: bad req, len %u", 340 (uint_t)(mp->b_wptr - mp->b_rptr)); 341 icmp_err_ack(q, mp, TPROTO, 0); 342 return; 343 } 344 345 if (icmp->icmp_state != TS_UNBND) { 346 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 347 "icmp_bind: bad state, %u", icmp->icmp_state); 348 icmp_err_ack(q, mp, TOUTSTATE, 0); 349 return; 350 } 351 352 /* 353 * Reallocate the message to make sure we have enough room for an 354 * address. 355 */ 356 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 357 if (mp1 == NULL) { 358 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 359 return; 360 } 361 mp = mp1; 362 363 /* Reset the message type in preparation for shipping it back. */ 364 DB_TYPE(mp) = M_PCPROTO; 365 tbr = (struct T_bind_req *)mp->b_rptr; 366 len = tbr->ADDR_length; 367 switch (len) { 368 case 0: /* request for a generic port */ 369 tbr->ADDR_offset = sizeof (struct T_bind_req); 370 if (connp->conn_family == AF_INET) { 371 tbr->ADDR_length = sizeof (sin_t); 372 sin = (sin_t *)&tbr[1]; 373 *sin = sin_null; 374 sin->sin_family = AF_INET; 375 mp->b_wptr = (uchar_t *)&sin[1]; 376 sa = (struct sockaddr *)sin; 377 len = sizeof (sin_t); 378 } else { 379 ASSERT(connp->conn_family == AF_INET6); 380 tbr->ADDR_length = sizeof (sin6_t); 381 sin6 = (sin6_t *)&tbr[1]; 382 *sin6 = sin6_null; 383 sin6->sin6_family = AF_INET6; 384 mp->b_wptr = (uchar_t *)&sin6[1]; 385 sa = (struct sockaddr *)sin6; 386 len = sizeof (sin6_t); 387 } 388 break; 389 390 case sizeof (sin_t): /* Complete IPv4 address */ 391 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 392 sizeof (sin_t)); 393 break; 394 395 case sizeof (sin6_t): /* Complete IPv6 address */ 396 sa = (struct sockaddr *)mi_offset_param(mp, 397 tbr->ADDR_offset, sizeof (sin6_t)); 398 break; 399 400 default: 401 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 402 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 403 icmp_err_ack(q, mp, TBADADDR, 0); 404 return; 405 } 406 407 error = rawip_do_bind(connp, sa, len); 408 if (error != 0) { 409 if (error > 0) { 410 icmp_err_ack(q, mp, TSYSERR, error); 411 } else { 412 icmp_err_ack(q, mp, -error, 0); 413 } 414 } else { 415 tbr->PRIM_type = T_BIND_ACK; 416 qreply(q, mp); 417 } 418 } 419 420 static int 421 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 422 { 423 sin_t *sin; 424 sin6_t *sin6; 425 icmp_t *icmp = connp->conn_icmp; 426 int error = 0; 427 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 428 in_port_t lport; /* Network byte order */ 429 ipaddr_t v4src; /* Set if AF_INET */ 430 in6_addr_t v6src; 431 uint_t scopeid = 0; 432 zoneid_t zoneid = IPCL_ZONEID(connp); 433 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 434 435 if (sa == NULL || !OK_32PTR((char *)sa)) { 436 return (EINVAL); 437 } 438 439 switch (len) { 440 case sizeof (sin_t): /* Complete IPv4 address */ 441 sin = (sin_t *)sa; 442 if (sin->sin_family != AF_INET || 443 connp->conn_family != AF_INET) { 444 /* TSYSERR, EAFNOSUPPORT */ 445 return (EAFNOSUPPORT); 446 } 447 v4src = sin->sin_addr.s_addr; 448 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 449 if (v4src != INADDR_ANY) { 450 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 451 B_TRUE); 452 } 453 lport = sin->sin_port; 454 break; 455 case sizeof (sin6_t): /* Complete IPv6 address */ 456 sin6 = (sin6_t *)sa; 457 if (sin6->sin6_family != AF_INET6 || 458 connp->conn_family != AF_INET6) { 459 /* TSYSERR, EAFNOSUPPORT */ 460 return (EAFNOSUPPORT); 461 } 462 /* No support for mapped addresses on raw sockets */ 463 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 464 /* TSYSERR, EADDRNOTAVAIL */ 465 return (EADDRNOTAVAIL); 466 } 467 v6src = sin6->sin6_addr; 468 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 469 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 470 scopeid = sin6->sin6_scope_id; 471 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 472 B_TRUE, scopeid); 473 } 474 lport = sin6->sin6_port; 475 break; 476 477 default: 478 /* TBADADDR */ 479 return (EADDRNOTAVAIL); 480 } 481 482 /* Is the local address a valid unicast, multicast, or broadcast? */ 483 if (laddr_type == IPVL_BAD) 484 return (EADDRNOTAVAIL); 485 486 /* 487 * The state must be TS_UNBND. 488 */ 489 mutex_enter(&connp->conn_lock); 490 if (icmp->icmp_state != TS_UNBND) { 491 mutex_exit(&connp->conn_lock); 492 return (-TOUTSTATE); 493 } 494 495 /* 496 * Copy the source address into our icmp structure. This address 497 * may still be zero; if so, ip will fill in the correct address 498 * each time an outbound packet is passed to it. 499 * If we are binding to a broadcast or multicast address then 500 * we just set the conn_bound_addr since we don't want to use 501 * that as the source address when sending. 502 */ 503 connp->conn_bound_addr_v6 = v6src; 504 connp->conn_laddr_v6 = v6src; 505 if (scopeid != 0) { 506 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 507 connp->conn_ixa->ixa_scopeid = scopeid; 508 connp->conn_incoming_ifindex = scopeid; 509 } else { 510 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 511 connp->conn_incoming_ifindex = connp->conn_bound_if; 512 } 513 514 switch (laddr_type) { 515 case IPVL_UNICAST_UP: 516 case IPVL_UNICAST_DOWN: 517 connp->conn_saddr_v6 = v6src; 518 connp->conn_mcbc_bind = B_FALSE; 519 break; 520 case IPVL_MCAST: 521 case IPVL_BCAST: 522 /* ip_set_destination will pick a source address later */ 523 connp->conn_saddr_v6 = ipv6_all_zeros; 524 connp->conn_mcbc_bind = B_TRUE; 525 break; 526 } 527 528 /* Any errors after this point should use late_error */ 529 530 /* 531 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 532 * with IPPROTO_TCP. 533 */ 534 connp->conn_lport = lport; 535 connp->conn_fport = 0; 536 537 if (connp->conn_family == AF_INET) { 538 ASSERT(connp->conn_ipversion == IPV4_VERSION); 539 } else { 540 ASSERT(connp->conn_ipversion == IPV6_VERSION); 541 } 542 543 icmp->icmp_state = TS_IDLE; 544 545 /* 546 * We create an initial header template here to make a subsequent 547 * sendto have a starting point. Since conn_last_dst is zero the 548 * first sendto will always follow the 'dst changed' code path. 549 * Note that we defer massaging options and the related checksum 550 * adjustment until we have a destination address. 551 */ 552 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 553 &connp->conn_faddr_v6, connp->conn_flowinfo); 554 if (error != 0) { 555 mutex_exit(&connp->conn_lock); 556 goto late_error; 557 } 558 /* Just in case */ 559 connp->conn_faddr_v6 = ipv6_all_zeros; 560 connp->conn_v6lastdst = ipv6_all_zeros; 561 mutex_exit(&connp->conn_lock); 562 563 error = ip_laddr_fanout_insert(connp); 564 if (error != 0) 565 goto late_error; 566 567 /* Bind succeeded */ 568 return (0); 569 570 late_error: 571 mutex_enter(&connp->conn_lock); 572 connp->conn_saddr_v6 = ipv6_all_zeros; 573 connp->conn_bound_addr_v6 = ipv6_all_zeros; 574 connp->conn_laddr_v6 = ipv6_all_zeros; 575 if (scopeid != 0) { 576 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 577 connp->conn_incoming_ifindex = connp->conn_bound_if; 578 } 579 icmp->icmp_state = TS_UNBND; 580 connp->conn_v6lastdst = ipv6_all_zeros; 581 connp->conn_lport = 0; 582 583 /* Restore the header that was built above - different source address */ 584 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 585 &connp->conn_faddr_v6, connp->conn_flowinfo); 586 mutex_exit(&connp->conn_lock); 587 return (error); 588 } 589 590 /* 591 * Tell IP to just bind to the protocol. 592 */ 593 static void 594 icmp_bind_proto(icmp_t *icmp) 595 { 596 conn_t *connp = icmp->icmp_connp; 597 598 mutex_enter(&connp->conn_lock); 599 connp->conn_saddr_v6 = ipv6_all_zeros; 600 connp->conn_laddr_v6 = ipv6_all_zeros; 601 connp->conn_faddr_v6 = ipv6_all_zeros; 602 connp->conn_v6lastdst = ipv6_all_zeros; 603 mutex_exit(&connp->conn_lock); 604 605 (void) ip_laddr_fanout_insert(connp); 606 } 607 608 /* 609 * This routine handles each T_CONN_REQ message passed to icmp. It 610 * associates a default destination address with the stream. 611 * 612 * After various error checks are completed, icmp_connect() lays 613 * the target address and port into the composite header template. 614 * Then we ask IP for information, including a source address if we didn't 615 * already have one. Finally we send up the T_OK_ACK reply message. 616 */ 617 static void 618 icmp_tpi_connect(queue_t *q, mblk_t *mp) 619 { 620 conn_t *connp = Q_TO_CONN(q); 621 struct T_conn_req *tcr; 622 struct sockaddr *sa; 623 socklen_t len; 624 int error; 625 cred_t *cr; 626 pid_t pid; 627 /* 628 * All Solaris components should pass a db_credp 629 * for this TPI message, hence we ASSERT. 630 * But in case there is some other M_PROTO that looks 631 * like a TPI message sent by some other kernel 632 * component, we check and return an error. 633 */ 634 cr = msg_getcred(mp, &pid); 635 ASSERT(cr != NULL); 636 if (cr == NULL) { 637 icmp_err_ack(q, mp, TSYSERR, EINVAL); 638 return; 639 } 640 641 tcr = (struct T_conn_req *)mp->b_rptr; 642 /* Sanity checks */ 643 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 644 icmp_err_ack(q, mp, TPROTO, 0); 645 return; 646 } 647 648 if (tcr->OPT_length != 0) { 649 icmp_err_ack(q, mp, TBADOPT, 0); 650 return; 651 } 652 653 len = tcr->DEST_length; 654 655 switch (len) { 656 default: 657 icmp_err_ack(q, mp, TBADADDR, 0); 658 return; 659 case sizeof (sin_t): 660 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 661 sizeof (sin_t)); 662 break; 663 case sizeof (sin6_t): 664 sa = (struct sockaddr *)mi_offset_param(mp, 665 tcr->DEST_offset, sizeof (sin6_t)); 666 break; 667 } 668 669 error = proto_verify_ip_addr(connp->conn_family, sa, len); 670 if (error != 0) { 671 icmp_err_ack(q, mp, TSYSERR, error); 672 return; 673 } 674 675 error = rawip_do_connect(connp, sa, len, cr, pid); 676 if (error != 0) { 677 if (error < 0) { 678 icmp_err_ack(q, mp, -error, 0); 679 } else { 680 icmp_err_ack(q, mp, 0, error); 681 } 682 } else { 683 mblk_t *mp1; 684 685 /* 686 * We have to send a connection confirmation to 687 * keep TLI happy. 688 */ 689 if (connp->conn_family == AF_INET) { 690 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 691 sizeof (sin_t), NULL, 0); 692 } else { 693 ASSERT(connp->conn_family == AF_INET6); 694 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 695 sizeof (sin6_t), NULL, 0); 696 } 697 if (mp1 == NULL) { 698 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 699 return; 700 } 701 702 /* 703 * Send ok_ack for T_CONN_REQ 704 */ 705 mp = mi_tpi_ok_ack_alloc(mp); 706 if (mp == NULL) { 707 /* Unable to reuse the T_CONN_REQ for the ack. */ 708 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 709 return; 710 } 711 putnext(connp->conn_rq, mp); 712 putnext(connp->conn_rq, mp1); 713 } 714 } 715 716 static int 717 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 718 cred_t *cr, pid_t pid) 719 { 720 icmp_t *icmp; 721 sin_t *sin; 722 sin6_t *sin6; 723 int error; 724 uint16_t dstport; 725 ipaddr_t v4dst; 726 in6_addr_t v6dst; 727 uint32_t flowinfo; 728 ip_xmit_attr_t *ixa; 729 ip_xmit_attr_t *oldixa; 730 uint_t scopeid = 0; 731 uint_t srcid = 0; 732 in6_addr_t v6src = connp->conn_saddr_v6; 733 734 icmp = connp->conn_icmp; 735 736 if (sa == NULL || !OK_32PTR((char *)sa)) { 737 return (EINVAL); 738 } 739 740 ASSERT(sa != NULL && len != 0); 741 sin = NULL; 742 sin6 = NULL; 743 dstport = 0; 744 flowinfo = 0; 745 v4dst = INADDR_ANY; 746 747 /* 748 * Determine packet type based on type of address passed in 749 * the request should contain an IPv4 or IPv6 address. 750 * Make sure that address family matches the type of 751 * family of the address passed down. 752 */ 753 switch (len) { 754 case sizeof (sin_t): 755 sin = (sin_t *)sa; 756 757 v4dst = sin->sin_addr.s_addr; 758 dstport = sin->sin_port; 759 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 760 ASSERT(connp->conn_ipversion == IPV4_VERSION); 761 break; 762 763 case sizeof (sin6_t): 764 sin6 = (sin6_t *)sa; 765 766 /* No support for mapped addresses on raw sockets */ 767 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 768 return (EADDRNOTAVAIL); 769 } 770 v6dst = sin6->sin6_addr; 771 dstport = sin6->sin6_port; 772 ASSERT(connp->conn_ipversion == IPV6_VERSION); 773 flowinfo = sin6->sin6_flowinfo; 774 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 775 scopeid = sin6->sin6_scope_id; 776 srcid = sin6->__sin6_src_id; 777 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 778 /* Due to check above, we know sin6_addr is v6-only. */ 779 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 780 B_FALSE, connp->conn_netstack)) { 781 /* Mismatch - v6src would be v4mapped. */ 782 return (EADDRNOTAVAIL); 783 } 784 } 785 break; 786 } 787 788 /* 789 * If there is a different thread using conn_ixa then we get a new 790 * copy and cut the old one loose from conn_ixa. Otherwise we use 791 * conn_ixa and prevent any other thread from using/changing it. 792 * Once connect() is done other threads can use conn_ixa since the 793 * refcnt will be back at one. 794 * We defer updating conn_ixa until later to handle any concurrent 795 * conn_ixa_cleanup thread. 796 */ 797 ixa = conn_get_ixa(connp, B_FALSE); 798 if (ixa == NULL) 799 return (ENOMEM); 800 801 mutex_enter(&connp->conn_lock); 802 /* 803 * This icmp_t must have bound already before doing a connect. 804 * Reject if a connect is in progress (we drop conn_lock during 805 * rawip_do_connect). 806 */ 807 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 808 mutex_exit(&connp->conn_lock); 809 ixa_refrele(ixa); 810 return (-TOUTSTATE); 811 } 812 813 if (icmp->icmp_state == TS_DATA_XFER) { 814 /* Already connected - clear out state */ 815 if (connp->conn_mcbc_bind) 816 connp->conn_saddr_v6 = ipv6_all_zeros; 817 else 818 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 819 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 820 connp->conn_faddr_v6 = ipv6_all_zeros; 821 icmp->icmp_state = TS_IDLE; 822 } 823 824 /* 825 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 826 * with IPPROTO_TCP. 827 */ 828 connp->conn_fport = dstport; 829 if (connp->conn_ipversion == IPV4_VERSION) { 830 /* 831 * Interpret a zero destination to mean loopback. 832 * Update the T_CONN_REQ (sin/sin6) since it is used to 833 * generate the T_CONN_CON. 834 */ 835 if (v4dst == INADDR_ANY) { 836 v4dst = htonl(INADDR_LOOPBACK); 837 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 838 ASSERT(connp->conn_family == AF_INET); 839 sin->sin_addr.s_addr = v4dst; 840 } 841 connp->conn_faddr_v6 = v6dst; 842 connp->conn_flowinfo = 0; 843 } else { 844 ASSERT(connp->conn_ipversion == IPV6_VERSION); 845 /* 846 * Interpret a zero destination to mean loopback. 847 * Update the T_CONN_REQ (sin/sin6) since it is used to 848 * generate the T_CONN_CON. 849 */ 850 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 851 v6dst = ipv6_loopback; 852 sin6->sin6_addr = v6dst; 853 } 854 connp->conn_faddr_v6 = v6dst; 855 connp->conn_flowinfo = flowinfo; 856 } 857 858 /* 859 * We update our cred/cpid based on the caller of connect 860 */ 861 if (connp->conn_cred != cr) { 862 crhold(cr); 863 crfree(connp->conn_cred); 864 connp->conn_cred = cr; 865 } 866 connp->conn_cpid = pid; 867 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 868 ixa->ixa_cred = cr; 869 ixa->ixa_cpid = pid; 870 if (is_system_labeled()) { 871 /* We need to restart with a label based on the cred */ 872 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 873 } 874 875 if (scopeid != 0) { 876 ixa->ixa_flags |= IXAF_SCOPEID_SET; 877 ixa->ixa_scopeid = scopeid; 878 connp->conn_incoming_ifindex = scopeid; 879 } else { 880 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 881 connp->conn_incoming_ifindex = connp->conn_bound_if; 882 } 883 884 /* 885 * conn_connect will drop conn_lock and reacquire it. 886 * To prevent a send* from messing with this icmp_t while the lock 887 * is dropped we set icmp_state and clear conn_v6lastdst. 888 * That will make all send* fail with EISCONN. 889 */ 890 connp->conn_v6lastdst = ipv6_all_zeros; 891 icmp->icmp_state = TS_WCON_CREQ; 892 893 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 894 mutex_exit(&connp->conn_lock); 895 if (error != 0) 896 goto connect_failed; 897 898 /* 899 * The addresses have been verified. Time to insert in 900 * the correct fanout list. 901 */ 902 error = ipcl_conn_insert(connp); 903 if (error != 0) 904 goto connect_failed; 905 906 mutex_enter(&connp->conn_lock); 907 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 908 &connp->conn_faddr_v6, connp->conn_flowinfo); 909 if (error != 0) { 910 mutex_exit(&connp->conn_lock); 911 goto connect_failed; 912 } 913 914 icmp->icmp_state = TS_DATA_XFER; 915 /* Record this as the "last" send even though we haven't sent any */ 916 connp->conn_v6lastdst = connp->conn_faddr_v6; 917 connp->conn_lastipversion = connp->conn_ipversion; 918 connp->conn_lastdstport = connp->conn_fport; 919 connp->conn_lastflowinfo = connp->conn_flowinfo; 920 connp->conn_lastscopeid = scopeid; 921 connp->conn_lastsrcid = srcid; 922 /* Also remember a source to use together with lastdst */ 923 connp->conn_v6lastsrc = v6src; 924 925 oldixa = conn_replace_ixa(connp, ixa); 926 mutex_exit(&connp->conn_lock); 927 ixa_refrele(oldixa); 928 929 ixa_refrele(ixa); 930 return (0); 931 932 connect_failed: 933 if (ixa != NULL) 934 ixa_refrele(ixa); 935 mutex_enter(&connp->conn_lock); 936 icmp->icmp_state = TS_IDLE; 937 /* In case the source address was set above */ 938 if (connp->conn_mcbc_bind) 939 connp->conn_saddr_v6 = ipv6_all_zeros; 940 else 941 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 942 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 943 connp->conn_faddr_v6 = ipv6_all_zeros; 944 connp->conn_v6lastdst = ipv6_all_zeros; 945 connp->conn_flowinfo = 0; 946 947 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 948 &connp->conn_faddr_v6, connp->conn_flowinfo); 949 mutex_exit(&connp->conn_lock); 950 return (error); 951 } 952 953 static void 954 rawip_do_close(conn_t *connp) 955 { 956 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 957 958 ip_quiesce_conn(connp); 959 960 if (!IPCL_IS_NONSTR(connp)) { 961 qprocsoff(connp->conn_rq); 962 } 963 964 icmp_close_free(connp); 965 966 /* 967 * Now we are truly single threaded on this stream, and can 968 * delete the things hanging off the connp, and finally the connp. 969 * We removed this connp from the fanout list, it cannot be 970 * accessed thru the fanouts, and we already waited for the 971 * conn_ref to drop to 0. We are already in close, so 972 * there cannot be any other thread from the top. qprocsoff 973 * has completed, and service has completed or won't run in 974 * future. 975 */ 976 ASSERT(connp->conn_ref == 1); 977 978 if (!IPCL_IS_NONSTR(connp)) { 979 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 980 } else { 981 ip_free_helper_stream(connp); 982 } 983 984 connp->conn_ref--; 985 ipcl_conn_destroy(connp); 986 } 987 988 /* ARGSUSED */ 989 static int 990 icmp_close(queue_t *q, int flags, cred_t *credp __unused) 991 { 992 conn_t *connp; 993 994 if (flags & SO_FALLBACK) { 995 /* 996 * stream is being closed while in fallback 997 * simply free the resources that were allocated 998 */ 999 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 1000 qprocsoff(q); 1001 goto done; 1002 } 1003 1004 connp = Q_TO_CONN(q); 1005 (void) rawip_do_close(connp); 1006 done: 1007 q->q_ptr = WR(q)->q_ptr = NULL; 1008 return (0); 1009 } 1010 1011 static void 1012 icmp_close_free(conn_t *connp) 1013 { 1014 icmp_t *icmp = connp->conn_icmp; 1015 1016 if (icmp->icmp_filter != NULL) { 1017 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 1018 icmp->icmp_filter = NULL; 1019 } 1020 1021 /* 1022 * Clear any fields which the kmem_cache constructor clears. 1023 * Only icmp_connp needs to be preserved. 1024 * TBD: We should make this more efficient to avoid clearing 1025 * everything. 1026 */ 1027 ASSERT(icmp->icmp_connp == connp); 1028 bzero(icmp, sizeof (icmp_t)); 1029 icmp->icmp_connp = connp; 1030 } 1031 1032 /* 1033 * This routine handles each T_DISCON_REQ message passed to icmp 1034 * as an indicating that ICMP is no longer connected. This results 1035 * in telling IP to restore the binding to just the local address. 1036 */ 1037 static int 1038 icmp_do_disconnect(conn_t *connp) 1039 { 1040 icmp_t *icmp = connp->conn_icmp; 1041 int error; 1042 1043 mutex_enter(&connp->conn_lock); 1044 if (icmp->icmp_state != TS_DATA_XFER) { 1045 mutex_exit(&connp->conn_lock); 1046 return (-TOUTSTATE); 1047 } 1048 if (connp->conn_mcbc_bind) 1049 connp->conn_saddr_v6 = ipv6_all_zeros; 1050 else 1051 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1052 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1053 connp->conn_faddr_v6 = ipv6_all_zeros; 1054 icmp->icmp_state = TS_IDLE; 1055 1056 connp->conn_v6lastdst = ipv6_all_zeros; 1057 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1058 &connp->conn_faddr_v6, connp->conn_flowinfo); 1059 mutex_exit(&connp->conn_lock); 1060 if (error != 0) 1061 return (error); 1062 1063 /* 1064 * Tell IP to remove the full binding and revert 1065 * to the local address binding. 1066 */ 1067 return (ip_laddr_fanout_insert(connp)); 1068 } 1069 1070 static void 1071 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1072 { 1073 conn_t *connp = Q_TO_CONN(q); 1074 int error; 1075 1076 /* 1077 * Allocate the largest primitive we need to send back 1078 * T_error_ack is > than T_ok_ack 1079 */ 1080 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1081 if (mp == NULL) { 1082 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1083 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1084 return; 1085 } 1086 1087 error = icmp_do_disconnect(connp); 1088 1089 if (error != 0) { 1090 if (error > 0) { 1091 icmp_err_ack(q, mp, 0, error); 1092 } else { 1093 icmp_err_ack(q, mp, -error, 0); 1094 } 1095 } else { 1096 mp = mi_tpi_ok_ack_alloc(mp); 1097 ASSERT(mp != NULL); 1098 qreply(q, mp); 1099 } 1100 } 1101 1102 static int 1103 icmp_disconnect(conn_t *connp) 1104 { 1105 int error; 1106 1107 connp->conn_dgram_errind = B_FALSE; 1108 1109 error = icmp_do_disconnect(connp); 1110 1111 if (error < 0) 1112 error = proto_tlitosyserr(-error); 1113 return (error); 1114 } 1115 1116 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1117 static void 1118 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1119 { 1120 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1121 qreply(q, mp); 1122 } 1123 1124 /* Shorthand to generate and send TPI error acks to our client */ 1125 static void 1126 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1127 t_scalar_t t_error, int sys_error) 1128 { 1129 struct T_error_ack *teackp; 1130 1131 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1132 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1133 teackp = (struct T_error_ack *)mp->b_rptr; 1134 teackp->ERROR_prim = primitive; 1135 teackp->TLI_error = t_error; 1136 teackp->UNIX_error = sys_error; 1137 qreply(q, mp); 1138 } 1139 } 1140 1141 /* 1142 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1143 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1144 * Assumes that IP has pulled up everything up to and including the ICMP header. 1145 */ 1146 /* ARGSUSED2 */ 1147 static void 1148 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1149 { 1150 conn_t *connp = (conn_t *)arg1; 1151 icmp_t *icmp = connp->conn_icmp; 1152 icmph_t *icmph; 1153 ipha_t *ipha; 1154 int iph_hdr_length; 1155 sin_t sin; 1156 mblk_t *mp1; 1157 int error = 0; 1158 1159 ipha = (ipha_t *)mp->b_rptr; 1160 1161 ASSERT(OK_32PTR(mp->b_rptr)); 1162 1163 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1164 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1165 icmp_icmp_error_ipv6(connp, mp, ira); 1166 return; 1167 } 1168 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1169 1170 /* Skip past the outer IP and ICMP headers */ 1171 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1172 iph_hdr_length = ira->ira_ip_hdr_length; 1173 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1174 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1175 1176 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1177 1178 switch (icmph->icmph_type) { 1179 case ICMP_DEST_UNREACHABLE: 1180 switch (icmph->icmph_code) { 1181 case ICMP_FRAGMENTATION_NEEDED: { 1182 ipha_t *ipha; 1183 ip_xmit_attr_t *ixa; 1184 /* 1185 * IP has already adjusted the path MTU. 1186 * But we need to adjust DF for IPv4. 1187 */ 1188 if (connp->conn_ipversion != IPV4_VERSION) 1189 break; 1190 1191 ixa = conn_get_ixa(connp, B_FALSE); 1192 if (ixa == NULL || ixa->ixa_ire == NULL) { 1193 /* 1194 * Some other thread holds conn_ixa. We will 1195 * redo this on the next ICMP too big. 1196 */ 1197 if (ixa != NULL) 1198 ixa_refrele(ixa); 1199 break; 1200 } 1201 (void) ip_get_pmtu(ixa); 1202 1203 mutex_enter(&connp->conn_lock); 1204 ipha = (ipha_t *)connp->conn_ht_iphc; 1205 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1206 ipha->ipha_fragment_offset_and_flags |= 1207 IPH_DF_HTONS; 1208 } else { 1209 ipha->ipha_fragment_offset_and_flags &= 1210 ~IPH_DF_HTONS; 1211 } 1212 mutex_exit(&connp->conn_lock); 1213 ixa_refrele(ixa); 1214 break; 1215 } 1216 case ICMP_PORT_UNREACHABLE: 1217 case ICMP_PROTOCOL_UNREACHABLE: 1218 error = ECONNREFUSED; 1219 break; 1220 default: 1221 /* Transient errors */ 1222 break; 1223 } 1224 break; 1225 default: 1226 /* Transient errors */ 1227 break; 1228 } 1229 if (error == 0) { 1230 freemsg(mp); 1231 return; 1232 } 1233 1234 /* 1235 * Deliver T_UDERROR_IND when the application has asked for it. 1236 * The socket layer enables this automatically when connected. 1237 */ 1238 if (!connp->conn_dgram_errind) { 1239 freemsg(mp); 1240 return; 1241 } 1242 1243 sin = sin_null; 1244 sin.sin_family = AF_INET; 1245 sin.sin_addr.s_addr = ipha->ipha_dst; 1246 1247 if (IPCL_IS_NONSTR(connp)) { 1248 mutex_enter(&connp->conn_lock); 1249 if (icmp->icmp_state == TS_DATA_XFER) { 1250 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1251 mutex_exit(&connp->conn_lock); 1252 (*connp->conn_upcalls->su_set_error) 1253 (connp->conn_upper_handle, error); 1254 goto done; 1255 } 1256 } else { 1257 icmp->icmp_delayed_error = error; 1258 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1259 } 1260 mutex_exit(&connp->conn_lock); 1261 } else { 1262 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1263 error); 1264 if (mp1 != NULL) 1265 putnext(connp->conn_rq, mp1); 1266 } 1267 done: 1268 freemsg(mp); 1269 } 1270 1271 /* 1272 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1273 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1274 * Assumes that IP has pulled up all the extension headers as well as the 1275 * ICMPv6 header. 1276 */ 1277 static void 1278 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1279 { 1280 icmp6_t *icmp6; 1281 ip6_t *ip6h, *outer_ip6h; 1282 uint16_t iph_hdr_length; 1283 uint8_t *nexthdrp; 1284 sin6_t sin6; 1285 mblk_t *mp1; 1286 int error = 0; 1287 icmp_t *icmp = connp->conn_icmp; 1288 1289 outer_ip6h = (ip6_t *)mp->b_rptr; 1290 #ifdef DEBUG 1291 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1292 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1293 else 1294 iph_hdr_length = IPV6_HDR_LEN; 1295 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1296 #endif 1297 /* Skip past the outer IP and ICMP headers */ 1298 iph_hdr_length = ira->ira_ip_hdr_length; 1299 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1300 1301 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1302 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1303 freemsg(mp); 1304 return; 1305 } 1306 1307 switch (icmp6->icmp6_type) { 1308 case ICMP6_DST_UNREACH: 1309 switch (icmp6->icmp6_code) { 1310 case ICMP6_DST_UNREACH_NOPORT: 1311 error = ECONNREFUSED; 1312 break; 1313 case ICMP6_DST_UNREACH_ADMIN: 1314 case ICMP6_DST_UNREACH_NOROUTE: 1315 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1316 case ICMP6_DST_UNREACH_ADDR: 1317 /* Transient errors */ 1318 break; 1319 default: 1320 break; 1321 } 1322 break; 1323 case ICMP6_PACKET_TOO_BIG: { 1324 struct T_unitdata_ind *tudi; 1325 struct T_opthdr *toh; 1326 size_t udi_size; 1327 mblk_t *newmp; 1328 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1329 sizeof (struct ip6_mtuinfo); 1330 sin6_t *sin6; 1331 struct ip6_mtuinfo *mtuinfo; 1332 1333 /* 1334 * If the application has requested to receive path mtu 1335 * information, send up an empty message containing an 1336 * IPV6_PATHMTU ancillary data item. 1337 */ 1338 if (!connp->conn_ipv6_recvpathmtu) 1339 break; 1340 1341 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1342 opt_length; 1343 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1344 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1345 break; 1346 } 1347 1348 /* 1349 * newmp->b_cont is left to NULL on purpose. This is an 1350 * empty message containing only ancillary data. 1351 */ 1352 newmp->b_datap->db_type = M_PROTO; 1353 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1354 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1355 tudi->PRIM_type = T_UNITDATA_IND; 1356 tudi->SRC_length = sizeof (sin6_t); 1357 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1358 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1359 tudi->OPT_length = opt_length; 1360 1361 sin6 = (sin6_t *)&tudi[1]; 1362 bzero(sin6, sizeof (sin6_t)); 1363 sin6->sin6_family = AF_INET6; 1364 sin6->sin6_addr = connp->conn_faddr_v6; 1365 1366 toh = (struct T_opthdr *)&sin6[1]; 1367 toh->level = IPPROTO_IPV6; 1368 toh->name = IPV6_PATHMTU; 1369 toh->len = opt_length; 1370 toh->status = 0; 1371 1372 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1373 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1374 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1375 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1376 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1377 /* 1378 * We've consumed everything we need from the original 1379 * message. Free it, then send our empty message. 1380 */ 1381 freemsg(mp); 1382 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1383 return; 1384 } 1385 case ICMP6_TIME_EXCEEDED: 1386 /* Transient errors */ 1387 break; 1388 case ICMP6_PARAM_PROB: 1389 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1390 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1391 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1392 (uchar_t *)nexthdrp) { 1393 error = ECONNREFUSED; 1394 break; 1395 } 1396 break; 1397 } 1398 if (error == 0) { 1399 freemsg(mp); 1400 return; 1401 } 1402 1403 /* 1404 * Deliver T_UDERROR_IND when the application has asked for it. 1405 * The socket layer enables this automatically when connected. 1406 */ 1407 if (!connp->conn_dgram_errind) { 1408 freemsg(mp); 1409 return; 1410 } 1411 1412 sin6 = sin6_null; 1413 sin6.sin6_family = AF_INET6; 1414 sin6.sin6_addr = ip6h->ip6_dst; 1415 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1416 if (IPCL_IS_NONSTR(connp)) { 1417 mutex_enter(&connp->conn_lock); 1418 if (icmp->icmp_state == TS_DATA_XFER) { 1419 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1420 &connp->conn_faddr_v6)) { 1421 mutex_exit(&connp->conn_lock); 1422 (*connp->conn_upcalls->su_set_error) 1423 (connp->conn_upper_handle, error); 1424 goto done; 1425 } 1426 } else { 1427 icmp->icmp_delayed_error = error; 1428 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1429 } 1430 mutex_exit(&connp->conn_lock); 1431 } else { 1432 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1433 NULL, 0, error); 1434 if (mp1 != NULL) 1435 putnext(connp->conn_rq, mp1); 1436 } 1437 done: 1438 freemsg(mp); 1439 } 1440 1441 /* 1442 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1443 * The local address is filled in if endpoint is bound. The remote address 1444 * is filled in if remote address has been precified ("connected endpoint") 1445 * (The concept of connected CLTS sockets is alien to published TPI 1446 * but we support it anyway). 1447 */ 1448 static void 1449 icmp_addr_req(queue_t *q, mblk_t *mp) 1450 { 1451 struct sockaddr *sa; 1452 mblk_t *ackmp; 1453 struct T_addr_ack *taa; 1454 icmp_t *icmp = Q_TO_ICMP(q); 1455 conn_t *connp = icmp->icmp_connp; 1456 uint_t addrlen; 1457 1458 /* Make it large enough for worst case */ 1459 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1460 2 * sizeof (sin6_t), 1); 1461 if (ackmp == NULL) { 1462 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1463 return; 1464 } 1465 taa = (struct T_addr_ack *)ackmp->b_rptr; 1466 1467 bzero(taa, sizeof (struct T_addr_ack)); 1468 ackmp->b_wptr = (uchar_t *)&taa[1]; 1469 1470 taa->PRIM_type = T_ADDR_ACK; 1471 ackmp->b_datap->db_type = M_PCPROTO; 1472 1473 if (connp->conn_family == AF_INET) 1474 addrlen = sizeof (sin_t); 1475 else 1476 addrlen = sizeof (sin6_t); 1477 1478 mutex_enter(&connp->conn_lock); 1479 /* 1480 * Note: Following code assumes 32 bit alignment of basic 1481 * data structures like sin_t and struct T_addr_ack. 1482 */ 1483 if (icmp->icmp_state != TS_UNBND) { 1484 /* 1485 * Fill in local address first 1486 */ 1487 taa->LOCADDR_offset = sizeof (*taa); 1488 taa->LOCADDR_length = addrlen; 1489 sa = (struct sockaddr *)&taa[1]; 1490 (void) conn_getsockname(connp, sa, &addrlen); 1491 ackmp->b_wptr += addrlen; 1492 } 1493 if (icmp->icmp_state == TS_DATA_XFER) { 1494 /* 1495 * connected, fill remote address too 1496 */ 1497 taa->REMADDR_length = addrlen; 1498 /* assumed 32-bit alignment */ 1499 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1500 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1501 (void) conn_getpeername(connp, sa, &addrlen); 1502 ackmp->b_wptr += addrlen; 1503 } 1504 mutex_exit(&connp->conn_lock); 1505 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1506 qreply(q, ackmp); 1507 } 1508 1509 static void 1510 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1511 { 1512 conn_t *connp = icmp->icmp_connp; 1513 1514 *tap = icmp_g_t_info_ack; 1515 1516 if (connp->conn_family == AF_INET6) 1517 tap->ADDR_size = sizeof (sin6_t); 1518 else 1519 tap->ADDR_size = sizeof (sin_t); 1520 tap->CURRENT_state = icmp->icmp_state; 1521 tap->OPT_size = icmp_max_optsize; 1522 } 1523 1524 static void 1525 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1526 t_uscalar_t cap_bits1) 1527 { 1528 tcap->CAP_bits1 = 0; 1529 1530 if (cap_bits1 & TC1_INFO) { 1531 icmp_copy_info(&tcap->INFO_ack, icmp); 1532 tcap->CAP_bits1 |= TC1_INFO; 1533 } 1534 } 1535 1536 /* 1537 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1538 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1539 * icmp_g_t_info_ack. The current state of the stream is copied from 1540 * icmp_state. 1541 */ 1542 static void 1543 icmp_capability_req(queue_t *q, mblk_t *mp) 1544 { 1545 icmp_t *icmp = Q_TO_ICMP(q); 1546 t_uscalar_t cap_bits1; 1547 struct T_capability_ack *tcap; 1548 1549 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1550 1551 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1552 mp->b_datap->db_type, T_CAPABILITY_ACK); 1553 if (!mp) 1554 return; 1555 1556 tcap = (struct T_capability_ack *)mp->b_rptr; 1557 1558 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1559 1560 qreply(q, mp); 1561 } 1562 1563 /* 1564 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1565 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1566 * The current state of the stream is copied from icmp_state. 1567 */ 1568 static void 1569 icmp_info_req(queue_t *q, mblk_t *mp) 1570 { 1571 icmp_t *icmp = Q_TO_ICMP(q); 1572 1573 /* Create a T_INFO_ACK message. */ 1574 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1575 T_INFO_ACK); 1576 if (!mp) 1577 return; 1578 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1579 qreply(q, mp); 1580 } 1581 1582 static int 1583 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1584 int family) 1585 { 1586 conn_t *connp; 1587 dev_t conn_dev; 1588 int error; 1589 1590 /* If the stream is already open, return immediately. */ 1591 if (q->q_ptr != NULL) 1592 return (0); 1593 1594 if (sflag == MODOPEN) 1595 return (EINVAL); 1596 1597 /* 1598 * Since ICMP is not used so heavily, allocating from the small 1599 * arena should be sufficient. 1600 */ 1601 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1602 return (EBUSY); 1603 } 1604 1605 if (flag & SO_FALLBACK) { 1606 /* 1607 * Non streams socket needs a stream to fallback to 1608 */ 1609 RD(q)->q_ptr = (void *)conn_dev; 1610 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1611 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1612 qprocson(q); 1613 return (0); 1614 } 1615 1616 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1617 if (connp == NULL) { 1618 ASSERT(error != 0); 1619 inet_minor_free(ip_minor_arena_sa, conn_dev); 1620 return (error); 1621 } 1622 1623 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1624 connp->conn_dev = conn_dev; 1625 connp->conn_minor_arena = ip_minor_arena_sa; 1626 1627 /* 1628 * Initialize the icmp_t structure for this stream. 1629 */ 1630 q->q_ptr = connp; 1631 WR(q)->q_ptr = connp; 1632 connp->conn_rq = q; 1633 connp->conn_wq = WR(q); 1634 1635 WR(q)->q_hiwat = connp->conn_sndbuf; 1636 WR(q)->q_lowat = connp->conn_sndlowat; 1637 1638 qprocson(q); 1639 1640 /* Set the Stream head write offset. */ 1641 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1642 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1643 1644 mutex_enter(&connp->conn_lock); 1645 connp->conn_state_flags &= ~CONN_INCIPIENT; 1646 mutex_exit(&connp->conn_lock); 1647 1648 icmp_bind_proto(connp->conn_icmp); 1649 1650 return (0); 1651 } 1652 1653 /* For /dev/icmp aka AF_INET open */ 1654 static int 1655 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1656 { 1657 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1658 } 1659 1660 /* For /dev/icmp6 aka AF_INET6 open */ 1661 static int 1662 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1663 { 1664 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1665 } 1666 1667 /* 1668 * This is the open routine for icmp. It allocates a icmp_t structure for 1669 * the stream and, on the first open of the module, creates an ND table. 1670 */ 1671 static conn_t * 1672 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1673 { 1674 icmp_t *icmp; 1675 conn_t *connp; 1676 zoneid_t zoneid; 1677 netstack_t *ns; 1678 icmp_stack_t *is; 1679 int len; 1680 boolean_t isv6 = B_FALSE; 1681 1682 *err = secpolicy_net_icmpaccess(credp); 1683 if (*err != 0) 1684 return (NULL); 1685 1686 if (family == AF_INET6) 1687 isv6 = B_TRUE; 1688 1689 ns = netstack_find_by_cred(credp); 1690 ASSERT(ns != NULL); 1691 is = ns->netstack_icmp; 1692 ASSERT(is != NULL); 1693 1694 /* 1695 * For exclusive stacks we set the zoneid to zero 1696 * to make ICMP operate as if in the global zone. 1697 */ 1698 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1699 zoneid = GLOBAL_ZONEID; 1700 else 1701 zoneid = crgetzoneid(credp); 1702 1703 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1704 1705 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1706 icmp = connp->conn_icmp; 1707 1708 /* 1709 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1710 * done by netstack_find_by_cred() 1711 */ 1712 netstack_rele(ns); 1713 1714 /* 1715 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1716 * need to lock anything. 1717 */ 1718 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1719 ASSERT(connp->conn_icmp == icmp); 1720 ASSERT(icmp->icmp_connp == connp); 1721 1722 /* Set the initial state of the stream and the privilege status. */ 1723 icmp->icmp_state = TS_UNBND; 1724 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1725 if (isv6) { 1726 connp->conn_family = AF_INET6; 1727 connp->conn_ipversion = IPV6_VERSION; 1728 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1729 connp->conn_proto = IPPROTO_ICMPV6; 1730 /* May be changed by a SO_PROTOTYPE socket option. */ 1731 connp->conn_proto = IPPROTO_ICMPV6; 1732 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1733 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1734 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1735 len = sizeof (ip6_t); 1736 } else { 1737 connp->conn_family = AF_INET; 1738 connp->conn_ipversion = IPV4_VERSION; 1739 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1740 /* May be changed by a SO_PROTOTYPE socket option. */ 1741 connp->conn_proto = IPPROTO_ICMP; 1742 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1743 connp->conn_default_ttl = is->is_ipv4_ttl; 1744 len = sizeof (ipha_t); 1745 } 1746 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1747 1748 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1749 1750 /* 1751 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1752 * the checksum is provided in the pre-built packet. We clear 1753 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1754 * complete IP header and not to compute the transport checksum. 1755 */ 1756 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1757 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1758 connp->conn_ixa->ixa_zoneid = zoneid; 1759 1760 connp->conn_zoneid = zoneid; 1761 1762 /* 1763 * If the caller has the process-wide flag set, then default to MAC 1764 * exempt mode. This allows read-down to unlabeled hosts. 1765 */ 1766 if (getpflags(NET_MAC_AWARE, credp) != 0) 1767 connp->conn_mac_mode = CONN_MAC_AWARE; 1768 1769 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1770 1771 icmp->icmp_is = is; 1772 1773 connp->conn_rcvbuf = is->is_recv_hiwat; 1774 connp->conn_sndbuf = is->is_xmit_hiwat; 1775 connp->conn_sndlowat = is->is_xmit_lowat; 1776 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1777 1778 connp->conn_wroff = len + is->is_wroff_extra; 1779 connp->conn_so_type = SOCK_RAW; 1780 1781 connp->conn_recv = icmp_input; 1782 connp->conn_recvicmp = icmp_icmp_input; 1783 crhold(credp); 1784 connp->conn_cred = credp; 1785 connp->conn_cpid = curproc->p_pid; 1786 connp->conn_open_time = ddi_get_lbolt64(); 1787 /* Cache things in ixa without an extra refhold */ 1788 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1789 connp->conn_ixa->ixa_cred = connp->conn_cred; 1790 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1791 if (is_system_labeled()) 1792 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1793 1794 connp->conn_flow_cntrld = B_FALSE; 1795 1796 if (is->is_pmtu_discovery) 1797 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1798 1799 return (connp); 1800 } 1801 1802 /* 1803 * Which ICMP options OK to set through T_UNITDATA_REQ... 1804 */ 1805 /* ARGSUSED */ 1806 static boolean_t 1807 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1808 { 1809 return (B_TRUE); 1810 } 1811 1812 /* 1813 * This routine gets default values of certain options whose default 1814 * values are maintained by protcol specific code 1815 */ 1816 int 1817 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1818 { 1819 icmp_t *icmp = Q_TO_ICMP(q); 1820 icmp_stack_t *is = icmp->icmp_is; 1821 int *i1 = (int *)ptr; 1822 1823 switch (level) { 1824 case IPPROTO_IP: 1825 switch (name) { 1826 case IP_MULTICAST_TTL: 1827 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1828 return (sizeof (uchar_t)); 1829 case IP_MULTICAST_LOOP: 1830 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1831 return (sizeof (uchar_t)); 1832 } 1833 break; 1834 case IPPROTO_IPV6: 1835 switch (name) { 1836 case IPV6_MULTICAST_HOPS: 1837 *i1 = IP_DEFAULT_MULTICAST_TTL; 1838 return (sizeof (int)); 1839 case IPV6_MULTICAST_LOOP: 1840 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1841 return (sizeof (int)); 1842 case IPV6_UNICAST_HOPS: 1843 *i1 = is->is_ipv6_hoplimit; 1844 return (sizeof (int)); 1845 } 1846 break; 1847 case IPPROTO_ICMPV6: 1848 switch (name) { 1849 case ICMP6_FILTER: 1850 /* Make it look like "pass all" */ 1851 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1852 return (sizeof (icmp6_filter_t)); 1853 } 1854 break; 1855 } 1856 return (-1); 1857 } 1858 1859 /* 1860 * This routine retrieves the current status of socket options. 1861 * It returns the size of the option retrieved, or -1. 1862 */ 1863 int 1864 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1865 { 1866 icmp_t *icmp = connp->conn_icmp; 1867 int *i1 = (int *)ptr; 1868 conn_opt_arg_t coas; 1869 int retval; 1870 1871 coas.coa_connp = connp; 1872 coas.coa_ixa = connp->conn_ixa; 1873 coas.coa_ipp = &connp->conn_xmit_ipp; 1874 coas.coa_ancillary = B_FALSE; 1875 coas.coa_changed = 0; 1876 1877 /* 1878 * We assume that the optcom framework has checked for the set 1879 * of levels and names that are supported, hence we don't worry 1880 * about rejecting based on that. 1881 * First check for ICMP specific handling, then pass to common routine. 1882 */ 1883 switch (level) { 1884 case IPPROTO_IP: 1885 /* 1886 * Only allow IPv4 option processing on IPv4 sockets. 1887 */ 1888 if (connp->conn_family != AF_INET) 1889 return (-1); 1890 1891 switch (name) { 1892 case IP_OPTIONS: 1893 case T_IP_OPTIONS: 1894 /* Options are passed up with each packet */ 1895 return (0); 1896 case IP_HDRINCL: 1897 mutex_enter(&connp->conn_lock); 1898 *i1 = (int)icmp->icmp_hdrincl; 1899 mutex_exit(&connp->conn_lock); 1900 return (sizeof (int)); 1901 } 1902 break; 1903 1904 case IPPROTO_IPV6: 1905 /* 1906 * Only allow IPv6 option processing on native IPv6 sockets. 1907 */ 1908 if (connp->conn_family != AF_INET6) 1909 return (-1); 1910 1911 switch (name) { 1912 case IPV6_CHECKSUM: 1913 /* 1914 * Return offset or -1 if no checksum offset. 1915 * Does not apply to IPPROTO_ICMPV6 1916 */ 1917 if (connp->conn_proto == IPPROTO_ICMPV6) 1918 return (-1); 1919 1920 mutex_enter(&connp->conn_lock); 1921 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1922 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1923 else 1924 *i1 = -1; 1925 mutex_exit(&connp->conn_lock); 1926 return (sizeof (int)); 1927 } 1928 break; 1929 1930 case IPPROTO_ICMPV6: 1931 /* 1932 * Only allow IPv6 option processing on native IPv6 sockets. 1933 */ 1934 if (connp->conn_family != AF_INET6) 1935 return (-1); 1936 1937 if (connp->conn_proto != IPPROTO_ICMPV6) 1938 return (-1); 1939 1940 switch (name) { 1941 case ICMP6_FILTER: 1942 mutex_enter(&connp->conn_lock); 1943 if (icmp->icmp_filter == NULL) { 1944 /* Make it look like "pass all" */ 1945 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1946 } else { 1947 (void) bcopy(icmp->icmp_filter, ptr, 1948 sizeof (icmp6_filter_t)); 1949 } 1950 mutex_exit(&connp->conn_lock); 1951 return (sizeof (icmp6_filter_t)); 1952 } 1953 } 1954 mutex_enter(&connp->conn_lock); 1955 retval = conn_opt_get(&coas, level, name, ptr); 1956 mutex_exit(&connp->conn_lock); 1957 return (retval); 1958 } 1959 1960 /* 1961 * This routine retrieves the current status of socket options. 1962 * It returns the size of the option retrieved, or -1. 1963 */ 1964 int 1965 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1966 { 1967 conn_t *connp = Q_TO_CONN(q); 1968 int err; 1969 1970 err = icmp_opt_get(connp, level, name, ptr); 1971 return (err); 1972 } 1973 1974 /* 1975 * This routine sets socket options. 1976 */ 1977 int 1978 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1979 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1980 { 1981 conn_t *connp = coa->coa_connp; 1982 ip_xmit_attr_t *ixa = coa->coa_ixa; 1983 icmp_t *icmp = connp->conn_icmp; 1984 icmp_stack_t *is = icmp->icmp_is; 1985 int *i1 = (int *)invalp; 1986 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1987 int error; 1988 1989 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1990 1991 /* 1992 * For fixed length options, no sanity check 1993 * of passed in length is done. It is assumed *_optcom_req() 1994 * routines do the right thing. 1995 */ 1996 1997 switch (level) { 1998 case SOL_SOCKET: 1999 switch (name) { 2000 case SO_PROTOTYPE: 2001 if ((*i1 & 0xFF) != IPPROTO_ICMP && 2002 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 2003 secpolicy_net_rawaccess(cr) != 0) { 2004 return (EACCES); 2005 } 2006 if (checkonly) 2007 break; 2008 2009 mutex_enter(&connp->conn_lock); 2010 connp->conn_proto = *i1 & 0xFF; 2011 ixa->ixa_protocol = connp->conn_proto; 2012 if ((connp->conn_proto == IPPROTO_RAW || 2013 connp->conn_proto == IPPROTO_IGMP) && 2014 connp->conn_family == AF_INET) { 2015 icmp->icmp_hdrincl = 1; 2016 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2017 } else if (connp->conn_proto == IPPROTO_UDP || 2018 connp->conn_proto == IPPROTO_TCP || 2019 connp->conn_proto == IPPROTO_SCTP) { 2020 /* Used by test applications like psh */ 2021 icmp->icmp_hdrincl = 0; 2022 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2023 } else { 2024 icmp->icmp_hdrincl = 0; 2025 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2026 } 2027 2028 if (connp->conn_family == AF_INET6 && 2029 connp->conn_proto == IPPROTO_ICMPV6) { 2030 /* Set offset for icmp6_cksum */ 2031 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2032 ixa->ixa_raw_cksum_offset = 2; 2033 } 2034 if (icmp->icmp_filter != NULL && 2035 connp->conn_proto != IPPROTO_ICMPV6) { 2036 kmem_free(icmp->icmp_filter, 2037 sizeof (icmp6_filter_t)); 2038 icmp->icmp_filter = NULL; 2039 } 2040 mutex_exit(&connp->conn_lock); 2041 2042 coa->coa_changed |= COA_HEADER_CHANGED; 2043 /* 2044 * For SCTP, we don't use icmp_bind_proto() for 2045 * raw socket binding. 2046 */ 2047 if (connp->conn_proto == IPPROTO_SCTP) 2048 return (0); 2049 2050 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2051 return (0); 2052 2053 case SO_SNDBUF: 2054 if (*i1 > is->is_max_buf) { 2055 return (ENOBUFS); 2056 } 2057 break; 2058 case SO_RCVBUF: 2059 if (*i1 > is->is_max_buf) { 2060 return (ENOBUFS); 2061 } 2062 break; 2063 } 2064 break; 2065 2066 case IPPROTO_IP: 2067 /* 2068 * Only allow IPv4 option processing on IPv4 sockets. 2069 */ 2070 if (connp->conn_family != AF_INET) 2071 return (EINVAL); 2072 2073 switch (name) { 2074 case IP_HDRINCL: 2075 if (!checkonly) { 2076 mutex_enter(&connp->conn_lock); 2077 icmp->icmp_hdrincl = onoff; 2078 if (onoff) 2079 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2080 else 2081 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2082 mutex_exit(&connp->conn_lock); 2083 } 2084 break; 2085 } 2086 break; 2087 2088 case IPPROTO_IPV6: 2089 if (connp->conn_family != AF_INET6) 2090 return (EINVAL); 2091 2092 switch (name) { 2093 case IPV6_CHECKSUM: 2094 /* 2095 * Integer offset into the user data of where the 2096 * checksum is located. 2097 * Offset of -1 disables option. 2098 * Does not apply to IPPROTO_ICMPV6. 2099 */ 2100 if (connp->conn_proto == IPPROTO_ICMPV6 || 2101 coa->coa_ancillary) { 2102 return (EINVAL); 2103 } 2104 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2105 /* Negative or not 16 bit aligned offset */ 2106 return (EINVAL); 2107 } 2108 if (checkonly) 2109 break; 2110 2111 mutex_enter(&connp->conn_lock); 2112 if (*i1 == -1) { 2113 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2114 ixa->ixa_raw_cksum_offset = 0; 2115 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2116 } else { 2117 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2118 ixa->ixa_raw_cksum_offset = *i1; 2119 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2120 } 2121 mutex_exit(&connp->conn_lock); 2122 break; 2123 } 2124 break; 2125 2126 case IPPROTO_ICMPV6: 2127 /* 2128 * Only allow IPv6 option processing on IPv6 sockets. 2129 */ 2130 if (connp->conn_family != AF_INET6) 2131 return (EINVAL); 2132 if (connp->conn_proto != IPPROTO_ICMPV6) 2133 return (EINVAL); 2134 2135 switch (name) { 2136 case ICMP6_FILTER: 2137 if (checkonly) 2138 break; 2139 2140 if ((inlen != 0) && 2141 (inlen != sizeof (icmp6_filter_t))) 2142 return (EINVAL); 2143 2144 mutex_enter(&connp->conn_lock); 2145 if (inlen == 0) { 2146 if (icmp->icmp_filter != NULL) { 2147 kmem_free(icmp->icmp_filter, 2148 sizeof (icmp6_filter_t)); 2149 icmp->icmp_filter = NULL; 2150 } 2151 } else { 2152 if (icmp->icmp_filter == NULL) { 2153 icmp->icmp_filter = kmem_alloc( 2154 sizeof (icmp6_filter_t), 2155 KM_NOSLEEP); 2156 if (icmp->icmp_filter == NULL) { 2157 mutex_exit(&connp->conn_lock); 2158 return (ENOBUFS); 2159 } 2160 } 2161 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2162 } 2163 mutex_exit(&connp->conn_lock); 2164 break; 2165 } 2166 break; 2167 } 2168 error = conn_opt_set(coa, level, name, inlen, invalp, 2169 checkonly, cr); 2170 return (error); 2171 } 2172 2173 /* 2174 * This routine sets socket options. 2175 */ 2176 int 2177 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2178 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2179 void *thisdg_attrs, cred_t *cr) 2180 { 2181 icmp_t *icmp = connp->conn_icmp; 2182 int err; 2183 conn_opt_arg_t coas, *coa; 2184 boolean_t checkonly; 2185 icmp_stack_t *is = icmp->icmp_is; 2186 2187 switch (optset_context) { 2188 case SETFN_OPTCOM_CHECKONLY: 2189 checkonly = B_TRUE; 2190 /* 2191 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2192 * inlen != 0 implies value supplied and 2193 * we have to "pretend" to set it. 2194 * inlen == 0 implies that there is no 2195 * value part in T_CHECK request and just validation 2196 * done elsewhere should be enough, we just return here. 2197 */ 2198 if (inlen == 0) { 2199 *outlenp = 0; 2200 return (0); 2201 } 2202 break; 2203 case SETFN_OPTCOM_NEGOTIATE: 2204 checkonly = B_FALSE; 2205 break; 2206 case SETFN_UD_NEGOTIATE: 2207 case SETFN_CONN_NEGOTIATE: 2208 checkonly = B_FALSE; 2209 /* 2210 * Negotiating local and "association-related" options 2211 * through T_UNITDATA_REQ. 2212 * 2213 * Following routine can filter out ones we do not 2214 * want to be "set" this way. 2215 */ 2216 if (!icmp_opt_allow_udr_set(level, name)) { 2217 *outlenp = 0; 2218 return (EINVAL); 2219 } 2220 break; 2221 default: 2222 /* 2223 * We should never get here 2224 */ 2225 *outlenp = 0; 2226 return (EINVAL); 2227 } 2228 2229 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2230 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2231 2232 if (thisdg_attrs != NULL) { 2233 /* Options from T_UNITDATA_REQ */ 2234 coa = (conn_opt_arg_t *)thisdg_attrs; 2235 ASSERT(coa->coa_connp == connp); 2236 ASSERT(coa->coa_ixa != NULL); 2237 ASSERT(coa->coa_ipp != NULL); 2238 ASSERT(coa->coa_ancillary); 2239 } else { 2240 coa = &coas; 2241 coas.coa_connp = connp; 2242 /* Get a reference on conn_ixa to prevent concurrent mods */ 2243 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2244 if (coas.coa_ixa == NULL) { 2245 *outlenp = 0; 2246 return (ENOMEM); 2247 } 2248 coas.coa_ipp = &connp->conn_xmit_ipp; 2249 coas.coa_ancillary = B_FALSE; 2250 coas.coa_changed = 0; 2251 } 2252 2253 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2254 cr, checkonly); 2255 if (err != 0) { 2256 errout: 2257 if (!coa->coa_ancillary) 2258 ixa_refrele(coa->coa_ixa); 2259 *outlenp = 0; 2260 return (err); 2261 } 2262 2263 /* 2264 * Common case of OK return with outval same as inval. 2265 */ 2266 if (invalp != outvalp) { 2267 /* don't trust bcopy for identical src/dst */ 2268 (void) bcopy(invalp, outvalp, inlen); 2269 } 2270 *outlenp = inlen; 2271 2272 /* 2273 * If this was not ancillary data, then we rebuild the headers, 2274 * update the IRE/NCE, and IPsec as needed. 2275 * Since the label depends on the destination we go through 2276 * ip_set_destination first. 2277 */ 2278 if (coa->coa_ancillary) { 2279 return (0); 2280 } 2281 2282 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2283 in6_addr_t saddr, faddr, nexthop; 2284 in_port_t fport; 2285 2286 /* 2287 * We clear lastdst to make sure we pick up the change 2288 * next time sending. 2289 * If we are connected we re-cache the information. 2290 * We ignore errors to preserve BSD behavior. 2291 * Note that we don't redo IPsec policy lookup here 2292 * since the final destination (or source) didn't change. 2293 */ 2294 mutex_enter(&connp->conn_lock); 2295 connp->conn_v6lastdst = ipv6_all_zeros; 2296 2297 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2298 &connp->conn_faddr_v6, &nexthop); 2299 saddr = connp->conn_saddr_v6; 2300 faddr = connp->conn_faddr_v6; 2301 fport = connp->conn_fport; 2302 mutex_exit(&connp->conn_lock); 2303 2304 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2305 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2306 (void) ip_attr_connect(connp, coa->coa_ixa, 2307 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2308 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2309 } 2310 } 2311 2312 ixa_refrele(coa->coa_ixa); 2313 2314 if (coa->coa_changed & COA_HEADER_CHANGED) { 2315 /* 2316 * Rebuild the header template if we are connected. 2317 * Otherwise clear conn_v6lastdst so we rebuild the header 2318 * in the data path. 2319 */ 2320 mutex_enter(&connp->conn_lock); 2321 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2322 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2323 err = icmp_build_hdr_template(connp, 2324 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2325 connp->conn_flowinfo); 2326 if (err != 0) { 2327 mutex_exit(&connp->conn_lock); 2328 return (err); 2329 } 2330 } else { 2331 connp->conn_v6lastdst = ipv6_all_zeros; 2332 } 2333 mutex_exit(&connp->conn_lock); 2334 } 2335 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2336 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2337 connp->conn_rcvbuf); 2338 } 2339 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2340 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2341 } 2342 if (coa->coa_changed & COA_WROFF_CHANGED) { 2343 /* Increase wroff if needed */ 2344 uint_t wroff; 2345 2346 mutex_enter(&connp->conn_lock); 2347 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2348 if (wroff > connp->conn_wroff) { 2349 connp->conn_wroff = wroff; 2350 mutex_exit(&connp->conn_lock); 2351 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2352 } else { 2353 mutex_exit(&connp->conn_lock); 2354 } 2355 } 2356 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2357 icmp_bind_proto(icmp); 2358 } 2359 return (err); 2360 } 2361 2362 /* This routine sets socket options. */ 2363 int 2364 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2365 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2366 void *thisdg_attrs, cred_t *cr) 2367 { 2368 conn_t *connp = Q_TO_CONN(q); 2369 int error; 2370 2371 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2372 outlenp, outvalp, thisdg_attrs, cr); 2373 return (error); 2374 } 2375 2376 /* 2377 * Setup IP headers. 2378 * 2379 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2380 * but icmp_output_hdrincl restores ipha_protocol once we return. 2381 */ 2382 mblk_t * 2383 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2384 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2385 mblk_t *data_mp, int *errorp) 2386 { 2387 mblk_t *mp; 2388 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2389 uint_t data_len; 2390 uint32_t cksum; 2391 2392 data_len = msgdsize(data_mp); 2393 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2394 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2395 if (mp == NULL) { 2396 ASSERT(*errorp != 0); 2397 return (NULL); 2398 } 2399 2400 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2401 2402 /* 2403 * If there was a routing option/header then conn_prepend_hdr 2404 * has massaged it and placed the pseudo-header checksum difference 2405 * in the cksum argument. 2406 * 2407 * Prepare for ICMPv6 checksum done in IP. 2408 * 2409 * We make it easy for IP to include our pseudo header 2410 * by putting our length (and any routing header adjustment) 2411 * in the ICMPv6 checksum field. 2412 * The IP source, destination, and length have already been set by 2413 * conn_prepend_hdr. 2414 */ 2415 cksum += data_len; 2416 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2417 ASSERT(cksum < 0x10000); 2418 2419 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2420 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2421 2422 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2423 } else { 2424 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2425 uint_t cksum_offset = 0; 2426 2427 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2428 2429 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2430 if (connp->conn_proto == IPPROTO_ICMPV6) { 2431 cksum_offset = ixa->ixa_ip_hdr_length + 2432 offsetof(icmp6_t, icmp6_cksum); 2433 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2434 cksum_offset = ixa->ixa_ip_hdr_length + 2435 ixa->ixa_raw_cksum_offset; 2436 } 2437 } 2438 if (cksum_offset != 0) { 2439 uint16_t *ptr; 2440 2441 /* Make sure the checksum fits in the first mblk */ 2442 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2443 mblk_t *mp1; 2444 2445 mp1 = msgpullup(mp, 2446 cksum_offset + sizeof (short)); 2447 freemsg(mp); 2448 if (mp1 == NULL) { 2449 *errorp = ENOMEM; 2450 return (NULL); 2451 } 2452 mp = mp1; 2453 ip6h = (ip6_t *)mp->b_rptr; 2454 } 2455 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2456 *ptr = htons(cksum); 2457 } 2458 } 2459 2460 /* Note that we don't try to update wroff due to ancillary data */ 2461 return (mp); 2462 } 2463 2464 static int 2465 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2466 const in6_addr_t *v6dst, uint32_t flowinfo) 2467 { 2468 int error; 2469 2470 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2471 /* 2472 * We clear lastdst to make sure we don't use the lastdst path 2473 * next time sending since we might not have set v6dst yet. 2474 */ 2475 connp->conn_v6lastdst = ipv6_all_zeros; 2476 2477 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2478 if (error != 0) 2479 return (error); 2480 2481 /* 2482 * Any routing header/option has been massaged. The checksum difference 2483 * is stored in conn_sum. 2484 */ 2485 return (0); 2486 } 2487 2488 static mblk_t * 2489 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2490 { 2491 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2492 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2493 /* 2494 * fallback has started but messages have not been moved yet 2495 */ 2496 if (icmp->icmp_fallback_queue_head == NULL) { 2497 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2498 icmp->icmp_fallback_queue_head = mp; 2499 icmp->icmp_fallback_queue_tail = mp; 2500 } else { 2501 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2502 icmp->icmp_fallback_queue_tail->b_next = mp; 2503 icmp->icmp_fallback_queue_tail = mp; 2504 } 2505 return (NULL); 2506 } else { 2507 /* 2508 * Fallback completed, let the caller putnext() the mblk. 2509 */ 2510 return (mp); 2511 } 2512 } 2513 2514 /* 2515 * Deliver data to ULP. In case we have a socket, and it's falling back to 2516 * TPI, then we'll queue the mp for later processing. 2517 */ 2518 static void 2519 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2520 { 2521 if (IPCL_IS_NONSTR(connp)) { 2522 icmp_t *icmp = connp->conn_icmp; 2523 int error; 2524 2525 ASSERT(len == msgdsize(mp)); 2526 if ((*connp->conn_upcalls->su_recv) 2527 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2528 mutex_enter(&icmp->icmp_recv_lock); 2529 if (error == ENOSPC) { 2530 /* 2531 * let's confirm while holding the lock 2532 */ 2533 if ((*connp->conn_upcalls->su_recv) 2534 (connp->conn_upper_handle, NULL, 0, 0, 2535 &error, NULL) < 0) { 2536 ASSERT(error == ENOSPC); 2537 if (error == ENOSPC) { 2538 connp->conn_flow_cntrld = 2539 B_TRUE; 2540 } 2541 } 2542 mutex_exit(&icmp->icmp_recv_lock); 2543 } else { 2544 ASSERT(error == EOPNOTSUPP); 2545 mp = icmp_queue_fallback(icmp, mp); 2546 mutex_exit(&icmp->icmp_recv_lock); 2547 if (mp != NULL) 2548 putnext(connp->conn_rq, mp); 2549 } 2550 } 2551 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2552 } else { 2553 putnext(connp->conn_rq, mp); 2554 } 2555 } 2556 2557 /* 2558 * This is the inbound data path. 2559 * IP has already pulled up the IP headers and verified alignment 2560 * etc. 2561 */ 2562 /* ARGSUSED2 */ 2563 static void 2564 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2565 { 2566 conn_t *connp = (conn_t *)arg1; 2567 struct T_unitdata_ind *tudi; 2568 uchar_t *rptr; /* Pointer to IP header */ 2569 int ip_hdr_length; 2570 int udi_size; /* Size of T_unitdata_ind */ 2571 int pkt_len; 2572 icmp_t *icmp; 2573 ip_pkt_t ipps; 2574 ip6_t *ip6h; 2575 mblk_t *mp1; 2576 crb_t recv_ancillary; 2577 icmp_stack_t *is; 2578 sin_t *sin; 2579 sin6_t *sin6; 2580 ipha_t *ipha; 2581 2582 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2583 2584 icmp = connp->conn_icmp; 2585 is = icmp->icmp_is; 2586 rptr = mp->b_rptr; 2587 2588 ASSERT(DB_TYPE(mp) == M_DATA); 2589 ASSERT(OK_32PTR(rptr)); 2590 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2591 pkt_len = ira->ira_pktlen; 2592 2593 /* 2594 * Get a snapshot of these and allow other threads to change 2595 * them after that. We need the same recv_ancillary when determining 2596 * the size as when adding the ancillary data items. 2597 */ 2598 mutex_enter(&connp->conn_lock); 2599 recv_ancillary = connp->conn_recv_ancillary; 2600 mutex_exit(&connp->conn_lock); 2601 2602 ip_hdr_length = ira->ira_ip_hdr_length; 2603 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2604 2605 /* Initialize regardless of IP version */ 2606 ipps.ipp_fields = 0; 2607 2608 if (ira->ira_flags & IRAF_IS_IPV4) { 2609 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2610 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2611 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2612 2613 ipha = (ipha_t *)mp->b_rptr; 2614 if (recv_ancillary.crb_all != 0) 2615 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2616 2617 /* 2618 * BSD for some reason adjusts ipha_length to exclude the 2619 * IP header length. We do the same. 2620 */ 2621 if (is->is_bsd_compat) { 2622 ushort_t len; 2623 2624 len = ntohs(ipha->ipha_length); 2625 if (mp->b_datap->db_ref > 1) { 2626 /* 2627 * Allocate a new IP header so that we can 2628 * modify ipha_length. 2629 */ 2630 mblk_t *mp1; 2631 2632 mp1 = allocb(ip_hdr_length, BPRI_MED); 2633 if (mp1 == NULL) { 2634 freemsg(mp); 2635 BUMP_MIB(&is->is_rawip_mib, 2636 rawipInErrors); 2637 return; 2638 } 2639 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2640 mp->b_rptr = rptr + ip_hdr_length; 2641 rptr = mp1->b_rptr; 2642 ipha = (ipha_t *)rptr; 2643 mp1->b_cont = mp; 2644 mp1->b_wptr = rptr + ip_hdr_length; 2645 mp = mp1; 2646 } 2647 len -= ip_hdr_length; 2648 ipha->ipha_length = htons(len); 2649 } 2650 2651 /* 2652 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2653 * sockets. This is ensured by icmp_bind and the IP fanout code. 2654 */ 2655 ASSERT(connp->conn_family == AF_INET); 2656 2657 /* 2658 * This is the inbound data path. Packets are passed upstream 2659 * as T_UNITDATA_IND messages with full IPv4 headers still 2660 * attached. 2661 */ 2662 2663 /* 2664 * Normally only send up the source address. 2665 * If any ancillary data items are wanted we add those. 2666 */ 2667 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2668 if (recv_ancillary.crb_all != 0) { 2669 udi_size += conn_recvancillary_size(connp, 2670 recv_ancillary, ira, mp, &ipps); 2671 } 2672 2673 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2674 mp1 = allocb(udi_size, BPRI_MED); 2675 if (mp1 == NULL) { 2676 freemsg(mp); 2677 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2678 return; 2679 } 2680 mp1->b_cont = mp; 2681 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2682 mp1->b_datap->db_type = M_PROTO; 2683 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2684 tudi->PRIM_type = T_UNITDATA_IND; 2685 tudi->SRC_length = sizeof (sin_t); 2686 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2687 sin = (sin_t *)&tudi[1]; 2688 *sin = sin_null; 2689 sin->sin_family = AF_INET; 2690 sin->sin_addr.s_addr = ipha->ipha_src; 2691 *(uint32_t *)&sin->sin_zero[0] = 0; 2692 *(uint32_t *)&sin->sin_zero[4] = 0; 2693 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2694 sizeof (sin_t); 2695 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2696 tudi->OPT_length = udi_size; 2697 2698 /* 2699 * Add options if IP_RECVIF etc is set 2700 */ 2701 if (udi_size != 0) { 2702 conn_recvancillary_add(connp, recv_ancillary, ira, 2703 &ipps, (uchar_t *)&sin[1], udi_size); 2704 } 2705 goto deliver; 2706 } 2707 2708 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2709 /* 2710 * IPv6 packets can only be received by applications 2711 * that are prepared to receive IPv6 addresses. 2712 * The IP fanout must ensure this. 2713 */ 2714 ASSERT(connp->conn_family == AF_INET6); 2715 2716 /* 2717 * Handle IPv6 packets. We don't pass up the IP headers with the 2718 * payload for IPv6. 2719 */ 2720 2721 ip6h = (ip6_t *)rptr; 2722 if (recv_ancillary.crb_all != 0) { 2723 /* 2724 * Call on ip_find_hdr_v6 which gets individual lenghts of 2725 * extension headers (and pointers to them). 2726 */ 2727 uint8_t nexthdr; 2728 2729 /* We don't care about the length or nextheader. */ 2730 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2731 2732 /* 2733 * We do not pass up hop-by-hop options or any other 2734 * extension header as part of the packet. Applications 2735 * that want to see them have to specify IPV6_RECV* socket 2736 * options. And conn_recvancillary_size/add explicitly 2737 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2738 * 2739 * If we had multilevel ICMP sockets, then we'd want to 2740 * modify conn_recvancillary_size/add to 2741 * allow the user to see the label. 2742 */ 2743 } 2744 2745 /* 2746 * Check a filter for ICMPv6 types if needed. 2747 * Verify raw checksums if needed. 2748 */ 2749 mutex_enter(&connp->conn_lock); 2750 if (icmp->icmp_filter != NULL) { 2751 int type; 2752 2753 /* Assumes that IP has done the pullupmsg */ 2754 type = mp->b_rptr[ip_hdr_length]; 2755 2756 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2757 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2758 mutex_exit(&connp->conn_lock); 2759 freemsg(mp); 2760 return; 2761 } 2762 } 2763 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2764 /* Checksum */ 2765 uint16_t *up; 2766 uint32_t sum; 2767 int remlen; 2768 2769 up = (uint16_t *)&ip6h->ip6_src; 2770 2771 remlen = msgdsize(mp) - ip_hdr_length; 2772 sum = htons(connp->conn_proto + remlen) 2773 + up[0] + up[1] + up[2] + up[3] 2774 + up[4] + up[5] + up[6] + up[7] 2775 + up[8] + up[9] + up[10] + up[11] 2776 + up[12] + up[13] + up[14] + up[15]; 2777 sum = (sum & 0xffff) + (sum >> 16); 2778 sum = IP_CSUM(mp, ip_hdr_length, sum); 2779 if (sum != 0) { 2780 /* IPv6 RAW checksum failed */ 2781 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2782 mutex_exit(&connp->conn_lock); 2783 freemsg(mp); 2784 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2785 return; 2786 } 2787 } 2788 mutex_exit(&connp->conn_lock); 2789 2790 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2791 2792 if (recv_ancillary.crb_all != 0) { 2793 udi_size += conn_recvancillary_size(connp, 2794 recv_ancillary, ira, mp, &ipps); 2795 } 2796 2797 mp1 = allocb(udi_size, BPRI_MED); 2798 if (mp1 == NULL) { 2799 freemsg(mp); 2800 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2801 return; 2802 } 2803 mp1->b_cont = mp; 2804 mp1->b_datap->db_type = M_PROTO; 2805 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2806 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2807 tudi->PRIM_type = T_UNITDATA_IND; 2808 tudi->SRC_length = sizeof (sin6_t); 2809 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2810 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2811 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2812 tudi->OPT_length = udi_size; 2813 sin6 = (sin6_t *)&tudi[1]; 2814 *sin6 = sin6_null; 2815 sin6->sin6_port = 0; 2816 sin6->sin6_family = AF_INET6; 2817 2818 sin6->sin6_addr = ip6h->ip6_src; 2819 /* No sin6_flowinfo per API */ 2820 sin6->sin6_flowinfo = 0; 2821 /* For link-scope pass up scope id */ 2822 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2823 sin6->sin6_scope_id = ira->ira_ruifindex; 2824 else 2825 sin6->sin6_scope_id = 0; 2826 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2827 IPCL_ZONEID(connp), is->is_netstack); 2828 2829 if (udi_size != 0) { 2830 conn_recvancillary_add(connp, recv_ancillary, ira, 2831 &ipps, (uchar_t *)&sin6[1], udi_size); 2832 } 2833 2834 /* Skip all the IPv6 headers per API */ 2835 mp->b_rptr += ip_hdr_length; 2836 pkt_len -= ip_hdr_length; 2837 2838 deliver: 2839 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2840 icmp_ulp_recv(connp, mp1, pkt_len); 2841 } 2842 2843 /* 2844 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2845 * information that can be changing beneath us. 2846 */ 2847 mblk_t * 2848 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2849 { 2850 mblk_t *mpdata; 2851 struct opthdr *optp; 2852 conn_t *connp = Q_TO_CONN(q); 2853 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2854 mblk_t *mp2ctl; 2855 2856 /* 2857 * make a copy of the original message 2858 */ 2859 mp2ctl = copymsg(mpctl); 2860 2861 if (mpctl == NULL || 2862 (mpdata = mpctl->b_cont) == NULL) { 2863 freemsg(mpctl); 2864 freemsg(mp2ctl); 2865 return (0); 2866 } 2867 2868 /* fixed length structure for IPv4 and IPv6 counters */ 2869 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2870 optp->level = EXPER_RAWIP; 2871 optp->name = 0; 2872 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2873 sizeof (is->is_rawip_mib)); 2874 optp->len = msgdsize(mpdata); 2875 qreply(q, mpctl); 2876 2877 return (mp2ctl); 2878 } 2879 2880 /* 2881 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2882 * TODO: If this ever actually tries to set anything, it needs to be 2883 * to do the appropriate locking. 2884 */ 2885 /* ARGSUSED */ 2886 int 2887 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2888 uchar_t *ptr, int len) 2889 { 2890 switch (level) { 2891 case EXPER_RAWIP: 2892 return (0); 2893 default: 2894 return (1); 2895 } 2896 } 2897 2898 /* 2899 * This routine creates a T_UDERROR_IND message and passes it upstream. 2900 * The address and options are copied from the T_UNITDATA_REQ message 2901 * passed in mp. This message is freed. 2902 */ 2903 static void 2904 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2905 { 2906 struct T_unitdata_req *tudr; 2907 mblk_t *mp1; 2908 uchar_t *destaddr; 2909 t_scalar_t destlen; 2910 uchar_t *optaddr; 2911 t_scalar_t optlen; 2912 2913 if ((mp->b_wptr < mp->b_rptr) || 2914 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2915 goto done; 2916 } 2917 tudr = (struct T_unitdata_req *)mp->b_rptr; 2918 destaddr = mp->b_rptr + tudr->DEST_offset; 2919 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2920 destaddr + tudr->DEST_length < mp->b_rptr || 2921 destaddr + tudr->DEST_length > mp->b_wptr) { 2922 goto done; 2923 } 2924 optaddr = mp->b_rptr + tudr->OPT_offset; 2925 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2926 optaddr + tudr->OPT_length < mp->b_rptr || 2927 optaddr + tudr->OPT_length > mp->b_wptr) { 2928 goto done; 2929 } 2930 destlen = tudr->DEST_length; 2931 optlen = tudr->OPT_length; 2932 2933 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2934 (char *)optaddr, optlen, err); 2935 if (mp1 != NULL) 2936 qreply(q, mp1); 2937 2938 done: 2939 freemsg(mp); 2940 } 2941 2942 static int 2943 rawip_do_unbind(conn_t *connp) 2944 { 2945 icmp_t *icmp = connp->conn_icmp; 2946 2947 mutex_enter(&connp->conn_lock); 2948 /* If a bind has not been done, we can't unbind. */ 2949 if (icmp->icmp_state == TS_UNBND) { 2950 mutex_exit(&connp->conn_lock); 2951 return (-TOUTSTATE); 2952 } 2953 connp->conn_saddr_v6 = ipv6_all_zeros; 2954 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2955 connp->conn_laddr_v6 = ipv6_all_zeros; 2956 connp->conn_mcbc_bind = B_FALSE; 2957 connp->conn_lport = 0; 2958 connp->conn_fport = 0; 2959 /* In case we were also connected */ 2960 connp->conn_faddr_v6 = ipv6_all_zeros; 2961 connp->conn_v6lastdst = ipv6_all_zeros; 2962 2963 icmp->icmp_state = TS_UNBND; 2964 2965 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2966 &connp->conn_faddr_v6, connp->conn_flowinfo); 2967 mutex_exit(&connp->conn_lock); 2968 2969 ip_unbind(connp); 2970 return (0); 2971 } 2972 2973 /* 2974 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2975 * After some error checking, the message is passed downstream to ip. 2976 */ 2977 static void 2978 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2979 { 2980 conn_t *connp = Q_TO_CONN(q); 2981 int error; 2982 2983 ASSERT(mp->b_cont == NULL); 2984 error = rawip_do_unbind(connp); 2985 if (error) { 2986 if (error < 0) { 2987 icmp_err_ack(q, mp, -error, 0); 2988 } else { 2989 icmp_err_ack(q, mp, 0, error); 2990 } 2991 return; 2992 } 2993 2994 /* 2995 * Convert mp into a T_OK_ACK 2996 */ 2997 2998 mp = mi_tpi_ok_ack_alloc(mp); 2999 3000 /* 3001 * should not happen in practice... T_OK_ACK is smaller than the 3002 * original message. 3003 */ 3004 ASSERT(mp != NULL); 3005 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 3006 qreply(q, mp); 3007 } 3008 3009 /* 3010 * Process IPv4 packets that already include an IP header. 3011 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3012 * IPPROTO_IGMP). 3013 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3014 * 3015 * The packet is assumed to have a base (20 byte) IP header followed 3016 * by the upper-layer protocol. We include any IP_OPTIONS including a 3017 * CIPSO label but otherwise preserve the base IP header. 3018 */ 3019 static int 3020 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3021 { 3022 icmp_t *icmp = connp->conn_icmp; 3023 icmp_stack_t *is = icmp->icmp_is; 3024 ipha_t iphas; 3025 ipha_t *ipha; 3026 int ip_hdr_length; 3027 int tp_hdr_len; 3028 ip_xmit_attr_t *ixa; 3029 ip_pkt_t *ipp; 3030 in6_addr_t v6src; 3031 in6_addr_t v6dst; 3032 in6_addr_t v6nexthop; 3033 int error; 3034 boolean_t do_ipsec; 3035 3036 /* 3037 * We need an exclusive copy of conn_ixa since the included IP 3038 * header could have any destination. 3039 * That copy has no pointers hence we 3040 * need to set them up once we've parsed the ancillary data. 3041 */ 3042 ixa = conn_get_ixa_exclusive(connp); 3043 if (ixa == NULL) { 3044 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3045 freemsg(mp); 3046 return (ENOMEM); 3047 } 3048 ASSERT(cr != NULL); 3049 /* 3050 * Caller has a reference on cr; from db_credp or because we 3051 * are running in process context. 3052 */ 3053 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3054 ixa->ixa_cred = cr; 3055 ixa->ixa_cpid = pid; 3056 if (is_system_labeled()) { 3057 /* We need to restart with a label based on the cred */ 3058 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3059 } 3060 3061 /* In case previous destination was multicast or multirt */ 3062 ip_attr_newdst(ixa); 3063 3064 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3065 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3066 if (ipp == NULL) { 3067 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3068 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3069 ixa->ixa_cpid = connp->conn_cpid; 3070 ixa_refrele(ixa); 3071 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3072 freemsg(mp); 3073 return (ENOMEM); 3074 } 3075 mutex_enter(&connp->conn_lock); 3076 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3077 mutex_exit(&connp->conn_lock); 3078 if (error != 0) { 3079 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3080 freemsg(mp); 3081 goto done; 3082 } 3083 3084 /* Sanity check length of packet */ 3085 ipha = (ipha_t *)mp->b_rptr; 3086 3087 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3088 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3089 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3090 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3091 freemsg(mp); 3092 goto done; 3093 } 3094 ipha = (ipha_t *)mp->b_rptr; 3095 } 3096 ipha->ipha_version_and_hdr_length = 3097 (IP_VERSION<<4) | (ip_hdr_length>>2); 3098 3099 /* 3100 * We set IXAF_DONTFRAG if the application set DF which makes 3101 * IP not fragment. 3102 */ 3103 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3104 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3105 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3106 else 3107 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3108 3109 /* Even for multicast and broadcast we honor the apps ttl */ 3110 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3111 3112 /* 3113 * No source verification for non-local addresses 3114 */ 3115 if (ipha->ipha_src != INADDR_ANY && 3116 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3117 is->is_netstack->netstack_ip, B_FALSE) 3118 != IPVL_UNICAST_UP) { 3119 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3120 } 3121 3122 if (ipha->ipha_dst == INADDR_ANY) 3123 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3124 3125 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3126 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3127 3128 /* Defer IPsec if it might need to look at ICMP type/code */ 3129 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3130 ixa->ixa_flags |= IXAF_IS_IPV4; 3131 3132 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3133 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3134 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3135 (do_ipsec ? IPDF_IPSEC : 0)); 3136 switch (error) { 3137 case 0: 3138 break; 3139 case EADDRNOTAVAIL: 3140 /* 3141 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3142 * Don't have the application see that errno 3143 */ 3144 error = ENETUNREACH; 3145 goto failed; 3146 case ENETDOWN: 3147 /* 3148 * Have !ipif_addr_ready address; drop packet silently 3149 * until we can get applications to not send until we 3150 * are ready. 3151 */ 3152 error = 0; 3153 goto failed; 3154 case EHOSTUNREACH: 3155 case ENETUNREACH: 3156 if (ixa->ixa_ire != NULL) { 3157 /* 3158 * Let conn_ip_output/ire_send_noroute return 3159 * the error and send any local ICMP error. 3160 */ 3161 error = 0; 3162 break; 3163 } 3164 /* FALLTHRU */ 3165 default: 3166 failed: 3167 freemsg(mp); 3168 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3169 goto done; 3170 } 3171 if (ipha->ipha_src == INADDR_ANY) 3172 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3173 3174 /* 3175 * We might be going to a different destination than last time, 3176 * thus check that TX allows the communication and compute any 3177 * needed label. 3178 * 3179 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3180 * don't have to worry about concurrent threads. 3181 */ 3182 if (is_system_labeled()) { 3183 /* 3184 * Check whether Trusted Solaris policy allows communication 3185 * with this host, and pretend that the destination is 3186 * unreachable if not. 3187 * Compute any needed label and place it in ipp_label_v4/v6. 3188 * 3189 * Later conn_build_hdr_template/conn_prepend_hdr takes 3190 * ipp_label_v4/v6 to form the packet. 3191 * 3192 * Tsol note: We have ipp structure local to this thread so 3193 * no locking is needed. 3194 */ 3195 error = conn_update_label(connp, ixa, &v6dst, ipp); 3196 if (error != 0) { 3197 freemsg(mp); 3198 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3199 goto done; 3200 } 3201 } 3202 3203 /* 3204 * Save away a copy of the IPv4 header the application passed down 3205 * and then prepend an IPv4 header complete with any IP options 3206 * including label. 3207 * We need a struct copy since icmp_prepend_hdr will reuse the available 3208 * space in the mblk. 3209 */ 3210 iphas = *ipha; 3211 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3212 3213 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3214 if (mp == NULL) { 3215 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3216 ASSERT(error != 0); 3217 goto done; 3218 } 3219 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3220 error = EMSGSIZE; 3221 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3222 freemsg(mp); 3223 goto done; 3224 } 3225 /* Restore key parts of the header that the application passed down */ 3226 ipha = (ipha_t *)mp->b_rptr; 3227 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3228 ipha->ipha_ident = iphas.ipha_ident; 3229 ipha->ipha_fragment_offset_and_flags = 3230 iphas.ipha_fragment_offset_and_flags; 3231 ipha->ipha_ttl = iphas.ipha_ttl; 3232 ipha->ipha_protocol = iphas.ipha_protocol; 3233 ipha->ipha_src = iphas.ipha_src; 3234 ipha->ipha_dst = iphas.ipha_dst; 3235 3236 ixa->ixa_protocol = ipha->ipha_protocol; 3237 3238 /* 3239 * Make sure that the IP header plus any transport header that is 3240 * checksumed by ip_output is in the first mblk. (ip_output assumes 3241 * that at least the checksum field is in the first mblk.) 3242 */ 3243 switch (ipha->ipha_protocol) { 3244 case IPPROTO_UDP: 3245 tp_hdr_len = 8; 3246 break; 3247 case IPPROTO_TCP: 3248 tp_hdr_len = 20; 3249 break; 3250 default: 3251 tp_hdr_len = 0; 3252 break; 3253 } 3254 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3255 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3256 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3257 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3258 if (mp->b_cont == NULL) 3259 error = EINVAL; 3260 else 3261 error = ENOMEM; 3262 freemsg(mp); 3263 goto done; 3264 } 3265 } 3266 3267 if (!do_ipsec) { 3268 /* Policy might differ for different ICMP type/code */ 3269 if (ixa->ixa_ipsec_policy != NULL) { 3270 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3271 ixa->ixa_ipsec_policy = NULL; 3272 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3273 } 3274 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3275 if (mp == NULL) { 3276 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3277 error = EHOSTUNREACH; /* IPsec policy failure */ 3278 goto done; 3279 } 3280 } 3281 3282 /* We're done. Pass the packet to ip. */ 3283 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3284 3285 error = conn_ip_output(mp, ixa); 3286 /* No rawipOutErrors if an error since IP increases its error counter */ 3287 switch (error) { 3288 case 0: 3289 break; 3290 case EWOULDBLOCK: 3291 (void) ixa_check_drain_insert(connp, ixa); 3292 error = 0; 3293 break; 3294 case EADDRNOTAVAIL: 3295 /* 3296 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3297 * Don't have the application see that errno 3298 */ 3299 error = ENETUNREACH; 3300 break; 3301 } 3302 done: 3303 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3304 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3305 ixa->ixa_cpid = connp->conn_cpid; 3306 ixa_refrele(ixa); 3307 ip_pkt_free(ipp); 3308 kmem_free(ipp, sizeof (*ipp)); 3309 return (error); 3310 } 3311 3312 static mblk_t * 3313 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3314 { 3315 ipha_t *ipha = NULL; 3316 ip6_t *ip6h = NULL; 3317 3318 if (ixa->ixa_flags & IXAF_IS_IPV4) 3319 ipha = (ipha_t *)mp->b_rptr; 3320 else 3321 ip6h = (ip6_t *)mp->b_rptr; 3322 3323 if (ixa->ixa_ipsec_policy != NULL) { 3324 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3325 ixa->ixa_ipsec_policy = NULL; 3326 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3327 } 3328 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3329 } 3330 3331 /* 3332 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3333 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3334 * the TPI options, otherwise we take them from msg_control. 3335 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3336 * Always consumes mp; never consumes tudr_mp. 3337 */ 3338 static int 3339 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3340 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3341 { 3342 icmp_t *icmp = connp->conn_icmp; 3343 icmp_stack_t *is = icmp->icmp_is; 3344 int error; 3345 ip_xmit_attr_t *ixa; 3346 ip_pkt_t *ipp; 3347 in6_addr_t v6src; 3348 in6_addr_t v6dst; 3349 in6_addr_t v6nexthop; 3350 in_port_t dstport; 3351 uint32_t flowinfo; 3352 int is_absreq_failure = 0; 3353 conn_opt_arg_t coas, *coa; 3354 3355 ASSERT(tudr_mp != NULL || msg != NULL); 3356 3357 /* 3358 * Get ixa before checking state to handle a disconnect race. 3359 * 3360 * We need an exclusive copy of conn_ixa since the ancillary data 3361 * options might modify it. That copy has no pointers hence we 3362 * need to set them up once we've parsed the ancillary data. 3363 */ 3364 ixa = conn_get_ixa_exclusive(connp); 3365 if (ixa == NULL) { 3366 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3367 freemsg(mp); 3368 return (ENOMEM); 3369 } 3370 ASSERT(cr != NULL); 3371 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3372 ixa->ixa_cred = cr; 3373 ixa->ixa_cpid = pid; 3374 if (is_system_labeled()) { 3375 /* We need to restart with a label based on the cred */ 3376 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3377 } 3378 3379 /* In case previous destination was multicast or multirt */ 3380 ip_attr_newdst(ixa); 3381 3382 /* Get a copy of conn_xmit_ipp since the options might change it */ 3383 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3384 if (ipp == NULL) { 3385 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3386 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3387 ixa->ixa_cpid = connp->conn_cpid; 3388 ixa_refrele(ixa); 3389 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3390 freemsg(mp); 3391 return (ENOMEM); 3392 } 3393 mutex_enter(&connp->conn_lock); 3394 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3395 mutex_exit(&connp->conn_lock); 3396 if (error != 0) { 3397 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3398 freemsg(mp); 3399 goto done; 3400 } 3401 3402 /* 3403 * Parse the options and update ixa and ipp as a result. 3404 */ 3405 3406 coa = &coas; 3407 coa->coa_connp = connp; 3408 coa->coa_ixa = ixa; 3409 coa->coa_ipp = ipp; 3410 coa->coa_ancillary = B_TRUE; 3411 coa->coa_changed = 0; 3412 3413 if (msg != NULL) { 3414 error = process_auxiliary_options(connp, msg->msg_control, 3415 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3416 } else { 3417 struct T_unitdata_req *tudr; 3418 3419 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3420 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3421 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3422 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3423 coa, &is_absreq_failure); 3424 } 3425 if (error != 0) { 3426 /* 3427 * Note: No special action needed in this 3428 * module for "is_absreq_failure" 3429 */ 3430 freemsg(mp); 3431 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3432 goto done; 3433 } 3434 ASSERT(is_absreq_failure == 0); 3435 3436 mutex_enter(&connp->conn_lock); 3437 /* 3438 * If laddr is unspecified then we look at sin6_src_id. 3439 * We will give precedence to a source address set with IPV6_PKTINFO 3440 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3441 * want ip_attr_connect to select a source (since it can fail) when 3442 * IPV6_PKTINFO is specified. 3443 * If this doesn't result in a source address then we get a source 3444 * from ip_attr_connect() below. 3445 */ 3446 v6src = connp->conn_saddr_v6; 3447 if (sin != NULL) { 3448 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3449 dstport = sin->sin_port; 3450 flowinfo = 0; 3451 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3452 ixa->ixa_flags |= IXAF_IS_IPV4; 3453 } else if (sin6 != NULL) { 3454 boolean_t v4mapped; 3455 uint_t srcid; 3456 3457 v6dst = sin6->sin6_addr; 3458 dstport = sin6->sin6_port; 3459 flowinfo = sin6->sin6_flowinfo; 3460 srcid = sin6->__sin6_src_id; 3461 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3462 ixa->ixa_scopeid = sin6->sin6_scope_id; 3463 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3464 } else { 3465 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3466 } 3467 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 3468 if (v4mapped) 3469 ixa->ixa_flags |= IXAF_IS_IPV4; 3470 else 3471 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3472 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3473 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3474 v4mapped, connp->conn_netstack)) { 3475 /* Mismatched v4mapped/v6 specified by srcid. */ 3476 mutex_exit(&connp->conn_lock); 3477 error = EADDRNOTAVAIL; 3478 goto failed; /* Does freemsg() and mib. */ 3479 } 3480 } 3481 } else { 3482 /* Connected case */ 3483 dstport = connp->conn_fport; 3484 v6dst = connp->conn_faddr_v6; 3485 flowinfo = connp->conn_flowinfo; 3486 } 3487 mutex_exit(&connp->conn_lock); 3488 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3489 if (ipp->ipp_fields & IPPF_ADDR) { 3490 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3491 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3492 v6src = ipp->ipp_addr; 3493 } else { 3494 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3495 v6src = ipp->ipp_addr; 3496 } 3497 } 3498 /* 3499 * Allow source not assigned to the system 3500 * only if it is not a local addresses 3501 */ 3502 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3503 ip_laddr_t laddr_type; 3504 3505 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3506 ipaddr_t v4src; 3507 3508 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3509 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3510 is->is_netstack->netstack_ip, B_FALSE); 3511 } else { 3512 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3513 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3514 } 3515 if (laddr_type != IPVL_UNICAST_UP) 3516 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3517 } 3518 3519 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3520 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3521 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3522 3523 switch (error) { 3524 case 0: 3525 break; 3526 case EADDRNOTAVAIL: 3527 /* 3528 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3529 * Don't have the application see that errno 3530 */ 3531 error = ENETUNREACH; 3532 goto failed; 3533 case ENETDOWN: 3534 /* 3535 * Have !ipif_addr_ready address; drop packet silently 3536 * until we can get applications to not send until we 3537 * are ready. 3538 */ 3539 error = 0; 3540 goto failed; 3541 case EHOSTUNREACH: 3542 case ENETUNREACH: 3543 if (ixa->ixa_ire != NULL) { 3544 /* 3545 * Let conn_ip_output/ire_send_noroute return 3546 * the error and send any local ICMP error. 3547 */ 3548 error = 0; 3549 break; 3550 } 3551 /* FALLTHRU */ 3552 default: 3553 failed: 3554 freemsg(mp); 3555 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3556 goto done; 3557 } 3558 3559 /* 3560 * We might be going to a different destination than last time, 3561 * thus check that TX allows the communication and compute any 3562 * needed label. 3563 * 3564 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3565 * don't have to worry about concurrent threads. 3566 */ 3567 if (is_system_labeled()) { 3568 /* 3569 * Check whether Trusted Solaris policy allows communication 3570 * with this host, and pretend that the destination is 3571 * unreachable if not. 3572 * Compute any needed label and place it in ipp_label_v4/v6. 3573 * 3574 * Later conn_build_hdr_template/conn_prepend_hdr takes 3575 * ipp_label_v4/v6 to form the packet. 3576 * 3577 * Tsol note: We have ipp structure local to this thread so 3578 * no locking is needed. 3579 */ 3580 error = conn_update_label(connp, ixa, &v6dst, ipp); 3581 if (error != 0) { 3582 freemsg(mp); 3583 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3584 goto done; 3585 } 3586 } 3587 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3588 &error); 3589 if (mp == NULL) { 3590 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3591 ASSERT(error != 0); 3592 goto done; 3593 } 3594 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3595 error = EMSGSIZE; 3596 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3597 freemsg(mp); 3598 goto done; 3599 } 3600 3601 /* Policy might differ for different ICMP type/code */ 3602 mp = icmp_output_attach_policy(mp, connp, ixa); 3603 if (mp == NULL) { 3604 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3605 error = EHOSTUNREACH; /* IPsec policy failure */ 3606 goto done; 3607 } 3608 3609 /* We're done. Pass the packet to ip. */ 3610 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3611 3612 error = conn_ip_output(mp, ixa); 3613 if (!connp->conn_unspec_src) 3614 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3615 /* No rawipOutErrors if an error since IP increases its error counter */ 3616 switch (error) { 3617 case 0: 3618 break; 3619 case EWOULDBLOCK: 3620 (void) ixa_check_drain_insert(connp, ixa); 3621 error = 0; 3622 break; 3623 case EADDRNOTAVAIL: 3624 /* 3625 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3626 * Don't have the application see that errno 3627 */ 3628 error = ENETUNREACH; 3629 /* FALLTHRU */ 3630 default: 3631 mutex_enter(&connp->conn_lock); 3632 /* 3633 * Clear the source and v6lastdst so we call ip_attr_connect 3634 * for the next packet and try to pick a better source. 3635 */ 3636 if (connp->conn_mcbc_bind) 3637 connp->conn_saddr_v6 = ipv6_all_zeros; 3638 else 3639 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3640 connp->conn_v6lastdst = ipv6_all_zeros; 3641 mutex_exit(&connp->conn_lock); 3642 break; 3643 } 3644 done: 3645 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3646 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3647 ixa->ixa_cpid = connp->conn_cpid; 3648 ixa_refrele(ixa); 3649 ip_pkt_free(ipp); 3650 kmem_free(ipp, sizeof (*ipp)); 3651 return (error); 3652 } 3653 3654 /* 3655 * Handle sending an M_DATA for a connected socket. 3656 * Handles both IPv4 and IPv6. 3657 */ 3658 int 3659 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3660 { 3661 icmp_t *icmp = connp->conn_icmp; 3662 icmp_stack_t *is = icmp->icmp_is; 3663 int error; 3664 ip_xmit_attr_t *ixa; 3665 boolean_t do_ipsec; 3666 3667 /* 3668 * If no other thread is using conn_ixa this just gets a reference to 3669 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3670 */ 3671 ixa = conn_get_ixa(connp, B_FALSE); 3672 if (ixa == NULL) { 3673 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3674 freemsg(mp); 3675 return (ENOMEM); 3676 } 3677 3678 ASSERT(cr != NULL); 3679 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3680 ixa->ixa_cred = cr; 3681 ixa->ixa_cpid = pid; 3682 3683 /* Defer IPsec if it might need to look at ICMP type/code */ 3684 switch (ixa->ixa_protocol) { 3685 case IPPROTO_ICMP: 3686 case IPPROTO_ICMPV6: 3687 do_ipsec = B_FALSE; 3688 break; 3689 default: 3690 do_ipsec = B_TRUE; 3691 } 3692 3693 mutex_enter(&connp->conn_lock); 3694 mp = icmp_prepend_header_template(connp, ixa, mp, 3695 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3696 3697 if (mp == NULL) { 3698 ASSERT(error != 0); 3699 mutex_exit(&connp->conn_lock); 3700 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3701 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3702 ixa->ixa_cpid = connp->conn_cpid; 3703 ixa_refrele(ixa); 3704 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3705 freemsg(mp); 3706 return (error); 3707 } 3708 3709 if (!do_ipsec) { 3710 /* Policy might differ for different ICMP type/code */ 3711 mp = icmp_output_attach_policy(mp, connp, ixa); 3712 if (mp == NULL) { 3713 mutex_exit(&connp->conn_lock); 3714 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3715 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3716 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3717 ixa->ixa_cpid = connp->conn_cpid; 3718 ixa_refrele(ixa); 3719 return (EHOSTUNREACH); /* IPsec policy failure */ 3720 } 3721 } 3722 3723 /* 3724 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3725 * safe copy, then we need to fill in any pointers in it. 3726 */ 3727 if (ixa->ixa_ire == NULL) { 3728 in6_addr_t faddr, saddr; 3729 in6_addr_t nexthop; 3730 in_port_t fport; 3731 3732 saddr = connp->conn_saddr_v6; 3733 faddr = connp->conn_faddr_v6; 3734 fport = connp->conn_fport; 3735 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3736 mutex_exit(&connp->conn_lock); 3737 3738 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3739 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3740 (do_ipsec ? IPDF_IPSEC : 0)); 3741 switch (error) { 3742 case 0: 3743 break; 3744 case EADDRNOTAVAIL: 3745 /* 3746 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3747 * Don't have the application see that errno 3748 */ 3749 error = ENETUNREACH; 3750 goto failed; 3751 case ENETDOWN: 3752 /* 3753 * Have !ipif_addr_ready address; drop packet silently 3754 * until we can get applications to not send until we 3755 * are ready. 3756 */ 3757 error = 0; 3758 goto failed; 3759 case EHOSTUNREACH: 3760 case ENETUNREACH: 3761 if (ixa->ixa_ire != NULL) { 3762 /* 3763 * Let conn_ip_output/ire_send_noroute return 3764 * the error and send any local ICMP error. 3765 */ 3766 error = 0; 3767 break; 3768 } 3769 /* FALLTHRU */ 3770 default: 3771 failed: 3772 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3773 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3774 ixa->ixa_cpid = connp->conn_cpid; 3775 ixa_refrele(ixa); 3776 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3777 freemsg(mp); 3778 return (error); 3779 } 3780 } else { 3781 /* Done with conn_t */ 3782 mutex_exit(&connp->conn_lock); 3783 } 3784 3785 /* We're done. Pass the packet to ip. */ 3786 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3787 3788 error = conn_ip_output(mp, ixa); 3789 /* No rawipOutErrors if an error since IP increases its error counter */ 3790 switch (error) { 3791 case 0: 3792 break; 3793 case EWOULDBLOCK: 3794 (void) ixa_check_drain_insert(connp, ixa); 3795 error = 0; 3796 break; 3797 case EADDRNOTAVAIL: 3798 /* 3799 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3800 * Don't have the application see that errno 3801 */ 3802 error = ENETUNREACH; 3803 break; 3804 } 3805 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3806 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3807 ixa->ixa_cpid = connp->conn_cpid; 3808 ixa_refrele(ixa); 3809 return (error); 3810 } 3811 3812 /* 3813 * Handle sending an M_DATA to the last destination. 3814 * Handles both IPv4 and IPv6. 3815 * 3816 * NOTE: The caller must hold conn_lock and we drop it here. 3817 */ 3818 int 3819 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3820 ip_xmit_attr_t *ixa) 3821 { 3822 icmp_t *icmp = connp->conn_icmp; 3823 icmp_stack_t *is = icmp->icmp_is; 3824 int error; 3825 boolean_t do_ipsec; 3826 3827 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3828 ASSERT(ixa != NULL); 3829 3830 ASSERT(cr != NULL); 3831 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3832 ixa->ixa_cred = cr; 3833 ixa->ixa_cpid = pid; 3834 3835 /* Defer IPsec if it might need to look at ICMP type/code */ 3836 switch (ixa->ixa_protocol) { 3837 case IPPROTO_ICMP: 3838 case IPPROTO_ICMPV6: 3839 do_ipsec = B_FALSE; 3840 break; 3841 default: 3842 do_ipsec = B_TRUE; 3843 } 3844 3845 3846 mp = icmp_prepend_header_template(connp, ixa, mp, 3847 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3848 3849 if (mp == NULL) { 3850 ASSERT(error != 0); 3851 mutex_exit(&connp->conn_lock); 3852 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3853 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3854 ixa->ixa_cpid = connp->conn_cpid; 3855 ixa_refrele(ixa); 3856 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3857 freemsg(mp); 3858 return (error); 3859 } 3860 3861 if (!do_ipsec) { 3862 /* Policy might differ for different ICMP type/code */ 3863 mp = icmp_output_attach_policy(mp, connp, ixa); 3864 if (mp == NULL) { 3865 mutex_exit(&connp->conn_lock); 3866 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3867 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3868 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3869 ixa->ixa_cpid = connp->conn_cpid; 3870 ixa_refrele(ixa); 3871 return (EHOSTUNREACH); /* IPsec policy failure */ 3872 } 3873 } 3874 3875 /* 3876 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3877 * safe copy, then we need to fill in any pointers in it. 3878 */ 3879 if (ixa->ixa_ire == NULL) { 3880 in6_addr_t lastdst, lastsrc; 3881 in6_addr_t nexthop; 3882 in_port_t lastport; 3883 3884 lastsrc = connp->conn_v6lastsrc; 3885 lastdst = connp->conn_v6lastdst; 3886 lastport = connp->conn_lastdstport; 3887 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3888 mutex_exit(&connp->conn_lock); 3889 3890 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3891 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3892 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3893 switch (error) { 3894 case 0: 3895 break; 3896 case EADDRNOTAVAIL: 3897 /* 3898 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3899 * Don't have the application see that errno 3900 */ 3901 error = ENETUNREACH; 3902 goto failed; 3903 case ENETDOWN: 3904 /* 3905 * Have !ipif_addr_ready address; drop packet silently 3906 * until we can get applications to not send until we 3907 * are ready. 3908 */ 3909 error = 0; 3910 goto failed; 3911 case EHOSTUNREACH: 3912 case ENETUNREACH: 3913 if (ixa->ixa_ire != NULL) { 3914 /* 3915 * Let conn_ip_output/ire_send_noroute return 3916 * the error and send any local ICMP error. 3917 */ 3918 error = 0; 3919 break; 3920 } 3921 /* FALLTHRU */ 3922 default: 3923 failed: 3924 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3925 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3926 ixa->ixa_cpid = connp->conn_cpid; 3927 ixa_refrele(ixa); 3928 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3929 freemsg(mp); 3930 return (error); 3931 } 3932 } else { 3933 /* Done with conn_t */ 3934 mutex_exit(&connp->conn_lock); 3935 } 3936 3937 /* We're done. Pass the packet to ip. */ 3938 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3939 error = conn_ip_output(mp, ixa); 3940 /* No rawipOutErrors if an error since IP increases its error counter */ 3941 switch (error) { 3942 case 0: 3943 break; 3944 case EWOULDBLOCK: 3945 (void) ixa_check_drain_insert(connp, ixa); 3946 error = 0; 3947 break; 3948 case EADDRNOTAVAIL: 3949 /* 3950 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3951 * Don't have the application see that errno 3952 */ 3953 error = ENETUNREACH; 3954 /* FALLTHRU */ 3955 default: 3956 mutex_enter(&connp->conn_lock); 3957 /* 3958 * Clear the source and v6lastdst so we call ip_attr_connect 3959 * for the next packet and try to pick a better source. 3960 */ 3961 if (connp->conn_mcbc_bind) 3962 connp->conn_saddr_v6 = ipv6_all_zeros; 3963 else 3964 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3965 connp->conn_v6lastdst = ipv6_all_zeros; 3966 mutex_exit(&connp->conn_lock); 3967 break; 3968 } 3969 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3970 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3971 ixa->ixa_cpid = connp->conn_cpid; 3972 ixa_refrele(ixa); 3973 return (error); 3974 } 3975 3976 3977 /* 3978 * Prepend the header template and then fill in the source and 3979 * flowinfo. The caller needs to handle the destination address since 3980 * it's setting is different if rthdr or source route. 3981 * 3982 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3983 * When it returns NULL it sets errorp. 3984 */ 3985 static mblk_t * 3986 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3987 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3988 { 3989 icmp_t *icmp = connp->conn_icmp; 3990 icmp_stack_t *is = icmp->icmp_is; 3991 uint_t pktlen; 3992 uint_t copylen; 3993 uint8_t *iph; 3994 uint_t ip_hdr_length; 3995 uint32_t cksum; 3996 ip_pkt_t *ipp; 3997 3998 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3999 4000 /* 4001 * Copy the header template. 4002 */ 4003 copylen = connp->conn_ht_iphc_len; 4004 pktlen = copylen + msgdsize(mp); 4005 if (pktlen > IP_MAXPACKET) { 4006 freemsg(mp); 4007 *errorp = EMSGSIZE; 4008 return (NULL); 4009 } 4010 ixa->ixa_pktlen = pktlen; 4011 4012 /* check/fix buffer config, setup pointers into it */ 4013 iph = mp->b_rptr - copylen; 4014 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 4015 mblk_t *mp1; 4016 4017 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 4018 if (mp1 == NULL) { 4019 freemsg(mp); 4020 *errorp = ENOMEM; 4021 return (NULL); 4022 } 4023 mp1->b_wptr = DB_LIM(mp1); 4024 mp1->b_cont = mp; 4025 mp = mp1; 4026 iph = (mp->b_wptr - copylen); 4027 } 4028 mp->b_rptr = iph; 4029 bcopy(connp->conn_ht_iphc, iph, copylen); 4030 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4031 4032 ixa->ixa_ip_hdr_length = ip_hdr_length; 4033 4034 /* 4035 * Prepare for ICMPv6 checksum done in IP. 4036 * 4037 * icmp_build_hdr_template has already massaged any routing header 4038 * and placed the result in conn_sum. 4039 * 4040 * We make it easy for IP to include our pseudo header 4041 * by putting our length (and any routing header adjustment) 4042 * in the ICMPv6 checksum field. 4043 */ 4044 cksum = pktlen - ip_hdr_length; 4045 4046 cksum += connp->conn_sum; 4047 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4048 ASSERT(cksum < 0x10000); 4049 4050 ipp = &connp->conn_xmit_ipp; 4051 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4052 ipha_t *ipha = (ipha_t *)iph; 4053 4054 ipha->ipha_length = htons((uint16_t)pktlen); 4055 4056 /* if IP_PKTINFO specified an addres it wins over bind() */ 4057 if ((ipp->ipp_fields & IPPF_ADDR) && 4058 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4059 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4060 ipha->ipha_src = ipp->ipp_addr_v4; 4061 } else { 4062 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4063 } 4064 } else { 4065 ip6_t *ip6h = (ip6_t *)iph; 4066 uint_t cksum_offset = 0; 4067 4068 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4069 4070 /* if IP_PKTINFO specified an addres it wins over bind() */ 4071 if ((ipp->ipp_fields & IPPF_ADDR) && 4072 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4073 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4074 ip6h->ip6_src = ipp->ipp_addr; 4075 } else { 4076 ip6h->ip6_src = *v6src; 4077 } 4078 ip6h->ip6_vcf = 4079 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4080 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4081 if (ipp->ipp_fields & IPPF_TCLASS) { 4082 /* Overrides the class part of flowinfo */ 4083 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4084 ipp->ipp_tclass); 4085 } 4086 4087 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4088 if (connp->conn_proto == IPPROTO_ICMPV6) { 4089 cksum_offset = ixa->ixa_ip_hdr_length + 4090 offsetof(icmp6_t, icmp6_cksum); 4091 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4092 cksum_offset = ixa->ixa_ip_hdr_length + 4093 ixa->ixa_raw_cksum_offset; 4094 } 4095 } 4096 if (cksum_offset != 0) { 4097 uint16_t *ptr; 4098 4099 /* Make sure the checksum fits in the first mblk */ 4100 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4101 mblk_t *mp1; 4102 4103 mp1 = msgpullup(mp, 4104 cksum_offset + sizeof (short)); 4105 freemsg(mp); 4106 if (mp1 == NULL) { 4107 *errorp = ENOMEM; 4108 return (NULL); 4109 } 4110 mp = mp1; 4111 iph = mp->b_rptr; 4112 ip6h = (ip6_t *)iph; 4113 } 4114 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4115 *ptr = htons(cksum); 4116 } 4117 } 4118 4119 return (mp); 4120 } 4121 4122 /* 4123 * This routine handles all messages passed downstream. It either 4124 * consumes the message or passes it downstream; it never queues a 4125 * a message. 4126 */ 4127 int 4128 icmp_wput(queue_t *q, mblk_t *mp) 4129 { 4130 sin6_t *sin6; 4131 sin_t *sin = NULL; 4132 uint_t srcid; 4133 conn_t *connp = Q_TO_CONN(q); 4134 icmp_t *icmp = connp->conn_icmp; 4135 int error = 0; 4136 struct sockaddr *addr = NULL; 4137 socklen_t addrlen; 4138 icmp_stack_t *is = icmp->icmp_is; 4139 struct T_unitdata_req *tudr; 4140 mblk_t *data_mp; 4141 cred_t *cr; 4142 pid_t pid; 4143 4144 /* 4145 * We directly handle several cases here: T_UNITDATA_REQ message 4146 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4147 * socket. 4148 */ 4149 switch (DB_TYPE(mp)) { 4150 case M_DATA: 4151 /* sockfs never sends down M_DATA */ 4152 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4153 freemsg(mp); 4154 return (0); 4155 4156 case M_PROTO: 4157 case M_PCPROTO: 4158 tudr = (struct T_unitdata_req *)mp->b_rptr; 4159 if (MBLKL(mp) < sizeof (*tudr) || 4160 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4161 icmp_wput_other(q, mp); 4162 return (0); 4163 } 4164 break; 4165 4166 default: 4167 icmp_wput_other(q, mp); 4168 return (0); 4169 } 4170 4171 /* Handle valid T_UNITDATA_REQ here */ 4172 data_mp = mp->b_cont; 4173 if (data_mp == NULL) { 4174 error = EPROTO; 4175 goto ud_error2; 4176 } 4177 mp->b_cont = NULL; 4178 4179 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4180 error = EADDRNOTAVAIL; 4181 goto ud_error2; 4182 } 4183 4184 /* 4185 * All Solaris components should pass a db_credp 4186 * for this message, hence we ASSERT. 4187 * On production kernels we return an error to be robust against 4188 * random streams modules sitting on top of us. 4189 */ 4190 cr = msg_getcred(mp, &pid); 4191 ASSERT(cr != NULL); 4192 if (cr == NULL) { 4193 error = EINVAL; 4194 goto ud_error2; 4195 } 4196 4197 /* 4198 * If a port has not been bound to the stream, fail. 4199 * This is not a problem when sockfs is directly 4200 * above us, because it will ensure that the socket 4201 * is first bound before allowing data to be sent. 4202 */ 4203 if (icmp->icmp_state == TS_UNBND) { 4204 error = EPROTO; 4205 goto ud_error2; 4206 } 4207 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4208 addrlen = tudr->DEST_length; 4209 4210 switch (connp->conn_family) { 4211 case AF_INET6: 4212 sin6 = (sin6_t *)addr; 4213 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4214 (sin6->sin6_family != AF_INET6)) { 4215 error = EADDRNOTAVAIL; 4216 goto ud_error2; 4217 } 4218 4219 /* No support for mapped addresses on raw sockets */ 4220 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4221 error = EADDRNOTAVAIL; 4222 goto ud_error2; 4223 } 4224 srcid = sin6->__sin6_src_id; 4225 4226 /* 4227 * If the local address is a mapped address return 4228 * an error. 4229 * It would be possible to send an IPv6 packet but the 4230 * response would never make it back to the application 4231 * since it is bound to a mapped address. 4232 */ 4233 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4234 error = EADDRNOTAVAIL; 4235 goto ud_error2; 4236 } 4237 4238 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4239 sin6->sin6_addr = ipv6_loopback; 4240 4241 if (tudr->OPT_length != 0) { 4242 /* 4243 * If we are connected then the destination needs to be 4244 * the same as the connected one. 4245 */ 4246 if (icmp->icmp_state == TS_DATA_XFER && 4247 !conn_same_as_last_v6(connp, sin6)) { 4248 error = EISCONN; 4249 goto ud_error2; 4250 } 4251 error = icmp_output_ancillary(connp, NULL, sin6, 4252 data_mp, mp, NULL, cr, pid); 4253 } else { 4254 ip_xmit_attr_t *ixa; 4255 4256 /* 4257 * We have to allocate an ip_xmit_attr_t before we grab 4258 * conn_lock and we need to hold conn_lock once we've 4259 * checked conn_same_as_last_v6 to handle concurrent 4260 * send* calls on a socket. 4261 */ 4262 ixa = conn_get_ixa(connp, B_FALSE); 4263 if (ixa == NULL) { 4264 error = ENOMEM; 4265 goto ud_error2; 4266 } 4267 mutex_enter(&connp->conn_lock); 4268 4269 if (conn_same_as_last_v6(connp, sin6) && 4270 connp->conn_lastsrcid == srcid && 4271 ipsec_outbound_policy_current(ixa)) { 4272 /* icmp_output_lastdst drops conn_lock */ 4273 error = icmp_output_lastdst(connp, data_mp, cr, 4274 pid, ixa); 4275 } else { 4276 /* icmp_output_newdst drops conn_lock */ 4277 error = icmp_output_newdst(connp, data_mp, NULL, 4278 sin6, cr, pid, ixa); 4279 } 4280 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4281 } 4282 if (error == 0) { 4283 freeb(mp); 4284 return (0); 4285 } 4286 break; 4287 4288 case AF_INET: 4289 sin = (sin_t *)addr; 4290 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4291 (sin->sin_family != AF_INET)) { 4292 error = EADDRNOTAVAIL; 4293 goto ud_error2; 4294 } 4295 if (sin->sin_addr.s_addr == INADDR_ANY) 4296 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4297 4298 /* Protocol 255 contains full IP headers */ 4299 /* Read without holding lock */ 4300 if (icmp->icmp_hdrincl) { 4301 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4302 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4303 error = EINVAL; 4304 goto ud_error2; 4305 } 4306 } 4307 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4308 if (error == 0) { 4309 freeb(mp); 4310 return (0); 4311 } 4312 /* data_mp consumed above */ 4313 data_mp = NULL; 4314 goto ud_error2; 4315 } 4316 4317 if (tudr->OPT_length != 0) { 4318 /* 4319 * If we are connected then the destination needs to be 4320 * the same as the connected one. 4321 */ 4322 if (icmp->icmp_state == TS_DATA_XFER && 4323 !conn_same_as_last_v4(connp, sin)) { 4324 error = EISCONN; 4325 goto ud_error2; 4326 } 4327 error = icmp_output_ancillary(connp, sin, NULL, 4328 data_mp, mp, NULL, cr, pid); 4329 } else { 4330 ip_xmit_attr_t *ixa; 4331 4332 /* 4333 * We have to allocate an ip_xmit_attr_t before we grab 4334 * conn_lock and we need to hold conn_lock once we've 4335 * checked conn_same_as_last_v4 to handle concurrent 4336 * send* calls on a socket. 4337 */ 4338 ixa = conn_get_ixa(connp, B_FALSE); 4339 if (ixa == NULL) { 4340 error = ENOMEM; 4341 goto ud_error2; 4342 } 4343 mutex_enter(&connp->conn_lock); 4344 4345 if (conn_same_as_last_v4(connp, sin) && 4346 ipsec_outbound_policy_current(ixa)) { 4347 /* icmp_output_lastdst drops conn_lock */ 4348 error = icmp_output_lastdst(connp, data_mp, cr, 4349 pid, ixa); 4350 } else { 4351 /* icmp_output_newdst drops conn_lock */ 4352 error = icmp_output_newdst(connp, data_mp, sin, 4353 NULL, cr, pid, ixa); 4354 } 4355 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4356 } 4357 if (error == 0) { 4358 freeb(mp); 4359 return (0); 4360 } 4361 break; 4362 } 4363 ASSERT(mp != NULL); 4364 /* mp is freed by the following routine */ 4365 icmp_ud_err(q, mp, (t_scalar_t)error); 4366 return (0); 4367 4368 ud_error2: 4369 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4370 freemsg(data_mp); 4371 ASSERT(mp != NULL); 4372 /* mp is freed by the following routine */ 4373 icmp_ud_err(q, mp, (t_scalar_t)error); 4374 return (0); 4375 } 4376 4377 /* 4378 * Handle the case of the IP address or flow label being different 4379 * for both IPv4 and IPv6. 4380 * 4381 * NOTE: The caller must hold conn_lock and we drop it here. 4382 */ 4383 static int 4384 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4385 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4386 { 4387 icmp_t *icmp = connp->conn_icmp; 4388 icmp_stack_t *is = icmp->icmp_is; 4389 int error; 4390 ip_xmit_attr_t *oldixa; 4391 boolean_t do_ipsec; 4392 uint_t srcid; 4393 uint32_t flowinfo; 4394 in6_addr_t v6src; 4395 in6_addr_t v6dst; 4396 in6_addr_t v6nexthop; 4397 in_port_t dstport; 4398 4399 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4400 ASSERT(ixa != NULL); 4401 4402 /* 4403 * We hold conn_lock across all the use and modifications of 4404 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4405 * stay consistent. 4406 */ 4407 4408 ASSERT(cr != NULL); 4409 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4410 ixa->ixa_cred = cr; 4411 ixa->ixa_cpid = pid; 4412 if (is_system_labeled()) { 4413 /* We need to restart with a label based on the cred */ 4414 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4415 } 4416 /* 4417 * If we are connected then the destination needs to be the 4418 * same as the connected one, which is not the case here since we 4419 * checked for that above. 4420 */ 4421 if (icmp->icmp_state == TS_DATA_XFER) { 4422 mutex_exit(&connp->conn_lock); 4423 error = EISCONN; 4424 goto ud_error; 4425 } 4426 4427 /* In case previous destination was multicast or multirt */ 4428 ip_attr_newdst(ixa); 4429 4430 /* 4431 * If laddr is unspecified then we look at sin6_src_id. 4432 * We will give precedence to a source address set with IPV6_PKTINFO 4433 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4434 * want ip_attr_connect to select a source (since it can fail) when 4435 * IPV6_PKTINFO is specified. 4436 * If this doesn't result in a source address then we get a source 4437 * from ip_attr_connect() below. 4438 */ 4439 v6src = connp->conn_saddr_v6; 4440 if (sin != NULL) { 4441 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4442 dstport = sin->sin_port; 4443 flowinfo = 0; 4444 /* Don't bother with ip_srcid_find_id(), but indicate anyway. */ 4445 srcid = 0; 4446 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4447 ixa->ixa_flags |= IXAF_IS_IPV4; 4448 } else { 4449 boolean_t v4mapped; 4450 4451 v6dst = sin6->sin6_addr; 4452 dstport = sin6->sin6_port; 4453 flowinfo = sin6->sin6_flowinfo; 4454 srcid = sin6->__sin6_src_id; 4455 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4456 ixa->ixa_scopeid = sin6->sin6_scope_id; 4457 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4458 } else { 4459 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4460 } 4461 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 4462 if (v4mapped) 4463 ixa->ixa_flags |= IXAF_IS_IPV4; 4464 else 4465 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4466 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4467 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4468 v4mapped, connp->conn_netstack)) { 4469 /* Mismatched v4mapped/v6 specified by srcid. */ 4470 mutex_exit(&connp->conn_lock); 4471 error = EADDRNOTAVAIL; 4472 goto ud_error; 4473 } 4474 } 4475 } 4476 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4477 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4478 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4479 4480 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4481 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4482 v6src = ipp->ipp_addr; 4483 } else { 4484 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4485 v6src = ipp->ipp_addr; 4486 } 4487 } 4488 4489 /* Defer IPsec if it might need to look at ICMP type/code */ 4490 switch (ixa->ixa_protocol) { 4491 case IPPROTO_ICMP: 4492 case IPPROTO_ICMPV6: 4493 do_ipsec = B_FALSE; 4494 break; 4495 default: 4496 do_ipsec = B_TRUE; 4497 } 4498 4499 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4500 mutex_exit(&connp->conn_lock); 4501 4502 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4503 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4504 (do_ipsec ? IPDF_IPSEC : 0)); 4505 switch (error) { 4506 case 0: 4507 break; 4508 case EADDRNOTAVAIL: 4509 /* 4510 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4511 * Don't have the application see that errno 4512 */ 4513 error = ENETUNREACH; 4514 goto failed; 4515 case ENETDOWN: 4516 /* 4517 * Have !ipif_addr_ready address; drop packet silently 4518 * until we can get applications to not send until we 4519 * are ready. 4520 */ 4521 error = 0; 4522 goto failed; 4523 case EHOSTUNREACH: 4524 case ENETUNREACH: 4525 if (ixa->ixa_ire != NULL) { 4526 /* 4527 * Let conn_ip_output/ire_send_noroute return 4528 * the error and send any local ICMP error. 4529 */ 4530 error = 0; 4531 break; 4532 } 4533 /* FALLTHRU */ 4534 default: 4535 failed: 4536 goto ud_error; 4537 } 4538 4539 mutex_enter(&connp->conn_lock); 4540 /* 4541 * While we dropped the lock some other thread might have connected 4542 * this socket. If so we bail out with EISCONN to ensure that the 4543 * connecting thread is the one that updates conn_ixa, conn_ht_* 4544 * and conn_*last*. 4545 */ 4546 if (icmp->icmp_state == TS_DATA_XFER) { 4547 mutex_exit(&connp->conn_lock); 4548 error = EISCONN; 4549 goto ud_error; 4550 } 4551 4552 /* 4553 * We need to rebuild the headers if 4554 * - we are labeling packets (could be different for different 4555 * destinations) 4556 * - we have a source route (or routing header) since we need to 4557 * massage that to get the pseudo-header checksum 4558 * - a socket option with COA_HEADER_CHANGED has been set which 4559 * set conn_v6lastdst to zero. 4560 * 4561 * Otherwise the prepend function will just update the src, dst, 4562 * and flow label. 4563 */ 4564 if (is_system_labeled()) { 4565 /* TX MLP requires SCM_UCRED and don't have that here */ 4566 if (connp->conn_mlp_type != mlptSingle) { 4567 mutex_exit(&connp->conn_lock); 4568 error = ECONNREFUSED; 4569 goto ud_error; 4570 } 4571 /* 4572 * Check whether Trusted Solaris policy allows communication 4573 * with this host, and pretend that the destination is 4574 * unreachable if not. 4575 * Compute any needed label and place it in ipp_label_v4/v6. 4576 * 4577 * Later conn_build_hdr_template/conn_prepend_hdr takes 4578 * ipp_label_v4/v6 to form the packet. 4579 * 4580 * Tsol note: Since we hold conn_lock we know no other 4581 * thread manipulates conn_xmit_ipp. 4582 */ 4583 error = conn_update_label(connp, ixa, &v6dst, 4584 &connp->conn_xmit_ipp); 4585 if (error != 0) { 4586 mutex_exit(&connp->conn_lock); 4587 goto ud_error; 4588 } 4589 /* Rebuild the header template */ 4590 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4591 flowinfo); 4592 if (error != 0) { 4593 mutex_exit(&connp->conn_lock); 4594 goto ud_error; 4595 } 4596 } else if (connp->conn_xmit_ipp.ipp_fields & 4597 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4598 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4599 /* Rebuild the header template */ 4600 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4601 flowinfo); 4602 if (error != 0) { 4603 mutex_exit(&connp->conn_lock); 4604 goto ud_error; 4605 } 4606 } else { 4607 /* Simply update the destination address if no source route */ 4608 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4609 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4610 4611 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4612 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4613 ipha->ipha_fragment_offset_and_flags |= 4614 IPH_DF_HTONS; 4615 } else { 4616 ipha->ipha_fragment_offset_and_flags &= 4617 ~IPH_DF_HTONS; 4618 } 4619 } else { 4620 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4621 ip6h->ip6_dst = v6dst; 4622 } 4623 } 4624 4625 /* 4626 * Remember the dst etc which corresponds to the built header 4627 * template and conn_ixa. 4628 */ 4629 oldixa = conn_replace_ixa(connp, ixa); 4630 connp->conn_v6lastdst = v6dst; 4631 connp->conn_lastflowinfo = flowinfo; 4632 connp->conn_lastscopeid = ixa->ixa_scopeid; 4633 connp->conn_lastsrcid = srcid; 4634 /* Also remember a source to use together with lastdst */ 4635 connp->conn_v6lastsrc = v6src; 4636 4637 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4638 flowinfo, &error); 4639 4640 /* Done with conn_t */ 4641 mutex_exit(&connp->conn_lock); 4642 ixa_refrele(oldixa); 4643 4644 if (data_mp == NULL) { 4645 ASSERT(error != 0); 4646 goto ud_error; 4647 } 4648 4649 if (!do_ipsec) { 4650 /* Policy might differ for different ICMP type/code */ 4651 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4652 if (data_mp == NULL) { 4653 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4654 error = EHOSTUNREACH; /* IPsec policy failure */ 4655 goto done; 4656 } 4657 } 4658 4659 /* We're done. Pass the packet to ip. */ 4660 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4661 4662 error = conn_ip_output(data_mp, ixa); 4663 /* No rawipOutErrors if an error since IP increases its error counter */ 4664 switch (error) { 4665 case 0: 4666 break; 4667 case EWOULDBLOCK: 4668 (void) ixa_check_drain_insert(connp, ixa); 4669 error = 0; 4670 break; 4671 case EADDRNOTAVAIL: 4672 /* 4673 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4674 * Don't have the application see that errno 4675 */ 4676 error = ENETUNREACH; 4677 /* FALLTHRU */ 4678 default: 4679 mutex_enter(&connp->conn_lock); 4680 /* 4681 * Clear the source and v6lastdst so we call ip_attr_connect 4682 * for the next packet and try to pick a better source. 4683 */ 4684 if (connp->conn_mcbc_bind) 4685 connp->conn_saddr_v6 = ipv6_all_zeros; 4686 else 4687 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4688 connp->conn_v6lastdst = ipv6_all_zeros; 4689 mutex_exit(&connp->conn_lock); 4690 break; 4691 } 4692 done: 4693 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4694 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4695 ixa->ixa_cpid = connp->conn_cpid; 4696 ixa_refrele(ixa); 4697 return (error); 4698 4699 ud_error: 4700 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4701 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4702 ixa->ixa_cpid = connp->conn_cpid; 4703 ixa_refrele(ixa); 4704 4705 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4706 freemsg(data_mp); 4707 return (error); 4708 } 4709 4710 /* ARGSUSED */ 4711 static int 4712 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4713 { 4714 #ifdef DEBUG 4715 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4716 #endif 4717 freemsg(mp); 4718 return (0); 4719 } 4720 4721 static void 4722 icmp_wput_other(queue_t *q, mblk_t *mp) 4723 { 4724 uchar_t *rptr = mp->b_rptr; 4725 struct iocblk *iocp; 4726 conn_t *connp = Q_TO_CONN(q); 4727 icmp_t *icmp = connp->conn_icmp; 4728 cred_t *cr; 4729 4730 switch (mp->b_datap->db_type) { 4731 case M_PROTO: 4732 case M_PCPROTO: 4733 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4734 /* 4735 * If the message does not contain a PRIM_type, 4736 * throw it away. 4737 */ 4738 freemsg(mp); 4739 return; 4740 } 4741 switch (((t_primp_t)rptr)->type) { 4742 case T_ADDR_REQ: 4743 icmp_addr_req(q, mp); 4744 return; 4745 case O_T_BIND_REQ: 4746 case T_BIND_REQ: 4747 icmp_tpi_bind(q, mp); 4748 return; 4749 case T_CONN_REQ: 4750 icmp_tpi_connect(q, mp); 4751 return; 4752 case T_CAPABILITY_REQ: 4753 icmp_capability_req(q, mp); 4754 return; 4755 case T_INFO_REQ: 4756 icmp_info_req(q, mp); 4757 return; 4758 case T_UNITDATA_REQ: 4759 /* 4760 * If a T_UNITDATA_REQ gets here, the address must 4761 * be bad. Valid T_UNITDATA_REQs are handled 4762 * in icmp_wput. 4763 */ 4764 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4765 return; 4766 case T_UNBIND_REQ: 4767 icmp_tpi_unbind(q, mp); 4768 return; 4769 case T_SVR4_OPTMGMT_REQ: 4770 /* 4771 * All Solaris components should pass a db_credp 4772 * for this TPI message, hence we ASSERT. 4773 * But in case there is some other M_PROTO that looks 4774 * like a TPI message sent by some other kernel 4775 * component, we check and return an error. 4776 */ 4777 cr = msg_getcred(mp, NULL); 4778 ASSERT(cr != NULL); 4779 if (cr == NULL) { 4780 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4781 return; 4782 } 4783 4784 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4785 cr)) { 4786 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4787 } 4788 return; 4789 4790 case T_OPTMGMT_REQ: 4791 /* 4792 * All Solaris components should pass a db_credp 4793 * for this TPI message, hence we ASSERT. 4794 * But in case there is some other M_PROTO that looks 4795 * like a TPI message sent by some other kernel 4796 * component, we check and return an error. 4797 */ 4798 cr = msg_getcred(mp, NULL); 4799 ASSERT(cr != NULL); 4800 if (cr == NULL) { 4801 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4802 return; 4803 } 4804 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4805 return; 4806 4807 case T_DISCON_REQ: 4808 icmp_tpi_disconnect(q, mp); 4809 return; 4810 4811 /* The following TPI message is not supported by icmp. */ 4812 case O_T_CONN_RES: 4813 case T_CONN_RES: 4814 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4815 return; 4816 4817 /* The following 3 TPI requests are illegal for icmp. */ 4818 case T_DATA_REQ: 4819 case T_EXDATA_REQ: 4820 case T_ORDREL_REQ: 4821 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4822 return; 4823 default: 4824 break; 4825 } 4826 break; 4827 case M_FLUSH: 4828 if (*rptr & FLUSHW) 4829 flushq(q, FLUSHDATA); 4830 break; 4831 case M_IOCTL: 4832 iocp = (struct iocblk *)mp->b_rptr; 4833 switch (iocp->ioc_cmd) { 4834 case TI_GETPEERNAME: 4835 if (icmp->icmp_state != TS_DATA_XFER) { 4836 /* 4837 * If a default destination address has not 4838 * been associated with the stream, then we 4839 * don't know the peer's name. 4840 */ 4841 iocp->ioc_error = ENOTCONN; 4842 iocp->ioc_count = 0; 4843 mp->b_datap->db_type = M_IOCACK; 4844 qreply(q, mp); 4845 return; 4846 } 4847 /* FALLTHRU */ 4848 case TI_GETMYNAME: 4849 /* 4850 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4851 * need to copyin the user's strbuf structure. 4852 * Processing will continue in the M_IOCDATA case 4853 * below. 4854 */ 4855 mi_copyin(q, mp, NULL, 4856 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4857 return; 4858 default: 4859 break; 4860 } 4861 break; 4862 case M_IOCDATA: 4863 icmp_wput_iocdata(q, mp); 4864 return; 4865 default: 4866 /* Unrecognized messages are passed through without change. */ 4867 break; 4868 } 4869 ip_wput_nondata(q, mp); 4870 } 4871 4872 /* 4873 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4874 * messages. 4875 */ 4876 static void 4877 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4878 { 4879 mblk_t *mp1; 4880 STRUCT_HANDLE(strbuf, sb); 4881 uint_t addrlen; 4882 conn_t *connp = Q_TO_CONN(q); 4883 icmp_t *icmp = connp->conn_icmp; 4884 4885 /* Make sure it is one of ours. */ 4886 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4887 case TI_GETMYNAME: 4888 case TI_GETPEERNAME: 4889 break; 4890 default: 4891 ip_wput_nondata(q, mp); 4892 return; 4893 } 4894 4895 switch (mi_copy_state(q, mp, &mp1)) { 4896 case -1: 4897 return; 4898 case MI_COPY_CASE(MI_COPY_IN, 1): 4899 break; 4900 case MI_COPY_CASE(MI_COPY_OUT, 1): 4901 /* 4902 * The address has been copied out, so now 4903 * copyout the strbuf. 4904 */ 4905 mi_copyout(q, mp); 4906 return; 4907 case MI_COPY_CASE(MI_COPY_OUT, 2): 4908 /* 4909 * The address and strbuf have been copied out. 4910 * We're done, so just acknowledge the original 4911 * M_IOCTL. 4912 */ 4913 mi_copy_done(q, mp, 0); 4914 return; 4915 default: 4916 /* 4917 * Something strange has happened, so acknowledge 4918 * the original M_IOCTL with an EPROTO error. 4919 */ 4920 mi_copy_done(q, mp, EPROTO); 4921 return; 4922 } 4923 4924 /* 4925 * Now we have the strbuf structure for TI_GETMYNAME 4926 * and TI_GETPEERNAME. Next we copyout the requested 4927 * address and then we'll copyout the strbuf. 4928 */ 4929 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4930 (void *)mp1->b_rptr); 4931 4932 if (connp->conn_family == AF_INET) 4933 addrlen = sizeof (sin_t); 4934 else 4935 addrlen = sizeof (sin6_t); 4936 4937 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4938 mi_copy_done(q, mp, EINVAL); 4939 return; 4940 } 4941 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4942 case TI_GETMYNAME: 4943 break; 4944 case TI_GETPEERNAME: 4945 if (icmp->icmp_state != TS_DATA_XFER) { 4946 mi_copy_done(q, mp, ENOTCONN); 4947 return; 4948 } 4949 break; 4950 default: 4951 mi_copy_done(q, mp, EPROTO); 4952 return; 4953 } 4954 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4955 if (!mp1) 4956 return; 4957 4958 STRUCT_FSET(sb, len, addrlen); 4959 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4960 case TI_GETMYNAME: 4961 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4962 &addrlen); 4963 break; 4964 case TI_GETPEERNAME: 4965 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4966 &addrlen); 4967 break; 4968 } 4969 mp1->b_wptr += addrlen; 4970 /* Copy out the address */ 4971 mi_copyout(q, mp); 4972 } 4973 4974 void 4975 icmp_ddi_g_init(void) 4976 { 4977 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4978 icmp_opt_obj.odb_opt_arr_cnt); 4979 4980 /* 4981 * We want to be informed each time a stack is created or 4982 * destroyed in the kernel, so we can maintain the 4983 * set of icmp_stack_t's. 4984 */ 4985 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4986 } 4987 4988 void 4989 icmp_ddi_g_destroy(void) 4990 { 4991 netstack_unregister(NS_ICMP); 4992 } 4993 4994 #define INET_NAME "ip" 4995 4996 /* 4997 * Initialize the ICMP stack instance. 4998 */ 4999 static void * 5000 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 5001 { 5002 icmp_stack_t *is; 5003 int error = 0; 5004 size_t arrsz; 5005 major_t major; 5006 5007 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 5008 is->is_netstack = ns; 5009 5010 arrsz = sizeof (icmp_propinfo_tbl); 5011 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 5012 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 5013 5014 is->is_ksp = rawip_kstat_init(stackid); 5015 5016 major = mod_name_to_major(INET_NAME); 5017 error = ldi_ident_from_major(major, &is->is_ldi_ident); 5018 ASSERT(error == 0); 5019 return (is); 5020 } 5021 5022 /* 5023 * Free the ICMP stack instance. 5024 */ 5025 static void 5026 rawip_stack_fini(netstackid_t stackid, void *arg) 5027 { 5028 icmp_stack_t *is = (icmp_stack_t *)arg; 5029 5030 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 5031 is->is_propinfo_tbl = NULL; 5032 5033 rawip_kstat_fini(stackid, is->is_ksp); 5034 is->is_ksp = NULL; 5035 ldi_ident_release(is->is_ldi_ident); 5036 kmem_free(is, sizeof (*is)); 5037 } 5038 5039 static void * 5040 rawip_kstat_init(netstackid_t stackid) 5041 { 5042 kstat_t *ksp; 5043 5044 rawip_named_kstat_t template = { 5045 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5046 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5047 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5048 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5049 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5050 }; 5051 5052 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5053 KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid); 5054 if (ksp == NULL || ksp->ks_data == NULL) 5055 return (NULL); 5056 5057 bcopy(&template, ksp->ks_data, sizeof (template)); 5058 ksp->ks_update = rawip_kstat_update; 5059 ksp->ks_private = (void *)(uintptr_t)stackid; 5060 5061 kstat_install(ksp); 5062 return (ksp); 5063 } 5064 5065 static void 5066 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5067 { 5068 if (ksp != NULL) { 5069 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5070 kstat_delete_netstack(ksp, stackid); 5071 } 5072 } 5073 5074 static int 5075 rawip_kstat_update(kstat_t *ksp, int rw) 5076 { 5077 rawip_named_kstat_t *rawipkp; 5078 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5079 netstack_t *ns; 5080 icmp_stack_t *is; 5081 5082 if (ksp->ks_data == NULL) 5083 return (EIO); 5084 5085 if (rw == KSTAT_WRITE) 5086 return (EACCES); 5087 5088 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5089 5090 ns = netstack_find_by_stackid(stackid); 5091 if (ns == NULL) 5092 return (-1); 5093 is = ns->netstack_icmp; 5094 if (is == NULL) { 5095 netstack_rele(ns); 5096 return (-1); 5097 } 5098 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5099 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5100 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5101 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5102 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5103 netstack_rele(ns); 5104 return (0); 5105 } 5106 5107 /* ARGSUSED */ 5108 int 5109 rawip_accept(sock_lower_handle_t lproto_handle, 5110 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5111 cred_t *cr) 5112 { 5113 return (EOPNOTSUPP); 5114 } 5115 5116 /* ARGSUSED */ 5117 int 5118 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5119 socklen_t len, cred_t *cr) 5120 { 5121 conn_t *connp = (conn_t *)proto_handle; 5122 int error; 5123 5124 /* All Solaris components should pass a cred for this operation. */ 5125 ASSERT(cr != NULL); 5126 5127 /* Binding to a NULL address really means unbind */ 5128 if (sa == NULL) 5129 error = rawip_do_unbind(connp); 5130 else 5131 error = rawip_do_bind(connp, sa, len); 5132 5133 if (error < 0) { 5134 if (error == -TOUTSTATE) 5135 error = EINVAL; 5136 else 5137 error = proto_tlitosyserr(-error); 5138 } 5139 return (error); 5140 } 5141 5142 static int 5143 rawip_implicit_bind(conn_t *connp) 5144 { 5145 sin6_t sin6addr; 5146 sin_t *sin; 5147 sin6_t *sin6; 5148 socklen_t len; 5149 int error; 5150 5151 if (connp->conn_family == AF_INET) { 5152 len = sizeof (struct sockaddr_in); 5153 sin = (sin_t *)&sin6addr; 5154 *sin = sin_null; 5155 sin->sin_family = AF_INET; 5156 sin->sin_addr.s_addr = INADDR_ANY; 5157 } else { 5158 ASSERT(connp->conn_family == AF_INET6); 5159 len = sizeof (sin6_t); 5160 sin6 = (sin6_t *)&sin6addr; 5161 *sin6 = sin6_null; 5162 sin6->sin6_family = AF_INET6; 5163 V6_SET_ZERO(sin6->sin6_addr); 5164 } 5165 5166 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5167 5168 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5169 } 5170 5171 static int 5172 rawip_unbind(conn_t *connp) 5173 { 5174 int error; 5175 5176 error = rawip_do_unbind(connp); 5177 if (error < 0) { 5178 error = proto_tlitosyserr(-error); 5179 } 5180 return (error); 5181 } 5182 5183 /* ARGSUSED */ 5184 int 5185 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5186 { 5187 return (EOPNOTSUPP); 5188 } 5189 5190 int 5191 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5192 socklen_t len, sock_connid_t *id, cred_t *cr) 5193 { 5194 conn_t *connp = (conn_t *)proto_handle; 5195 icmp_t *icmp = connp->conn_icmp; 5196 int error; 5197 boolean_t did_bind = B_FALSE; 5198 pid_t pid = curproc->p_pid; 5199 5200 /* All Solaris components should pass a cred for this operation. */ 5201 ASSERT(cr != NULL); 5202 5203 if (sa == NULL) { 5204 /* 5205 * Disconnect 5206 * Make sure we are connected 5207 */ 5208 if (icmp->icmp_state != TS_DATA_XFER) 5209 return (EINVAL); 5210 5211 error = icmp_disconnect(connp); 5212 return (error); 5213 } 5214 5215 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5216 if (error != 0) 5217 return (error); 5218 5219 /* do an implicit bind if necessary */ 5220 if (icmp->icmp_state == TS_UNBND) { 5221 error = rawip_implicit_bind(connp); 5222 /* 5223 * We could be racing with an actual bind, in which case 5224 * we would see EPROTO. We cross our fingers and try 5225 * to connect. 5226 */ 5227 if (!(error == 0 || error == EPROTO)) 5228 return (error); 5229 did_bind = B_TRUE; 5230 } 5231 5232 /* 5233 * set SO_DGRAM_ERRIND 5234 */ 5235 connp->conn_dgram_errind = B_TRUE; 5236 5237 error = rawip_do_connect(connp, sa, len, cr, pid); 5238 if (error != 0 && did_bind) { 5239 int unbind_err; 5240 5241 unbind_err = rawip_unbind(connp); 5242 ASSERT(unbind_err == 0); 5243 } 5244 5245 if (error == 0) { 5246 *id = 0; 5247 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5248 0, NULL, -1); 5249 } else if (error < 0) { 5250 error = proto_tlitosyserr(-error); 5251 } 5252 return (error); 5253 } 5254 5255 /* ARGSUSED2 */ 5256 int 5257 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5258 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5259 sock_quiesce_arg_t *arg) 5260 { 5261 conn_t *connp = (conn_t *)proto_handle; 5262 icmp_t *icmp; 5263 struct T_capability_ack tca; 5264 struct sockaddr_in6 laddr, faddr; 5265 socklen_t laddrlen, faddrlen; 5266 short opts; 5267 struct stroptions *stropt; 5268 mblk_t *mp, *stropt_mp; 5269 int error; 5270 5271 icmp = connp->conn_icmp; 5272 5273 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5274 5275 /* 5276 * setup the fallback stream that was allocated 5277 */ 5278 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5279 connp->conn_minor_arena = WR(q)->q_ptr; 5280 5281 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5282 5283 WR(q)->q_qinfo = &icmpwinit; 5284 5285 connp->conn_rq = RD(q); 5286 connp->conn_wq = WR(q); 5287 5288 /* Notify stream head about options before sending up data */ 5289 stropt_mp->b_datap->db_type = M_SETOPTS; 5290 stropt_mp->b_wptr += sizeof (*stropt); 5291 stropt = (struct stroptions *)stropt_mp->b_rptr; 5292 stropt->so_flags = SO_WROFF | SO_HIWAT; 5293 stropt->so_wroff = connp->conn_wroff; 5294 stropt->so_hiwat = connp->conn_rcvbuf; 5295 putnext(RD(q), stropt_mp); 5296 5297 /* 5298 * free helper stream 5299 */ 5300 ip_free_helper_stream(connp); 5301 5302 /* 5303 * Collect the information needed to sync with the sonode 5304 */ 5305 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5306 5307 laddrlen = faddrlen = sizeof (sin6_t); 5308 (void) rawip_getsockname((sock_lower_handle_t)connp, 5309 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5310 error = rawip_getpeername((sock_lower_handle_t)connp, 5311 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5312 if (error != 0) 5313 faddrlen = 0; 5314 opts = 0; 5315 if (connp->conn_dgram_errind) 5316 opts |= SO_DGRAM_ERRIND; 5317 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5318 opts |= SO_DONTROUTE; 5319 5320 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5321 (struct sockaddr *)&laddr, laddrlen, 5322 (struct sockaddr *)&faddr, faddrlen, opts); 5323 5324 /* 5325 * Attempts to send data up during fallback will result in it being 5326 * queued in icmp_t. Now we push up any queued packets. 5327 */ 5328 mutex_enter(&icmp->icmp_recv_lock); 5329 if (mp != NULL) { 5330 mp->b_next = icmp->icmp_fallback_queue_head; 5331 icmp->icmp_fallback_queue_head = mp; 5332 } 5333 while (icmp->icmp_fallback_queue_head != NULL) { 5334 mp = icmp->icmp_fallback_queue_head; 5335 icmp->icmp_fallback_queue_head = mp->b_next; 5336 mp->b_next = NULL; 5337 mutex_exit(&icmp->icmp_recv_lock); 5338 putnext(RD(q), mp); 5339 mutex_enter(&icmp->icmp_recv_lock); 5340 } 5341 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5342 5343 /* 5344 * No longer a streams less socket 5345 */ 5346 mutex_enter(&connp->conn_lock); 5347 connp->conn_flags &= ~IPCL_NONSTR; 5348 mutex_exit(&connp->conn_lock); 5349 5350 mutex_exit(&icmp->icmp_recv_lock); 5351 5352 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5353 icmp->icmp_fallback_queue_tail == NULL); 5354 5355 ASSERT(connp->conn_ref >= 1); 5356 5357 return (0); 5358 } 5359 5360 /* ARGSUSED2 */ 5361 sock_lower_handle_t 5362 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5363 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5364 { 5365 conn_t *connp; 5366 5367 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5368 *errorp = EPROTONOSUPPORT; 5369 return (NULL); 5370 } 5371 5372 connp = rawip_do_open(family, credp, errorp, flags); 5373 if (connp != NULL) { 5374 connp->conn_flags |= IPCL_NONSTR; 5375 5376 mutex_enter(&connp->conn_lock); 5377 connp->conn_state_flags &= ~CONN_INCIPIENT; 5378 mutex_exit(&connp->conn_lock); 5379 *sock_downcalls = &sock_rawip_downcalls; 5380 *smodep = SM_ATOMIC; 5381 } else { 5382 ASSERT(*errorp != 0); 5383 } 5384 5385 return ((sock_lower_handle_t)connp); 5386 } 5387 5388 /* ARGSUSED3 */ 5389 void 5390 rawip_activate(sock_lower_handle_t proto_handle, 5391 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5392 cred_t *cr) 5393 { 5394 conn_t *connp = (conn_t *)proto_handle; 5395 struct sock_proto_props sopp; 5396 5397 /* All Solaris components should pass a cred for this operation. */ 5398 ASSERT(cr != NULL); 5399 5400 connp->conn_upcalls = sock_upcalls; 5401 connp->conn_upper_handle = sock_handle; 5402 5403 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5404 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5405 sopp.sopp_wroff = connp->conn_wroff; 5406 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5407 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5408 sopp.sopp_maxblk = INFPSZ; 5409 sopp.sopp_maxpsz = IP_MAXPACKET; 5410 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5411 icmp_mod_info.mi_minpsz; 5412 5413 (*connp->conn_upcalls->su_set_proto_props) 5414 (connp->conn_upper_handle, &sopp); 5415 5416 icmp_bind_proto(connp->conn_icmp); 5417 } 5418 5419 /* ARGSUSED3 */ 5420 int 5421 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5422 socklen_t *salenp, cred_t *cr) 5423 { 5424 conn_t *connp = (conn_t *)proto_handle; 5425 icmp_t *icmp = connp->conn_icmp; 5426 int error; 5427 5428 /* All Solaris components should pass a cred for this operation. */ 5429 ASSERT(cr != NULL); 5430 5431 mutex_enter(&connp->conn_lock); 5432 if (icmp->icmp_state != TS_DATA_XFER) 5433 error = ENOTCONN; 5434 else 5435 error = conn_getpeername(connp, sa, salenp); 5436 mutex_exit(&connp->conn_lock); 5437 return (error); 5438 } 5439 5440 /* ARGSUSED3 */ 5441 int 5442 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5443 socklen_t *salenp, cred_t *cr) 5444 { 5445 conn_t *connp = (conn_t *)proto_handle; 5446 int error; 5447 5448 /* All Solaris components should pass a cred for this operation. */ 5449 ASSERT(cr != NULL); 5450 5451 mutex_enter(&connp->conn_lock); 5452 error = conn_getsockname(connp, sa, salenp); 5453 mutex_exit(&connp->conn_lock); 5454 return (error); 5455 } 5456 5457 int 5458 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5459 const void *optvalp, socklen_t optlen, cred_t *cr) 5460 { 5461 conn_t *connp = (conn_t *)proto_handle; 5462 int error; 5463 5464 /* All Solaris components should pass a cred for this operation. */ 5465 ASSERT(cr != NULL); 5466 5467 error = proto_opt_check(level, option_name, optlen, NULL, 5468 icmp_opt_obj.odb_opt_des_arr, 5469 icmp_opt_obj.odb_opt_arr_cnt, 5470 B_TRUE, B_FALSE, cr); 5471 5472 if (error != 0) { 5473 /* 5474 * option not recognized 5475 */ 5476 if (error < 0) { 5477 error = proto_tlitosyserr(-error); 5478 } 5479 return (error); 5480 } 5481 5482 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5483 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5484 (uchar_t *)optvalp, NULL, cr); 5485 5486 ASSERT(error >= 0); 5487 5488 return (error); 5489 } 5490 5491 int 5492 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5493 void *optvalp, socklen_t *optlen, cred_t *cr) 5494 { 5495 int error; 5496 conn_t *connp = (conn_t *)proto_handle; 5497 t_uscalar_t max_optbuf_len; 5498 void *optvalp_buf; 5499 int len; 5500 5501 /* All Solaris components should pass a cred for this operation. */ 5502 ASSERT(cr != NULL); 5503 5504 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5505 icmp_opt_obj.odb_opt_des_arr, 5506 icmp_opt_obj.odb_opt_arr_cnt, 5507 B_FALSE, B_TRUE, cr); 5508 5509 if (error != 0) { 5510 if (error < 0) { 5511 error = proto_tlitosyserr(-error); 5512 } 5513 return (error); 5514 } 5515 5516 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5517 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5518 if (len == -1) { 5519 kmem_free(optvalp_buf, max_optbuf_len); 5520 return (EINVAL); 5521 } 5522 5523 /* 5524 * update optlen and copy option value 5525 */ 5526 t_uscalar_t size = MIN(len, *optlen); 5527 5528 bcopy(optvalp_buf, optvalp, size); 5529 bcopy(&size, optlen, sizeof (size)); 5530 5531 kmem_free(optvalp_buf, max_optbuf_len); 5532 return (0); 5533 } 5534 5535 /* ARGSUSED1 */ 5536 int 5537 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5538 { 5539 conn_t *connp = (conn_t *)proto_handle; 5540 5541 /* All Solaris components should pass a cred for this operation. */ 5542 ASSERT(cr != NULL); 5543 5544 (void) rawip_do_close(connp); 5545 return (0); 5546 } 5547 5548 /* ARGSUSED2 */ 5549 int 5550 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5551 { 5552 conn_t *connp = (conn_t *)proto_handle; 5553 5554 /* All Solaris components should pass a cred for this operation. */ 5555 ASSERT(cr != NULL); 5556 5557 /* shut down the send side */ 5558 if (how != SHUT_RD) 5559 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5560 SOCK_OPCTL_SHUT_SEND, 0); 5561 /* shut down the recv side */ 5562 if (how != SHUT_WR) 5563 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5564 SOCK_OPCTL_SHUT_RECV, 0); 5565 return (0); 5566 } 5567 5568 void 5569 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5570 { 5571 conn_t *connp = (conn_t *)proto_handle; 5572 icmp_t *icmp = connp->conn_icmp; 5573 5574 mutex_enter(&icmp->icmp_recv_lock); 5575 connp->conn_flow_cntrld = B_FALSE; 5576 mutex_exit(&icmp->icmp_recv_lock); 5577 } 5578 5579 int 5580 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5581 int mode, int32_t *rvalp, cred_t *cr) 5582 { 5583 conn_t *connp = (conn_t *)proto_handle; 5584 int error; 5585 5586 /* All Solaris components should pass a cred for this operation. */ 5587 ASSERT(cr != NULL); 5588 5589 /* 5590 * If we don't have a helper stream then create one. 5591 * ip_create_helper_stream takes care of locking the conn_t, 5592 * so this check for NULL is just a performance optimization. 5593 */ 5594 if (connp->conn_helper_info == NULL) { 5595 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5596 5597 ASSERT(is->is_ldi_ident != NULL); 5598 5599 /* 5600 * Create a helper stream for non-STREAMS socket. 5601 */ 5602 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5603 if (error != 0) { 5604 ip0dbg(("rawip_ioctl: create of IP helper stream " 5605 "failed %d\n", error)); 5606 return (error); 5607 } 5608 } 5609 5610 switch (cmd) { 5611 case _SIOCSOCKFALLBACK: 5612 case TI_GETPEERNAME: 5613 case TI_GETMYNAME: 5614 #ifdef DEBUG 5615 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5616 " socket", cmd); 5617 #endif 5618 error = EINVAL; 5619 break; 5620 default: 5621 /* 5622 * Pass on to IP using helper stream 5623 */ 5624 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5625 cmd, arg, mode, cr, rvalp); 5626 break; 5627 } 5628 return (error); 5629 } 5630 5631 int 5632 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5633 cred_t *cr) 5634 { 5635 sin6_t *sin6; 5636 sin_t *sin = NULL; 5637 uint_t srcid; 5638 conn_t *connp = (conn_t *)proto_handle; 5639 icmp_t *icmp = connp->conn_icmp; 5640 int error = 0; 5641 icmp_stack_t *is = icmp->icmp_is; 5642 pid_t pid = curproc->p_pid; 5643 ip_xmit_attr_t *ixa; 5644 5645 ASSERT(DB_TYPE(mp) == M_DATA); 5646 5647 /* All Solaris components should pass a cred for this operation. */ 5648 ASSERT(cr != NULL); 5649 5650 /* do an implicit bind if necessary */ 5651 if (icmp->icmp_state == TS_UNBND) { 5652 error = rawip_implicit_bind(connp); 5653 /* 5654 * We could be racing with an actual bind, in which case 5655 * we would see EPROTO. We cross our fingers and try 5656 * to connect. 5657 */ 5658 if (!(error == 0 || error == EPROTO)) { 5659 freemsg(mp); 5660 return (error); 5661 } 5662 } 5663 5664 /* Protocol 255 contains full IP headers */ 5665 /* Read without holding lock */ 5666 if (icmp->icmp_hdrincl) { 5667 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5668 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5669 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5670 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5671 freemsg(mp); 5672 return (EINVAL); 5673 } 5674 } 5675 error = icmp_output_hdrincl(connp, mp, cr, pid); 5676 if (is->is_sendto_ignerr) 5677 return (0); 5678 else 5679 return (error); 5680 } 5681 5682 /* Connected? */ 5683 if (msg->msg_name == NULL) { 5684 if (icmp->icmp_state != TS_DATA_XFER) { 5685 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5686 return (EDESTADDRREQ); 5687 } 5688 if (msg->msg_controllen != 0) { 5689 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5690 NULL, msg, cr, pid); 5691 } else { 5692 error = icmp_output_connected(connp, mp, cr, pid); 5693 } 5694 if (is->is_sendto_ignerr) 5695 return (0); 5696 else 5697 return (error); 5698 } 5699 if (icmp->icmp_state == TS_DATA_XFER) { 5700 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5701 return (EISCONN); 5702 } 5703 error = proto_verify_ip_addr(connp->conn_family, 5704 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5705 if (error != 0) { 5706 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5707 return (error); 5708 } 5709 switch (connp->conn_family) { 5710 case AF_INET6: 5711 sin6 = (sin6_t *)msg->msg_name; 5712 5713 /* No support for mapped addresses on raw sockets */ 5714 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5715 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5716 return (EADDRNOTAVAIL); 5717 } 5718 srcid = sin6->__sin6_src_id; 5719 5720 /* 5721 * If the local address is a mapped address return 5722 * an error. 5723 * It would be possible to send an IPv6 packet but the 5724 * response would never make it back to the application 5725 * since it is bound to a mapped address. 5726 */ 5727 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5728 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5729 return (EADDRNOTAVAIL); 5730 } 5731 5732 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5733 sin6->sin6_addr = ipv6_loopback; 5734 5735 /* 5736 * We have to allocate an ip_xmit_attr_t before we grab 5737 * conn_lock and we need to hold conn_lock once we've check 5738 * conn_same_as_last_v6 to handle concurrent send* calls on a 5739 * socket. 5740 */ 5741 if (msg->msg_controllen == 0) { 5742 ixa = conn_get_ixa(connp, B_FALSE); 5743 if (ixa == NULL) { 5744 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5745 return (ENOMEM); 5746 } 5747 } else { 5748 ixa = NULL; 5749 } 5750 mutex_enter(&connp->conn_lock); 5751 if (icmp->icmp_delayed_error != 0) { 5752 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5753 5754 error = icmp->icmp_delayed_error; 5755 icmp->icmp_delayed_error = 0; 5756 5757 /* Compare IP address and family */ 5758 5759 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5760 &sin2->sin6_addr) && 5761 sin6->sin6_family == sin2->sin6_family) { 5762 mutex_exit(&connp->conn_lock); 5763 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5764 if (ixa != NULL) 5765 ixa_refrele(ixa); 5766 return (error); 5767 } 5768 } 5769 if (msg->msg_controllen != 0) { 5770 mutex_exit(&connp->conn_lock); 5771 ASSERT(ixa == NULL); 5772 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5773 NULL, msg, cr, pid); 5774 } else if (conn_same_as_last_v6(connp, sin6) && 5775 connp->conn_lastsrcid == srcid && 5776 ipsec_outbound_policy_current(ixa)) { 5777 /* icmp_output_lastdst drops conn_lock */ 5778 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5779 } else { 5780 /* icmp_output_newdst drops conn_lock */ 5781 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5782 pid, ixa); 5783 } 5784 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5785 if (is->is_sendto_ignerr) 5786 return (0); 5787 else 5788 return (error); 5789 case AF_INET: 5790 sin = (sin_t *)msg->msg_name; 5791 5792 if (sin->sin_addr.s_addr == INADDR_ANY) 5793 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5794 5795 /* 5796 * We have to allocate an ip_xmit_attr_t before we grab 5797 * conn_lock and we need to hold conn_lock once we've check 5798 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5799 */ 5800 if (msg->msg_controllen == 0) { 5801 ixa = conn_get_ixa(connp, B_FALSE); 5802 if (ixa == NULL) { 5803 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5804 return (ENOMEM); 5805 } 5806 } else { 5807 ixa = NULL; 5808 } 5809 mutex_enter(&connp->conn_lock); 5810 if (icmp->icmp_delayed_error != 0) { 5811 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5812 5813 error = icmp->icmp_delayed_error; 5814 icmp->icmp_delayed_error = 0; 5815 5816 /* Compare IP address */ 5817 5818 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5819 mutex_exit(&connp->conn_lock); 5820 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5821 if (ixa != NULL) 5822 ixa_refrele(ixa); 5823 return (error); 5824 } 5825 } 5826 5827 if (msg->msg_controllen != 0) { 5828 mutex_exit(&connp->conn_lock); 5829 ASSERT(ixa == NULL); 5830 error = icmp_output_ancillary(connp, sin, NULL, mp, 5831 NULL, msg, cr, pid); 5832 } else if (conn_same_as_last_v4(connp, sin) && 5833 ipsec_outbound_policy_current(ixa)) { 5834 /* icmp_output_lastdst drops conn_lock */ 5835 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5836 } else { 5837 /* icmp_output_newdst drops conn_lock */ 5838 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5839 pid, ixa); 5840 } 5841 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5842 if (is->is_sendto_ignerr) 5843 return (0); 5844 else 5845 return (error); 5846 default: 5847 return (EINVAL); 5848 } 5849 } 5850 5851 sock_downcalls_t sock_rawip_downcalls = { 5852 rawip_activate, 5853 rawip_accept, 5854 rawip_bind, 5855 rawip_listen, 5856 rawip_connect, 5857 rawip_getpeername, 5858 rawip_getsockname, 5859 rawip_getsockopt, 5860 rawip_setsockopt, 5861 rawip_send, 5862 NULL, 5863 NULL, 5864 NULL, 5865 rawip_shutdown, 5866 rawip_clr_flowctrl, 5867 rawip_ioctl, 5868 rawip_close 5869 }; 5870