1 /* 2 * Copyright (C) 2016-2018 Red Hat, Inc. 3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws> 4 * 5 * Network Block Device Server Side 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; under version 2 of the License. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qapi/error.h" 22 #include "qemu/queue.h" 23 #include "trace.h" 24 #include "nbd-internal.h" 25 #include "qemu/units.h" 26 27 #define NBD_META_ID_BASE_ALLOCATION 0 28 #define NBD_META_ID_DIRTY_BITMAP 1 29 30 /* 31 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical 32 * constant. If an increase is needed, note that the NBD protocol 33 * recommends no larger than 32 mb, so that the client won't consider 34 * the reply as a denial of service attack. 35 */ 36 #define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8) 37 38 static int system_errno_to_nbd_errno(int err) 39 { 40 switch (err) { 41 case 0: 42 return NBD_SUCCESS; 43 case EPERM: 44 case EROFS: 45 return NBD_EPERM; 46 case EIO: 47 return NBD_EIO; 48 case ENOMEM: 49 return NBD_ENOMEM; 50 #ifdef EDQUOT 51 case EDQUOT: 52 #endif 53 case EFBIG: 54 case ENOSPC: 55 return NBD_ENOSPC; 56 case EOVERFLOW: 57 return NBD_EOVERFLOW; 58 case ESHUTDOWN: 59 return NBD_ESHUTDOWN; 60 case EINVAL: 61 default: 62 return NBD_EINVAL; 63 } 64 } 65 66 /* Definitions for opaque data types */ 67 68 typedef struct NBDRequestData NBDRequestData; 69 70 struct NBDRequestData { 71 QSIMPLEQ_ENTRY(NBDRequestData) entry; 72 NBDClient *client; 73 uint8_t *data; 74 bool complete; 75 }; 76 77 struct NBDExport { 78 int refcount; 79 void (*close)(NBDExport *exp); 80 81 BlockBackend *blk; 82 char *name; 83 char *description; 84 uint64_t dev_offset; 85 uint64_t size; 86 uint16_t nbdflags; 87 QTAILQ_HEAD(, NBDClient) clients; 88 QTAILQ_ENTRY(NBDExport) next; 89 90 AioContext *ctx; 91 92 BlockBackend *eject_notifier_blk; 93 Notifier eject_notifier; 94 95 BdrvDirtyBitmap *export_bitmap; 96 char *export_bitmap_context; 97 }; 98 99 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); 100 101 /* NBDExportMetaContexts represents a list of contexts to be exported, 102 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for 103 * NBD_OPT_LIST_META_CONTEXT. */ 104 typedef struct NBDExportMetaContexts { 105 NBDExport *exp; 106 bool valid; /* means that negotiation of the option finished without 107 errors */ 108 bool base_allocation; /* export base:allocation context (block status) */ 109 bool bitmap; /* export qemu:dirty-bitmap:<export bitmap name> */ 110 } NBDExportMetaContexts; 111 112 struct NBDClient { 113 int refcount; 114 void (*close_fn)(NBDClient *client, bool negotiated); 115 116 NBDExport *exp; 117 QCryptoTLSCreds *tlscreds; 118 char *tlsauthz; 119 QIOChannelSocket *sioc; /* The underlying data channel */ 120 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ 121 122 Coroutine *recv_coroutine; 123 124 CoMutex send_lock; 125 Coroutine *send_coroutine; 126 127 QTAILQ_ENTRY(NBDClient) next; 128 int nb_requests; 129 bool closing; 130 131 uint32_t check_align; /* If non-zero, check for aligned client requests */ 132 133 bool structured_reply; 134 NBDExportMetaContexts export_meta; 135 136 uint32_t opt; /* Current option being negotiated */ 137 uint32_t optlen; /* remaining length of data in ioc for the option being 138 negotiated now */ 139 }; 140 141 static void nbd_client_receive_next_request(NBDClient *client); 142 143 /* Basic flow for negotiation 144 145 Server Client 146 Negotiate 147 148 or 149 150 Server Client 151 Negotiate #1 152 Option 153 Negotiate #2 154 155 ---- 156 157 followed by 158 159 Server Client 160 Request 161 Response 162 Request 163 Response 164 ... 165 ... 166 Request (type == 2) 167 168 */ 169 170 static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option, 171 uint32_t type, uint32_t length) 172 { 173 stq_be_p(&rep->magic, NBD_REP_MAGIC); 174 stl_be_p(&rep->option, option); 175 stl_be_p(&rep->type, type); 176 stl_be_p(&rep->length, length); 177 } 178 179 /* Send a reply header, including length, but no payload. 180 * Return -errno on error, 0 on success. */ 181 static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type, 182 uint32_t len, Error **errp) 183 { 184 NBDOptionReply rep; 185 186 trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt), 187 type, nbd_rep_lookup(type), len); 188 189 assert(len < NBD_MAX_BUFFER_SIZE); 190 191 set_be_option_rep(&rep, client->opt, type, len); 192 return nbd_write(client->ioc, &rep, sizeof(rep), errp); 193 } 194 195 /* Send a reply header with default 0 length. 196 * Return -errno on error, 0 on success. */ 197 static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type, 198 Error **errp) 199 { 200 return nbd_negotiate_send_rep_len(client, type, 0, errp); 201 } 202 203 /* Send an error reply. 204 * Return -errno on error, 0 on success. */ 205 static int GCC_FMT_ATTR(4, 0) 206 nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type, 207 Error **errp, const char *fmt, va_list va) 208 { 209 g_autofree char *msg = NULL; 210 int ret; 211 size_t len; 212 213 msg = g_strdup_vprintf(fmt, va); 214 len = strlen(msg); 215 assert(len < 4096); 216 trace_nbd_negotiate_send_rep_err(msg); 217 ret = nbd_negotiate_send_rep_len(client, type, len, errp); 218 if (ret < 0) { 219 return ret; 220 } 221 if (nbd_write(client->ioc, msg, len, errp) < 0) { 222 error_prepend(errp, "write failed (error message): "); 223 return -EIO; 224 } 225 226 return 0; 227 } 228 229 /* Send an error reply. 230 * Return -errno on error, 0 on success. */ 231 static int GCC_FMT_ATTR(4, 5) 232 nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type, 233 Error **errp, const char *fmt, ...) 234 { 235 va_list va; 236 int ret; 237 238 va_start(va, fmt); 239 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va); 240 va_end(va); 241 return ret; 242 } 243 244 /* Drop remainder of the current option, and send a reply with the 245 * given error type and message. Return -errno on read or write 246 * failure; or 0 if connection is still live. */ 247 static int GCC_FMT_ATTR(4, 0) 248 nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp, 249 const char *fmt, va_list va) 250 { 251 int ret = nbd_drop(client->ioc, client->optlen, errp); 252 253 client->optlen = 0; 254 if (!ret) { 255 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va); 256 } 257 return ret; 258 } 259 260 static int GCC_FMT_ATTR(4, 5) 261 nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp, 262 const char *fmt, ...) 263 { 264 int ret; 265 va_list va; 266 267 va_start(va, fmt); 268 ret = nbd_opt_vdrop(client, type, errp, fmt, va); 269 va_end(va); 270 271 return ret; 272 } 273 274 static int GCC_FMT_ATTR(3, 4) 275 nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...) 276 { 277 int ret; 278 va_list va; 279 280 va_start(va, fmt); 281 ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va); 282 va_end(va); 283 284 return ret; 285 } 286 287 /* Read size bytes from the unparsed payload of the current option. 288 * Return -errno on I/O error, 0 if option was completely handled by 289 * sending a reply about inconsistent lengths, or 1 on success. */ 290 static int nbd_opt_read(NBDClient *client, void *buffer, size_t size, 291 Error **errp) 292 { 293 if (size > client->optlen) { 294 return nbd_opt_invalid(client, errp, 295 "Inconsistent lengths in option %s", 296 nbd_opt_lookup(client->opt)); 297 } 298 client->optlen -= size; 299 return qio_channel_read_all(client->ioc, buffer, size, errp) < 0 ? -EIO : 1; 300 } 301 302 /* Drop size bytes from the unparsed payload of the current option. 303 * Return -errno on I/O error, 0 if option was completely handled by 304 * sending a reply about inconsistent lengths, or 1 on success. */ 305 static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp) 306 { 307 if (size > client->optlen) { 308 return nbd_opt_invalid(client, errp, 309 "Inconsistent lengths in option %s", 310 nbd_opt_lookup(client->opt)); 311 } 312 client->optlen -= size; 313 return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1; 314 } 315 316 /* nbd_opt_read_name 317 * 318 * Read a string with the format: 319 * uint32_t len (<= NBD_MAX_NAME_SIZE) 320 * len bytes string (not 0-terminated) 321 * 322 * @name should be enough to store NBD_MAX_NAME_SIZE+1. 323 * If @length is non-null, it will be set to the actual string length. 324 * 325 * Return -errno on I/O error, 0 if option was completely handled by 326 * sending a reply about inconsistent lengths, or 1 on success. 327 */ 328 static int nbd_opt_read_name(NBDClient *client, char *name, uint32_t *length, 329 Error **errp) 330 { 331 int ret; 332 uint32_t len; 333 334 ret = nbd_opt_read(client, &len, sizeof(len), errp); 335 if (ret <= 0) { 336 return ret; 337 } 338 len = cpu_to_be32(len); 339 340 if (len > NBD_MAX_NAME_SIZE) { 341 return nbd_opt_invalid(client, errp, 342 "Invalid name length: %" PRIu32, len); 343 } 344 345 ret = nbd_opt_read(client, name, len, errp); 346 if (ret <= 0) { 347 return ret; 348 } 349 name[len] = '\0'; 350 351 if (length) { 352 *length = len; 353 } 354 355 return 1; 356 } 357 358 /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload. 359 * Return -errno on error, 0 on success. */ 360 static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp, 361 Error **errp) 362 { 363 size_t name_len, desc_len; 364 uint32_t len; 365 const char *name = exp->name ? exp->name : ""; 366 const char *desc = exp->description ? exp->description : ""; 367 QIOChannel *ioc = client->ioc; 368 int ret; 369 370 trace_nbd_negotiate_send_rep_list(name, desc); 371 name_len = strlen(name); 372 desc_len = strlen(desc); 373 len = name_len + desc_len + sizeof(len); 374 ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp); 375 if (ret < 0) { 376 return ret; 377 } 378 379 len = cpu_to_be32(name_len); 380 if (nbd_write(ioc, &len, sizeof(len), errp) < 0) { 381 error_prepend(errp, "write failed (name length): "); 382 return -EINVAL; 383 } 384 385 if (nbd_write(ioc, name, name_len, errp) < 0) { 386 error_prepend(errp, "write failed (name buffer): "); 387 return -EINVAL; 388 } 389 390 if (nbd_write(ioc, desc, desc_len, errp) < 0) { 391 error_prepend(errp, "write failed (description buffer): "); 392 return -EINVAL; 393 } 394 395 return 0; 396 } 397 398 /* Process the NBD_OPT_LIST command, with a potential series of replies. 399 * Return -errno on error, 0 on success. */ 400 static int nbd_negotiate_handle_list(NBDClient *client, Error **errp) 401 { 402 NBDExport *exp; 403 assert(client->opt == NBD_OPT_LIST); 404 405 /* For each export, send a NBD_REP_SERVER reply. */ 406 QTAILQ_FOREACH(exp, &exports, next) { 407 if (nbd_negotiate_send_rep_list(client, exp, errp)) { 408 return -EINVAL; 409 } 410 } 411 /* Finish with a NBD_REP_ACK. */ 412 return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 413 } 414 415 static void nbd_check_meta_export(NBDClient *client) 416 { 417 client->export_meta.valid &= client->exp == client->export_meta.exp; 418 } 419 420 /* Send a reply to NBD_OPT_EXPORT_NAME. 421 * Return -errno on error, 0 on success. */ 422 static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes, 423 Error **errp) 424 { 425 char name[NBD_MAX_NAME_SIZE + 1]; 426 char buf[NBD_REPLY_EXPORT_NAME_SIZE] = ""; 427 size_t len; 428 int ret; 429 uint16_t myflags; 430 431 /* Client sends: 432 [20 .. xx] export name (length bytes) 433 Server replies: 434 [ 0 .. 7] size 435 [ 8 .. 9] export flags 436 [10 .. 133] reserved (0) [unless no_zeroes] 437 */ 438 trace_nbd_negotiate_handle_export_name(); 439 if (client->optlen >= sizeof(name)) { 440 error_setg(errp, "Bad length received"); 441 return -EINVAL; 442 } 443 if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) { 444 return -EIO; 445 } 446 name[client->optlen] = '\0'; 447 client->optlen = 0; 448 449 trace_nbd_negotiate_handle_export_name_request(name); 450 451 client->exp = nbd_export_find(name); 452 if (!client->exp) { 453 error_setg(errp, "export not found"); 454 return -EINVAL; 455 } 456 457 myflags = client->exp->nbdflags; 458 if (client->structured_reply) { 459 myflags |= NBD_FLAG_SEND_DF; 460 } 461 trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags); 462 stq_be_p(buf, client->exp->size); 463 stw_be_p(buf + 8, myflags); 464 len = no_zeroes ? 10 : sizeof(buf); 465 ret = nbd_write(client->ioc, buf, len, errp); 466 if (ret < 0) { 467 error_prepend(errp, "write failed: "); 468 return ret; 469 } 470 471 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next); 472 nbd_export_get(client->exp); 473 nbd_check_meta_export(client); 474 475 return 0; 476 } 477 478 /* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes. 479 * The buffer does NOT include the info type prefix. 480 * Return -errno on error, 0 if ready to send more. */ 481 static int nbd_negotiate_send_info(NBDClient *client, 482 uint16_t info, uint32_t length, void *buf, 483 Error **errp) 484 { 485 int rc; 486 487 trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length); 488 rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO, 489 sizeof(info) + length, errp); 490 if (rc < 0) { 491 return rc; 492 } 493 info = cpu_to_be16(info); 494 if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) { 495 return -EIO; 496 } 497 if (nbd_write(client->ioc, buf, length, errp) < 0) { 498 return -EIO; 499 } 500 return 0; 501 } 502 503 /* nbd_reject_length: Handle any unexpected payload. 504 * @fatal requests that we quit talking to the client, even if we are able 505 * to successfully send an error reply. 506 * Return: 507 * -errno transmission error occurred or @fatal was requested, errp is set 508 * 0 error message successfully sent to client, errp is not set 509 */ 510 static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp) 511 { 512 int ret; 513 514 assert(client->optlen); 515 ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length", 516 nbd_opt_lookup(client->opt)); 517 if (fatal && !ret) { 518 error_setg(errp, "option '%s' has unexpected length", 519 nbd_opt_lookup(client->opt)); 520 return -EINVAL; 521 } 522 return ret; 523 } 524 525 /* Handle NBD_OPT_INFO and NBD_OPT_GO. 526 * Return -errno on error, 0 if ready for next option, and 1 to move 527 * into transmission phase. */ 528 static int nbd_negotiate_handle_info(NBDClient *client, Error **errp) 529 { 530 int rc; 531 char name[NBD_MAX_NAME_SIZE + 1]; 532 NBDExport *exp; 533 uint16_t requests; 534 uint16_t request; 535 uint32_t namelen; 536 bool sendname = false; 537 bool blocksize = false; 538 uint32_t sizes[3]; 539 char buf[sizeof(uint64_t) + sizeof(uint16_t)]; 540 uint32_t check_align = 0; 541 uint16_t myflags; 542 543 /* Client sends: 544 4 bytes: L, name length (can be 0) 545 L bytes: export name 546 2 bytes: N, number of requests (can be 0) 547 N * 2 bytes: N requests 548 */ 549 rc = nbd_opt_read_name(client, name, &namelen, errp); 550 if (rc <= 0) { 551 return rc; 552 } 553 trace_nbd_negotiate_handle_export_name_request(name); 554 555 rc = nbd_opt_read(client, &requests, sizeof(requests), errp); 556 if (rc <= 0) { 557 return rc; 558 } 559 requests = be16_to_cpu(requests); 560 trace_nbd_negotiate_handle_info_requests(requests); 561 while (requests--) { 562 rc = nbd_opt_read(client, &request, sizeof(request), errp); 563 if (rc <= 0) { 564 return rc; 565 } 566 request = be16_to_cpu(request); 567 trace_nbd_negotiate_handle_info_request(request, 568 nbd_info_lookup(request)); 569 /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE; 570 * everything else is either a request we don't know or 571 * something we send regardless of request */ 572 switch (request) { 573 case NBD_INFO_NAME: 574 sendname = true; 575 break; 576 case NBD_INFO_BLOCK_SIZE: 577 blocksize = true; 578 break; 579 } 580 } 581 if (client->optlen) { 582 return nbd_reject_length(client, false, errp); 583 } 584 585 exp = nbd_export_find(name); 586 if (!exp) { 587 return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN, 588 errp, "export '%s' not present", 589 name); 590 } 591 592 /* Don't bother sending NBD_INFO_NAME unless client requested it */ 593 if (sendname) { 594 rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name, 595 errp); 596 if (rc < 0) { 597 return rc; 598 } 599 } 600 601 /* Send NBD_INFO_DESCRIPTION only if available, regardless of 602 * client request */ 603 if (exp->description) { 604 size_t len = strlen(exp->description); 605 606 rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION, 607 len, exp->description, errp); 608 if (rc < 0) { 609 return rc; 610 } 611 } 612 613 /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size 614 * according to whether the client requested it, and according to 615 * whether this is OPT_INFO or OPT_GO. */ 616 /* minimum - 1 for back-compat, or actual if client will obey it. */ 617 if (client->opt == NBD_OPT_INFO || blocksize) { 618 check_align = sizes[0] = blk_get_request_alignment(exp->blk); 619 } else { 620 sizes[0] = 1; 621 } 622 assert(sizes[0] <= NBD_MAX_BUFFER_SIZE); 623 /* preferred - Hard-code to 4096 for now. 624 * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */ 625 sizes[1] = MAX(4096, sizes[0]); 626 /* maximum - At most 32M, but smaller as appropriate. */ 627 sizes[2] = MIN(blk_get_max_transfer(exp->blk), NBD_MAX_BUFFER_SIZE); 628 trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]); 629 sizes[0] = cpu_to_be32(sizes[0]); 630 sizes[1] = cpu_to_be32(sizes[1]); 631 sizes[2] = cpu_to_be32(sizes[2]); 632 rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE, 633 sizeof(sizes), sizes, errp); 634 if (rc < 0) { 635 return rc; 636 } 637 638 /* Send NBD_INFO_EXPORT always */ 639 myflags = exp->nbdflags; 640 if (client->structured_reply) { 641 myflags |= NBD_FLAG_SEND_DF; 642 } 643 trace_nbd_negotiate_new_style_size_flags(exp->size, myflags); 644 stq_be_p(buf, exp->size); 645 stw_be_p(buf + 8, myflags); 646 rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT, 647 sizeof(buf), buf, errp); 648 if (rc < 0) { 649 return rc; 650 } 651 652 /* 653 * If the client is just asking for NBD_OPT_INFO, but forgot to 654 * request block sizes in a situation that would impact 655 * performance, then return an error. But for NBD_OPT_GO, we 656 * tolerate all clients, regardless of alignments. 657 */ 658 if (client->opt == NBD_OPT_INFO && !blocksize && 659 blk_get_request_alignment(exp->blk) > 1) { 660 return nbd_negotiate_send_rep_err(client, 661 NBD_REP_ERR_BLOCK_SIZE_REQD, 662 errp, 663 "request NBD_INFO_BLOCK_SIZE to " 664 "use this export"); 665 } 666 667 /* Final reply */ 668 rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 669 if (rc < 0) { 670 return rc; 671 } 672 673 if (client->opt == NBD_OPT_GO) { 674 client->exp = exp; 675 client->check_align = check_align; 676 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next); 677 nbd_export_get(client->exp); 678 nbd_check_meta_export(client); 679 rc = 1; 680 } 681 return rc; 682 } 683 684 685 /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the 686 * new channel for all further (now-encrypted) communication. */ 687 static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, 688 Error **errp) 689 { 690 QIOChannel *ioc; 691 QIOChannelTLS *tioc; 692 struct NBDTLSHandshakeData data = { 0 }; 693 694 assert(client->opt == NBD_OPT_STARTTLS); 695 696 trace_nbd_negotiate_handle_starttls(); 697 ioc = client->ioc; 698 699 if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) { 700 return NULL; 701 } 702 703 tioc = qio_channel_tls_new_server(ioc, 704 client->tlscreds, 705 client->tlsauthz, 706 errp); 707 if (!tioc) { 708 return NULL; 709 } 710 711 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls"); 712 trace_nbd_negotiate_handle_starttls_handshake(); 713 data.loop = g_main_loop_new(g_main_context_default(), FALSE); 714 qio_channel_tls_handshake(tioc, 715 nbd_tls_handshake, 716 &data, 717 NULL, 718 NULL); 719 720 if (!data.complete) { 721 g_main_loop_run(data.loop); 722 } 723 g_main_loop_unref(data.loop); 724 if (data.error) { 725 object_unref(OBJECT(tioc)); 726 error_propagate(errp, data.error); 727 return NULL; 728 } 729 730 return QIO_CHANNEL(tioc); 731 } 732 733 /* nbd_negotiate_send_meta_context 734 * 735 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT 736 * 737 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead. 738 */ 739 static int nbd_negotiate_send_meta_context(NBDClient *client, 740 const char *context, 741 uint32_t context_id, 742 Error **errp) 743 { 744 NBDOptionReplyMetaContext opt; 745 struct iovec iov[] = { 746 {.iov_base = &opt, .iov_len = sizeof(opt)}, 747 {.iov_base = (void *)context, .iov_len = strlen(context)} 748 }; 749 750 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 751 context_id = 0; 752 } 753 754 trace_nbd_negotiate_meta_query_reply(context, context_id); 755 set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT, 756 sizeof(opt) - sizeof(opt.h) + iov[1].iov_len); 757 stl_be_p(&opt.context_id, context_id); 758 759 return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0; 760 } 761 762 /* Read strlen(@pattern) bytes, and set @match to true if they match @pattern. 763 * @match is never set to false. 764 * 765 * Return -errno on I/O error, 0 if option was completely handled by 766 * sending a reply about inconsistent lengths, or 1 on success. 767 * 768 * Note: return code = 1 doesn't mean that we've read exactly @pattern. 769 * It only means that there are no errors. 770 */ 771 static int nbd_meta_pattern(NBDClient *client, const char *pattern, bool *match, 772 Error **errp) 773 { 774 int ret; 775 char *query; 776 size_t len = strlen(pattern); 777 778 assert(len); 779 780 query = g_malloc(len); 781 ret = nbd_opt_read(client, query, len, errp); 782 if (ret <= 0) { 783 g_free(query); 784 return ret; 785 } 786 787 if (strncmp(query, pattern, len) == 0) { 788 trace_nbd_negotiate_meta_query_parse(pattern); 789 *match = true; 790 } else { 791 trace_nbd_negotiate_meta_query_skip("pattern not matched"); 792 } 793 g_free(query); 794 795 return 1; 796 } 797 798 /* 799 * Read @len bytes, and set @match to true if they match @pattern, or if @len 800 * is 0 and the client is performing _LIST_. @match is never set to false. 801 * 802 * Return -errno on I/O error, 0 if option was completely handled by 803 * sending a reply about inconsistent lengths, or 1 on success. 804 * 805 * Note: return code = 1 doesn't mean that we've read exactly @pattern. 806 * It only means that there are no errors. 807 */ 808 static int nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern, 809 uint32_t len, bool *match, Error **errp) 810 { 811 if (len == 0) { 812 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 813 *match = true; 814 } 815 trace_nbd_negotiate_meta_query_parse("empty"); 816 return 1; 817 } 818 819 if (len != strlen(pattern)) { 820 trace_nbd_negotiate_meta_query_skip("different lengths"); 821 return nbd_opt_skip(client, len, errp); 822 } 823 824 return nbd_meta_pattern(client, pattern, match, errp); 825 } 826 827 /* nbd_meta_base_query 828 * 829 * Handle queries to 'base' namespace. For now, only the base:allocation 830 * context is available. 'len' is the amount of text remaining to be read from 831 * the current name, after the 'base:' portion has been stripped. 832 * 833 * Return -errno on I/O error, 0 if option was completely handled by 834 * sending a reply about inconsistent lengths, or 1 on success. 835 */ 836 static int nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta, 837 uint32_t len, Error **errp) 838 { 839 return nbd_meta_empty_or_pattern(client, "allocation", len, 840 &meta->base_allocation, errp); 841 } 842 843 /* nbd_meta_bitmap_query 844 * 845 * Handle query to 'qemu:' namespace. 846 * @len is the amount of text remaining to be read from the current name, after 847 * the 'qemu:' portion has been stripped. 848 * 849 * Return -errno on I/O error, 0 if option was completely handled by 850 * sending a reply about inconsistent lengths, or 1 on success. */ 851 static int nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta, 852 uint32_t len, Error **errp) 853 { 854 bool dirty_bitmap = false; 855 size_t dirty_bitmap_len = strlen("dirty-bitmap:"); 856 int ret; 857 858 if (!meta->exp->export_bitmap) { 859 trace_nbd_negotiate_meta_query_skip("no dirty-bitmap exported"); 860 return nbd_opt_skip(client, len, errp); 861 } 862 863 if (len == 0) { 864 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 865 meta->bitmap = true; 866 } 867 trace_nbd_negotiate_meta_query_parse("empty"); 868 return 1; 869 } 870 871 if (len < dirty_bitmap_len) { 872 trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:"); 873 return nbd_opt_skip(client, len, errp); 874 } 875 876 len -= dirty_bitmap_len; 877 ret = nbd_meta_pattern(client, "dirty-bitmap:", &dirty_bitmap, errp); 878 if (ret <= 0) { 879 return ret; 880 } 881 if (!dirty_bitmap) { 882 trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:"); 883 return nbd_opt_skip(client, len, errp); 884 } 885 886 trace_nbd_negotiate_meta_query_parse("dirty-bitmap:"); 887 888 return nbd_meta_empty_or_pattern( 889 client, meta->exp->export_bitmap_context + 890 strlen("qemu:dirty_bitmap:"), len, &meta->bitmap, errp); 891 } 892 893 /* nbd_negotiate_meta_query 894 * 895 * Parse namespace name and call corresponding function to parse body of the 896 * query. 897 * 898 * The only supported namespace now is 'base'. 899 * 900 * The function aims not wasting time and memory to read long unknown namespace 901 * names. 902 * 903 * Return -errno on I/O error, 0 if option was completely handled by 904 * sending a reply about inconsistent lengths, or 1 on success. */ 905 static int nbd_negotiate_meta_query(NBDClient *client, 906 NBDExportMetaContexts *meta, Error **errp) 907 { 908 /* 909 * Both 'qemu' and 'base' namespaces have length = 5 including a 910 * colon. If another length namespace is later introduced, this 911 * should certainly be refactored. 912 */ 913 int ret; 914 size_t ns_len = 5; 915 char ns[5]; 916 uint32_t len; 917 918 ret = nbd_opt_read(client, &len, sizeof(len), errp); 919 if (ret <= 0) { 920 return ret; 921 } 922 len = cpu_to_be32(len); 923 924 if (len < ns_len) { 925 trace_nbd_negotiate_meta_query_skip("length too short"); 926 return nbd_opt_skip(client, len, errp); 927 } 928 929 len -= ns_len; 930 ret = nbd_opt_read(client, ns, ns_len, errp); 931 if (ret <= 0) { 932 return ret; 933 } 934 935 if (!strncmp(ns, "base:", ns_len)) { 936 trace_nbd_negotiate_meta_query_parse("base:"); 937 return nbd_meta_base_query(client, meta, len, errp); 938 } else if (!strncmp(ns, "qemu:", ns_len)) { 939 trace_nbd_negotiate_meta_query_parse("qemu:"); 940 return nbd_meta_qemu_query(client, meta, len, errp); 941 } 942 943 trace_nbd_negotiate_meta_query_skip("unknown namespace"); 944 return nbd_opt_skip(client, len, errp); 945 } 946 947 /* nbd_negotiate_meta_queries 948 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT 949 * 950 * Return -errno on I/O error, or 0 if option was completely handled. */ 951 static int nbd_negotiate_meta_queries(NBDClient *client, 952 NBDExportMetaContexts *meta, Error **errp) 953 { 954 int ret; 955 char export_name[NBD_MAX_NAME_SIZE + 1]; 956 NBDExportMetaContexts local_meta; 957 uint32_t nb_queries; 958 int i; 959 960 if (!client->structured_reply) { 961 return nbd_opt_invalid(client, errp, 962 "request option '%s' when structured reply " 963 "is not negotiated", 964 nbd_opt_lookup(client->opt)); 965 } 966 967 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 968 /* Only change the caller's meta on SET. */ 969 meta = &local_meta; 970 } 971 972 memset(meta, 0, sizeof(*meta)); 973 974 ret = nbd_opt_read_name(client, export_name, NULL, errp); 975 if (ret <= 0) { 976 return ret; 977 } 978 979 meta->exp = nbd_export_find(export_name); 980 if (meta->exp == NULL) { 981 return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp, 982 "export '%s' not present", export_name); 983 } 984 985 ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp); 986 if (ret <= 0) { 987 return ret; 988 } 989 nb_queries = cpu_to_be32(nb_queries); 990 trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt), 991 export_name, nb_queries); 992 993 if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) { 994 /* enable all known contexts */ 995 meta->base_allocation = true; 996 meta->bitmap = !!meta->exp->export_bitmap; 997 } else { 998 for (i = 0; i < nb_queries; ++i) { 999 ret = nbd_negotiate_meta_query(client, meta, errp); 1000 if (ret <= 0) { 1001 return ret; 1002 } 1003 } 1004 } 1005 1006 if (meta->base_allocation) { 1007 ret = nbd_negotiate_send_meta_context(client, "base:allocation", 1008 NBD_META_ID_BASE_ALLOCATION, 1009 errp); 1010 if (ret < 0) { 1011 return ret; 1012 } 1013 } 1014 1015 if (meta->bitmap) { 1016 ret = nbd_negotiate_send_meta_context(client, 1017 meta->exp->export_bitmap_context, 1018 NBD_META_ID_DIRTY_BITMAP, 1019 errp); 1020 if (ret < 0) { 1021 return ret; 1022 } 1023 } 1024 1025 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 1026 if (ret == 0) { 1027 meta->valid = true; 1028 } 1029 1030 return ret; 1031 } 1032 1033 /* nbd_negotiate_options 1034 * Process all NBD_OPT_* client option commands, during fixed newstyle 1035 * negotiation. 1036 * Return: 1037 * -errno on error, errp is set 1038 * 0 on successful negotiation, errp is not set 1039 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect, 1040 * errp is not set 1041 */ 1042 static int nbd_negotiate_options(NBDClient *client, Error **errp) 1043 { 1044 uint32_t flags; 1045 bool fixedNewstyle = false; 1046 bool no_zeroes = false; 1047 1048 /* Client sends: 1049 [ 0 .. 3] client flags 1050 1051 Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO: 1052 [ 0 .. 7] NBD_OPTS_MAGIC 1053 [ 8 .. 11] NBD option 1054 [12 .. 15] Data length 1055 ... Rest of request 1056 1057 [ 0 .. 7] NBD_OPTS_MAGIC 1058 [ 8 .. 11] Second NBD option 1059 [12 .. 15] Data length 1060 ... Rest of request 1061 */ 1062 1063 if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) { 1064 return -EIO; 1065 } 1066 trace_nbd_negotiate_options_flags(flags); 1067 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) { 1068 fixedNewstyle = true; 1069 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE; 1070 } 1071 if (flags & NBD_FLAG_C_NO_ZEROES) { 1072 no_zeroes = true; 1073 flags &= ~NBD_FLAG_C_NO_ZEROES; 1074 } 1075 if (flags != 0) { 1076 error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags); 1077 return -EINVAL; 1078 } 1079 1080 while (1) { 1081 int ret; 1082 uint32_t option, length; 1083 uint64_t magic; 1084 1085 if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) { 1086 return -EINVAL; 1087 } 1088 trace_nbd_negotiate_options_check_magic(magic); 1089 if (magic != NBD_OPTS_MAGIC) { 1090 error_setg(errp, "Bad magic received"); 1091 return -EINVAL; 1092 } 1093 1094 if (nbd_read32(client->ioc, &option, "option", errp) < 0) { 1095 return -EINVAL; 1096 } 1097 client->opt = option; 1098 1099 if (nbd_read32(client->ioc, &length, "option length", errp) < 0) { 1100 return -EINVAL; 1101 } 1102 assert(!client->optlen); 1103 client->optlen = length; 1104 1105 if (length > NBD_MAX_BUFFER_SIZE) { 1106 error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)", 1107 length, NBD_MAX_BUFFER_SIZE); 1108 return -EINVAL; 1109 } 1110 1111 trace_nbd_negotiate_options_check_option(option, 1112 nbd_opt_lookup(option)); 1113 if (client->tlscreds && 1114 client->ioc == (QIOChannel *)client->sioc) { 1115 QIOChannel *tioc; 1116 if (!fixedNewstyle) { 1117 error_setg(errp, "Unsupported option 0x%" PRIx32, option); 1118 return -EINVAL; 1119 } 1120 switch (option) { 1121 case NBD_OPT_STARTTLS: 1122 if (length) { 1123 /* Unconditionally drop the connection if the client 1124 * can't start a TLS negotiation correctly */ 1125 return nbd_reject_length(client, true, errp); 1126 } 1127 tioc = nbd_negotiate_handle_starttls(client, errp); 1128 if (!tioc) { 1129 return -EIO; 1130 } 1131 ret = 0; 1132 object_unref(OBJECT(client->ioc)); 1133 client->ioc = QIO_CHANNEL(tioc); 1134 break; 1135 1136 case NBD_OPT_EXPORT_NAME: 1137 /* No way to return an error to client, so drop connection */ 1138 error_setg(errp, "Option 0x%x not permitted before TLS", 1139 option); 1140 return -EINVAL; 1141 1142 default: 1143 /* Let the client keep trying, unless they asked to 1144 * quit. Always try to give an error back to the 1145 * client; but when replying to OPT_ABORT, be aware 1146 * that the client may hang up before receiving the 1147 * error, in which case we are fine ignoring the 1148 * resulting EPIPE. */ 1149 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD, 1150 option == NBD_OPT_ABORT ? NULL : errp, 1151 "Option 0x%" PRIx32 1152 " not permitted before TLS", option); 1153 if (option == NBD_OPT_ABORT) { 1154 return 1; 1155 } 1156 break; 1157 } 1158 } else if (fixedNewstyle) { 1159 switch (option) { 1160 case NBD_OPT_LIST: 1161 if (length) { 1162 ret = nbd_reject_length(client, false, errp); 1163 } else { 1164 ret = nbd_negotiate_handle_list(client, errp); 1165 } 1166 break; 1167 1168 case NBD_OPT_ABORT: 1169 /* NBD spec says we must try to reply before 1170 * disconnecting, but that we must also tolerate 1171 * guests that don't wait for our reply. */ 1172 nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL); 1173 return 1; 1174 1175 case NBD_OPT_EXPORT_NAME: 1176 return nbd_negotiate_handle_export_name(client, no_zeroes, 1177 errp); 1178 1179 case NBD_OPT_INFO: 1180 case NBD_OPT_GO: 1181 ret = nbd_negotiate_handle_info(client, errp); 1182 if (ret == 1) { 1183 assert(option == NBD_OPT_GO); 1184 return 0; 1185 } 1186 break; 1187 1188 case NBD_OPT_STARTTLS: 1189 if (length) { 1190 ret = nbd_reject_length(client, false, errp); 1191 } else if (client->tlscreds) { 1192 ret = nbd_negotiate_send_rep_err(client, 1193 NBD_REP_ERR_INVALID, errp, 1194 "TLS already enabled"); 1195 } else { 1196 ret = nbd_negotiate_send_rep_err(client, 1197 NBD_REP_ERR_POLICY, errp, 1198 "TLS not configured"); 1199 } 1200 break; 1201 1202 case NBD_OPT_STRUCTURED_REPLY: 1203 if (length) { 1204 ret = nbd_reject_length(client, false, errp); 1205 } else if (client->structured_reply) { 1206 ret = nbd_negotiate_send_rep_err( 1207 client, NBD_REP_ERR_INVALID, errp, 1208 "structured reply already negotiated"); 1209 } else { 1210 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 1211 client->structured_reply = true; 1212 } 1213 break; 1214 1215 case NBD_OPT_LIST_META_CONTEXT: 1216 case NBD_OPT_SET_META_CONTEXT: 1217 ret = nbd_negotiate_meta_queries(client, &client->export_meta, 1218 errp); 1219 break; 1220 1221 default: 1222 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp, 1223 "Unsupported option %" PRIu32 " (%s)", 1224 option, nbd_opt_lookup(option)); 1225 break; 1226 } 1227 } else { 1228 /* 1229 * If broken new-style we should drop the connection 1230 * for anything except NBD_OPT_EXPORT_NAME 1231 */ 1232 switch (option) { 1233 case NBD_OPT_EXPORT_NAME: 1234 return nbd_negotiate_handle_export_name(client, no_zeroes, 1235 errp); 1236 1237 default: 1238 error_setg(errp, "Unsupported option %" PRIu32 " (%s)", 1239 option, nbd_opt_lookup(option)); 1240 return -EINVAL; 1241 } 1242 } 1243 if (ret < 0) { 1244 return ret; 1245 } 1246 } 1247 } 1248 1249 /* nbd_negotiate 1250 * Return: 1251 * -errno on error, errp is set 1252 * 0 on successful negotiation, errp is not set 1253 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect, 1254 * errp is not set 1255 */ 1256 static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp) 1257 { 1258 char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = ""; 1259 int ret; 1260 1261 /* Old style negotiation header, no room for options 1262 [ 0 .. 7] passwd ("NBDMAGIC") 1263 [ 8 .. 15] magic (NBD_CLIENT_MAGIC) 1264 [16 .. 23] size 1265 [24 .. 27] export flags (zero-extended) 1266 [28 .. 151] reserved (0) 1267 1268 New style negotiation header, client can send options 1269 [ 0 .. 7] passwd ("NBDMAGIC") 1270 [ 8 .. 15] magic (NBD_OPTS_MAGIC) 1271 [16 .. 17] server flags (0) 1272 ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO.... 1273 */ 1274 1275 qio_channel_set_blocking(client->ioc, false, NULL); 1276 1277 trace_nbd_negotiate_begin(); 1278 memcpy(buf, "NBDMAGIC", 8); 1279 1280 stq_be_p(buf + 8, NBD_OPTS_MAGIC); 1281 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES); 1282 1283 if (nbd_write(client->ioc, buf, 18, errp) < 0) { 1284 error_prepend(errp, "write failed: "); 1285 return -EINVAL; 1286 } 1287 ret = nbd_negotiate_options(client, errp); 1288 if (ret != 0) { 1289 if (ret < 0) { 1290 error_prepend(errp, "option negotiation failed: "); 1291 } 1292 return ret; 1293 } 1294 1295 assert(!client->optlen); 1296 trace_nbd_negotiate_success(); 1297 1298 return 0; 1299 } 1300 1301 static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request, 1302 Error **errp) 1303 { 1304 uint8_t buf[NBD_REQUEST_SIZE]; 1305 uint32_t magic; 1306 int ret; 1307 1308 ret = nbd_read(ioc, buf, sizeof(buf), "request", errp); 1309 if (ret < 0) { 1310 return ret; 1311 } 1312 1313 /* Request 1314 [ 0 .. 3] magic (NBD_REQUEST_MAGIC) 1315 [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...) 1316 [ 6 .. 7] type (NBD_CMD_READ, ...) 1317 [ 8 .. 15] handle 1318 [16 .. 23] from 1319 [24 .. 27] len 1320 */ 1321 1322 magic = ldl_be_p(buf); 1323 request->flags = lduw_be_p(buf + 4); 1324 request->type = lduw_be_p(buf + 6); 1325 request->handle = ldq_be_p(buf + 8); 1326 request->from = ldq_be_p(buf + 16); 1327 request->len = ldl_be_p(buf + 24); 1328 1329 trace_nbd_receive_request(magic, request->flags, request->type, 1330 request->from, request->len); 1331 1332 if (magic != NBD_REQUEST_MAGIC) { 1333 error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic); 1334 return -EINVAL; 1335 } 1336 return 0; 1337 } 1338 1339 #define MAX_NBD_REQUESTS 16 1340 1341 void nbd_client_get(NBDClient *client) 1342 { 1343 client->refcount++; 1344 } 1345 1346 void nbd_client_put(NBDClient *client) 1347 { 1348 if (--client->refcount == 0) { 1349 /* The last reference should be dropped by client->close, 1350 * which is called by client_close. 1351 */ 1352 assert(client->closing); 1353 1354 qio_channel_detach_aio_context(client->ioc); 1355 object_unref(OBJECT(client->sioc)); 1356 object_unref(OBJECT(client->ioc)); 1357 if (client->tlscreds) { 1358 object_unref(OBJECT(client->tlscreds)); 1359 } 1360 g_free(client->tlsauthz); 1361 if (client->exp) { 1362 QTAILQ_REMOVE(&client->exp->clients, client, next); 1363 nbd_export_put(client->exp); 1364 } 1365 g_free(client); 1366 } 1367 } 1368 1369 static void client_close(NBDClient *client, bool negotiated) 1370 { 1371 if (client->closing) { 1372 return; 1373 } 1374 1375 client->closing = true; 1376 1377 /* Force requests to finish. They will drop their own references, 1378 * then we'll close the socket and free the NBDClient. 1379 */ 1380 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, 1381 NULL); 1382 1383 /* Also tell the client, so that they release their reference. */ 1384 if (client->close_fn) { 1385 client->close_fn(client, negotiated); 1386 } 1387 } 1388 1389 static NBDRequestData *nbd_request_get(NBDClient *client) 1390 { 1391 NBDRequestData *req; 1392 1393 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1); 1394 client->nb_requests++; 1395 1396 req = g_new0(NBDRequestData, 1); 1397 nbd_client_get(client); 1398 req->client = client; 1399 return req; 1400 } 1401 1402 static void nbd_request_put(NBDRequestData *req) 1403 { 1404 NBDClient *client = req->client; 1405 1406 if (req->data) { 1407 qemu_vfree(req->data); 1408 } 1409 g_free(req); 1410 1411 client->nb_requests--; 1412 nbd_client_receive_next_request(client); 1413 1414 nbd_client_put(client); 1415 } 1416 1417 static void blk_aio_attached(AioContext *ctx, void *opaque) 1418 { 1419 NBDExport *exp = opaque; 1420 NBDClient *client; 1421 1422 trace_nbd_blk_aio_attached(exp->name, ctx); 1423 1424 exp->ctx = ctx; 1425 1426 QTAILQ_FOREACH(client, &exp->clients, next) { 1427 qio_channel_attach_aio_context(client->ioc, ctx); 1428 if (client->recv_coroutine) { 1429 aio_co_schedule(ctx, client->recv_coroutine); 1430 } 1431 if (client->send_coroutine) { 1432 aio_co_schedule(ctx, client->send_coroutine); 1433 } 1434 } 1435 } 1436 1437 static void blk_aio_detach(void *opaque) 1438 { 1439 NBDExport *exp = opaque; 1440 NBDClient *client; 1441 1442 trace_nbd_blk_aio_detach(exp->name, exp->ctx); 1443 1444 QTAILQ_FOREACH(client, &exp->clients, next) { 1445 qio_channel_detach_aio_context(client->ioc); 1446 } 1447 1448 exp->ctx = NULL; 1449 } 1450 1451 static void nbd_eject_notifier(Notifier *n, void *data) 1452 { 1453 NBDExport *exp = container_of(n, NBDExport, eject_notifier); 1454 nbd_export_close(exp); 1455 } 1456 1457 NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset, 1458 uint64_t size, const char *name, const char *desc, 1459 const char *bitmap, bool readonly, bool shared, 1460 void (*close)(NBDExport *), bool writethrough, 1461 BlockBackend *on_eject_blk, Error **errp) 1462 { 1463 AioContext *ctx; 1464 BlockBackend *blk; 1465 NBDExport *exp = g_new0(NBDExport, 1); 1466 uint64_t perm; 1467 int ret; 1468 1469 /* 1470 * NBD exports are used for non-shared storage migration. Make sure 1471 * that BDRV_O_INACTIVE is cleared and the image is ready for write 1472 * access since the export could be available before migration handover. 1473 */ 1474 assert(name); 1475 ctx = bdrv_get_aio_context(bs); 1476 aio_context_acquire(ctx); 1477 bdrv_invalidate_cache(bs, NULL); 1478 aio_context_release(ctx); 1479 1480 /* Don't allow resize while the NBD server is running, otherwise we don't 1481 * care what happens with the node. */ 1482 perm = BLK_PERM_CONSISTENT_READ; 1483 if (!readonly) { 1484 perm |= BLK_PERM_WRITE; 1485 } 1486 blk = blk_new(bdrv_get_aio_context(bs), perm, 1487 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | 1488 BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD); 1489 ret = blk_insert_bs(blk, bs, errp); 1490 if (ret < 0) { 1491 goto fail; 1492 } 1493 blk_set_enable_write_cache(blk, !writethrough); 1494 blk_set_allow_aio_context_change(blk, true); 1495 1496 exp->refcount = 1; 1497 QTAILQ_INIT(&exp->clients); 1498 exp->blk = blk; 1499 assert(dev_offset <= INT64_MAX); 1500 exp->dev_offset = dev_offset; 1501 exp->name = g_strdup(name); 1502 exp->description = g_strdup(desc); 1503 exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH | 1504 NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE); 1505 if (readonly) { 1506 exp->nbdflags |= NBD_FLAG_READ_ONLY; 1507 if (shared) { 1508 exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN; 1509 } 1510 } else { 1511 exp->nbdflags |= NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES; 1512 } 1513 assert(size <= INT64_MAX - dev_offset); 1514 exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE); 1515 1516 if (bitmap) { 1517 BdrvDirtyBitmap *bm = NULL; 1518 1519 while (true) { 1520 bm = bdrv_find_dirty_bitmap(bs, bitmap); 1521 if (bm != NULL || bs->backing == NULL) { 1522 break; 1523 } 1524 1525 bs = bs->backing->bs; 1526 } 1527 1528 if (bm == NULL) { 1529 error_setg(errp, "Bitmap '%s' is not found", bitmap); 1530 goto fail; 1531 } 1532 1533 if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) { 1534 goto fail; 1535 } 1536 1537 if (readonly && bdrv_is_writable(bs) && 1538 bdrv_dirty_bitmap_enabled(bm)) { 1539 error_setg(errp, 1540 "Enabled bitmap '%s' incompatible with readonly export", 1541 bitmap); 1542 goto fail; 1543 } 1544 1545 bdrv_dirty_bitmap_set_busy(bm, true); 1546 exp->export_bitmap = bm; 1547 exp->export_bitmap_context = g_strdup_printf("qemu:dirty-bitmap:%s", 1548 bitmap); 1549 } 1550 1551 exp->close = close; 1552 exp->ctx = blk_get_aio_context(blk); 1553 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp); 1554 1555 if (on_eject_blk) { 1556 blk_ref(on_eject_blk); 1557 exp->eject_notifier_blk = on_eject_blk; 1558 exp->eject_notifier.notify = nbd_eject_notifier; 1559 blk_add_remove_bs_notifier(on_eject_blk, &exp->eject_notifier); 1560 } 1561 QTAILQ_INSERT_TAIL(&exports, exp, next); 1562 nbd_export_get(exp); 1563 return exp; 1564 1565 fail: 1566 blk_unref(blk); 1567 g_free(exp->name); 1568 g_free(exp->description); 1569 g_free(exp); 1570 return NULL; 1571 } 1572 1573 NBDExport *nbd_export_find(const char *name) 1574 { 1575 NBDExport *exp; 1576 QTAILQ_FOREACH(exp, &exports, next) { 1577 if (strcmp(name, exp->name) == 0) { 1578 return exp; 1579 } 1580 } 1581 1582 return NULL; 1583 } 1584 1585 void nbd_export_close(NBDExport *exp) 1586 { 1587 NBDClient *client, *next; 1588 1589 nbd_export_get(exp); 1590 /* 1591 * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a 1592 * close mode that stops advertising the export to new clients but 1593 * still permits existing clients to run to completion? Because of 1594 * that possibility, nbd_export_close() can be called more than 1595 * once on an export. 1596 */ 1597 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) { 1598 client_close(client, true); 1599 } 1600 if (exp->name) { 1601 nbd_export_put(exp); 1602 g_free(exp->name); 1603 exp->name = NULL; 1604 QTAILQ_REMOVE(&exports, exp, next); 1605 } 1606 g_free(exp->description); 1607 exp->description = NULL; 1608 nbd_export_put(exp); 1609 } 1610 1611 void nbd_export_remove(NBDExport *exp, NbdServerRemoveMode mode, Error **errp) 1612 { 1613 if (mode == NBD_SERVER_REMOVE_MODE_HARD || QTAILQ_EMPTY(&exp->clients)) { 1614 nbd_export_close(exp); 1615 return; 1616 } 1617 1618 assert(mode == NBD_SERVER_REMOVE_MODE_SAFE); 1619 1620 error_setg(errp, "export '%s' still in use", exp->name); 1621 error_append_hint(errp, "Use mode='hard' to force client disconnect\n"); 1622 } 1623 1624 void nbd_export_get(NBDExport *exp) 1625 { 1626 assert(exp->refcount > 0); 1627 exp->refcount++; 1628 } 1629 1630 void nbd_export_put(NBDExport *exp) 1631 { 1632 assert(exp->refcount > 0); 1633 if (exp->refcount == 1) { 1634 nbd_export_close(exp); 1635 } 1636 1637 /* nbd_export_close() may theoretically reduce refcount to 0. It may happen 1638 * if someone calls nbd_export_put() on named export not through 1639 * nbd_export_set_name() when refcount is 1. So, let's assert that 1640 * it is > 0. 1641 */ 1642 assert(exp->refcount > 0); 1643 if (--exp->refcount == 0) { 1644 assert(exp->name == NULL); 1645 assert(exp->description == NULL); 1646 1647 if (exp->close) { 1648 exp->close(exp); 1649 } 1650 1651 if (exp->blk) { 1652 if (exp->eject_notifier_blk) { 1653 notifier_remove(&exp->eject_notifier); 1654 blk_unref(exp->eject_notifier_blk); 1655 } 1656 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, 1657 blk_aio_detach, exp); 1658 blk_unref(exp->blk); 1659 exp->blk = NULL; 1660 } 1661 1662 if (exp->export_bitmap) { 1663 bdrv_dirty_bitmap_set_busy(exp->export_bitmap, false); 1664 g_free(exp->export_bitmap_context); 1665 } 1666 1667 g_free(exp); 1668 } 1669 } 1670 1671 BlockBackend *nbd_export_get_blockdev(NBDExport *exp) 1672 { 1673 return exp->blk; 1674 } 1675 1676 void nbd_export_close_all(void) 1677 { 1678 NBDExport *exp, *next; 1679 1680 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) { 1681 nbd_export_close(exp); 1682 } 1683 } 1684 1685 static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov, 1686 unsigned niov, Error **errp) 1687 { 1688 int ret; 1689 1690 g_assert(qemu_in_coroutine()); 1691 qemu_co_mutex_lock(&client->send_lock); 1692 client->send_coroutine = qemu_coroutine_self(); 1693 1694 ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0; 1695 1696 client->send_coroutine = NULL; 1697 qemu_co_mutex_unlock(&client->send_lock); 1698 1699 return ret; 1700 } 1701 1702 static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error, 1703 uint64_t handle) 1704 { 1705 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC); 1706 stl_be_p(&reply->error, error); 1707 stq_be_p(&reply->handle, handle); 1708 } 1709 1710 static int nbd_co_send_simple_reply(NBDClient *client, 1711 uint64_t handle, 1712 uint32_t error, 1713 void *data, 1714 size_t len, 1715 Error **errp) 1716 { 1717 NBDSimpleReply reply; 1718 int nbd_err = system_errno_to_nbd_errno(error); 1719 struct iovec iov[] = { 1720 {.iov_base = &reply, .iov_len = sizeof(reply)}, 1721 {.iov_base = data, .iov_len = len} 1722 }; 1723 1724 trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err), 1725 len); 1726 set_be_simple_reply(&reply, nbd_err, handle); 1727 1728 return nbd_co_send_iov(client, iov, len ? 2 : 1, errp); 1729 } 1730 1731 static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags, 1732 uint16_t type, uint64_t handle, uint32_t length) 1733 { 1734 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC); 1735 stw_be_p(&chunk->flags, flags); 1736 stw_be_p(&chunk->type, type); 1737 stq_be_p(&chunk->handle, handle); 1738 stl_be_p(&chunk->length, length); 1739 } 1740 1741 static int coroutine_fn nbd_co_send_structured_done(NBDClient *client, 1742 uint64_t handle, 1743 Error **errp) 1744 { 1745 NBDStructuredReplyChunk chunk; 1746 struct iovec iov[] = { 1747 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1748 }; 1749 1750 trace_nbd_co_send_structured_done(handle); 1751 set_be_chunk(&chunk, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, handle, 0); 1752 1753 return nbd_co_send_iov(client, iov, 1, errp); 1754 } 1755 1756 static int coroutine_fn nbd_co_send_structured_read(NBDClient *client, 1757 uint64_t handle, 1758 uint64_t offset, 1759 void *data, 1760 size_t size, 1761 bool final, 1762 Error **errp) 1763 { 1764 NBDStructuredReadData chunk; 1765 struct iovec iov[] = { 1766 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1767 {.iov_base = data, .iov_len = size} 1768 }; 1769 1770 assert(size); 1771 trace_nbd_co_send_structured_read(handle, offset, data, size); 1772 set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0, 1773 NBD_REPLY_TYPE_OFFSET_DATA, handle, 1774 sizeof(chunk) - sizeof(chunk.h) + size); 1775 stq_be_p(&chunk.offset, offset); 1776 1777 return nbd_co_send_iov(client, iov, 2, errp); 1778 } 1779 1780 static int coroutine_fn nbd_co_send_structured_error(NBDClient *client, 1781 uint64_t handle, 1782 uint32_t error, 1783 const char *msg, 1784 Error **errp) 1785 { 1786 NBDStructuredError chunk; 1787 int nbd_err = system_errno_to_nbd_errno(error); 1788 struct iovec iov[] = { 1789 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1790 {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0}, 1791 }; 1792 1793 assert(nbd_err); 1794 trace_nbd_co_send_structured_error(handle, nbd_err, 1795 nbd_err_lookup(nbd_err), msg ? msg : ""); 1796 set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, handle, 1797 sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len); 1798 stl_be_p(&chunk.error, nbd_err); 1799 stw_be_p(&chunk.message_length, iov[1].iov_len); 1800 1801 return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp); 1802 } 1803 1804 /* Do a sparse read and send the structured reply to the client. 1805 * Returns -errno if sending fails. bdrv_block_status_above() failure is 1806 * reported to the client, at which point this function succeeds. 1807 */ 1808 static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client, 1809 uint64_t handle, 1810 uint64_t offset, 1811 uint8_t *data, 1812 size_t size, 1813 Error **errp) 1814 { 1815 int ret = 0; 1816 NBDExport *exp = client->exp; 1817 size_t progress = 0; 1818 1819 while (progress < size) { 1820 int64_t pnum; 1821 int status = bdrv_block_status_above(blk_bs(exp->blk), NULL, 1822 offset + progress, 1823 size - progress, &pnum, NULL, 1824 NULL); 1825 bool final; 1826 1827 if (status < 0) { 1828 char *msg = g_strdup_printf("unable to check for holes: %s", 1829 strerror(-status)); 1830 1831 ret = nbd_co_send_structured_error(client, handle, -status, msg, 1832 errp); 1833 g_free(msg); 1834 return ret; 1835 } 1836 assert(pnum && pnum <= size - progress); 1837 final = progress + pnum == size; 1838 if (status & BDRV_BLOCK_ZERO) { 1839 NBDStructuredReadHole chunk; 1840 struct iovec iov[] = { 1841 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1842 }; 1843 1844 trace_nbd_co_send_structured_read_hole(handle, offset + progress, 1845 pnum); 1846 set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0, 1847 NBD_REPLY_TYPE_OFFSET_HOLE, 1848 handle, sizeof(chunk) - sizeof(chunk.h)); 1849 stq_be_p(&chunk.offset, offset + progress); 1850 stl_be_p(&chunk.length, pnum); 1851 ret = nbd_co_send_iov(client, iov, 1, errp); 1852 } else { 1853 ret = blk_pread(exp->blk, offset + progress + exp->dev_offset, 1854 data + progress, pnum); 1855 if (ret < 0) { 1856 error_setg_errno(errp, -ret, "reading from file failed"); 1857 break; 1858 } 1859 ret = nbd_co_send_structured_read(client, handle, offset + progress, 1860 data + progress, pnum, final, 1861 errp); 1862 } 1863 1864 if (ret < 0) { 1865 break; 1866 } 1867 progress += pnum; 1868 } 1869 return ret; 1870 } 1871 1872 /* 1873 * Populate @extents from block status. Update @bytes to be the actual 1874 * length encoded (which may be smaller than the original), and update 1875 * @nb_extents to the number of extents used. 1876 * 1877 * Returns zero on success and -errno on bdrv_block_status_above failure. 1878 */ 1879 static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset, 1880 uint64_t *bytes, NBDExtent *extents, 1881 unsigned int *nb_extents) 1882 { 1883 uint64_t remaining_bytes = *bytes; 1884 NBDExtent *extent = extents, *extents_end = extents + *nb_extents; 1885 bool first_extent = true; 1886 1887 assert(*nb_extents); 1888 while (remaining_bytes) { 1889 uint32_t flags; 1890 int64_t num; 1891 int ret = bdrv_block_status_above(bs, NULL, offset, remaining_bytes, 1892 &num, NULL, NULL); 1893 1894 if (ret < 0) { 1895 return ret; 1896 } 1897 1898 flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) | 1899 (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0); 1900 1901 if (first_extent) { 1902 extent->flags = flags; 1903 extent->length = num; 1904 first_extent = false; 1905 } else if (flags == extent->flags) { 1906 /* extend current extent */ 1907 extent->length += num; 1908 } else { 1909 if (extent + 1 == extents_end) { 1910 break; 1911 } 1912 1913 /* start new extent */ 1914 extent++; 1915 extent->flags = flags; 1916 extent->length = num; 1917 } 1918 offset += num; 1919 remaining_bytes -= num; 1920 } 1921 1922 extents_end = extent + 1; 1923 1924 for (extent = extents; extent < extents_end; extent++) { 1925 extent->flags = cpu_to_be32(extent->flags); 1926 extent->length = cpu_to_be32(extent->length); 1927 } 1928 1929 *bytes -= remaining_bytes; 1930 *nb_extents = extents_end - extents; 1931 1932 return 0; 1933 } 1934 1935 /* nbd_co_send_extents 1936 * 1937 * @length is only for tracing purposes (and may be smaller or larger 1938 * than the client's original request). @last controls whether 1939 * NBD_REPLY_FLAG_DONE is sent. @extents should already be in 1940 * big-endian format. 1941 */ 1942 static int nbd_co_send_extents(NBDClient *client, uint64_t handle, 1943 NBDExtent *extents, unsigned int nb_extents, 1944 uint64_t length, bool last, 1945 uint32_t context_id, Error **errp) 1946 { 1947 NBDStructuredMeta chunk; 1948 1949 struct iovec iov[] = { 1950 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1951 {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])} 1952 }; 1953 1954 trace_nbd_co_send_extents(handle, nb_extents, context_id, length, last); 1955 set_be_chunk(&chunk.h, last ? NBD_REPLY_FLAG_DONE : 0, 1956 NBD_REPLY_TYPE_BLOCK_STATUS, 1957 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len); 1958 stl_be_p(&chunk.context_id, context_id); 1959 1960 return nbd_co_send_iov(client, iov, 2, errp); 1961 } 1962 1963 /* Get block status from the exported device and send it to the client */ 1964 static int nbd_co_send_block_status(NBDClient *client, uint64_t handle, 1965 BlockDriverState *bs, uint64_t offset, 1966 uint32_t length, bool dont_fragment, 1967 bool last, uint32_t context_id, 1968 Error **errp) 1969 { 1970 int ret; 1971 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; 1972 NBDExtent *extents = g_new(NBDExtent, nb_extents); 1973 uint64_t final_length = length; 1974 1975 ret = blockstatus_to_extents(bs, offset, &final_length, extents, 1976 &nb_extents); 1977 if (ret < 0) { 1978 g_free(extents); 1979 return nbd_co_send_structured_error( 1980 client, handle, -ret, "can't get block status", errp); 1981 } 1982 1983 ret = nbd_co_send_extents(client, handle, extents, nb_extents, 1984 final_length, last, context_id, errp); 1985 1986 g_free(extents); 1987 1988 return ret; 1989 } 1990 1991 /* 1992 * Populate @extents from a dirty bitmap. Unless @dont_fragment, the 1993 * final extent may exceed the original @length. Store in @length the 1994 * byte length encoded (which may be smaller or larger than the 1995 * original), and return the number of extents used. 1996 */ 1997 static unsigned int bitmap_to_extents(BdrvDirtyBitmap *bitmap, uint64_t offset, 1998 uint64_t *length, NBDExtent *extents, 1999 unsigned int nb_extents, 2000 bool dont_fragment) 2001 { 2002 uint64_t begin = offset, end = offset; 2003 uint64_t overall_end = offset + *length; 2004 unsigned int i = 0; 2005 BdrvDirtyBitmapIter *it; 2006 bool dirty; 2007 2008 bdrv_dirty_bitmap_lock(bitmap); 2009 2010 it = bdrv_dirty_iter_new(bitmap); 2011 dirty = bdrv_dirty_bitmap_get_locked(bitmap, offset); 2012 2013 assert(begin < overall_end && nb_extents); 2014 while (begin < overall_end && i < nb_extents) { 2015 bool next_dirty = !dirty; 2016 2017 if (dirty) { 2018 end = bdrv_dirty_bitmap_next_zero(bitmap, begin, UINT64_MAX); 2019 } else { 2020 bdrv_set_dirty_iter(it, begin); 2021 end = bdrv_dirty_iter_next(it); 2022 } 2023 if (end == -1 || end - begin > UINT32_MAX) { 2024 /* Cap to an aligned value < 4G beyond begin. */ 2025 end = MIN(bdrv_dirty_bitmap_size(bitmap), 2026 begin + UINT32_MAX + 1 - 2027 bdrv_dirty_bitmap_granularity(bitmap)); 2028 next_dirty = dirty; 2029 } 2030 if (dont_fragment && end > overall_end) { 2031 end = overall_end; 2032 } 2033 2034 extents[i].length = cpu_to_be32(end - begin); 2035 extents[i].flags = cpu_to_be32(dirty ? NBD_STATE_DIRTY : 0); 2036 i++; 2037 begin = end; 2038 dirty = next_dirty; 2039 } 2040 2041 bdrv_dirty_iter_free(it); 2042 2043 bdrv_dirty_bitmap_unlock(bitmap); 2044 2045 assert(offset < end); 2046 *length = end - offset; 2047 return i; 2048 } 2049 2050 static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle, 2051 BdrvDirtyBitmap *bitmap, uint64_t offset, 2052 uint32_t length, bool dont_fragment, bool last, 2053 uint32_t context_id, Error **errp) 2054 { 2055 int ret; 2056 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; 2057 NBDExtent *extents = g_new(NBDExtent, nb_extents); 2058 uint64_t final_length = length; 2059 2060 nb_extents = bitmap_to_extents(bitmap, offset, &final_length, extents, 2061 nb_extents, dont_fragment); 2062 2063 ret = nbd_co_send_extents(client, handle, extents, nb_extents, 2064 final_length, last, context_id, errp); 2065 2066 g_free(extents); 2067 2068 return ret; 2069 } 2070 2071 /* nbd_co_receive_request 2072 * Collect a client request. Return 0 if request looks valid, -EIO to drop 2073 * connection right away, and any other negative value to report an error to 2074 * the client (although the caller may still need to disconnect after reporting 2075 * the error). 2076 */ 2077 static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, 2078 Error **errp) 2079 { 2080 NBDClient *client = req->client; 2081 int valid_flags; 2082 2083 g_assert(qemu_in_coroutine()); 2084 assert(client->recv_coroutine == qemu_coroutine_self()); 2085 if (nbd_receive_request(client->ioc, request, errp) < 0) { 2086 return -EIO; 2087 } 2088 2089 trace_nbd_co_receive_request_decode_type(request->handle, request->type, 2090 nbd_cmd_lookup(request->type)); 2091 2092 if (request->type != NBD_CMD_WRITE) { 2093 /* No payload, we are ready to read the next request. */ 2094 req->complete = true; 2095 } 2096 2097 if (request->type == NBD_CMD_DISC) { 2098 /* Special case: we're going to disconnect without a reply, 2099 * whether or not flags, from, or len are bogus */ 2100 return -EIO; 2101 } 2102 2103 if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE || 2104 request->type == NBD_CMD_CACHE) 2105 { 2106 if (request->len > NBD_MAX_BUFFER_SIZE) { 2107 error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)", 2108 request->len, NBD_MAX_BUFFER_SIZE); 2109 return -EINVAL; 2110 } 2111 2112 if (request->type != NBD_CMD_CACHE) { 2113 req->data = blk_try_blockalign(client->exp->blk, request->len); 2114 if (req->data == NULL) { 2115 error_setg(errp, "No memory"); 2116 return -ENOMEM; 2117 } 2118 } 2119 } 2120 2121 if (request->type == NBD_CMD_WRITE) { 2122 if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data", 2123 errp) < 0) 2124 { 2125 return -EIO; 2126 } 2127 req->complete = true; 2128 2129 trace_nbd_co_receive_request_payload_received(request->handle, 2130 request->len); 2131 } 2132 2133 /* Sanity checks. */ 2134 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && 2135 (request->type == NBD_CMD_WRITE || 2136 request->type == NBD_CMD_WRITE_ZEROES || 2137 request->type == NBD_CMD_TRIM)) { 2138 error_setg(errp, "Export is read-only"); 2139 return -EROFS; 2140 } 2141 if (request->from > client->exp->size || 2142 request->len > client->exp->size - request->from) { 2143 error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32 2144 ", Size: %" PRIu64, request->from, request->len, 2145 client->exp->size); 2146 return (request->type == NBD_CMD_WRITE || 2147 request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL; 2148 } 2149 if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len, 2150 client->check_align)) { 2151 /* 2152 * The block layer gracefully handles unaligned requests, but 2153 * it's still worth tracing client non-compliance 2154 */ 2155 trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type), 2156 request->from, 2157 request->len, 2158 client->check_align); 2159 } 2160 valid_flags = NBD_CMD_FLAG_FUA; 2161 if (request->type == NBD_CMD_READ && client->structured_reply) { 2162 valid_flags |= NBD_CMD_FLAG_DF; 2163 } else if (request->type == NBD_CMD_WRITE_ZEROES) { 2164 valid_flags |= NBD_CMD_FLAG_NO_HOLE; 2165 } else if (request->type == NBD_CMD_BLOCK_STATUS) { 2166 valid_flags |= NBD_CMD_FLAG_REQ_ONE; 2167 } 2168 if (request->flags & ~valid_flags) { 2169 error_setg(errp, "unsupported flags for command %s (got 0x%x)", 2170 nbd_cmd_lookup(request->type), request->flags); 2171 return -EINVAL; 2172 } 2173 2174 return 0; 2175 } 2176 2177 /* Send simple reply without a payload, or a structured error 2178 * @error_msg is ignored if @ret >= 0 2179 * Returns 0 if connection is still live, -errno on failure to talk to client 2180 */ 2181 static coroutine_fn int nbd_send_generic_reply(NBDClient *client, 2182 uint64_t handle, 2183 int ret, 2184 const char *error_msg, 2185 Error **errp) 2186 { 2187 if (client->structured_reply && ret < 0) { 2188 return nbd_co_send_structured_error(client, handle, -ret, error_msg, 2189 errp); 2190 } else { 2191 return nbd_co_send_simple_reply(client, handle, ret < 0 ? -ret : 0, 2192 NULL, 0, errp); 2193 } 2194 } 2195 2196 /* Handle NBD_CMD_READ request. 2197 * Return -errno if sending fails. Other errors are reported directly to the 2198 * client as an error reply. */ 2199 static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, 2200 uint8_t *data, Error **errp) 2201 { 2202 int ret; 2203 NBDExport *exp = client->exp; 2204 2205 assert(request->type == NBD_CMD_READ); 2206 2207 /* XXX: NBD Protocol only documents use of FUA with WRITE */ 2208 if (request->flags & NBD_CMD_FLAG_FUA) { 2209 ret = blk_co_flush(exp->blk); 2210 if (ret < 0) { 2211 return nbd_send_generic_reply(client, request->handle, ret, 2212 "flush failed", errp); 2213 } 2214 } 2215 2216 if (client->structured_reply && !(request->flags & NBD_CMD_FLAG_DF) && 2217 request->len) 2218 { 2219 return nbd_co_send_sparse_read(client, request->handle, request->from, 2220 data, request->len, errp); 2221 } 2222 2223 ret = blk_pread(exp->blk, request->from + exp->dev_offset, data, 2224 request->len); 2225 if (ret < 0) { 2226 return nbd_send_generic_reply(client, request->handle, ret, 2227 "reading from file failed", errp); 2228 } 2229 2230 if (client->structured_reply) { 2231 if (request->len) { 2232 return nbd_co_send_structured_read(client, request->handle, 2233 request->from, data, 2234 request->len, true, errp); 2235 } else { 2236 return nbd_co_send_structured_done(client, request->handle, errp); 2237 } 2238 } else { 2239 return nbd_co_send_simple_reply(client, request->handle, 0, 2240 data, request->len, errp); 2241 } 2242 } 2243 2244 /* 2245 * nbd_do_cmd_cache 2246 * 2247 * Handle NBD_CMD_CACHE request. 2248 * Return -errno if sending fails. Other errors are reported directly to the 2249 * client as an error reply. 2250 */ 2251 static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request, 2252 Error **errp) 2253 { 2254 int ret; 2255 NBDExport *exp = client->exp; 2256 2257 assert(request->type == NBD_CMD_CACHE); 2258 2259 ret = blk_co_preadv(exp->blk, request->from + exp->dev_offset, request->len, 2260 NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); 2261 2262 return nbd_send_generic_reply(client, request->handle, ret, 2263 "caching data failed", errp); 2264 } 2265 2266 /* Handle NBD request. 2267 * Return -errno if sending fails. Other errors are reported directly to the 2268 * client as an error reply. */ 2269 static coroutine_fn int nbd_handle_request(NBDClient *client, 2270 NBDRequest *request, 2271 uint8_t *data, Error **errp) 2272 { 2273 int ret; 2274 int flags; 2275 NBDExport *exp = client->exp; 2276 char *msg; 2277 2278 switch (request->type) { 2279 case NBD_CMD_CACHE: 2280 return nbd_do_cmd_cache(client, request, errp); 2281 2282 case NBD_CMD_READ: 2283 return nbd_do_cmd_read(client, request, data, errp); 2284 2285 case NBD_CMD_WRITE: 2286 flags = 0; 2287 if (request->flags & NBD_CMD_FLAG_FUA) { 2288 flags |= BDRV_REQ_FUA; 2289 } 2290 ret = blk_pwrite(exp->blk, request->from + exp->dev_offset, 2291 data, request->len, flags); 2292 return nbd_send_generic_reply(client, request->handle, ret, 2293 "writing to file failed", errp); 2294 2295 case NBD_CMD_WRITE_ZEROES: 2296 flags = 0; 2297 if (request->flags & NBD_CMD_FLAG_FUA) { 2298 flags |= BDRV_REQ_FUA; 2299 } 2300 if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) { 2301 flags |= BDRV_REQ_MAY_UNMAP; 2302 } 2303 ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, 2304 request->len, flags); 2305 return nbd_send_generic_reply(client, request->handle, ret, 2306 "writing to file failed", errp); 2307 2308 case NBD_CMD_DISC: 2309 /* unreachable, thanks to special case in nbd_co_receive_request() */ 2310 abort(); 2311 2312 case NBD_CMD_FLUSH: 2313 ret = blk_co_flush(exp->blk); 2314 return nbd_send_generic_reply(client, request->handle, ret, 2315 "flush failed", errp); 2316 2317 case NBD_CMD_TRIM: 2318 ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, 2319 request->len); 2320 if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) { 2321 ret = blk_co_flush(exp->blk); 2322 } 2323 return nbd_send_generic_reply(client, request->handle, ret, 2324 "discard failed", errp); 2325 2326 case NBD_CMD_BLOCK_STATUS: 2327 if (!request->len) { 2328 return nbd_send_generic_reply(client, request->handle, -EINVAL, 2329 "need non-zero length", errp); 2330 } 2331 if (client->export_meta.valid && 2332 (client->export_meta.base_allocation || 2333 client->export_meta.bitmap)) 2334 { 2335 bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE; 2336 2337 if (client->export_meta.base_allocation) { 2338 ret = nbd_co_send_block_status(client, request->handle, 2339 blk_bs(exp->blk), request->from, 2340 request->len, dont_fragment, 2341 !client->export_meta.bitmap, 2342 NBD_META_ID_BASE_ALLOCATION, 2343 errp); 2344 if (ret < 0) { 2345 return ret; 2346 } 2347 } 2348 2349 if (client->export_meta.bitmap) { 2350 ret = nbd_co_send_bitmap(client, request->handle, 2351 client->exp->export_bitmap, 2352 request->from, request->len, 2353 dont_fragment, 2354 true, NBD_META_ID_DIRTY_BITMAP, errp); 2355 if (ret < 0) { 2356 return ret; 2357 } 2358 } 2359 2360 return ret; 2361 } else { 2362 return nbd_send_generic_reply(client, request->handle, -EINVAL, 2363 "CMD_BLOCK_STATUS not negotiated", 2364 errp); 2365 } 2366 2367 default: 2368 msg = g_strdup_printf("invalid request type (%" PRIu32 ") received", 2369 request->type); 2370 ret = nbd_send_generic_reply(client, request->handle, -EINVAL, msg, 2371 errp); 2372 g_free(msg); 2373 return ret; 2374 } 2375 } 2376 2377 /* Owns a reference to the NBDClient passed as opaque. */ 2378 static coroutine_fn void nbd_trip(void *opaque) 2379 { 2380 NBDClient *client = opaque; 2381 NBDRequestData *req; 2382 NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */ 2383 int ret; 2384 Error *local_err = NULL; 2385 2386 trace_nbd_trip(); 2387 if (client->closing) { 2388 nbd_client_put(client); 2389 return; 2390 } 2391 2392 req = nbd_request_get(client); 2393 ret = nbd_co_receive_request(req, &request, &local_err); 2394 client->recv_coroutine = NULL; 2395 2396 if (client->closing) { 2397 /* 2398 * The client may be closed when we are blocked in 2399 * nbd_co_receive_request() 2400 */ 2401 goto done; 2402 } 2403 2404 nbd_client_receive_next_request(client); 2405 if (ret == -EIO) { 2406 goto disconnect; 2407 } 2408 2409 if (ret < 0) { 2410 /* It wans't -EIO, so, according to nbd_co_receive_request() 2411 * semantics, we should return the error to the client. */ 2412 Error *export_err = local_err; 2413 2414 local_err = NULL; 2415 ret = nbd_send_generic_reply(client, request.handle, -EINVAL, 2416 error_get_pretty(export_err), &local_err); 2417 error_free(export_err); 2418 } else { 2419 ret = nbd_handle_request(client, &request, req->data, &local_err); 2420 } 2421 if (ret < 0) { 2422 error_prepend(&local_err, "Failed to send reply: "); 2423 goto disconnect; 2424 } 2425 2426 /* We must disconnect after NBD_CMD_WRITE if we did not 2427 * read the payload. 2428 */ 2429 if (!req->complete) { 2430 error_setg(&local_err, "Request handling failed in intermediate state"); 2431 goto disconnect; 2432 } 2433 2434 done: 2435 nbd_request_put(req); 2436 nbd_client_put(client); 2437 return; 2438 2439 disconnect: 2440 if (local_err) { 2441 error_reportf_err(local_err, "Disconnect client, due to: "); 2442 } 2443 nbd_request_put(req); 2444 client_close(client, true); 2445 nbd_client_put(client); 2446 } 2447 2448 static void nbd_client_receive_next_request(NBDClient *client) 2449 { 2450 if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) { 2451 nbd_client_get(client); 2452 client->recv_coroutine = qemu_coroutine_create(nbd_trip, client); 2453 aio_co_schedule(client->exp->ctx, client->recv_coroutine); 2454 } 2455 } 2456 2457 static coroutine_fn void nbd_co_client_start(void *opaque) 2458 { 2459 NBDClient *client = opaque; 2460 Error *local_err = NULL; 2461 2462 qemu_co_mutex_init(&client->send_lock); 2463 2464 if (nbd_negotiate(client, &local_err)) { 2465 if (local_err) { 2466 error_report_err(local_err); 2467 } 2468 client_close(client, false); 2469 return; 2470 } 2471 2472 nbd_client_receive_next_request(client); 2473 } 2474 2475 /* 2476 * Create a new client listener using the given channel @sioc. 2477 * Begin servicing it in a coroutine. When the connection closes, call 2478 * @close_fn with an indication of whether the client completed negotiation. 2479 */ 2480 void nbd_client_new(QIOChannelSocket *sioc, 2481 QCryptoTLSCreds *tlscreds, 2482 const char *tlsauthz, 2483 void (*close_fn)(NBDClient *, bool)) 2484 { 2485 NBDClient *client; 2486 Coroutine *co; 2487 2488 client = g_new0(NBDClient, 1); 2489 client->refcount = 1; 2490 client->tlscreds = tlscreds; 2491 if (tlscreds) { 2492 object_ref(OBJECT(client->tlscreds)); 2493 } 2494 client->tlsauthz = g_strdup(tlsauthz); 2495 client->sioc = sioc; 2496 object_ref(OBJECT(client->sioc)); 2497 client->ioc = QIO_CHANNEL(sioc); 2498 object_ref(OBJECT(client->ioc)); 2499 client->close_fn = close_fn; 2500 2501 co = qemu_coroutine_create(nbd_co_client_start, client); 2502 qemu_coroutine_enter(co); 2503 } 2504