|
|
1.1 ! root 1: /* ! 2: * QEMU Block driver for RADOS (Ceph) ! 3: * ! 4: * Copyright (C) 2010 Christian Brunner <[email protected]> ! 5: * ! 6: * This work is licensed under the terms of the GNU GPL, version 2. See ! 7: * the COPYING file in the top-level directory. ! 8: * ! 9: */ ! 10: ! 11: #include "qemu-common.h" ! 12: #include "qemu-error.h" ! 13: ! 14: #include "rbd_types.h" ! 15: #include "block_int.h" ! 16: ! 17: #include <rados/librados.h> ! 18: ! 19: ! 20: ! 21: /* ! 22: * When specifying the image filename use: ! 23: * ! 24: * rbd:poolname/devicename ! 25: * ! 26: * poolname must be the name of an existing rados pool ! 27: * ! 28: * devicename is the basename for all objects used to ! 29: * emulate the raw device. ! 30: * ! 31: * Metadata information (image size, ...) is stored in an ! 32: * object with the name "devicename.rbd". ! 33: * ! 34: * The raw device is split into 4MB sized objects by default. ! 35: * The sequencenumber is encoded in a 12 byte long hex-string, ! 36: * and is attached to the devicename, separated by a dot. ! 37: * e.g. "devicename.1234567890ab" ! 38: * ! 39: */ ! 40: ! 41: #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) ! 42: ! 43: typedef struct RBDAIOCB { ! 44: BlockDriverAIOCB common; ! 45: QEMUBH *bh; ! 46: int ret; ! 47: QEMUIOVector *qiov; ! 48: char *bounce; ! 49: int write; ! 50: int64_t sector_num; ! 51: int aiocnt; ! 52: int error; ! 53: struct BDRVRBDState *s; ! 54: int cancelled; ! 55: } RBDAIOCB; ! 56: ! 57: typedef struct RADOSCB { ! 58: int rcbid; ! 59: RBDAIOCB *acb; ! 60: struct BDRVRBDState *s; ! 61: int done; ! 62: int64_t segsize; ! 63: char *buf; ! 64: int ret; ! 65: } RADOSCB; ! 66: ! 67: #define RBD_FD_READ 0 ! 68: #define RBD_FD_WRITE 1 ! 69: ! 70: typedef struct BDRVRBDState { ! 71: int fds[2]; ! 72: rados_pool_t pool; ! 73: rados_pool_t header_pool; ! 74: char name[RBD_MAX_OBJ_NAME_SIZE]; ! 75: char block_name[RBD_MAX_BLOCK_NAME_SIZE]; ! 76: uint64_t size; ! 77: uint64_t objsize; ! 78: int qemu_aio_count; ! 79: int event_reader_pos; ! 80: RADOSCB *event_rcb; ! 81: } BDRVRBDState; ! 82: ! 83: typedef struct rbd_obj_header_ondisk RbdHeader1; ! 84: ! 85: static void rbd_aio_bh_cb(void *opaque); ! 86: ! 87: static int rbd_next_tok(char *dst, int dst_len, ! 88: char *src, char delim, ! 89: const char *name, ! 90: char **p) ! 91: { ! 92: int l; ! 93: char *end; ! 94: ! 95: *p = NULL; ! 96: ! 97: if (delim != '\0') { ! 98: end = strchr(src, delim); ! 99: if (end) { ! 100: *p = end + 1; ! 101: *end = '\0'; ! 102: } ! 103: } ! 104: l = strlen(src); ! 105: if (l >= dst_len) { ! 106: error_report("%s too long", name); ! 107: return -EINVAL; ! 108: } else if (l == 0) { ! 109: error_report("%s too short", name); ! 110: return -EINVAL; ! 111: } ! 112: ! 113: pstrcpy(dst, dst_len, src); ! 114: ! 115: return 0; ! 116: } ! 117: ! 118: static int rbd_parsename(const char *filename, ! 119: char *pool, int pool_len, ! 120: char *snap, int snap_len, ! 121: char *name, int name_len) ! 122: { ! 123: const char *start; ! 124: char *p, *buf; ! 125: int ret; ! 126: ! 127: if (!strstart(filename, "rbd:", &start)) { ! 128: return -EINVAL; ! 129: } ! 130: ! 131: buf = qemu_strdup(start); ! 132: p = buf; ! 133: ! 134: ret = rbd_next_tok(pool, pool_len, p, '/', "pool name", &p); ! 135: if (ret < 0 || !p) { ! 136: ret = -EINVAL; ! 137: goto done; ! 138: } ! 139: ret = rbd_next_tok(name, name_len, p, '@', "object name", &p); ! 140: if (ret < 0) { ! 141: goto done; ! 142: } ! 143: if (!p) { ! 144: *snap = '\0'; ! 145: goto done; ! 146: } ! 147: ! 148: ret = rbd_next_tok(snap, snap_len, p, '\0', "snap name", &p); ! 149: ! 150: done: ! 151: qemu_free(buf); ! 152: return ret; ! 153: } ! 154: ! 155: static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc) ! 156: { ! 157: uint32_t len = strlen(name); ! 158: uint32_t len_le = cpu_to_le32(len); ! 159: /* total_len = encoding op + name + empty buffer */ ! 160: uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t); ! 161: uint8_t *desc = NULL; ! 162: ! 163: desc = qemu_malloc(total_len); ! 164: ! 165: *tmap_desc = (char *)desc; ! 166: ! 167: *desc = op; ! 168: desc++; ! 169: memcpy(desc, &len_le, sizeof(len_le)); ! 170: desc += sizeof(len_le); ! 171: memcpy(desc, name, len); ! 172: desc += len; ! 173: len = 0; /* no need for endian conversion for 0 */ ! 174: memcpy(desc, &len, sizeof(len)); ! 175: desc += sizeof(len); ! 176: ! 177: return (char *)desc - *tmap_desc; ! 178: } ! 179: ! 180: static void free_tmap_op(char *tmap_desc) ! 181: { ! 182: qemu_free(tmap_desc); ! 183: } ! 184: ! 185: static int rbd_register_image(rados_pool_t pool, const char *name) ! 186: { ! 187: char *tmap_desc; ! 188: const char *dir = RBD_DIRECTORY; ! 189: int ret; ! 190: ! 191: ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, &tmap_desc); ! 192: if (ret < 0) { ! 193: return ret; ! 194: } ! 195: ! 196: ret = rados_tmap_update(pool, dir, tmap_desc, ret); ! 197: free_tmap_op(tmap_desc); ! 198: ! 199: return ret; ! 200: } ! 201: ! 202: static int touch_rbd_info(rados_pool_t pool, const char *info_oid) ! 203: { ! 204: int r = rados_write(pool, info_oid, 0, NULL, 0); ! 205: if (r < 0) { ! 206: return r; ! 207: } ! 208: return 0; ! 209: } ! 210: ! 211: static int rbd_assign_bid(rados_pool_t pool, uint64_t *id) ! 212: { ! 213: uint64_t out[1]; ! 214: const char *info_oid = RBD_INFO; ! 215: ! 216: *id = 0; ! 217: ! 218: int r = touch_rbd_info(pool, info_oid); ! 219: if (r < 0) { ! 220: return r; ! 221: } ! 222: ! 223: r = rados_exec(pool, info_oid, "rbd", "assign_bid", NULL, ! 224: 0, (char *)out, sizeof(out)); ! 225: if (r < 0) { ! 226: return r; ! 227: } ! 228: ! 229: le64_to_cpus(out); ! 230: *id = out[0]; ! 231: ! 232: return 0; ! 233: } ! 234: ! 235: static int rbd_create(const char *filename, QEMUOptionParameter *options) ! 236: { ! 237: int64_t bytes = 0; ! 238: int64_t objsize; ! 239: uint64_t size; ! 240: time_t mtime; ! 241: uint8_t obj_order = RBD_DEFAULT_OBJ_ORDER; ! 242: char pool[RBD_MAX_SEG_NAME_SIZE]; ! 243: char n[RBD_MAX_SEG_NAME_SIZE]; ! 244: char name[RBD_MAX_OBJ_NAME_SIZE]; ! 245: char snap_buf[RBD_MAX_SEG_NAME_SIZE]; ! 246: char *snap = NULL; ! 247: RbdHeader1 header; ! 248: rados_pool_t p; ! 249: uint64_t bid; ! 250: uint32_t hi, lo; ! 251: int ret; ! 252: ! 253: if (rbd_parsename(filename, ! 254: pool, sizeof(pool), ! 255: snap_buf, sizeof(snap_buf), ! 256: name, sizeof(name)) < 0) { ! 257: return -EINVAL; ! 258: } ! 259: if (snap_buf[0] != '\0') { ! 260: snap = snap_buf; ! 261: } ! 262: ! 263: snprintf(n, sizeof(n), "%s%s", name, RBD_SUFFIX); ! 264: ! 265: /* Read out options */ ! 266: while (options && options->name) { ! 267: if (!strcmp(options->name, BLOCK_OPT_SIZE)) { ! 268: bytes = options->value.n; ! 269: } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { ! 270: if (options->value.n) { ! 271: objsize = options->value.n; ! 272: if ((objsize - 1) & objsize) { /* not a power of 2? */ ! 273: error_report("obj size needs to be power of 2"); ! 274: return -EINVAL; ! 275: } ! 276: if (objsize < 4096) { ! 277: error_report("obj size too small"); ! 278: return -EINVAL; ! 279: } ! 280: obj_order = ffs(objsize) - 1; ! 281: } ! 282: } ! 283: options++; ! 284: } ! 285: ! 286: memset(&header, 0, sizeof(header)); ! 287: pstrcpy(header.text, sizeof(header.text), RBD_HEADER_TEXT); ! 288: pstrcpy(header.signature, sizeof(header.signature), RBD_HEADER_SIGNATURE); ! 289: pstrcpy(header.version, sizeof(header.version), RBD_HEADER_VERSION); ! 290: header.image_size = cpu_to_le64(bytes); ! 291: header.options.order = obj_order; ! 292: header.options.crypt_type = RBD_CRYPT_NONE; ! 293: header.options.comp_type = RBD_COMP_NONE; ! 294: header.snap_seq = 0; ! 295: header.snap_count = 0; ! 296: ! 297: if (rados_initialize(0, NULL) < 0) { ! 298: error_report("error initializing"); ! 299: return -EIO; ! 300: } ! 301: ! 302: if (rados_open_pool(pool, &p)) { ! 303: error_report("error opening pool %s", pool); ! 304: rados_deinitialize(); ! 305: return -EIO; ! 306: } ! 307: ! 308: /* check for existing rbd header file */ ! 309: ret = rados_stat(p, n, &size, &mtime); ! 310: if (ret == 0) { ! 311: ret=-EEXIST; ! 312: goto done; ! 313: } ! 314: ! 315: ret = rbd_assign_bid(p, &bid); ! 316: if (ret < 0) { ! 317: error_report("failed assigning block id"); ! 318: rados_deinitialize(); ! 319: return -EIO; ! 320: } ! 321: hi = bid >> 32; ! 322: lo = bid & 0xFFFFFFFF; ! 323: snprintf(header.block_name, sizeof(header.block_name), "rb.%x.%x", hi, lo); ! 324: ! 325: /* create header file */ ! 326: ret = rados_write(p, n, 0, (const char *)&header, sizeof(header)); ! 327: if (ret < 0) { ! 328: goto done; ! 329: } ! 330: ! 331: ret = rbd_register_image(p, name); ! 332: done: ! 333: rados_close_pool(p); ! 334: rados_deinitialize(); ! 335: ! 336: return ret; ! 337: } ! 338: ! 339: /* ! 340: * This aio completion is being called from rbd_aio_event_reader() and ! 341: * runs in qemu context. It schedules a bh, but just in case the aio ! 342: * was not cancelled before. ! 343: */ ! 344: static void rbd_complete_aio(RADOSCB *rcb) ! 345: { ! 346: RBDAIOCB *acb = rcb->acb; ! 347: int64_t r; ! 348: ! 349: acb->aiocnt--; ! 350: ! 351: if (acb->cancelled) { ! 352: if (!acb->aiocnt) { ! 353: qemu_vfree(acb->bounce); ! 354: qemu_aio_release(acb); ! 355: } ! 356: goto done; ! 357: } ! 358: ! 359: r = rcb->ret; ! 360: ! 361: if (acb->write) { ! 362: if (r < 0) { ! 363: acb->ret = r; ! 364: acb->error = 1; ! 365: } else if (!acb->error) { ! 366: acb->ret += rcb->segsize; ! 367: } ! 368: } else { ! 369: if (r == -ENOENT) { ! 370: memset(rcb->buf, 0, rcb->segsize); ! 371: if (!acb->error) { ! 372: acb->ret += rcb->segsize; ! 373: } ! 374: } else if (r < 0) { ! 375: memset(rcb->buf, 0, rcb->segsize); ! 376: acb->ret = r; ! 377: acb->error = 1; ! 378: } else if (r < rcb->segsize) { ! 379: memset(rcb->buf + r, 0, rcb->segsize - r); ! 380: if (!acb->error) { ! 381: acb->ret += rcb->segsize; ! 382: } ! 383: } else if (!acb->error) { ! 384: acb->ret += r; ! 385: } ! 386: } ! 387: /* Note that acb->bh can be NULL in case where the aio was cancelled */ ! 388: if (!acb->aiocnt) { ! 389: acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); ! 390: qemu_bh_schedule(acb->bh); ! 391: } ! 392: done: ! 393: qemu_free(rcb); ! 394: } ! 395: ! 396: /* ! 397: * aio fd read handler. It runs in the qemu context and calls the ! 398: * completion handling of completed rados aio operations. ! 399: */ ! 400: static void rbd_aio_event_reader(void *opaque) ! 401: { ! 402: BDRVRBDState *s = opaque; ! 403: ! 404: ssize_t ret; ! 405: ! 406: do { ! 407: char *p = (char *)&s->event_rcb; ! 408: ! 409: /* now read the rcb pointer that was sent from a non qemu thread */ ! 410: if ((ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, ! 411: sizeof(s->event_rcb) - s->event_reader_pos)) > 0) { ! 412: if (ret > 0) { ! 413: s->event_reader_pos += ret; ! 414: if (s->event_reader_pos == sizeof(s->event_rcb)) { ! 415: s->event_reader_pos = 0; ! 416: rbd_complete_aio(s->event_rcb); ! 417: s->qemu_aio_count --; ! 418: } ! 419: } ! 420: } ! 421: } while (ret < 0 && errno == EINTR); ! 422: } ! 423: ! 424: static int rbd_aio_flush_cb(void *opaque) ! 425: { ! 426: BDRVRBDState *s = opaque; ! 427: ! 428: return (s->qemu_aio_count > 0); ! 429: } ! 430: ! 431: ! 432: static int rbd_set_snapc(rados_pool_t pool, const char *snap, RbdHeader1 *header) ! 433: { ! 434: uint32_t snap_count = le32_to_cpu(header->snap_count); ! 435: rados_snap_t *snaps = NULL; ! 436: rados_snap_t seq; ! 437: uint32_t i; ! 438: uint64_t snap_names_len = le64_to_cpu(header->snap_names_len); ! 439: int r; ! 440: rados_snap_t snapid = 0; ! 441: ! 442: if (snap_count) { ! 443: const char *header_snap = (const char *)&header->snaps[snap_count]; ! 444: const char *end = header_snap + snap_names_len; ! 445: snaps = qemu_malloc(sizeof(rados_snap_t) * header->snap_count); ! 446: ! 447: for (i=0; i < snap_count; i++) { ! 448: snaps[i] = le64_to_cpu(header->snaps[i].id); ! 449: ! 450: if (snap && strcmp(snap, header_snap) == 0) { ! 451: snapid = snaps[i]; ! 452: } ! 453: ! 454: header_snap += strlen(header_snap) + 1; ! 455: if (header_snap > end) { ! 456: error_report("bad header, snapshot list broken"); ! 457: } ! 458: } ! 459: } ! 460: ! 461: if (snap && !snapid) { ! 462: error_report("snapshot not found"); ! 463: qemu_free(snaps); ! 464: return -ENOENT; ! 465: } ! 466: seq = le32_to_cpu(header->snap_seq); ! 467: ! 468: r = rados_set_snap_context(pool, seq, snaps, snap_count); ! 469: ! 470: rados_set_snap(pool, snapid); ! 471: ! 472: qemu_free(snaps); ! 473: ! 474: return r; ! 475: } ! 476: ! 477: #define BUF_READ_START_LEN 4096 ! 478: ! 479: static int rbd_read_header(BDRVRBDState *s, char **hbuf) ! 480: { ! 481: char *buf = NULL; ! 482: char n[RBD_MAX_SEG_NAME_SIZE]; ! 483: uint64_t len = BUF_READ_START_LEN; ! 484: int r; ! 485: ! 486: snprintf(n, sizeof(n), "%s%s", s->name, RBD_SUFFIX); ! 487: ! 488: buf = qemu_malloc(len); ! 489: ! 490: r = rados_read(s->header_pool, n, 0, buf, len); ! 491: if (r < 0) { ! 492: goto failed; ! 493: } ! 494: ! 495: if (r < len) { ! 496: goto done; ! 497: } ! 498: ! 499: qemu_free(buf); ! 500: buf = qemu_malloc(len); ! 501: ! 502: r = rados_stat(s->header_pool, n, &len, NULL); ! 503: if (r < 0) { ! 504: goto failed; ! 505: } ! 506: ! 507: r = rados_read(s->header_pool, n, 0, buf, len); ! 508: if (r < 0) { ! 509: goto failed; ! 510: } ! 511: ! 512: done: ! 513: *hbuf = buf; ! 514: return 0; ! 515: ! 516: failed: ! 517: qemu_free(buf); ! 518: return r; ! 519: } ! 520: ! 521: static int rbd_open(BlockDriverState *bs, const char *filename, int flags) ! 522: { ! 523: BDRVRBDState *s = bs->opaque; ! 524: RbdHeader1 *header; ! 525: char pool[RBD_MAX_SEG_NAME_SIZE]; ! 526: char snap_buf[RBD_MAX_SEG_NAME_SIZE]; ! 527: char *snap = NULL; ! 528: char *hbuf = NULL; ! 529: int r; ! 530: ! 531: if (rbd_parsename(filename, pool, sizeof(pool), ! 532: snap_buf, sizeof(snap_buf), ! 533: s->name, sizeof(s->name)) < 0) { ! 534: return -EINVAL; ! 535: } ! 536: if (snap_buf[0] != '\0') { ! 537: snap = snap_buf; ! 538: } ! 539: ! 540: if ((r = rados_initialize(0, NULL)) < 0) { ! 541: error_report("error initializing"); ! 542: return r; ! 543: } ! 544: ! 545: if ((r = rados_open_pool(pool, &s->pool))) { ! 546: error_report("error opening pool %s", pool); ! 547: rados_deinitialize(); ! 548: return r; ! 549: } ! 550: ! 551: if ((r = rados_open_pool(pool, &s->header_pool))) { ! 552: error_report("error opening pool %s", pool); ! 553: rados_deinitialize(); ! 554: return r; ! 555: } ! 556: ! 557: if ((r = rbd_read_header(s, &hbuf)) < 0) { ! 558: error_report("error reading header from %s", s->name); ! 559: goto failed; ! 560: } ! 561: ! 562: if (memcmp(hbuf + 64, RBD_HEADER_SIGNATURE, 4)) { ! 563: error_report("Invalid header signature"); ! 564: r = -EMEDIUMTYPE; ! 565: goto failed; ! 566: } ! 567: ! 568: if (memcmp(hbuf + 68, RBD_HEADER_VERSION, 8)) { ! 569: error_report("Unknown image version"); ! 570: r = -EMEDIUMTYPE; ! 571: goto failed; ! 572: } ! 573: ! 574: header = (RbdHeader1 *) hbuf; ! 575: s->size = le64_to_cpu(header->image_size); ! 576: s->objsize = 1ULL << header->options.order; ! 577: memcpy(s->block_name, header->block_name, sizeof(header->block_name)); ! 578: ! 579: r = rbd_set_snapc(s->pool, snap, header); ! 580: if (r < 0) { ! 581: error_report("failed setting snap context: %s", strerror(-r)); ! 582: goto failed; ! 583: } ! 584: ! 585: bs->read_only = (snap != NULL); ! 586: ! 587: s->event_reader_pos = 0; ! 588: r = qemu_pipe(s->fds); ! 589: if (r < 0) { ! 590: error_report("error opening eventfd"); ! 591: goto failed; ! 592: } ! 593: fcntl(s->fds[0], F_SETFL, O_NONBLOCK); ! 594: fcntl(s->fds[1], F_SETFL, O_NONBLOCK); ! 595: qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], rbd_aio_event_reader, NULL, ! 596: rbd_aio_flush_cb, NULL, s); ! 597: ! 598: qemu_free(hbuf); ! 599: ! 600: return 0; ! 601: ! 602: failed: ! 603: qemu_free(hbuf); ! 604: ! 605: rados_close_pool(s->header_pool); ! 606: rados_close_pool(s->pool); ! 607: rados_deinitialize(); ! 608: return r; ! 609: } ! 610: ! 611: static void rbd_close(BlockDriverState *bs) ! 612: { ! 613: BDRVRBDState *s = bs->opaque; ! 614: ! 615: close(s->fds[0]); ! 616: close(s->fds[1]); ! 617: qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL , NULL, NULL, NULL, ! 618: NULL); ! 619: ! 620: rados_close_pool(s->header_pool); ! 621: rados_close_pool(s->pool); ! 622: rados_deinitialize(); ! 623: } ! 624: ! 625: /* ! 626: * Cancel aio. Since we don't reference acb in a non qemu threads, ! 627: * it is safe to access it here. ! 628: */ ! 629: static void rbd_aio_cancel(BlockDriverAIOCB *blockacb) ! 630: { ! 631: RBDAIOCB *acb = (RBDAIOCB *) blockacb; ! 632: acb->cancelled = 1; ! 633: } ! 634: ! 635: static AIOPool rbd_aio_pool = { ! 636: .aiocb_size = sizeof(RBDAIOCB), ! 637: .cancel = rbd_aio_cancel, ! 638: }; ! 639: ! 640: /* ! 641: * This is the callback function for rados_aio_read and _write ! 642: * ! 643: * Note: this function is being called from a non qemu thread so ! 644: * we need to be careful about what we do here. Generally we only ! 645: * write to the block notification pipe, and do the rest of the ! 646: * io completion handling from rbd_aio_event_reader() which ! 647: * runs in a qemu context. ! 648: */ ! 649: static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb) ! 650: { ! 651: int ret; ! 652: rcb->ret = rados_aio_get_return_value(c); ! 653: rados_aio_release(c); ! 654: while (1) { ! 655: fd_set wfd; ! 656: int fd = rcb->s->fds[RBD_FD_WRITE]; ! 657: ! 658: /* send the rcb pointer to the qemu thread that is responsible ! 659: for the aio completion. Must do it in a qemu thread context */ ! 660: ret = write(fd, (void *)&rcb, sizeof(rcb)); ! 661: if (ret >= 0) { ! 662: break; ! 663: } ! 664: if (errno == EINTR) { ! 665: continue; ! 666: } ! 667: if (errno != EAGAIN) { ! 668: break; ! 669: } ! 670: ! 671: FD_ZERO(&wfd); ! 672: FD_SET(fd, &wfd); ! 673: do { ! 674: ret = select(fd + 1, NULL, &wfd, NULL, NULL); ! 675: } while (ret < 0 && errno == EINTR); ! 676: } ! 677: ! 678: if (ret < 0) { ! 679: error_report("failed writing to acb->s->fds\n"); ! 680: qemu_free(rcb); ! 681: } ! 682: } ! 683: ! 684: /* Callback when all queued rados_aio requests are complete */ ! 685: ! 686: static void rbd_aio_bh_cb(void *opaque) ! 687: { ! 688: RBDAIOCB *acb = opaque; ! 689: ! 690: if (!acb->write) { ! 691: qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); ! 692: } ! 693: qemu_vfree(acb->bounce); ! 694: acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); ! 695: qemu_bh_delete(acb->bh); ! 696: acb->bh = NULL; ! 697: ! 698: qemu_aio_release(acb); ! 699: } ! 700: ! 701: static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs, ! 702: int64_t sector_num, ! 703: QEMUIOVector *qiov, ! 704: int nb_sectors, ! 705: BlockDriverCompletionFunc *cb, ! 706: void *opaque, int write) ! 707: { ! 708: RBDAIOCB *acb; ! 709: RADOSCB *rcb; ! 710: rados_completion_t c; ! 711: char n[RBD_MAX_SEG_NAME_SIZE]; ! 712: int64_t segnr, segoffs, segsize, last_segnr; ! 713: int64_t off, size; ! 714: char *buf; ! 715: ! 716: BDRVRBDState *s = bs->opaque; ! 717: ! 718: acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque); ! 719: acb->write = write; ! 720: acb->qiov = qiov; ! 721: acb->bounce = qemu_blockalign(bs, qiov->size); ! 722: acb->aiocnt = 0; ! 723: acb->ret = 0; ! 724: acb->error = 0; ! 725: acb->s = s; ! 726: acb->cancelled = 0; ! 727: acb->bh = NULL; ! 728: ! 729: if (write) { ! 730: qemu_iovec_to_buffer(acb->qiov, acb->bounce); ! 731: } ! 732: ! 733: buf = acb->bounce; ! 734: ! 735: off = sector_num * BDRV_SECTOR_SIZE; ! 736: size = nb_sectors * BDRV_SECTOR_SIZE; ! 737: segnr = off / s->objsize; ! 738: segoffs = off % s->objsize; ! 739: segsize = s->objsize - segoffs; ! 740: ! 741: last_segnr = ((off + size - 1) / s->objsize); ! 742: acb->aiocnt = (last_segnr - segnr) + 1; ! 743: ! 744: s->qemu_aio_count += acb->aiocnt; /* All the RADOSCB */ ! 745: ! 746: while (size > 0) { ! 747: if (size < segsize) { ! 748: segsize = size; ! 749: } ! 750: ! 751: snprintf(n, sizeof(n), "%s.%012" PRIx64, s->block_name, ! 752: segnr); ! 753: ! 754: rcb = qemu_malloc(sizeof(RADOSCB)); ! 755: rcb->done = 0; ! 756: rcb->acb = acb; ! 757: rcb->segsize = segsize; ! 758: rcb->buf = buf; ! 759: rcb->s = acb->s; ! 760: ! 761: if (write) { ! 762: rados_aio_create_completion(rcb, NULL, ! 763: (rados_callback_t) rbd_finish_aiocb, ! 764: &c); ! 765: rados_aio_write(s->pool, n, segoffs, buf, segsize, c); ! 766: } else { ! 767: rados_aio_create_completion(rcb, ! 768: (rados_callback_t) rbd_finish_aiocb, ! 769: NULL, &c); ! 770: rados_aio_read(s->pool, n, segoffs, buf, segsize, c); ! 771: } ! 772: ! 773: buf += segsize; ! 774: size -= segsize; ! 775: segoffs = 0; ! 776: segsize = s->objsize; ! 777: segnr++; ! 778: } ! 779: ! 780: return &acb->common; ! 781: } ! 782: ! 783: static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState * bs, ! 784: int64_t sector_num, QEMUIOVector * qiov, ! 785: int nb_sectors, ! 786: BlockDriverCompletionFunc * cb, ! 787: void *opaque) ! 788: { ! 789: return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); ! 790: } ! 791: ! 792: static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState * bs, ! 793: int64_t sector_num, QEMUIOVector * qiov, ! 794: int nb_sectors, ! 795: BlockDriverCompletionFunc * cb, ! 796: void *opaque) ! 797: { ! 798: return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); ! 799: } ! 800: ! 801: static int rbd_getinfo(BlockDriverState * bs, BlockDriverInfo * bdi) ! 802: { ! 803: BDRVRBDState *s = bs->opaque; ! 804: bdi->cluster_size = s->objsize; ! 805: return 0; ! 806: } ! 807: ! 808: static int64_t rbd_getlength(BlockDriverState * bs) ! 809: { ! 810: BDRVRBDState *s = bs->opaque; ! 811: ! 812: return s->size; ! 813: } ! 814: ! 815: static int rbd_snap_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) ! 816: { ! 817: BDRVRBDState *s = bs->opaque; ! 818: char inbuf[512], outbuf[128]; ! 819: uint64_t snap_id; ! 820: int r; ! 821: char *p = inbuf; ! 822: char *end = inbuf + sizeof(inbuf); ! 823: char n[RBD_MAX_SEG_NAME_SIZE]; ! 824: char *hbuf = NULL; ! 825: RbdHeader1 *header; ! 826: ! 827: if (sn_info->name[0] == '\0') { ! 828: return -EINVAL; /* we need a name for rbd snapshots */ ! 829: } ! 830: ! 831: /* ! 832: * rbd snapshots are using the name as the user controlled unique identifier ! 833: * we can't use the rbd snapid for that purpose, as it can't be set ! 834: */ ! 835: if (sn_info->id_str[0] != '\0' && ! 836: strcmp(sn_info->id_str, sn_info->name) != 0) { ! 837: return -EINVAL; ! 838: } ! 839: ! 840: if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { ! 841: return -ERANGE; ! 842: } ! 843: ! 844: r = rados_selfmanaged_snap_create(s->header_pool, &snap_id); ! 845: if (r < 0) { ! 846: error_report("failed to create snap id: %s", strerror(-r)); ! 847: return r; ! 848: } ! 849: ! 850: *(uint32_t *)p = strlen(sn_info->name); ! 851: cpu_to_le32s((uint32_t *)p); ! 852: p += sizeof(uint32_t); ! 853: strncpy(p, sn_info->name, end - p); ! 854: p += strlen(p); ! 855: if (p + sizeof(snap_id) > end) { ! 856: error_report("invalid input parameter"); ! 857: return -EINVAL; ! 858: } ! 859: ! 860: *(uint64_t *)p = snap_id; ! 861: cpu_to_le64s((uint64_t *)p); ! 862: ! 863: snprintf(n, sizeof(n), "%s%s", s->name, RBD_SUFFIX); ! 864: ! 865: r = rados_exec(s->header_pool, n, "rbd", "snap_add", inbuf, ! 866: sizeof(inbuf), outbuf, sizeof(outbuf)); ! 867: if (r < 0) { ! 868: error_report("rbd.snap_add execution failed failed: %s", strerror(-r)); ! 869: return r; ! 870: } ! 871: ! 872: sprintf(sn_info->id_str, "%s", sn_info->name); ! 873: ! 874: r = rbd_read_header(s, &hbuf); ! 875: if (r < 0) { ! 876: error_report("failed reading header: %s", strerror(-r)); ! 877: return r; ! 878: } ! 879: ! 880: header = (RbdHeader1 *) hbuf; ! 881: r = rbd_set_snapc(s->pool, sn_info->name, header); ! 882: if (r < 0) { ! 883: error_report("failed setting snap context: %s", strerror(-r)); ! 884: goto failed; ! 885: } ! 886: ! 887: return 0; ! 888: ! 889: failed: ! 890: qemu_free(header); ! 891: return r; ! 892: } ! 893: ! 894: static int decode32(char **p, const char *end, uint32_t *v) ! 895: { ! 896: if (*p + 4 > end) { ! 897: return -ERANGE; ! 898: } ! 899: ! 900: *v = *(uint32_t *)(*p); ! 901: le32_to_cpus(v); ! 902: *p += 4; ! 903: return 0; ! 904: } ! 905: ! 906: static int decode64(char **p, const char *end, uint64_t *v) ! 907: { ! 908: if (*p + 8 > end) { ! 909: return -ERANGE; ! 910: } ! 911: ! 912: *v = *(uint64_t *)(*p); ! 913: le64_to_cpus(v); ! 914: *p += 8; ! 915: return 0; ! 916: } ! 917: ! 918: static int decode_str(char **p, const char *end, char **s) ! 919: { ! 920: uint32_t len; ! 921: int r; ! 922: ! 923: if ((r = decode32(p, end, &len)) < 0) { ! 924: return r; ! 925: } ! 926: ! 927: *s = qemu_malloc(len + 1); ! 928: memcpy(*s, *p, len); ! 929: *p += len; ! 930: (*s)[len] = '\0'; ! 931: ! 932: return len; ! 933: } ! 934: ! 935: static int rbd_snap_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) ! 936: { ! 937: BDRVRBDState *s = bs->opaque; ! 938: char n[RBD_MAX_SEG_NAME_SIZE]; ! 939: QEMUSnapshotInfo *sn_info, *sn_tab = NULL; ! 940: RbdHeader1 *header; ! 941: char *hbuf = NULL; ! 942: char *outbuf = NULL, *end, *buf; ! 943: uint64_t len; ! 944: uint64_t snap_seq; ! 945: uint32_t snap_count; ! 946: int r, i; ! 947: ! 948: /* read header to estimate how much space we need to read the snap ! 949: * list */ ! 950: if ((r = rbd_read_header(s, &hbuf)) < 0) { ! 951: goto done_err; ! 952: } ! 953: header = (RbdHeader1 *)hbuf; ! 954: len = le64_to_cpu(header->snap_names_len); ! 955: len += 1024; /* should have already been enough, but new snapshots might ! 956: already been created since we read the header. just allocate ! 957: a bit more, so that in most cases it'll suffice anyway */ ! 958: qemu_free(hbuf); ! 959: ! 960: snprintf(n, sizeof(n), "%s%s", s->name, RBD_SUFFIX); ! 961: while (1) { ! 962: qemu_free(outbuf); ! 963: outbuf = qemu_malloc(len); ! 964: ! 965: r = rados_exec(s->header_pool, n, "rbd", "snap_list", NULL, 0, ! 966: outbuf, len); ! 967: if (r < 0) { ! 968: error_report("rbd.snap_list execution failed failed: %s", strerror(-r)); ! 969: goto done_err; ! 970: } ! 971: if (r != len) { ! 972: break; ! 973: } ! 974: ! 975: /* if we're here, we probably raced with some snaps creation */ ! 976: len *= 2; ! 977: } ! 978: buf = outbuf; ! 979: end = buf + len; ! 980: ! 981: if ((r = decode64(&buf, end, &snap_seq)) < 0) { ! 982: goto done_err; ! 983: } ! 984: if ((r = decode32(&buf, end, &snap_count)) < 0) { ! 985: goto done_err; ! 986: } ! 987: ! 988: sn_tab = qemu_mallocz(snap_count * sizeof(QEMUSnapshotInfo)); ! 989: for (i = 0; i < snap_count; i++) { ! 990: uint64_t id, image_size; ! 991: char *snap_name; ! 992: ! 993: if ((r = decode64(&buf, end, &id)) < 0) { ! 994: goto done_err; ! 995: } ! 996: if ((r = decode64(&buf, end, &image_size)) < 0) { ! 997: goto done_err; ! 998: } ! 999: if ((r = decode_str(&buf, end, &snap_name)) < 0) { ! 1000: goto done_err; ! 1001: } ! 1002: ! 1003: sn_info = sn_tab + i; ! 1004: pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name); ! 1005: pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name); ! 1006: qemu_free(snap_name); ! 1007: ! 1008: sn_info->vm_state_size = image_size; ! 1009: sn_info->date_sec = 0; ! 1010: sn_info->date_nsec = 0; ! 1011: sn_info->vm_clock_nsec = 0; ! 1012: } ! 1013: *psn_tab = sn_tab; ! 1014: qemu_free(outbuf); ! 1015: return snap_count; ! 1016: done_err: ! 1017: qemu_free(sn_tab); ! 1018: qemu_free(outbuf); ! 1019: return r; ! 1020: } ! 1021: ! 1022: static QEMUOptionParameter rbd_create_options[] = { ! 1023: { ! 1024: .name = BLOCK_OPT_SIZE, ! 1025: .type = OPT_SIZE, ! 1026: .help = "Virtual disk size" ! 1027: }, ! 1028: { ! 1029: .name = BLOCK_OPT_CLUSTER_SIZE, ! 1030: .type = OPT_SIZE, ! 1031: .help = "RBD object size" ! 1032: }, ! 1033: {NULL} ! 1034: }; ! 1035: ! 1036: static BlockDriver bdrv_rbd = { ! 1037: .format_name = "rbd", ! 1038: .instance_size = sizeof(BDRVRBDState), ! 1039: .bdrv_file_open = rbd_open, ! 1040: .bdrv_close = rbd_close, ! 1041: .bdrv_create = rbd_create, ! 1042: .bdrv_get_info = rbd_getinfo, ! 1043: .create_options = rbd_create_options, ! 1044: .bdrv_getlength = rbd_getlength, ! 1045: .protocol_name = "rbd", ! 1046: ! 1047: .bdrv_aio_readv = rbd_aio_readv, ! 1048: .bdrv_aio_writev = rbd_aio_writev, ! 1049: ! 1050: .bdrv_snapshot_create = rbd_snap_create, ! 1051: .bdrv_snapshot_list = rbd_snap_list, ! 1052: }; ! 1053: ! 1054: static void bdrv_rbd_init(void) ! 1055: { ! 1056: bdrv_register(&bdrv_rbd); ! 1057: } ! 1058: ! 1059: block_init(bdrv_rbd_init);
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.