|
|
1.1 root 1: /*
2: * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3: *
4: * This program is free software; you can redistribute it and/or
5: * modify it under the terms of the GNU General Public License version
6: * 2 as published by the Free Software Foundation.
7: *
8: * You should have received a copy of the GNU General Public License
9: * along with this program. If not, see <http://www.gnu.org/licenses/>.
10: */
11:
12: #include "qemu-common.h"
13: #include "qemu-error.h"
14: #include "qemu_socket.h"
15: #include "block_int.h"
1.1.1.3 root 16: #include "bitops.h"
1.1 root 17:
18: #define SD_PROTO_VER 0x01
19:
20: #define SD_DEFAULT_ADDR "localhost"
21: #define SD_DEFAULT_PORT "7000"
22:
23: #define SD_OP_CREATE_AND_WRITE_OBJ 0x01
24: #define SD_OP_READ_OBJ 0x02
25: #define SD_OP_WRITE_OBJ 0x03
26:
27: #define SD_OP_NEW_VDI 0x11
28: #define SD_OP_LOCK_VDI 0x12
29: #define SD_OP_RELEASE_VDI 0x13
30: #define SD_OP_GET_VDI_INFO 0x14
31: #define SD_OP_READ_VDIS 0x15
32:
33: #define SD_FLAG_CMD_WRITE 0x01
34: #define SD_FLAG_CMD_COW 0x02
35:
36: #define SD_RES_SUCCESS 0x00 /* Success */
37: #define SD_RES_UNKNOWN 0x01 /* Unknown error */
38: #define SD_RES_NO_OBJ 0x02 /* No object found */
39: #define SD_RES_EIO 0x03 /* I/O error */
40: #define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
41: #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
42: #define SD_RES_SYSTEM_ERROR 0x06 /* System error */
43: #define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
44: #define SD_RES_NO_VDI 0x08 /* No vdi found */
45: #define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
46: #define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
47: #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
48: #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
49: #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
50: #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
51: #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
52: #define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
53: #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
54: #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
55: #define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
56: #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
57: #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
58: #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
59: #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
60: #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
61:
62: /*
63: * Object ID rules
64: *
65: * 0 - 19 (20 bits): data object space
66: * 20 - 31 (12 bits): reserved data object space
67: * 32 - 55 (24 bits): vdi object space
68: * 56 - 59 ( 4 bits): reserved vdi object space
1.1.1.4 ! root 69: * 60 - 63 ( 4 bits): object type identifier space
1.1 root 70: */
71:
72: #define VDI_SPACE_SHIFT 32
73: #define VDI_BIT (UINT64_C(1) << 63)
74: #define VMSTATE_BIT (UINT64_C(1) << 62)
75: #define MAX_DATA_OBJS (UINT64_C(1) << 20)
76: #define MAX_CHILDREN 1024
77: #define SD_MAX_VDI_LEN 256
78: #define SD_MAX_VDI_TAG_LEN 256
79: #define SD_NR_VDIS (1U << 24)
80: #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
81: #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
82: #define SECTOR_SIZE 512
83:
84: #define SD_INODE_SIZE (sizeof(SheepdogInode))
85: #define CURRENT_VDI_ID 0
86:
87: typedef struct SheepdogReq {
88: uint8_t proto_ver;
89: uint8_t opcode;
90: uint16_t flags;
91: uint32_t epoch;
92: uint32_t id;
93: uint32_t data_length;
94: uint32_t opcode_specific[8];
95: } SheepdogReq;
96:
97: typedef struct SheepdogRsp {
98: uint8_t proto_ver;
99: uint8_t opcode;
100: uint16_t flags;
101: uint32_t epoch;
102: uint32_t id;
103: uint32_t data_length;
104: uint32_t result;
105: uint32_t opcode_specific[7];
106: } SheepdogRsp;
107:
108: typedef struct SheepdogObjReq {
109: uint8_t proto_ver;
110: uint8_t opcode;
111: uint16_t flags;
112: uint32_t epoch;
113: uint32_t id;
114: uint32_t data_length;
115: uint64_t oid;
116: uint64_t cow_oid;
117: uint32_t copies;
118: uint32_t rsvd;
119: uint64_t offset;
120: } SheepdogObjReq;
121:
122: typedef struct SheepdogObjRsp {
123: uint8_t proto_ver;
124: uint8_t opcode;
125: uint16_t flags;
126: uint32_t epoch;
127: uint32_t id;
128: uint32_t data_length;
129: uint32_t result;
130: uint32_t copies;
131: uint32_t pad[6];
132: } SheepdogObjRsp;
133:
134: typedef struct SheepdogVdiReq {
135: uint8_t proto_ver;
136: uint8_t opcode;
137: uint16_t flags;
138: uint32_t epoch;
139: uint32_t id;
140: uint32_t data_length;
141: uint64_t vdi_size;
142: uint32_t base_vdi_id;
143: uint32_t copies;
144: uint32_t snapid;
145: uint32_t pad[3];
146: } SheepdogVdiReq;
147:
148: typedef struct SheepdogVdiRsp {
149: uint8_t proto_ver;
150: uint8_t opcode;
151: uint16_t flags;
152: uint32_t epoch;
153: uint32_t id;
154: uint32_t data_length;
155: uint32_t result;
156: uint32_t rsvd;
157: uint32_t vdi_id;
158: uint32_t pad[5];
159: } SheepdogVdiRsp;
160:
161: typedef struct SheepdogInode {
162: char name[SD_MAX_VDI_LEN];
163: char tag[SD_MAX_VDI_TAG_LEN];
164: uint64_t ctime;
165: uint64_t snap_ctime;
166: uint64_t vm_clock_nsec;
167: uint64_t vdi_size;
168: uint64_t vm_state_size;
169: uint16_t copy_policy;
170: uint8_t nr_copies;
171: uint8_t block_size_shift;
172: uint32_t snap_id;
173: uint32_t vdi_id;
174: uint32_t parent_vdi_id;
175: uint32_t child_vdi_id[MAX_CHILDREN];
176: uint32_t data_vdi_id[MAX_DATA_OBJS];
177: } SheepdogInode;
178:
179: /*
180: * 64 bit FNV-1a non-zero initial basis
181: */
182: #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
183:
184: /*
185: * 64 bit Fowler/Noll/Vo FNV-1a hash code
186: */
187: static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
188: {
189: unsigned char *bp = buf;
190: unsigned char *be = bp + len;
191: while (bp < be) {
192: hval ^= (uint64_t) *bp++;
193: hval += (hval << 1) + (hval << 4) + (hval << 5) +
194: (hval << 7) + (hval << 8) + (hval << 40);
195: }
196: return hval;
197: }
198:
1.1.1.3 root 199: static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
1.1 root 200: {
201: return inode->vdi_id == inode->data_vdi_id[idx];
202: }
203:
204: static inline int is_data_obj(uint64_t oid)
205: {
206: return !(VDI_BIT & oid);
207: }
208:
209: static inline uint64_t data_oid_to_idx(uint64_t oid)
210: {
211: return oid & (MAX_DATA_OBJS - 1);
212: }
213:
214: static inline uint64_t vid_to_vdi_oid(uint32_t vid)
215: {
216: return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
217: }
218:
219: static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
220: {
221: return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
222: }
223:
224: static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
225: {
226: return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
227: }
228:
229: static inline int is_snapshot(struct SheepdogInode *inode)
230: {
231: return !!inode->snap_ctime;
232: }
233:
234: #undef dprintf
235: #ifdef DEBUG_SDOG
236: #define dprintf(fmt, args...) \
237: do { \
238: fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
239: } while (0)
240: #else
241: #define dprintf(fmt, args...)
242: #endif
243:
244: typedef struct SheepdogAIOCB SheepdogAIOCB;
245:
246: typedef struct AIOReq {
247: SheepdogAIOCB *aiocb;
248: unsigned int iov_offset;
249:
250: uint64_t oid;
251: uint64_t base_oid;
252: uint64_t offset;
253: unsigned int data_len;
254: uint8_t flags;
255: uint32_t id;
256:
257: QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
258: QLIST_ENTRY(AIOReq) aioreq_siblings;
259: } AIOReq;
260:
261: enum AIOCBState {
262: AIOCB_WRITE_UDATA,
263: AIOCB_READ_UDATA,
264: };
265:
266: struct SheepdogAIOCB {
267: BlockDriverAIOCB common;
268:
269: QEMUIOVector *qiov;
270:
271: int64_t sector_num;
272: int nb_sectors;
273:
274: int ret;
275: enum AIOCBState aiocb_type;
276:
1.1.1.4 ! root 277: Coroutine *coroutine;
1.1 root 278: void (*aio_done_func)(SheepdogAIOCB *);
279:
280: int canceled;
281:
282: QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
283: };
284:
285: typedef struct BDRVSheepdogState {
286: SheepdogInode inode;
287:
288: uint32_t min_dirty_data_idx;
289: uint32_t max_dirty_data_idx;
290:
291: char name[SD_MAX_VDI_LEN];
292: int is_snapshot;
293:
294: char *addr;
295: char *port;
296: int fd;
297:
1.1.1.4 ! root 298: CoMutex lock;
! 299: Coroutine *co_send;
! 300: Coroutine *co_recv;
! 301:
1.1 root 302: uint32_t aioreq_seq_num;
303: QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
304: } BDRVSheepdogState;
305:
306: static const char * sd_strerror(int err)
307: {
308: int i;
309:
310: static const struct {
311: int err;
312: const char *desc;
313: } errors[] = {
314: {SD_RES_SUCCESS, "Success"},
315: {SD_RES_UNKNOWN, "Unknown error"},
316: {SD_RES_NO_OBJ, "No object found"},
317: {SD_RES_EIO, "I/O error"},
318: {SD_RES_VDI_EXIST, "VDI exists already"},
319: {SD_RES_INVALID_PARMS, "Invalid parameters"},
320: {SD_RES_SYSTEM_ERROR, "System error"},
321: {SD_RES_VDI_LOCKED, "VDI is already locked"},
322: {SD_RES_NO_VDI, "No vdi found"},
323: {SD_RES_NO_BASE_VDI, "No base VDI found"},
324: {SD_RES_VDI_READ, "Failed read the requested VDI"},
325: {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
326: {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
327: {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
328: {SD_RES_NO_TAG, "Failed to find the requested tag"},
329: {SD_RES_STARTUP, "The system is still booting"},
330: {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
331: {SD_RES_SHUTDOWN, "The system is shutting down"},
332: {SD_RES_NO_MEM, "Out of memory on the server"},
333: {SD_RES_FULL_VDI, "We already have the maximum vdis"},
334: {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
335: {SD_RES_NO_SPACE, "Server has no space for new objects"},
336: {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
337: {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
338: {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
339: };
340:
341: for (i = 0; i < ARRAY_SIZE(errors); ++i) {
342: if (errors[i].err == err) {
343: return errors[i].desc;
344: }
345: }
346:
347: return "Invalid error code";
348: }
349:
350: /*
351: * Sheepdog I/O handling:
352: *
1.1.1.4 ! root 353: * 1. In sd_co_rw_vector, we send the I/O requests to the server and
! 354: * link the requests to the outstanding_list in the
! 355: * BDRVSheepdogState. The function exits without waiting for
! 356: * receiving the response.
1.1 root 357: *
1.1.1.4 ! root 358: * 2. We receive the response in aio_read_response, the fd handler to
1.1 root 359: * the sheepdog connection. If metadata update is needed, we send
360: * the write request to the vdi object in sd_write_done, the write
1.1.1.4 ! root 361: * completion function. We switch back to sd_co_readv/writev after
! 362: * all the requests belonging to the AIOCB are finished.
1.1 root 363: */
364:
365: static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
366: uint64_t oid, unsigned int data_len,
367: uint64_t offset, uint8_t flags,
368: uint64_t base_oid, unsigned int iov_offset)
369: {
370: AIOReq *aio_req;
371:
1.1.1.4 ! root 372: aio_req = g_malloc(sizeof(*aio_req));
1.1 root 373: aio_req->aiocb = acb;
374: aio_req->iov_offset = iov_offset;
375: aio_req->oid = oid;
376: aio_req->base_oid = base_oid;
377: aio_req->offset = offset;
378: aio_req->data_len = data_len;
379: aio_req->flags = flags;
380: aio_req->id = s->aioreq_seq_num++;
381:
382: QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
383: outstanding_aio_siblings);
384: QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
385:
386: return aio_req;
387: }
388:
389: static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
390: {
391: SheepdogAIOCB *acb = aio_req->aiocb;
392: QLIST_REMOVE(aio_req, outstanding_aio_siblings);
393: QLIST_REMOVE(aio_req, aioreq_siblings);
1.1.1.4 ! root 394: g_free(aio_req);
1.1 root 395:
396: return !QLIST_EMPTY(&acb->aioreq_head);
397: }
398:
1.1.1.4 ! root 399: static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
1.1 root 400: {
401: if (!acb->canceled) {
1.1.1.4 ! root 402: qemu_coroutine_enter(acb->coroutine, NULL);
1.1 root 403: }
404: qemu_aio_release(acb);
405: }
406:
407: static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
408: {
409: SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
410:
411: /*
412: * Sheepdog cannot cancel the requests which are already sent to
413: * the servers, so we just complete the request with -EIO here.
414: */
1.1.1.4 ! root 415: acb->ret = -EIO;
! 416: qemu_coroutine_enter(acb->coroutine, NULL);
1.1 root 417: acb->canceled = 1;
418: }
419:
420: static AIOPool sd_aio_pool = {
421: .aiocb_size = sizeof(SheepdogAIOCB),
422: .cancel = sd_aio_cancel,
423: };
424:
425: static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
426: int64_t sector_num, int nb_sectors,
427: BlockDriverCompletionFunc *cb, void *opaque)
428: {
429: SheepdogAIOCB *acb;
430:
431: acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
432:
433: acb->qiov = qiov;
434:
435: acb->sector_num = sector_num;
436: acb->nb_sectors = nb_sectors;
437:
438: acb->aio_done_func = NULL;
439: acb->canceled = 0;
1.1.1.4 ! root 440: acb->coroutine = qemu_coroutine_self();
1.1 root 441: acb->ret = 0;
442: QLIST_INIT(&acb->aioreq_head);
443: return acb;
444: }
445:
446: #ifdef _WIN32
447:
448: struct msghdr {
449: struct iovec *msg_iov;
450: size_t msg_iovlen;
451: };
452:
453: static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
454: {
455: size_t size = 0;
456: char *buf, *p;
457: int i, ret;
458:
459: /* count the msg size */
460: for (i = 0; i < msg->msg_iovlen; i++) {
461: size += msg->msg_iov[i].iov_len;
462: }
1.1.1.4 ! root 463: buf = g_malloc(size);
1.1 root 464:
465: p = buf;
466: for (i = 0; i < msg->msg_iovlen; i++) {
467: memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
468: p += msg->msg_iov[i].iov_len;
469: }
470:
471: ret = send(s, buf, size, flags);
472:
1.1.1.4 ! root 473: g_free(buf);
1.1 root 474: return ret;
475: }
476:
477: static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
478: {
479: size_t size = 0;
480: char *buf, *p;
481: int i, ret;
482:
483: /* count the msg size */
484: for (i = 0; i < msg->msg_iovlen; i++) {
485: size += msg->msg_iov[i].iov_len;
486: }
1.1.1.4 ! root 487: buf = g_malloc(size);
1.1 root 488:
1.1.1.3 root 489: ret = qemu_recv(s, buf, size, flags);
1.1 root 490: if (ret < 0) {
491: goto out;
492: }
493:
494: p = buf;
495: for (i = 0; i < msg->msg_iovlen; i++) {
496: memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
497: p += msg->msg_iov[i].iov_len;
498: }
499: out:
1.1.1.4 ! root 500: g_free(buf);
1.1 root 501: return ret;
502: }
503:
504: #endif
505:
506: /*
507: * Send/recv data with iovec buffers
508: *
509: * This function send/recv data from/to the iovec buffer directly.
510: * The first `offset' bytes in the iovec buffer are skipped and next
511: * `len' bytes are used.
512: *
513: * For example,
514: *
515: * do_send_recv(sockfd, iov, len, offset, 1);
516: *
517: * is equals to
518: *
519: * char *buf = malloc(size);
520: * iov_to_buf(iov, iovcnt, buf, offset, size);
521: * send(sockfd, buf, size, 0);
522: * free(buf);
523: */
524: static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
525: int write)
526: {
527: struct msghdr msg;
528: int ret, diff;
529:
530: memset(&msg, 0, sizeof(msg));
531: msg.msg_iov = iov;
532: msg.msg_iovlen = 1;
533:
534: len += offset;
535:
536: while (iov->iov_len < len) {
537: len -= iov->iov_len;
538:
539: iov++;
540: msg.msg_iovlen++;
541: }
542:
543: diff = iov->iov_len - len;
544: iov->iov_len -= diff;
545:
546: while (msg.msg_iov->iov_len <= offset) {
547: offset -= msg.msg_iov->iov_len;
548:
549: msg.msg_iov++;
550: msg.msg_iovlen--;
551: }
552:
553: msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
554: msg.msg_iov->iov_len -= offset;
555:
556: if (write) {
557: ret = sendmsg(sockfd, &msg, 0);
558: } else {
559: ret = recvmsg(sockfd, &msg, 0);
560: }
561:
562: msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
563: msg.msg_iov->iov_len += offset;
564:
565: iov->iov_len += diff;
566: return ret;
567: }
568:
569: static int connect_to_sdog(const char *addr, const char *port)
570: {
571: char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
572: int fd, ret;
573: struct addrinfo hints, *res, *res0;
574:
575: if (!addr) {
576: addr = SD_DEFAULT_ADDR;
577: port = SD_DEFAULT_PORT;
578: }
579:
580: memset(&hints, 0, sizeof(hints));
581: hints.ai_socktype = SOCK_STREAM;
582:
583: ret = getaddrinfo(addr, port, &hints, &res0);
584: if (ret) {
1.1.1.3 root 585: error_report("unable to get address info %s, %s",
1.1 root 586: addr, strerror(errno));
587: return -1;
588: }
589:
590: for (res = res0; res; res = res->ai_next) {
591: ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
592: sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
593: if (ret) {
594: continue;
595: }
596:
597: fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
598: if (fd < 0) {
599: continue;
600: }
601:
602: reconnect:
603: ret = connect(fd, res->ai_addr, res->ai_addrlen);
604: if (ret < 0) {
605: if (errno == EINTR) {
606: goto reconnect;
607: }
608: break;
609: }
610:
611: dprintf("connected to %s:%s\n", addr, port);
612: goto success;
613: }
614: fd = -1;
1.1.1.3 root 615: error_report("failed connect to %s:%s", addr, port);
1.1 root 616: success:
617: freeaddrinfo(res0);
618: return fd;
619: }
620:
621: static int do_readv_writev(int sockfd, struct iovec *iov, int len,
622: int iov_offset, int write)
623: {
624: int ret;
625: again:
626: ret = do_send_recv(sockfd, iov, len, iov_offset, write);
627: if (ret < 0) {
1.1.1.4 ! root 628: if (errno == EINTR) {
! 629: goto again;
! 630: }
! 631: if (errno == EAGAIN) {
! 632: if (qemu_in_coroutine()) {
! 633: qemu_coroutine_yield();
! 634: }
1.1 root 635: goto again;
636: }
1.1.1.3 root 637: error_report("failed to recv a rsp, %s", strerror(errno));
1.1 root 638: return 1;
639: }
640:
641: iov_offset += ret;
642: len -= ret;
643: if (len) {
644: goto again;
645: }
646:
647: return 0;
648: }
649:
650: static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
651: {
652: return do_readv_writev(sockfd, iov, len, iov_offset, 0);
653: }
654:
655: static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
656: {
657: return do_readv_writev(sockfd, iov, len, iov_offset, 1);
658: }
659:
660: static int do_read_write(int sockfd, void *buf, int len, int write)
661: {
662: struct iovec iov;
663:
664: iov.iov_base = buf;
665: iov.iov_len = len;
666:
667: return do_readv_writev(sockfd, &iov, len, 0, write);
668: }
669:
670: static int do_read(int sockfd, void *buf, int len)
671: {
672: return do_read_write(sockfd, buf, len, 0);
673: }
674:
675: static int do_write(int sockfd, void *buf, int len)
676: {
677: return do_read_write(sockfd, buf, len, 1);
678: }
679:
680: static int send_req(int sockfd, SheepdogReq *hdr, void *data,
681: unsigned int *wlen)
682: {
683: int ret;
684: struct iovec iov[2];
685:
686: iov[0].iov_base = hdr;
687: iov[0].iov_len = sizeof(*hdr);
688:
689: if (*wlen) {
690: iov[1].iov_base = data;
691: iov[1].iov_len = *wlen;
692: }
693:
694: ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
695: if (ret) {
1.1.1.3 root 696: error_report("failed to send a req, %s", strerror(errno));
1.1 root 697: ret = -1;
698: }
699:
700: return ret;
701: }
702:
703: static int do_req(int sockfd, SheepdogReq *hdr, void *data,
704: unsigned int *wlen, unsigned int *rlen)
705: {
706: int ret;
707:
708: ret = send_req(sockfd, hdr, data, wlen);
709: if (ret) {
710: ret = -1;
711: goto out;
712: }
713:
714: ret = do_read(sockfd, hdr, sizeof(*hdr));
715: if (ret) {
1.1.1.3 root 716: error_report("failed to get a rsp, %s", strerror(errno));
1.1 root 717: ret = -1;
718: goto out;
719: }
720:
721: if (*rlen > hdr->data_length) {
722: *rlen = hdr->data_length;
723: }
724:
725: if (*rlen) {
726: ret = do_read(sockfd, data, *rlen);
727: if (ret) {
1.1.1.3 root 728: error_report("failed to get the data, %s", strerror(errno));
1.1 root 729: ret = -1;
730: goto out;
731: }
732: }
733: ret = 0;
734: out:
735: return ret;
736: }
737:
1.1.1.4 ! root 738: static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1.1 root 739: struct iovec *iov, int niov, int create,
740: enum AIOCBState aiocb_type);
741:
742: /*
743: * This function searchs pending requests to the object `oid', and
744: * sends them.
745: */
1.1.1.4 ! root 746: static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
1.1 root 747: {
748: AIOReq *aio_req, *next;
749: SheepdogAIOCB *acb;
750: int ret;
751:
752: QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
753: outstanding_aio_siblings, next) {
754: if (id == aio_req->id) {
755: continue;
756: }
757: if (aio_req->oid != oid) {
758: continue;
759: }
760:
761: acb = aio_req->aiocb;
762: ret = add_aio_request(s, aio_req, acb->qiov->iov,
763: acb->qiov->niov, 0, acb->aiocb_type);
764: if (ret < 0) {
1.1.1.3 root 765: error_report("add_aio_request is failed");
1.1 root 766: free_aio_req(s, aio_req);
767: if (QLIST_EMPTY(&acb->aioreq_head)) {
768: sd_finish_aiocb(acb);
769: }
770: }
771: }
772: }
773:
774: /*
775: * Receive responses of the I/O requests.
776: *
777: * This function is registered as a fd handler, and called from the
778: * main loop when s->fd is ready for reading responses.
779: */
1.1.1.4 ! root 780: static void coroutine_fn aio_read_response(void *opaque)
1.1 root 781: {
782: SheepdogObjRsp rsp;
783: BDRVSheepdogState *s = opaque;
784: int fd = s->fd;
785: int ret;
786: AIOReq *aio_req = NULL;
787: SheepdogAIOCB *acb;
788: int rest;
789: unsigned long idx;
790:
791: if (QLIST_EMPTY(&s->outstanding_aio_head)) {
1.1.1.4 ! root 792: goto out;
1.1 root 793: }
794:
795: /* read a header */
796: ret = do_read(fd, &rsp, sizeof(rsp));
797: if (ret) {
1.1.1.3 root 798: error_report("failed to get the header, %s", strerror(errno));
1.1.1.4 ! root 799: goto out;
1.1 root 800: }
801:
802: /* find the right aio_req from the outstanding_aio list */
803: QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
804: if (aio_req->id == rsp.id) {
805: break;
806: }
807: }
808: if (!aio_req) {
1.1.1.3 root 809: error_report("cannot find aio_req %x", rsp.id);
1.1.1.4 ! root 810: goto out;
1.1 root 811: }
812:
813: acb = aio_req->aiocb;
814:
815: switch (acb->aiocb_type) {
816: case AIOCB_WRITE_UDATA:
817: if (!is_data_obj(aio_req->oid)) {
818: break;
819: }
820: idx = data_oid_to_idx(aio_req->oid);
821:
822: if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
823: /*
824: * If the object is newly created one, we need to update
825: * the vdi object (metadata object). min_dirty_data_idx
826: * and max_dirty_data_idx are changed to include updated
827: * index between them.
828: */
829: s->inode.data_vdi_id[idx] = s->inode.vdi_id;
830: s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
831: s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
832:
833: /*
834: * Some requests may be blocked because simultaneous
835: * create requests are not allowed, so we search the
836: * pending requests here.
837: */
838: send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
839: }
840: break;
841: case AIOCB_READ_UDATA:
842: ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
843: aio_req->iov_offset);
844: if (ret) {
1.1.1.3 root 845: error_report("failed to get the data, %s", strerror(errno));
1.1.1.4 ! root 846: goto out;
1.1 root 847: }
848: break;
849: }
850:
851: if (rsp.result != SD_RES_SUCCESS) {
852: acb->ret = -EIO;
1.1.1.3 root 853: error_report("%s", sd_strerror(rsp.result));
1.1 root 854: }
855:
856: rest = free_aio_req(s, aio_req);
857: if (!rest) {
858: /*
859: * We've finished all requests which belong to the AIOCB, so
1.1.1.4 ! root 860: * we can switch back to sd_co_readv/writev now.
1.1 root 861: */
862: acb->aio_done_func(acb);
863: }
1.1.1.4 ! root 864: out:
! 865: s->co_recv = NULL;
! 866: }
! 867:
! 868: static void co_read_response(void *opaque)
! 869: {
! 870: BDRVSheepdogState *s = opaque;
! 871:
! 872: if (!s->co_recv) {
! 873: s->co_recv = qemu_coroutine_create(aio_read_response);
! 874: }
! 875:
! 876: qemu_coroutine_enter(s->co_recv, opaque);
! 877: }
! 878:
! 879: static void co_write_request(void *opaque)
! 880: {
! 881: BDRVSheepdogState *s = opaque;
! 882:
! 883: qemu_coroutine_enter(s->co_send, NULL);
1.1 root 884: }
885:
886: static int aio_flush_request(void *opaque)
887: {
888: BDRVSheepdogState *s = opaque;
889:
890: return !QLIST_EMPTY(&s->outstanding_aio_head);
891: }
892:
893: #if !defined(SOL_TCP) || !defined(TCP_CORK)
894:
895: static int set_cork(int fd, int v)
896: {
897: return 0;
898: }
899:
900: #else
901:
902: static int set_cork(int fd, int v)
903: {
904: return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
905: }
906:
907: #endif
908:
909: static int set_nodelay(int fd)
910: {
911: int ret, opt;
912:
913: opt = 1;
914: ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
915: return ret;
916: }
917:
918: /*
919: * Return a socket discriptor to read/write objects.
920: *
921: * We cannot use this discriptor for other operations because
922: * the block driver may be on waiting response from the server.
923: */
924: static int get_sheep_fd(BDRVSheepdogState *s)
925: {
926: int ret, fd;
927:
928: fd = connect_to_sdog(s->addr, s->port);
929: if (fd < 0) {
1.1.1.3 root 930: error_report("%s", strerror(errno));
1.1 root 931: return -1;
932: }
933:
934: socket_set_nonblock(fd);
935:
936: ret = set_nodelay(fd);
937: if (ret) {
1.1.1.3 root 938: error_report("%s", strerror(errno));
1.1 root 939: closesocket(fd);
940: return -1;
941: }
942:
1.1.1.4 ! root 943: qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request,
1.1 root 944: NULL, s);
945: return fd;
946: }
947:
948: /*
949: * Parse a filename
950: *
951: * filename must be one of the following formats:
952: * 1. [vdiname]
953: * 2. [vdiname]:[snapid]
954: * 3. [vdiname]:[tag]
955: * 4. [hostname]:[port]:[vdiname]
956: * 5. [hostname]:[port]:[vdiname]:[snapid]
957: * 6. [hostname]:[port]:[vdiname]:[tag]
958: *
959: * You can boot from the snapshot images by specifying `snapid` or
960: * `tag'.
961: *
962: * You can run VMs outside the Sheepdog cluster by specifying
963: * `hostname' and `port' (experimental).
964: */
965: static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
966: char *vdi, uint32_t *snapid, char *tag)
967: {
968: char *p, *q;
969: int nr_sep;
970:
1.1.1.4 ! root 971: p = q = g_strdup(filename);
1.1 root 972:
973: /* count the number of separators */
974: nr_sep = 0;
975: while (*p) {
976: if (*p == ':') {
977: nr_sep++;
978: }
979: p++;
980: }
981: p = q;
982:
983: /* use the first two tokens as hostname and port number. */
984: if (nr_sep >= 2) {
985: s->addr = p;
986: p = strchr(p, ':');
987: *p++ = '\0';
988:
989: s->port = p;
990: p = strchr(p, ':');
991: *p++ = '\0';
992: } else {
993: s->addr = NULL;
994: s->port = 0;
995: }
996:
997: strncpy(vdi, p, SD_MAX_VDI_LEN);
998:
999: p = strchr(vdi, ':');
1000: if (p) {
1001: *p++ = '\0';
1002: *snapid = strtoul(p, NULL, 10);
1003: if (*snapid == 0) {
1004: strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
1005: }
1006: } else {
1007: *snapid = CURRENT_VDI_ID; /* search current vdi */
1008: }
1009:
1010: if (s->addr == NULL) {
1.1.1.4 ! root 1011: g_free(q);
1.1 root 1012: }
1013:
1014: return 0;
1015: }
1016:
1017: static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1018: char *tag, uint32_t *vid, int for_snapshot)
1019: {
1020: int ret, fd;
1021: SheepdogVdiReq hdr;
1022: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1023: unsigned int wlen, rlen = 0;
1024: char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1025:
1026: fd = connect_to_sdog(s->addr, s->port);
1027: if (fd < 0) {
1028: return -1;
1029: }
1030:
1031: memset(buf, 0, sizeof(buf));
1032: strncpy(buf, filename, SD_MAX_VDI_LEN);
1033: strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1034:
1035: memset(&hdr, 0, sizeof(hdr));
1036: if (for_snapshot) {
1037: hdr.opcode = SD_OP_GET_VDI_INFO;
1038: } else {
1039: hdr.opcode = SD_OP_LOCK_VDI;
1040: }
1041: wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1042: hdr.proto_ver = SD_PROTO_VER;
1043: hdr.data_length = wlen;
1044: hdr.snapid = snapid;
1045: hdr.flags = SD_FLAG_CMD_WRITE;
1046:
1047: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1048: if (ret) {
1049: ret = -1;
1050: goto out;
1051: }
1052:
1053: if (rsp->result != SD_RES_SUCCESS) {
1.1.1.3 root 1054: error_report("cannot get vdi info, %s, %s %d %s",
1.1 root 1055: sd_strerror(rsp->result), filename, snapid, tag);
1056: ret = -1;
1057: goto out;
1058: }
1059: *vid = rsp->vdi_id;
1060:
1061: ret = 0;
1062: out:
1063: closesocket(fd);
1064: return ret;
1065: }
1066:
1.1.1.4 ! root 1067: static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1.1 root 1068: struct iovec *iov, int niov, int create,
1069: enum AIOCBState aiocb_type)
1070: {
1071: int nr_copies = s->inode.nr_copies;
1072: SheepdogObjReq hdr;
1073: unsigned int wlen;
1074: int ret;
1075: uint64_t oid = aio_req->oid;
1076: unsigned int datalen = aio_req->data_len;
1077: uint64_t offset = aio_req->offset;
1078: uint8_t flags = aio_req->flags;
1079: uint64_t old_oid = aio_req->base_oid;
1080:
1081: if (!nr_copies) {
1.1.1.3 root 1082: error_report("bug");
1.1 root 1083: }
1084:
1085: memset(&hdr, 0, sizeof(hdr));
1086:
1087: if (aiocb_type == AIOCB_READ_UDATA) {
1088: wlen = 0;
1089: hdr.opcode = SD_OP_READ_OBJ;
1090: hdr.flags = flags;
1091: } else if (create) {
1092: wlen = datalen;
1093: hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1094: hdr.flags = SD_FLAG_CMD_WRITE | flags;
1095: } else {
1096: wlen = datalen;
1097: hdr.opcode = SD_OP_WRITE_OBJ;
1098: hdr.flags = SD_FLAG_CMD_WRITE | flags;
1099: }
1100:
1101: hdr.oid = oid;
1102: hdr.cow_oid = old_oid;
1103: hdr.copies = s->inode.nr_copies;
1104:
1105: hdr.data_length = datalen;
1106: hdr.offset = offset;
1107:
1108: hdr.id = aio_req->id;
1109:
1.1.1.4 ! root 1110: qemu_co_mutex_lock(&s->lock);
! 1111: s->co_send = qemu_coroutine_self();
! 1112: qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
! 1113: aio_flush_request, NULL, s);
1.1 root 1114: set_cork(s->fd, 1);
1115:
1116: /* send a header */
1117: ret = do_write(s->fd, &hdr, sizeof(hdr));
1118: if (ret) {
1.1.1.4 ! root 1119: qemu_co_mutex_unlock(&s->lock);
1.1.1.3 root 1120: error_report("failed to send a req, %s", strerror(errno));
1.1 root 1121: return -EIO;
1122: }
1123:
1124: if (wlen) {
1125: ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1126: if (ret) {
1.1.1.4 ! root 1127: qemu_co_mutex_unlock(&s->lock);
1.1.1.3 root 1128: error_report("failed to send a data, %s", strerror(errno));
1.1 root 1129: return -EIO;
1130: }
1131: }
1132:
1133: set_cork(s->fd, 0);
1.1.1.4 ! root 1134: qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
! 1135: aio_flush_request, NULL, s);
! 1136: qemu_co_mutex_unlock(&s->lock);
1.1 root 1137:
1138: return 0;
1139: }
1140:
1141: static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1142: unsigned int datalen, uint64_t offset,
1143: int write, int create)
1144: {
1145: SheepdogObjReq hdr;
1146: SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1147: unsigned int wlen, rlen;
1148: int ret;
1149:
1150: memset(&hdr, 0, sizeof(hdr));
1151:
1152: if (write) {
1153: wlen = datalen;
1154: rlen = 0;
1155: hdr.flags = SD_FLAG_CMD_WRITE;
1156: if (create) {
1157: hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1158: } else {
1159: hdr.opcode = SD_OP_WRITE_OBJ;
1160: }
1161: } else {
1162: wlen = 0;
1163: rlen = datalen;
1164: hdr.opcode = SD_OP_READ_OBJ;
1165: }
1166: hdr.oid = oid;
1167: hdr.data_length = datalen;
1168: hdr.offset = offset;
1169: hdr.copies = copies;
1170:
1171: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1172: if (ret) {
1.1.1.3 root 1173: error_report("failed to send a request to the sheep");
1.1 root 1174: return -1;
1175: }
1176:
1177: switch (rsp->result) {
1178: case SD_RES_SUCCESS:
1179: return 0;
1180: default:
1.1.1.3 root 1181: error_report("%s", sd_strerror(rsp->result));
1.1 root 1182: return -1;
1183: }
1184: }
1185:
1186: static int read_object(int fd, char *buf, uint64_t oid, int copies,
1187: unsigned int datalen, uint64_t offset)
1188: {
1189: return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1190: }
1191:
1192: static int write_object(int fd, char *buf, uint64_t oid, int copies,
1193: unsigned int datalen, uint64_t offset, int create)
1194: {
1195: return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1196: }
1197:
1198: static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1199: {
1200: int ret, fd;
1201: uint32_t vid = 0;
1202: BDRVSheepdogState *s = bs->opaque;
1203: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1204: uint32_t snapid;
1205: char *buf = NULL;
1206:
1207: strstart(filename, "sheepdog:", (const char **)&filename);
1208:
1209: QLIST_INIT(&s->outstanding_aio_head);
1210: s->fd = -1;
1211:
1212: memset(vdi, 0, sizeof(vdi));
1213: memset(tag, 0, sizeof(tag));
1214: if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1215: goto out;
1216: }
1217: s->fd = get_sheep_fd(s);
1218: if (s->fd < 0) {
1219: goto out;
1220: }
1221:
1222: ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1223: if (ret) {
1224: goto out;
1225: }
1226:
1227: if (snapid) {
1228: dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1229: s->is_snapshot = 1;
1230: }
1231:
1232: fd = connect_to_sdog(s->addr, s->port);
1233: if (fd < 0) {
1.1.1.3 root 1234: error_report("failed to connect");
1.1 root 1235: goto out;
1236: }
1237:
1.1.1.4 ! root 1238: buf = g_malloc(SD_INODE_SIZE);
1.1 root 1239: ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1240:
1241: closesocket(fd);
1242:
1243: if (ret) {
1244: goto out;
1245: }
1246:
1247: memcpy(&s->inode, buf, sizeof(s->inode));
1248: s->min_dirty_data_idx = UINT32_MAX;
1249: s->max_dirty_data_idx = 0;
1250:
1251: bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1252: strncpy(s->name, vdi, sizeof(s->name));
1.1.1.4 ! root 1253: qemu_co_mutex_init(&s->lock);
! 1254: g_free(buf);
1.1 root 1255: return 0;
1256: out:
1257: qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1258: if (s->fd >= 0) {
1259: closesocket(s->fd);
1260: }
1.1.1.4 ! root 1261: g_free(buf);
1.1 root 1262: return -1;
1263: }
1264:
1265: static int do_sd_create(char *filename, int64_t vdi_size,
1266: uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1267: const char *addr, const char *port)
1268: {
1269: SheepdogVdiReq hdr;
1270: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1271: int fd, ret;
1272: unsigned int wlen, rlen = 0;
1273: char buf[SD_MAX_VDI_LEN];
1274:
1275: fd = connect_to_sdog(addr, port);
1276: if (fd < 0) {
1277: return -EIO;
1278: }
1279:
1280: memset(buf, 0, sizeof(buf));
1281: strncpy(buf, filename, SD_MAX_VDI_LEN);
1282:
1283: memset(&hdr, 0, sizeof(hdr));
1284: hdr.opcode = SD_OP_NEW_VDI;
1285: hdr.base_vdi_id = base_vid;
1286:
1287: wlen = SD_MAX_VDI_LEN;
1288:
1289: hdr.flags = SD_FLAG_CMD_WRITE;
1290: hdr.snapid = snapshot;
1291:
1292: hdr.data_length = wlen;
1293: hdr.vdi_size = vdi_size;
1294:
1295: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1296:
1297: closesocket(fd);
1298:
1299: if (ret) {
1300: return -EIO;
1301: }
1302:
1303: if (rsp->result != SD_RES_SUCCESS) {
1.1.1.3 root 1304: error_report("%s, %s", sd_strerror(rsp->result), filename);
1.1 root 1305: return -EIO;
1306: }
1307:
1308: if (vdi_id) {
1309: *vdi_id = rsp->vdi_id;
1310: }
1311:
1312: return 0;
1313: }
1314:
1.1.1.3 root 1315: static int sd_prealloc(const char *filename)
1316: {
1317: BlockDriverState *bs = NULL;
1318: uint32_t idx, max_idx;
1319: int64_t vdi_size;
1.1.1.4 ! root 1320: void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
1.1.1.3 root 1321: int ret;
1322:
1323: ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1324: if (ret < 0) {
1325: goto out;
1326: }
1327:
1328: vdi_size = bdrv_getlength(bs);
1329: if (vdi_size < 0) {
1330: ret = vdi_size;
1331: goto out;
1332: }
1333: max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
1334:
1335: for (idx = 0; idx < max_idx; idx++) {
1336: /*
1337: * The created image can be a cloned image, so we need to read
1338: * a data from the source image.
1339: */
1340: ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1341: if (ret < 0) {
1342: goto out;
1343: }
1344: ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1345: if (ret < 0) {
1346: goto out;
1347: }
1348: }
1349: out:
1350: if (bs) {
1351: bdrv_delete(bs);
1352: }
1.1.1.4 ! root 1353: g_free(buf);
1.1.1.3 root 1354:
1355: return ret;
1356: }
1357:
1.1 root 1358: static int sd_create(const char *filename, QEMUOptionParameter *options)
1359: {
1360: int ret;
1.1.1.2 root 1361: uint32_t vid = 0, base_vid = 0;
1.1 root 1362: int64_t vdi_size = 0;
1363: char *backing_file = NULL;
1.1.1.2 root 1364: BDRVSheepdogState s;
1365: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1366: uint32_t snapid;
1.1.1.3 root 1367: int prealloc = 0;
1368: const char *vdiname;
1.1 root 1369:
1.1.1.3 root 1370: strstart(filename, "sheepdog:", &vdiname);
1.1 root 1371:
1.1.1.2 root 1372: memset(&s, 0, sizeof(s));
1373: memset(vdi, 0, sizeof(vdi));
1374: memset(tag, 0, sizeof(tag));
1.1.1.3 root 1375: if (parse_vdiname(&s, vdiname, vdi, &snapid, tag) < 0) {
1376: error_report("invalid filename");
1.1.1.2 root 1377: return -EINVAL;
1378: }
1379:
1.1 root 1380: while (options && options->name) {
1381: if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1382: vdi_size = options->value.n;
1383: } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1384: backing_file = options->value.s;
1.1.1.3 root 1385: } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1386: if (!options->value.s || !strcmp(options->value.s, "off")) {
1387: prealloc = 0;
1388: } else if (!strcmp(options->value.s, "full")) {
1389: prealloc = 1;
1390: } else {
1391: error_report("Invalid preallocation mode: '%s'",
1392: options->value.s);
1393: return -EINVAL;
1394: }
1.1 root 1395: }
1396: options++;
1397: }
1398:
1399: if (vdi_size > SD_MAX_VDI_SIZE) {
1.1.1.3 root 1400: error_report("too big image size");
1.1 root 1401: return -EINVAL;
1402: }
1403:
1404: if (backing_file) {
1405: BlockDriverState *bs;
1406: BDRVSheepdogState *s;
1407: BlockDriver *drv;
1408:
1409: /* Currently, only Sheepdog backing image is supported. */
1410: drv = bdrv_find_protocol(backing_file);
1411: if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1.1.1.3 root 1412: error_report("backing_file must be a sheepdog image");
1.1 root 1413: return -EINVAL;
1414: }
1415:
1416: ret = bdrv_file_open(&bs, backing_file, 0);
1417: if (ret < 0)
1418: return -EIO;
1419:
1420: s = bs->opaque;
1421:
1422: if (!is_snapshot(&s->inode)) {
1.1.1.3 root 1423: error_report("cannot clone from a non snapshot vdi");
1.1 root 1424: bdrv_delete(bs);
1425: return -EINVAL;
1426: }
1427:
1.1.1.2 root 1428: base_vid = s->inode.vdi_id;
1.1 root 1429: bdrv_delete(bs);
1430: }
1431:
1.1.1.3 root 1432: ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s.addr, s.port);
1433: if (!prealloc || ret) {
1434: return ret;
1435: }
1436:
1437: return sd_prealloc(filename);
1.1 root 1438: }
1439:
1440: static void sd_close(BlockDriverState *bs)
1441: {
1442: BDRVSheepdogState *s = bs->opaque;
1443: SheepdogVdiReq hdr;
1444: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1445: unsigned int wlen, rlen = 0;
1446: int fd, ret;
1447:
1448: dprintf("%s\n", s->name);
1449:
1450: fd = connect_to_sdog(s->addr, s->port);
1451: if (fd < 0) {
1452: return;
1453: }
1454:
1455: memset(&hdr, 0, sizeof(hdr));
1456:
1457: hdr.opcode = SD_OP_RELEASE_VDI;
1458: wlen = strlen(s->name) + 1;
1459: hdr.data_length = wlen;
1460: hdr.flags = SD_FLAG_CMD_WRITE;
1461:
1462: ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1463:
1464: closesocket(fd);
1465:
1466: if (!ret && rsp->result != SD_RES_SUCCESS &&
1467: rsp->result != SD_RES_VDI_NOT_LOCKED) {
1.1.1.3 root 1468: error_report("%s, %s", sd_strerror(rsp->result), s->name);
1.1 root 1469: }
1470:
1471: qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1472: closesocket(s->fd);
1.1.1.4 ! root 1473: g_free(s->addr);
1.1 root 1474: }
1475:
1476: static int64_t sd_getlength(BlockDriverState *bs)
1477: {
1478: BDRVSheepdogState *s = bs->opaque;
1479:
1480: return s->inode.vdi_size;
1481: }
1482:
1483: static int sd_truncate(BlockDriverState *bs, int64_t offset)
1484: {
1485: BDRVSheepdogState *s = bs->opaque;
1486: int ret, fd;
1487: unsigned int datalen;
1488:
1489: if (offset < s->inode.vdi_size) {
1.1.1.3 root 1490: error_report("shrinking is not supported");
1.1 root 1491: return -EINVAL;
1492: } else if (offset > SD_MAX_VDI_SIZE) {
1.1.1.3 root 1493: error_report("too big image size");
1.1 root 1494: return -EINVAL;
1495: }
1496:
1497: fd = connect_to_sdog(s->addr, s->port);
1498: if (fd < 0) {
1499: return -EIO;
1500: }
1501:
1502: /* we don't need to update entire object */
1503: datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1504: s->inode.vdi_size = offset;
1505: ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1506: s->inode.nr_copies, datalen, 0, 0);
1507: close(fd);
1508:
1509: if (ret < 0) {
1.1.1.3 root 1510: error_report("failed to update an inode.");
1.1 root 1511: return -EIO;
1512: }
1513:
1514: return 0;
1515: }
1516:
1517: /*
1518: * This function is called after writing data objects. If we need to
1519: * update metadata, this sends a write request to the vdi object.
1.1.1.4 ! root 1520: * Otherwise, this switches back to sd_co_readv/writev.
1.1 root 1521: */
1.1.1.4 ! root 1522: static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
1.1 root 1523: {
1524: int ret;
1525: BDRVSheepdogState *s = acb->common.bs->opaque;
1526: struct iovec iov;
1527: AIOReq *aio_req;
1528: uint32_t offset, data_len, mn, mx;
1529:
1530: mn = s->min_dirty_data_idx;
1531: mx = s->max_dirty_data_idx;
1532: if (mn <= mx) {
1533: /* we need to update the vdi object. */
1534: offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1535: mn * sizeof(s->inode.data_vdi_id[0]);
1536: data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1537:
1538: s->min_dirty_data_idx = UINT32_MAX;
1539: s->max_dirty_data_idx = 0;
1540:
1541: iov.iov_base = &s->inode;
1542: iov.iov_len = sizeof(s->inode);
1543: aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1544: data_len, offset, 0, 0, offset);
1545: ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1546: if (ret) {
1547: free_aio_req(s, aio_req);
1548: acb->ret = -EIO;
1549: goto out;
1550: }
1551:
1552: acb->aio_done_func = sd_finish_aiocb;
1553: acb->aiocb_type = AIOCB_WRITE_UDATA;
1554: return;
1555: }
1556: out:
1557: sd_finish_aiocb(acb);
1558: }
1559:
1560: /*
1561: * Create a writable VDI from a snapshot
1562: */
1563: static int sd_create_branch(BDRVSheepdogState *s)
1564: {
1565: int ret, fd;
1566: uint32_t vid;
1567: char *buf;
1568:
1569: dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1570:
1.1.1.4 ! root 1571: buf = g_malloc(SD_INODE_SIZE);
1.1 root 1572:
1573: ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1574: s->addr, s->port);
1575: if (ret) {
1576: goto out;
1577: }
1578:
1579: dprintf("%" PRIx32 " is created.\n", vid);
1580:
1581: fd = connect_to_sdog(s->addr, s->port);
1582: if (fd < 0) {
1.1.1.3 root 1583: error_report("failed to connect");
1.1 root 1584: goto out;
1585: }
1586:
1587: ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1588: SD_INODE_SIZE, 0);
1589:
1590: closesocket(fd);
1591:
1592: if (ret < 0) {
1593: goto out;
1594: }
1595:
1596: memcpy(&s->inode, buf, sizeof(s->inode));
1597:
1598: s->is_snapshot = 0;
1599: ret = 0;
1600: dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1601:
1602: out:
1.1.1.4 ! root 1603: g_free(buf);
1.1 root 1604:
1605: return ret;
1606: }
1607:
1608: /*
1609: * Send I/O requests to the server.
1610: *
1611: * This function sends requests to the server, links the requests to
1612: * the outstanding_list in BDRVSheepdogState, and exits without
1613: * waiting the response. The responses are received in the
1614: * `aio_read_response' function which is called from the main loop as
1615: * a fd handler.
1.1.1.4 ! root 1616: *
! 1617: * Returns 1 when we need to wait a response, 0 when there is no sent
! 1618: * request and -errno in error cases.
1.1 root 1619: */
1.1.1.4 ! root 1620: static int coroutine_fn sd_co_rw_vector(void *p)
1.1 root 1621: {
1622: SheepdogAIOCB *acb = p;
1623: int ret = 0;
1624: unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1625: unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1626: uint64_t oid;
1627: uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1628: BDRVSheepdogState *s = acb->common.bs->opaque;
1629: SheepdogInode *inode = &s->inode;
1630: AIOReq *aio_req;
1631:
1632: if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1633: /*
1634: * In the case we open the snapshot VDI, Sheepdog creates the
1635: * writable VDI when we do a write operation first.
1636: */
1637: ret = sd_create_branch(s);
1638: if (ret) {
1639: acb->ret = -EIO;
1640: goto out;
1641: }
1642: }
1643:
1644: while (done != total) {
1645: uint8_t flags = 0;
1646: uint64_t old_oid = 0;
1647: int create = 0;
1648:
1649: oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1650:
1651: len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1652:
1653: if (!inode->data_vdi_id[idx]) {
1654: if (acb->aiocb_type == AIOCB_READ_UDATA) {
1655: goto done;
1656: }
1657:
1658: create = 1;
1659: } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1.1.1.3 root 1660: && !is_data_obj_writable(inode, idx)) {
1.1 root 1661: /* Copy-On-Write */
1662: create = 1;
1663: old_oid = oid;
1664: flags = SD_FLAG_CMD_COW;
1665: }
1666:
1667: if (create) {
1668: dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1669: " %" PRIu64 "\n", inode->vdi_id, oid,
1670: vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1671: oid = vid_to_data_oid(inode->vdi_id, idx);
1672: dprintf("new oid %lx\n", oid);
1673: }
1674:
1675: aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1676:
1677: if (create) {
1678: AIOReq *areq;
1679: QLIST_FOREACH(areq, &s->outstanding_aio_head,
1680: outstanding_aio_siblings) {
1681: if (areq == aio_req) {
1682: continue;
1683: }
1684: if (areq->oid == oid) {
1685: /*
1686: * Sheepdog cannot handle simultaneous create
1687: * requests to the same object. So we cannot send
1688: * the request until the previous request
1689: * finishes.
1690: */
1691: aio_req->flags = 0;
1692: aio_req->base_oid = 0;
1693: goto done;
1694: }
1695: }
1696: }
1697:
1698: ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1699: create, acb->aiocb_type);
1700: if (ret < 0) {
1.1.1.3 root 1701: error_report("add_aio_request is failed");
1.1 root 1702: free_aio_req(s, aio_req);
1703: acb->ret = -EIO;
1704: goto out;
1705: }
1706: done:
1707: offset = 0;
1708: idx++;
1709: done += len;
1710: }
1711: out:
1712: if (QLIST_EMPTY(&acb->aioreq_head)) {
1.1.1.4 ! root 1713: return acb->ret;
1.1 root 1714: }
1.1.1.4 ! root 1715: return 1;
1.1 root 1716: }
1717:
1.1.1.4 ! root 1718: static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
! 1719: int nb_sectors, QEMUIOVector *qiov)
1.1 root 1720: {
1721: SheepdogAIOCB *acb;
1.1.1.4 ! root 1722: int ret;
1.1 root 1723:
1724: if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1725: /* TODO: shouldn't block here */
1726: if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1.1.1.4 ! root 1727: return -EIO;
1.1 root 1728: }
1729: bs->total_sectors = sector_num + nb_sectors;
1730: }
1731:
1.1.1.4 ! root 1732: acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
1.1 root 1733: acb->aio_done_func = sd_write_done;
1734: acb->aiocb_type = AIOCB_WRITE_UDATA;
1735:
1.1.1.4 ! root 1736: ret = sd_co_rw_vector(acb);
! 1737: if (ret <= 0) {
! 1738: qemu_aio_release(acb);
! 1739: return ret;
! 1740: }
! 1741:
! 1742: qemu_coroutine_yield();
! 1743:
! 1744: return acb->ret;
1.1 root 1745: }
1746:
1.1.1.4 ! root 1747: static int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
! 1748: int nb_sectors, QEMUIOVector *qiov)
1.1 root 1749: {
1750: SheepdogAIOCB *acb;
1.1.1.4 ! root 1751: int i, ret;
1.1 root 1752:
1.1.1.4 ! root 1753: acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
1.1 root 1754: acb->aiocb_type = AIOCB_READ_UDATA;
1755: acb->aio_done_func = sd_finish_aiocb;
1756:
1757: /*
1758: * TODO: we can do better; we don't need to initialize
1759: * blindly.
1760: */
1761: for (i = 0; i < qiov->niov; i++) {
1762: memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1763: }
1764:
1.1.1.4 ! root 1765: ret = sd_co_rw_vector(acb);
! 1766: if (ret <= 0) {
! 1767: qemu_aio_release(acb);
! 1768: return ret;
! 1769: }
! 1770:
! 1771: qemu_coroutine_yield();
! 1772:
! 1773: return acb->ret;
1.1 root 1774: }
1775:
1776: static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1777: {
1778: BDRVSheepdogState *s = bs->opaque;
1779: int ret, fd;
1780: uint32_t new_vid;
1781: SheepdogInode *inode;
1782: unsigned int datalen;
1783:
1784: dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1785: "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1786: s->name, sn_info->vm_state_size, s->is_snapshot);
1787:
1788: if (s->is_snapshot) {
1789: error_report("You can't create a snapshot of a snapshot VDI, "
1.1.1.3 root 1790: "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
1.1 root 1791:
1792: return -EINVAL;
1793: }
1794:
1795: dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1796:
1797: s->inode.vm_state_size = sn_info->vm_state_size;
1798: s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1799: strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1800: /* we don't need to update entire object */
1801: datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1802:
1803: /* refresh inode. */
1804: fd = connect_to_sdog(s->addr, s->port);
1805: if (fd < 0) {
1806: ret = -EIO;
1807: goto cleanup;
1808: }
1809:
1810: ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1811: s->inode.nr_copies, datalen, 0, 0);
1812: if (ret < 0) {
1.1.1.3 root 1813: error_report("failed to write snapshot's inode.");
1.1 root 1814: ret = -EIO;
1815: goto cleanup;
1816: }
1817:
1818: ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1819: s->addr, s->port);
1820: if (ret < 0) {
1.1.1.3 root 1821: error_report("failed to create inode for snapshot. %s",
1.1 root 1822: strerror(errno));
1823: ret = -EIO;
1824: goto cleanup;
1825: }
1826:
1.1.1.4 ! root 1827: inode = (SheepdogInode *)g_malloc(datalen);
1.1 root 1828:
1829: ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1830: s->inode.nr_copies, datalen, 0);
1831:
1832: if (ret < 0) {
1.1.1.3 root 1833: error_report("failed to read new inode info. %s", strerror(errno));
1.1 root 1834: ret = -EIO;
1835: goto cleanup;
1836: }
1837:
1838: memcpy(&s->inode, inode, datalen);
1839: dprintf("s->inode: name %s snap_id %x oid %x\n",
1840: s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1841:
1842: cleanup:
1843: closesocket(fd);
1844: return ret;
1845: }
1846:
1847: static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1848: {
1849: BDRVSheepdogState *s = bs->opaque;
1850: BDRVSheepdogState *old_s;
1851: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1852: char *buf = NULL;
1853: uint32_t vid;
1854: uint32_t snapid = 0;
1855: int ret = -ENOENT, fd;
1856:
1.1.1.4 ! root 1857: old_s = g_malloc(sizeof(BDRVSheepdogState));
1.1 root 1858:
1859: memcpy(old_s, s, sizeof(BDRVSheepdogState));
1860:
1861: memset(vdi, 0, sizeof(vdi));
1862: strncpy(vdi, s->name, sizeof(vdi));
1863:
1864: memset(tag, 0, sizeof(tag));
1865: snapid = strtoul(snapshot_id, NULL, 10);
1866: if (!snapid) {
1867: strncpy(tag, s->name, sizeof(tag));
1868: }
1869:
1870: ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1871: if (ret) {
1.1.1.3 root 1872: error_report("Failed to find_vdi_name");
1.1 root 1873: ret = -ENOENT;
1874: goto out;
1875: }
1876:
1877: fd = connect_to_sdog(s->addr, s->port);
1878: if (fd < 0) {
1.1.1.3 root 1879: error_report("failed to connect");
1.1 root 1880: goto out;
1881: }
1882:
1.1.1.4 ! root 1883: buf = g_malloc(SD_INODE_SIZE);
1.1 root 1884: ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1885: SD_INODE_SIZE, 0);
1886:
1887: closesocket(fd);
1888:
1889: if (ret) {
1890: ret = -ENOENT;
1891: goto out;
1892: }
1893:
1894: memcpy(&s->inode, buf, sizeof(s->inode));
1895:
1896: if (!s->inode.vm_state_size) {
1.1.1.3 root 1897: error_report("Invalid snapshot");
1.1 root 1898: ret = -ENOENT;
1899: goto out;
1900: }
1901:
1902: s->is_snapshot = 1;
1903:
1.1.1.4 ! root 1904: g_free(buf);
! 1905: g_free(old_s);
1.1 root 1906:
1907: return 0;
1908: out:
1909: /* recover bdrv_sd_state */
1910: memcpy(s, old_s, sizeof(BDRVSheepdogState));
1.1.1.4 ! root 1911: g_free(buf);
! 1912: g_free(old_s);
1.1 root 1913:
1.1.1.3 root 1914: error_report("failed to open. recover old bdrv_sd_state.");
1.1 root 1915:
1916: return ret;
1917: }
1918:
1919: static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1920: {
1921: /* FIXME: Delete specified snapshot id. */
1922: return 0;
1923: }
1924:
1925: static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1926: {
1927: BDRVSheepdogState *s = bs->opaque;
1928: SheepdogReq req;
1929: int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1930: QEMUSnapshotInfo *sn_tab = NULL;
1931: unsigned wlen, rlen;
1932: int found = 0;
1933: static SheepdogInode inode;
1934: unsigned long *vdi_inuse;
1935: unsigned int start_nr;
1936: uint64_t hval;
1937: uint32_t vid;
1938:
1.1.1.4 ! root 1939: vdi_inuse = g_malloc(max);
1.1 root 1940:
1941: fd = connect_to_sdog(s->addr, s->port);
1942: if (fd < 0) {
1943: goto out;
1944: }
1945:
1946: rlen = max;
1947: wlen = 0;
1948:
1949: memset(&req, 0, sizeof(req));
1950:
1951: req.opcode = SD_OP_READ_VDIS;
1952: req.data_length = max;
1953:
1954: ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1955:
1956: closesocket(fd);
1957: if (ret) {
1958: goto out;
1959: }
1960:
1.1.1.4 ! root 1961: sn_tab = g_malloc0(nr * sizeof(*sn_tab));
1.1 root 1962:
1963: /* calculate a vdi id with hash function */
1964: hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1965: start_nr = hval & (SD_NR_VDIS - 1);
1966:
1967: fd = connect_to_sdog(s->addr, s->port);
1968: if (fd < 0) {
1.1.1.3 root 1969: error_report("failed to connect");
1.1 root 1970: goto out;
1971: }
1972:
1973: for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1974: if (!test_bit(vid, vdi_inuse)) {
1975: break;
1976: }
1977:
1978: /* we don't need to read entire object */
1979: ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1980: 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1981:
1982: if (ret) {
1983: continue;
1984: }
1985:
1986: if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1987: sn_tab[found].date_sec = inode.snap_ctime >> 32;
1988: sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1989: sn_tab[found].vm_state_size = inode.vm_state_size;
1990: sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1991:
1992: snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1993: inode.snap_id);
1994: strncpy(sn_tab[found].name, inode.tag,
1995: MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1996: found++;
1997: }
1998: }
1999:
2000: closesocket(fd);
2001: out:
2002: *psn_tab = sn_tab;
2003:
1.1.1.4 ! root 2004: g_free(vdi_inuse);
1.1 root 2005:
2006: return found;
2007: }
2008:
2009: static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
2010: int64_t pos, int size, int load)
2011: {
2012: int fd, create;
2013: int ret = 0;
2014: unsigned int data_len;
2015: uint64_t vmstate_oid;
2016: uint32_t vdi_index;
2017: uint64_t offset;
2018:
2019: fd = connect_to_sdog(s->addr, s->port);
2020: if (fd < 0) {
2021: ret = -EIO;
2022: goto cleanup;
2023: }
2024:
2025: while (size) {
2026: vdi_index = pos / SD_DATA_OBJ_SIZE;
2027: offset = pos % SD_DATA_OBJ_SIZE;
2028:
2029: data_len = MIN(size, SD_DATA_OBJ_SIZE);
2030:
2031: vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
2032:
2033: create = (offset == 0);
2034: if (load) {
2035: ret = read_object(fd, (char *)data, vmstate_oid,
2036: s->inode.nr_copies, data_len, offset);
2037: } else {
2038: ret = write_object(fd, (char *)data, vmstate_oid,
2039: s->inode.nr_copies, data_len, offset, create);
2040: }
2041:
2042: if (ret < 0) {
1.1.1.3 root 2043: error_report("failed to save vmstate %s", strerror(errno));
1.1 root 2044: ret = -EIO;
2045: goto cleanup;
2046: }
2047:
2048: pos += data_len;
2049: size -= data_len;
2050: ret += data_len;
2051: }
2052: cleanup:
2053: closesocket(fd);
2054: return ret;
2055: }
2056:
2057: static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
2058: int64_t pos, int size)
2059: {
2060: BDRVSheepdogState *s = bs->opaque;
2061:
2062: return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
2063: }
2064:
2065: static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
2066: int64_t pos, int size)
2067: {
2068: BDRVSheepdogState *s = bs->opaque;
2069:
2070: return do_load_save_vmstate(s, data, pos, size, 1);
2071: }
2072:
2073:
2074: static QEMUOptionParameter sd_create_options[] = {
2075: {
2076: .name = BLOCK_OPT_SIZE,
2077: .type = OPT_SIZE,
2078: .help = "Virtual disk size"
2079: },
2080: {
2081: .name = BLOCK_OPT_BACKING_FILE,
2082: .type = OPT_STRING,
2083: .help = "File name of a base image"
2084: },
1.1.1.3 root 2085: {
2086: .name = BLOCK_OPT_PREALLOC,
2087: .type = OPT_STRING,
2088: .help = "Preallocation mode (allowed values: off, full)"
2089: },
1.1 root 2090: { NULL }
2091: };
2092:
2093: BlockDriver bdrv_sheepdog = {
2094: .format_name = "sheepdog",
2095: .protocol_name = "sheepdog",
2096: .instance_size = sizeof(BDRVSheepdogState),
2097: .bdrv_file_open = sd_open,
2098: .bdrv_close = sd_close,
2099: .bdrv_create = sd_create,
2100: .bdrv_getlength = sd_getlength,
2101: .bdrv_truncate = sd_truncate,
2102:
1.1.1.4 ! root 2103: .bdrv_co_readv = sd_co_readv,
! 2104: .bdrv_co_writev = sd_co_writev,
1.1 root 2105:
2106: .bdrv_snapshot_create = sd_snapshot_create,
2107: .bdrv_snapshot_goto = sd_snapshot_goto,
2108: .bdrv_snapshot_delete = sd_snapshot_delete,
2109: .bdrv_snapshot_list = sd_snapshot_list,
2110:
2111: .bdrv_save_vmstate = sd_save_vmstate,
2112: .bdrv_load_vmstate = sd_load_vmstate,
2113:
2114: .create_options = sd_create_options,
2115: };
2116:
2117: static void bdrv_sheepdog_init(void)
2118: {
2119: bdrv_register(&bdrv_sheepdog);
2120: }
2121: block_init(bdrv_sheepdog_init);
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.