|
|
1.1 root 1: /*
2: * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3: *
4: * This program is free software; you can redistribute it and/or
5: * modify it under the terms of the GNU General Public License version
6: * 2 as published by the Free Software Foundation.
7: *
8: * You should have received a copy of the GNU General Public License
9: * along with this program. If not, see <http://www.gnu.org/licenses/>.
10: */
11:
12: #include "qemu-common.h"
13: #include "qemu-error.h"
14: #include "qemu_socket.h"
15: #include "block_int.h"
16:
17: #define SD_PROTO_VER 0x01
18:
19: #define SD_DEFAULT_ADDR "localhost"
20: #define SD_DEFAULT_PORT "7000"
21:
22: #define SD_OP_CREATE_AND_WRITE_OBJ 0x01
23: #define SD_OP_READ_OBJ 0x02
24: #define SD_OP_WRITE_OBJ 0x03
25:
26: #define SD_OP_NEW_VDI 0x11
27: #define SD_OP_LOCK_VDI 0x12
28: #define SD_OP_RELEASE_VDI 0x13
29: #define SD_OP_GET_VDI_INFO 0x14
30: #define SD_OP_READ_VDIS 0x15
31:
32: #define SD_FLAG_CMD_WRITE 0x01
33: #define SD_FLAG_CMD_COW 0x02
34:
35: #define SD_RES_SUCCESS 0x00 /* Success */
36: #define SD_RES_UNKNOWN 0x01 /* Unknown error */
37: #define SD_RES_NO_OBJ 0x02 /* No object found */
38: #define SD_RES_EIO 0x03 /* I/O error */
39: #define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
40: #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
41: #define SD_RES_SYSTEM_ERROR 0x06 /* System error */
42: #define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
43: #define SD_RES_NO_VDI 0x08 /* No vdi found */
44: #define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
45: #define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
46: #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
47: #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
48: #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
49: #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
50: #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
51: #define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
52: #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
53: #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
54: #define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
55: #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
56: #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
57: #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
58: #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
59: #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
60:
61: /*
62: * Object ID rules
63: *
64: * 0 - 19 (20 bits): data object space
65: * 20 - 31 (12 bits): reserved data object space
66: * 32 - 55 (24 bits): vdi object space
67: * 56 - 59 ( 4 bits): reserved vdi object space
68: * 60 - 63 ( 4 bits): object type indentifier space
69: */
70:
71: #define VDI_SPACE_SHIFT 32
72: #define VDI_BIT (UINT64_C(1) << 63)
73: #define VMSTATE_BIT (UINT64_C(1) << 62)
74: #define MAX_DATA_OBJS (UINT64_C(1) << 20)
75: #define MAX_CHILDREN 1024
76: #define SD_MAX_VDI_LEN 256
77: #define SD_MAX_VDI_TAG_LEN 256
78: #define SD_NR_VDIS (1U << 24)
79: #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
80: #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
81: #define SECTOR_SIZE 512
82:
83: #define SD_INODE_SIZE (sizeof(SheepdogInode))
84: #define CURRENT_VDI_ID 0
85:
86: typedef struct SheepdogReq {
87: uint8_t proto_ver;
88: uint8_t opcode;
89: uint16_t flags;
90: uint32_t epoch;
91: uint32_t id;
92: uint32_t data_length;
93: uint32_t opcode_specific[8];
94: } SheepdogReq;
95:
96: typedef struct SheepdogRsp {
97: uint8_t proto_ver;
98: uint8_t opcode;
99: uint16_t flags;
100: uint32_t epoch;
101: uint32_t id;
102: uint32_t data_length;
103: uint32_t result;
104: uint32_t opcode_specific[7];
105: } SheepdogRsp;
106:
107: typedef struct SheepdogObjReq {
108: uint8_t proto_ver;
109: uint8_t opcode;
110: uint16_t flags;
111: uint32_t epoch;
112: uint32_t id;
113: uint32_t data_length;
114: uint64_t oid;
115: uint64_t cow_oid;
116: uint32_t copies;
117: uint32_t rsvd;
118: uint64_t offset;
119: } SheepdogObjReq;
120:
121: typedef struct SheepdogObjRsp {
122: uint8_t proto_ver;
123: uint8_t opcode;
124: uint16_t flags;
125: uint32_t epoch;
126: uint32_t id;
127: uint32_t data_length;
128: uint32_t result;
129: uint32_t copies;
130: uint32_t pad[6];
131: } SheepdogObjRsp;
132:
133: typedef struct SheepdogVdiReq {
134: uint8_t proto_ver;
135: uint8_t opcode;
136: uint16_t flags;
137: uint32_t epoch;
138: uint32_t id;
139: uint32_t data_length;
140: uint64_t vdi_size;
141: uint32_t base_vdi_id;
142: uint32_t copies;
143: uint32_t snapid;
144: uint32_t pad[3];
145: } SheepdogVdiReq;
146:
147: typedef struct SheepdogVdiRsp {
148: uint8_t proto_ver;
149: uint8_t opcode;
150: uint16_t flags;
151: uint32_t epoch;
152: uint32_t id;
153: uint32_t data_length;
154: uint32_t result;
155: uint32_t rsvd;
156: uint32_t vdi_id;
157: uint32_t pad[5];
158: } SheepdogVdiRsp;
159:
160: typedef struct SheepdogInode {
161: char name[SD_MAX_VDI_LEN];
162: char tag[SD_MAX_VDI_TAG_LEN];
163: uint64_t ctime;
164: uint64_t snap_ctime;
165: uint64_t vm_clock_nsec;
166: uint64_t vdi_size;
167: uint64_t vm_state_size;
168: uint16_t copy_policy;
169: uint8_t nr_copies;
170: uint8_t block_size_shift;
171: uint32_t snap_id;
172: uint32_t vdi_id;
173: uint32_t parent_vdi_id;
174: uint32_t child_vdi_id[MAX_CHILDREN];
175: uint32_t data_vdi_id[MAX_DATA_OBJS];
176: } SheepdogInode;
177:
178: /*
179: * 64 bit FNV-1a non-zero initial basis
180: */
181: #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
182:
183: /*
184: * 64 bit Fowler/Noll/Vo FNV-1a hash code
185: */
186: static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
187: {
188: unsigned char *bp = buf;
189: unsigned char *be = bp + len;
190: while (bp < be) {
191: hval ^= (uint64_t) *bp++;
192: hval += (hval << 1) + (hval << 4) + (hval << 5) +
193: (hval << 7) + (hval << 8) + (hval << 40);
194: }
195: return hval;
196: }
197:
198: static inline int is_data_obj_writeable(SheepdogInode *inode, unsigned int idx)
199: {
200: return inode->vdi_id == inode->data_vdi_id[idx];
201: }
202:
203: static inline int is_data_obj(uint64_t oid)
204: {
205: return !(VDI_BIT & oid);
206: }
207:
208: static inline uint64_t data_oid_to_idx(uint64_t oid)
209: {
210: return oid & (MAX_DATA_OBJS - 1);
211: }
212:
213: static inline uint64_t vid_to_vdi_oid(uint32_t vid)
214: {
215: return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
216: }
217:
218: static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
219: {
220: return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
221: }
222:
223: static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
224: {
225: return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
226: }
227:
228: static inline int is_snapshot(struct SheepdogInode *inode)
229: {
230: return !!inode->snap_ctime;
231: }
232:
233: #undef dprintf
234: #ifdef DEBUG_SDOG
235: #define dprintf(fmt, args...) \
236: do { \
237: fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
238: } while (0)
239: #else
240: #define dprintf(fmt, args...)
241: #endif
242:
243: typedef struct SheepdogAIOCB SheepdogAIOCB;
244:
245: typedef struct AIOReq {
246: SheepdogAIOCB *aiocb;
247: unsigned int iov_offset;
248:
249: uint64_t oid;
250: uint64_t base_oid;
251: uint64_t offset;
252: unsigned int data_len;
253: uint8_t flags;
254: uint32_t id;
255:
256: QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
257: QLIST_ENTRY(AIOReq) aioreq_siblings;
258: } AIOReq;
259:
260: enum AIOCBState {
261: AIOCB_WRITE_UDATA,
262: AIOCB_READ_UDATA,
263: };
264:
265: struct SheepdogAIOCB {
266: BlockDriverAIOCB common;
267:
268: QEMUIOVector *qiov;
269:
270: int64_t sector_num;
271: int nb_sectors;
272:
273: int ret;
274: enum AIOCBState aiocb_type;
275:
276: QEMUBH *bh;
277: void (*aio_done_func)(SheepdogAIOCB *);
278:
279: int canceled;
280:
281: QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
282: };
283:
284: typedef struct BDRVSheepdogState {
285: SheepdogInode inode;
286:
287: uint32_t min_dirty_data_idx;
288: uint32_t max_dirty_data_idx;
289:
290: char name[SD_MAX_VDI_LEN];
291: int is_snapshot;
292:
293: char *addr;
294: char *port;
295: int fd;
296:
297: uint32_t aioreq_seq_num;
298: QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
299: } BDRVSheepdogState;
300:
301: static const char * sd_strerror(int err)
302: {
303: int i;
304:
305: static const struct {
306: int err;
307: const char *desc;
308: } errors[] = {
309: {SD_RES_SUCCESS, "Success"},
310: {SD_RES_UNKNOWN, "Unknown error"},
311: {SD_RES_NO_OBJ, "No object found"},
312: {SD_RES_EIO, "I/O error"},
313: {SD_RES_VDI_EXIST, "VDI exists already"},
314: {SD_RES_INVALID_PARMS, "Invalid parameters"},
315: {SD_RES_SYSTEM_ERROR, "System error"},
316: {SD_RES_VDI_LOCKED, "VDI is already locked"},
317: {SD_RES_NO_VDI, "No vdi found"},
318: {SD_RES_NO_BASE_VDI, "No base VDI found"},
319: {SD_RES_VDI_READ, "Failed read the requested VDI"},
320: {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
321: {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
322: {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
323: {SD_RES_NO_TAG, "Failed to find the requested tag"},
324: {SD_RES_STARTUP, "The system is still booting"},
325: {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
326: {SD_RES_SHUTDOWN, "The system is shutting down"},
327: {SD_RES_NO_MEM, "Out of memory on the server"},
328: {SD_RES_FULL_VDI, "We already have the maximum vdis"},
329: {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
330: {SD_RES_NO_SPACE, "Server has no space for new objects"},
331: {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
332: {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
333: {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
334: };
335:
336: for (i = 0; i < ARRAY_SIZE(errors); ++i) {
337: if (errors[i].err == err) {
338: return errors[i].desc;
339: }
340: }
341:
342: return "Invalid error code";
343: }
344:
345: /*
346: * Sheepdog I/O handling:
347: *
348: * 1. In the sd_aio_readv/writev, read/write requests are added to the
349: * QEMU Bottom Halves.
350: *
351: * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
352: * requests to the server and link the requests to the
353: * outstanding_list in the BDRVSheepdogState. we exits the
354: * function without waiting for receiving the response.
355: *
356: * 3. We receive the response in aio_read_response, the fd handler to
357: * the sheepdog connection. If metadata update is needed, we send
358: * the write request to the vdi object in sd_write_done, the write
359: * completion function. The AIOCB callback is not called until all
360: * the requests belonging to the AIOCB are finished.
361: */
362:
363: static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
364: uint64_t oid, unsigned int data_len,
365: uint64_t offset, uint8_t flags,
366: uint64_t base_oid, unsigned int iov_offset)
367: {
368: AIOReq *aio_req;
369:
370: aio_req = qemu_malloc(sizeof(*aio_req));
371: aio_req->aiocb = acb;
372: aio_req->iov_offset = iov_offset;
373: aio_req->oid = oid;
374: aio_req->base_oid = base_oid;
375: aio_req->offset = offset;
376: aio_req->data_len = data_len;
377: aio_req->flags = flags;
378: aio_req->id = s->aioreq_seq_num++;
379:
380: QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
381: outstanding_aio_siblings);
382: QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
383:
384: return aio_req;
385: }
386:
387: static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
388: {
389: SheepdogAIOCB *acb = aio_req->aiocb;
390: QLIST_REMOVE(aio_req, outstanding_aio_siblings);
391: QLIST_REMOVE(aio_req, aioreq_siblings);
392: qemu_free(aio_req);
393:
394: return !QLIST_EMPTY(&acb->aioreq_head);
395: }
396:
397: static void sd_finish_aiocb(SheepdogAIOCB *acb)
398: {
399: if (!acb->canceled) {
400: acb->common.cb(acb->common.opaque, acb->ret);
401: }
402: qemu_aio_release(acb);
403: }
404:
405: static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
406: {
407: SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
408:
409: /*
410: * Sheepdog cannot cancel the requests which are already sent to
411: * the servers, so we just complete the request with -EIO here.
412: */
413: acb->common.cb(acb->common.opaque, -EIO);
414: acb->canceled = 1;
415: }
416:
417: static AIOPool sd_aio_pool = {
418: .aiocb_size = sizeof(SheepdogAIOCB),
419: .cancel = sd_aio_cancel,
420: };
421:
422: static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
423: int64_t sector_num, int nb_sectors,
424: BlockDriverCompletionFunc *cb, void *opaque)
425: {
426: SheepdogAIOCB *acb;
427:
428: acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
429:
430: acb->qiov = qiov;
431:
432: acb->sector_num = sector_num;
433: acb->nb_sectors = nb_sectors;
434:
435: acb->aio_done_func = NULL;
436: acb->canceled = 0;
437: acb->bh = NULL;
438: acb->ret = 0;
439: QLIST_INIT(&acb->aioreq_head);
440: return acb;
441: }
442:
443: static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
444: {
445: if (acb->bh) {
446: error_report("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type);
447: return -EIO;
448: }
449:
450: acb->bh = qemu_bh_new(cb, acb);
451: if (!acb->bh) {
452: error_report("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type);
453: return -EIO;
454: }
455:
456: qemu_bh_schedule(acb->bh);
457:
458: return 0;
459: }
460:
461: #ifdef _WIN32
462:
463: struct msghdr {
464: struct iovec *msg_iov;
465: size_t msg_iovlen;
466: };
467:
468: static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
469: {
470: size_t size = 0;
471: char *buf, *p;
472: int i, ret;
473:
474: /* count the msg size */
475: for (i = 0; i < msg->msg_iovlen; i++) {
476: size += msg->msg_iov[i].iov_len;
477: }
478: buf = qemu_malloc(size);
479:
480: p = buf;
481: for (i = 0; i < msg->msg_iovlen; i++) {
482: memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
483: p += msg->msg_iov[i].iov_len;
484: }
485:
486: ret = send(s, buf, size, flags);
487:
488: qemu_free(buf);
489: return ret;
490: }
491:
492: static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
493: {
494: size_t size = 0;
495: char *buf, *p;
496: int i, ret;
497:
498: /* count the msg size */
499: for (i = 0; i < msg->msg_iovlen; i++) {
500: size += msg->msg_iov[i].iov_len;
501: }
502: buf = qemu_malloc(size);
503:
504: ret = recv(s, buf, size, flags);
505: if (ret < 0) {
506: goto out;
507: }
508:
509: p = buf;
510: for (i = 0; i < msg->msg_iovlen; i++) {
511: memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
512: p += msg->msg_iov[i].iov_len;
513: }
514: out:
515: qemu_free(buf);
516: return ret;
517: }
518:
519: #endif
520:
521: /*
522: * Send/recv data with iovec buffers
523: *
524: * This function send/recv data from/to the iovec buffer directly.
525: * The first `offset' bytes in the iovec buffer are skipped and next
526: * `len' bytes are used.
527: *
528: * For example,
529: *
530: * do_send_recv(sockfd, iov, len, offset, 1);
531: *
532: * is equals to
533: *
534: * char *buf = malloc(size);
535: * iov_to_buf(iov, iovcnt, buf, offset, size);
536: * send(sockfd, buf, size, 0);
537: * free(buf);
538: */
539: static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
540: int write)
541: {
542: struct msghdr msg;
543: int ret, diff;
544:
545: memset(&msg, 0, sizeof(msg));
546: msg.msg_iov = iov;
547: msg.msg_iovlen = 1;
548:
549: len += offset;
550:
551: while (iov->iov_len < len) {
552: len -= iov->iov_len;
553:
554: iov++;
555: msg.msg_iovlen++;
556: }
557:
558: diff = iov->iov_len - len;
559: iov->iov_len -= diff;
560:
561: while (msg.msg_iov->iov_len <= offset) {
562: offset -= msg.msg_iov->iov_len;
563:
564: msg.msg_iov++;
565: msg.msg_iovlen--;
566: }
567:
568: msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
569: msg.msg_iov->iov_len -= offset;
570:
571: if (write) {
572: ret = sendmsg(sockfd, &msg, 0);
573: } else {
574: ret = recvmsg(sockfd, &msg, 0);
575: }
576:
577: msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
578: msg.msg_iov->iov_len += offset;
579:
580: iov->iov_len += diff;
581: return ret;
582: }
583:
584: static int connect_to_sdog(const char *addr, const char *port)
585: {
586: char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
587: int fd, ret;
588: struct addrinfo hints, *res, *res0;
589:
590: if (!addr) {
591: addr = SD_DEFAULT_ADDR;
592: port = SD_DEFAULT_PORT;
593: }
594:
595: memset(&hints, 0, sizeof(hints));
596: hints.ai_socktype = SOCK_STREAM;
597:
598: ret = getaddrinfo(addr, port, &hints, &res0);
599: if (ret) {
600: error_report("unable to get address info %s, %s\n",
601: addr, strerror(errno));
602: return -1;
603: }
604:
605: for (res = res0; res; res = res->ai_next) {
606: ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
607: sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
608: if (ret) {
609: continue;
610: }
611:
612: fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
613: if (fd < 0) {
614: continue;
615: }
616:
617: reconnect:
618: ret = connect(fd, res->ai_addr, res->ai_addrlen);
619: if (ret < 0) {
620: if (errno == EINTR) {
621: goto reconnect;
622: }
623: break;
624: }
625:
626: dprintf("connected to %s:%s\n", addr, port);
627: goto success;
628: }
629: fd = -1;
630: error_report("failed connect to %s:%s\n", addr, port);
631: success:
632: freeaddrinfo(res0);
633: return fd;
634: }
635:
636: static int do_readv_writev(int sockfd, struct iovec *iov, int len,
637: int iov_offset, int write)
638: {
639: int ret;
640: again:
641: ret = do_send_recv(sockfd, iov, len, iov_offset, write);
642: if (ret < 0) {
643: if (errno == EINTR || errno == EAGAIN) {
644: goto again;
645: }
646: error_report("failed to recv a rsp, %s\n", strerror(errno));
647: return 1;
648: }
649:
650: iov_offset += ret;
651: len -= ret;
652: if (len) {
653: goto again;
654: }
655:
656: return 0;
657: }
658:
659: static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
660: {
661: return do_readv_writev(sockfd, iov, len, iov_offset, 0);
662: }
663:
664: static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
665: {
666: return do_readv_writev(sockfd, iov, len, iov_offset, 1);
667: }
668:
669: static int do_read_write(int sockfd, void *buf, int len, int write)
670: {
671: struct iovec iov;
672:
673: iov.iov_base = buf;
674: iov.iov_len = len;
675:
676: return do_readv_writev(sockfd, &iov, len, 0, write);
677: }
678:
679: static int do_read(int sockfd, void *buf, int len)
680: {
681: return do_read_write(sockfd, buf, len, 0);
682: }
683:
684: static int do_write(int sockfd, void *buf, int len)
685: {
686: return do_read_write(sockfd, buf, len, 1);
687: }
688:
689: static int send_req(int sockfd, SheepdogReq *hdr, void *data,
690: unsigned int *wlen)
691: {
692: int ret;
693: struct iovec iov[2];
694:
695: iov[0].iov_base = hdr;
696: iov[0].iov_len = sizeof(*hdr);
697:
698: if (*wlen) {
699: iov[1].iov_base = data;
700: iov[1].iov_len = *wlen;
701: }
702:
703: ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
704: if (ret) {
705: error_report("failed to send a req, %s\n", strerror(errno));
706: ret = -1;
707: }
708:
709: return ret;
710: }
711:
712: static int do_req(int sockfd, SheepdogReq *hdr, void *data,
713: unsigned int *wlen, unsigned int *rlen)
714: {
715: int ret;
716:
717: ret = send_req(sockfd, hdr, data, wlen);
718: if (ret) {
719: ret = -1;
720: goto out;
721: }
722:
723: ret = do_read(sockfd, hdr, sizeof(*hdr));
724: if (ret) {
725: error_report("failed to get a rsp, %s\n", strerror(errno));
726: ret = -1;
727: goto out;
728: }
729:
730: if (*rlen > hdr->data_length) {
731: *rlen = hdr->data_length;
732: }
733:
734: if (*rlen) {
735: ret = do_read(sockfd, data, *rlen);
736: if (ret) {
737: error_report("failed to get the data, %s\n", strerror(errno));
738: ret = -1;
739: goto out;
740: }
741: }
742: ret = 0;
743: out:
744: return ret;
745: }
746:
747: static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
748: struct iovec *iov, int niov, int create,
749: enum AIOCBState aiocb_type);
750:
751: /*
752: * This function searchs pending requests to the object `oid', and
753: * sends them.
754: */
755: static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
756: {
757: AIOReq *aio_req, *next;
758: SheepdogAIOCB *acb;
759: int ret;
760:
761: QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
762: outstanding_aio_siblings, next) {
763: if (id == aio_req->id) {
764: continue;
765: }
766: if (aio_req->oid != oid) {
767: continue;
768: }
769:
770: acb = aio_req->aiocb;
771: ret = add_aio_request(s, aio_req, acb->qiov->iov,
772: acb->qiov->niov, 0, acb->aiocb_type);
773: if (ret < 0) {
774: error_report("add_aio_request is failed\n");
775: free_aio_req(s, aio_req);
776: if (QLIST_EMPTY(&acb->aioreq_head)) {
777: sd_finish_aiocb(acb);
778: }
779: }
780: }
781: }
782:
783: /*
784: * Receive responses of the I/O requests.
785: *
786: * This function is registered as a fd handler, and called from the
787: * main loop when s->fd is ready for reading responses.
788: */
789: static void aio_read_response(void *opaque)
790: {
791: SheepdogObjRsp rsp;
792: BDRVSheepdogState *s = opaque;
793: int fd = s->fd;
794: int ret;
795: AIOReq *aio_req = NULL;
796: SheepdogAIOCB *acb;
797: int rest;
798: unsigned long idx;
799:
800: if (QLIST_EMPTY(&s->outstanding_aio_head)) {
801: return;
802: }
803:
804: /* read a header */
805: ret = do_read(fd, &rsp, sizeof(rsp));
806: if (ret) {
807: error_report("failed to get the header, %s\n", strerror(errno));
808: return;
809: }
810:
811: /* find the right aio_req from the outstanding_aio list */
812: QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
813: if (aio_req->id == rsp.id) {
814: break;
815: }
816: }
817: if (!aio_req) {
818: error_report("cannot find aio_req %x\n", rsp.id);
819: return;
820: }
821:
822: acb = aio_req->aiocb;
823:
824: switch (acb->aiocb_type) {
825: case AIOCB_WRITE_UDATA:
826: if (!is_data_obj(aio_req->oid)) {
827: break;
828: }
829: idx = data_oid_to_idx(aio_req->oid);
830:
831: if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
832: /*
833: * If the object is newly created one, we need to update
834: * the vdi object (metadata object). min_dirty_data_idx
835: * and max_dirty_data_idx are changed to include updated
836: * index between them.
837: */
838: s->inode.data_vdi_id[idx] = s->inode.vdi_id;
839: s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
840: s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
841:
842: /*
843: * Some requests may be blocked because simultaneous
844: * create requests are not allowed, so we search the
845: * pending requests here.
846: */
847: send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
848: }
849: break;
850: case AIOCB_READ_UDATA:
851: ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
852: aio_req->iov_offset);
853: if (ret) {
854: error_report("failed to get the data, %s\n", strerror(errno));
855: return;
856: }
857: break;
858: }
859:
860: if (rsp.result != SD_RES_SUCCESS) {
861: acb->ret = -EIO;
862: error_report("%s\n", sd_strerror(rsp.result));
863: }
864:
865: rest = free_aio_req(s, aio_req);
866: if (!rest) {
867: /*
868: * We've finished all requests which belong to the AIOCB, so
869: * we can call the callback now.
870: */
871: acb->aio_done_func(acb);
872: }
873: }
874:
875: static int aio_flush_request(void *opaque)
876: {
877: BDRVSheepdogState *s = opaque;
878:
879: return !QLIST_EMPTY(&s->outstanding_aio_head);
880: }
881:
882: #if !defined(SOL_TCP) || !defined(TCP_CORK)
883:
884: static int set_cork(int fd, int v)
885: {
886: return 0;
887: }
888:
889: #else
890:
891: static int set_cork(int fd, int v)
892: {
893: return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
894: }
895:
896: #endif
897:
898: static int set_nodelay(int fd)
899: {
900: int ret, opt;
901:
902: opt = 1;
903: ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
904: return ret;
905: }
906:
907: /*
908: * Return a socket discriptor to read/write objects.
909: *
910: * We cannot use this discriptor for other operations because
911: * the block driver may be on waiting response from the server.
912: */
913: static int get_sheep_fd(BDRVSheepdogState *s)
914: {
915: int ret, fd;
916:
917: fd = connect_to_sdog(s->addr, s->port);
918: if (fd < 0) {
919: error_report("%s\n", strerror(errno));
920: return -1;
921: }
922:
923: socket_set_nonblock(fd);
924:
925: ret = set_nodelay(fd);
926: if (ret) {
927: error_report("%s\n", strerror(errno));
928: closesocket(fd);
929: return -1;
930: }
931:
932: qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
933: NULL, s);
934: return fd;
935: }
936:
937: /*
938: * Parse a filename
939: *
940: * filename must be one of the following formats:
941: * 1. [vdiname]
942: * 2. [vdiname]:[snapid]
943: * 3. [vdiname]:[tag]
944: * 4. [hostname]:[port]:[vdiname]
945: * 5. [hostname]:[port]:[vdiname]:[snapid]
946: * 6. [hostname]:[port]:[vdiname]:[tag]
947: *
948: * You can boot from the snapshot images by specifying `snapid` or
949: * `tag'.
950: *
951: * You can run VMs outside the Sheepdog cluster by specifying
952: * `hostname' and `port' (experimental).
953: */
954: static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
955: char *vdi, uint32_t *snapid, char *tag)
956: {
957: char *p, *q;
958: int nr_sep;
959:
960: p = q = qemu_strdup(filename);
961:
962: /* count the number of separators */
963: nr_sep = 0;
964: while (*p) {
965: if (*p == ':') {
966: nr_sep++;
967: }
968: p++;
969: }
970: p = q;
971:
972: /* use the first two tokens as hostname and port number. */
973: if (nr_sep >= 2) {
974: s->addr = p;
975: p = strchr(p, ':');
976: *p++ = '\0';
977:
978: s->port = p;
979: p = strchr(p, ':');
980: *p++ = '\0';
981: } else {
982: s->addr = NULL;
983: s->port = 0;
984: }
985:
986: strncpy(vdi, p, SD_MAX_VDI_LEN);
987:
988: p = strchr(vdi, ':');
989: if (p) {
990: *p++ = '\0';
991: *snapid = strtoul(p, NULL, 10);
992: if (*snapid == 0) {
993: strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
994: }
995: } else {
996: *snapid = CURRENT_VDI_ID; /* search current vdi */
997: }
998:
999: if (s->addr == NULL) {
1000: qemu_free(q);
1001: }
1002:
1003: return 0;
1004: }
1005:
1006: static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1007: char *tag, uint32_t *vid, int for_snapshot)
1008: {
1009: int ret, fd;
1010: SheepdogVdiReq hdr;
1011: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1012: unsigned int wlen, rlen = 0;
1013: char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1014:
1015: fd = connect_to_sdog(s->addr, s->port);
1016: if (fd < 0) {
1017: return -1;
1018: }
1019:
1020: memset(buf, 0, sizeof(buf));
1021: strncpy(buf, filename, SD_MAX_VDI_LEN);
1022: strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1023:
1024: memset(&hdr, 0, sizeof(hdr));
1025: if (for_snapshot) {
1026: hdr.opcode = SD_OP_GET_VDI_INFO;
1027: } else {
1028: hdr.opcode = SD_OP_LOCK_VDI;
1029: }
1030: wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1031: hdr.proto_ver = SD_PROTO_VER;
1032: hdr.data_length = wlen;
1033: hdr.snapid = snapid;
1034: hdr.flags = SD_FLAG_CMD_WRITE;
1035:
1036: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1037: if (ret) {
1038: ret = -1;
1039: goto out;
1040: }
1041:
1042: if (rsp->result != SD_RES_SUCCESS) {
1043: error_report("cannot get vdi info, %s, %s %d %s\n",
1044: sd_strerror(rsp->result), filename, snapid, tag);
1045: ret = -1;
1046: goto out;
1047: }
1048: *vid = rsp->vdi_id;
1049:
1050: ret = 0;
1051: out:
1052: closesocket(fd);
1053: return ret;
1054: }
1055:
1056: static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1057: struct iovec *iov, int niov, int create,
1058: enum AIOCBState aiocb_type)
1059: {
1060: int nr_copies = s->inode.nr_copies;
1061: SheepdogObjReq hdr;
1062: unsigned int wlen;
1063: int ret;
1064: uint64_t oid = aio_req->oid;
1065: unsigned int datalen = aio_req->data_len;
1066: uint64_t offset = aio_req->offset;
1067: uint8_t flags = aio_req->flags;
1068: uint64_t old_oid = aio_req->base_oid;
1069:
1070: if (!nr_copies) {
1071: error_report("bug\n");
1072: }
1073:
1074: memset(&hdr, 0, sizeof(hdr));
1075:
1076: if (aiocb_type == AIOCB_READ_UDATA) {
1077: wlen = 0;
1078: hdr.opcode = SD_OP_READ_OBJ;
1079: hdr.flags = flags;
1080: } else if (create) {
1081: wlen = datalen;
1082: hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1083: hdr.flags = SD_FLAG_CMD_WRITE | flags;
1084: } else {
1085: wlen = datalen;
1086: hdr.opcode = SD_OP_WRITE_OBJ;
1087: hdr.flags = SD_FLAG_CMD_WRITE | flags;
1088: }
1089:
1090: hdr.oid = oid;
1091: hdr.cow_oid = old_oid;
1092: hdr.copies = s->inode.nr_copies;
1093:
1094: hdr.data_length = datalen;
1095: hdr.offset = offset;
1096:
1097: hdr.id = aio_req->id;
1098:
1099: set_cork(s->fd, 1);
1100:
1101: /* send a header */
1102: ret = do_write(s->fd, &hdr, sizeof(hdr));
1103: if (ret) {
1104: error_report("failed to send a req, %s\n", strerror(errno));
1105: return -EIO;
1106: }
1107:
1108: if (wlen) {
1109: ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1110: if (ret) {
1111: error_report("failed to send a data, %s\n", strerror(errno));
1112: return -EIO;
1113: }
1114: }
1115:
1116: set_cork(s->fd, 0);
1117:
1118: return 0;
1119: }
1120:
1121: static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1122: unsigned int datalen, uint64_t offset,
1123: int write, int create)
1124: {
1125: SheepdogObjReq hdr;
1126: SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1127: unsigned int wlen, rlen;
1128: int ret;
1129:
1130: memset(&hdr, 0, sizeof(hdr));
1131:
1132: if (write) {
1133: wlen = datalen;
1134: rlen = 0;
1135: hdr.flags = SD_FLAG_CMD_WRITE;
1136: if (create) {
1137: hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1138: } else {
1139: hdr.opcode = SD_OP_WRITE_OBJ;
1140: }
1141: } else {
1142: wlen = 0;
1143: rlen = datalen;
1144: hdr.opcode = SD_OP_READ_OBJ;
1145: }
1146: hdr.oid = oid;
1147: hdr.data_length = datalen;
1148: hdr.offset = offset;
1149: hdr.copies = copies;
1150:
1151: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1152: if (ret) {
1153: error_report("failed to send a request to the sheep\n");
1154: return -1;
1155: }
1156:
1157: switch (rsp->result) {
1158: case SD_RES_SUCCESS:
1159: return 0;
1160: default:
1161: error_report("%s\n", sd_strerror(rsp->result));
1162: return -1;
1163: }
1164: }
1165:
1166: static int read_object(int fd, char *buf, uint64_t oid, int copies,
1167: unsigned int datalen, uint64_t offset)
1168: {
1169: return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1170: }
1171:
1172: static int write_object(int fd, char *buf, uint64_t oid, int copies,
1173: unsigned int datalen, uint64_t offset, int create)
1174: {
1175: return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1176: }
1177:
1178: static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1179: {
1180: int ret, fd;
1181: uint32_t vid = 0;
1182: BDRVSheepdogState *s = bs->opaque;
1183: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1184: uint32_t snapid;
1185: char *buf = NULL;
1186:
1187: strstart(filename, "sheepdog:", (const char **)&filename);
1188:
1189: QLIST_INIT(&s->outstanding_aio_head);
1190: s->fd = -1;
1191:
1192: memset(vdi, 0, sizeof(vdi));
1193: memset(tag, 0, sizeof(tag));
1194: if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1195: goto out;
1196: }
1197: s->fd = get_sheep_fd(s);
1198: if (s->fd < 0) {
1199: goto out;
1200: }
1201:
1202: ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1203: if (ret) {
1204: goto out;
1205: }
1206:
1207: if (snapid) {
1208: dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1209: s->is_snapshot = 1;
1210: }
1211:
1212: fd = connect_to_sdog(s->addr, s->port);
1213: if (fd < 0) {
1214: error_report("failed to connect\n");
1215: goto out;
1216: }
1217:
1218: buf = qemu_malloc(SD_INODE_SIZE);
1219: ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1220:
1221: closesocket(fd);
1222:
1223: if (ret) {
1224: goto out;
1225: }
1226:
1227: memcpy(&s->inode, buf, sizeof(s->inode));
1228: s->min_dirty_data_idx = UINT32_MAX;
1229: s->max_dirty_data_idx = 0;
1230:
1231: bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1232: strncpy(s->name, vdi, sizeof(s->name));
1233: qemu_free(buf);
1234: return 0;
1235: out:
1236: qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1237: if (s->fd >= 0) {
1238: closesocket(s->fd);
1239: }
1240: qemu_free(buf);
1241: return -1;
1242: }
1243:
1244: static int do_sd_create(char *filename, int64_t vdi_size,
1245: uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1246: const char *addr, const char *port)
1247: {
1248: SheepdogVdiReq hdr;
1249: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1250: int fd, ret;
1251: unsigned int wlen, rlen = 0;
1252: char buf[SD_MAX_VDI_LEN];
1253:
1254: fd = connect_to_sdog(addr, port);
1255: if (fd < 0) {
1256: return -EIO;
1257: }
1258:
1259: memset(buf, 0, sizeof(buf));
1260: strncpy(buf, filename, SD_MAX_VDI_LEN);
1261:
1262: memset(&hdr, 0, sizeof(hdr));
1263: hdr.opcode = SD_OP_NEW_VDI;
1264: hdr.base_vdi_id = base_vid;
1265:
1266: wlen = SD_MAX_VDI_LEN;
1267:
1268: hdr.flags = SD_FLAG_CMD_WRITE;
1269: hdr.snapid = snapshot;
1270:
1271: hdr.data_length = wlen;
1272: hdr.vdi_size = vdi_size;
1273:
1274: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1275:
1276: closesocket(fd);
1277:
1278: if (ret) {
1279: return -EIO;
1280: }
1281:
1282: if (rsp->result != SD_RES_SUCCESS) {
1283: error_report("%s, %s\n", sd_strerror(rsp->result), filename);
1284: return -EIO;
1285: }
1286:
1287: if (vdi_id) {
1288: *vdi_id = rsp->vdi_id;
1289: }
1290:
1291: return 0;
1292: }
1293:
1294: static int sd_create(const char *filename, QEMUOptionParameter *options)
1295: {
1296: int ret;
1.1.1.2 ! root 1297: uint32_t vid = 0, base_vid = 0;
1.1 root 1298: int64_t vdi_size = 0;
1299: char *backing_file = NULL;
1.1.1.2 ! root 1300: BDRVSheepdogState s;
! 1301: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
! 1302: uint32_t snapid;
1.1 root 1303:
1304: strstart(filename, "sheepdog:", (const char **)&filename);
1305:
1.1.1.2 ! root 1306: memset(&s, 0, sizeof(s));
! 1307: memset(vdi, 0, sizeof(vdi));
! 1308: memset(tag, 0, sizeof(tag));
! 1309: if (parse_vdiname(&s, filename, vdi, &snapid, tag) < 0) {
! 1310: error_report("invalid filename\n");
! 1311: return -EINVAL;
! 1312: }
! 1313:
1.1 root 1314: while (options && options->name) {
1315: if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1316: vdi_size = options->value.n;
1317: } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1318: backing_file = options->value.s;
1319: }
1320: options++;
1321: }
1322:
1323: if (vdi_size > SD_MAX_VDI_SIZE) {
1324: error_report("too big image size\n");
1325: return -EINVAL;
1326: }
1327:
1328: if (backing_file) {
1329: BlockDriverState *bs;
1330: BDRVSheepdogState *s;
1331: BlockDriver *drv;
1332:
1333: /* Currently, only Sheepdog backing image is supported. */
1334: drv = bdrv_find_protocol(backing_file);
1335: if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1336: error_report("backing_file must be a sheepdog image\n");
1337: return -EINVAL;
1338: }
1339:
1340: ret = bdrv_file_open(&bs, backing_file, 0);
1341: if (ret < 0)
1342: return -EIO;
1343:
1344: s = bs->opaque;
1345:
1346: if (!is_snapshot(&s->inode)) {
1347: error_report("cannot clone from a non snapshot vdi\n");
1348: bdrv_delete(bs);
1349: return -EINVAL;
1350: }
1351:
1.1.1.2 ! root 1352: base_vid = s->inode.vdi_id;
1.1 root 1353: bdrv_delete(bs);
1354: }
1355:
1.1.1.2 ! root 1356: return do_sd_create((char *)vdi, vdi_size, base_vid, &vid, 0, s.addr, s.port);
1.1 root 1357: }
1358:
1359: static void sd_close(BlockDriverState *bs)
1360: {
1361: BDRVSheepdogState *s = bs->opaque;
1362: SheepdogVdiReq hdr;
1363: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1364: unsigned int wlen, rlen = 0;
1365: int fd, ret;
1366:
1367: dprintf("%s\n", s->name);
1368:
1369: fd = connect_to_sdog(s->addr, s->port);
1370: if (fd < 0) {
1371: return;
1372: }
1373:
1374: memset(&hdr, 0, sizeof(hdr));
1375:
1376: hdr.opcode = SD_OP_RELEASE_VDI;
1377: wlen = strlen(s->name) + 1;
1378: hdr.data_length = wlen;
1379: hdr.flags = SD_FLAG_CMD_WRITE;
1380:
1381: ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1382:
1383: closesocket(fd);
1384:
1385: if (!ret && rsp->result != SD_RES_SUCCESS &&
1386: rsp->result != SD_RES_VDI_NOT_LOCKED) {
1387: error_report("%s, %s\n", sd_strerror(rsp->result), s->name);
1388: }
1389:
1390: qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1391: closesocket(s->fd);
1392: qemu_free(s->addr);
1393: }
1394:
1395: static int64_t sd_getlength(BlockDriverState *bs)
1396: {
1397: BDRVSheepdogState *s = bs->opaque;
1398:
1399: return s->inode.vdi_size;
1400: }
1401:
1402: static int sd_truncate(BlockDriverState *bs, int64_t offset)
1403: {
1404: BDRVSheepdogState *s = bs->opaque;
1405: int ret, fd;
1406: unsigned int datalen;
1407:
1408: if (offset < s->inode.vdi_size) {
1409: error_report("shrinking is not supported\n");
1410: return -EINVAL;
1411: } else if (offset > SD_MAX_VDI_SIZE) {
1412: error_report("too big image size\n");
1413: return -EINVAL;
1414: }
1415:
1416: fd = connect_to_sdog(s->addr, s->port);
1417: if (fd < 0) {
1418: return -EIO;
1419: }
1420:
1421: /* we don't need to update entire object */
1422: datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1423: s->inode.vdi_size = offset;
1424: ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1425: s->inode.nr_copies, datalen, 0, 0);
1426: close(fd);
1427:
1428: if (ret < 0) {
1429: error_report("failed to update an inode.\n");
1430: return -EIO;
1431: }
1432:
1433: return 0;
1434: }
1435:
1436: /*
1437: * This function is called after writing data objects. If we need to
1438: * update metadata, this sends a write request to the vdi object.
1439: * Otherwise, this calls the AIOCB callback.
1440: */
1441: static void sd_write_done(SheepdogAIOCB *acb)
1442: {
1443: int ret;
1444: BDRVSheepdogState *s = acb->common.bs->opaque;
1445: struct iovec iov;
1446: AIOReq *aio_req;
1447: uint32_t offset, data_len, mn, mx;
1448:
1449: mn = s->min_dirty_data_idx;
1450: mx = s->max_dirty_data_idx;
1451: if (mn <= mx) {
1452: /* we need to update the vdi object. */
1453: offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1454: mn * sizeof(s->inode.data_vdi_id[0]);
1455: data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1456:
1457: s->min_dirty_data_idx = UINT32_MAX;
1458: s->max_dirty_data_idx = 0;
1459:
1460: iov.iov_base = &s->inode;
1461: iov.iov_len = sizeof(s->inode);
1462: aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1463: data_len, offset, 0, 0, offset);
1464: ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1465: if (ret) {
1466: free_aio_req(s, aio_req);
1467: acb->ret = -EIO;
1468: goto out;
1469: }
1470:
1471: acb->aio_done_func = sd_finish_aiocb;
1472: acb->aiocb_type = AIOCB_WRITE_UDATA;
1473: return;
1474: }
1475: out:
1476: sd_finish_aiocb(acb);
1477: }
1478:
1479: /*
1480: * Create a writable VDI from a snapshot
1481: */
1482: static int sd_create_branch(BDRVSheepdogState *s)
1483: {
1484: int ret, fd;
1485: uint32_t vid;
1486: char *buf;
1487:
1488: dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1489:
1490: buf = qemu_malloc(SD_INODE_SIZE);
1491:
1492: ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1493: s->addr, s->port);
1494: if (ret) {
1495: goto out;
1496: }
1497:
1498: dprintf("%" PRIx32 " is created.\n", vid);
1499:
1500: fd = connect_to_sdog(s->addr, s->port);
1501: if (fd < 0) {
1502: error_report("failed to connect\n");
1503: goto out;
1504: }
1505:
1506: ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1507: SD_INODE_SIZE, 0);
1508:
1509: closesocket(fd);
1510:
1511: if (ret < 0) {
1512: goto out;
1513: }
1514:
1515: memcpy(&s->inode, buf, sizeof(s->inode));
1516:
1517: s->is_snapshot = 0;
1518: ret = 0;
1519: dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1520:
1521: out:
1522: qemu_free(buf);
1523:
1524: return ret;
1525: }
1526:
1527: /*
1528: * Send I/O requests to the server.
1529: *
1530: * This function sends requests to the server, links the requests to
1531: * the outstanding_list in BDRVSheepdogState, and exits without
1532: * waiting the response. The responses are received in the
1533: * `aio_read_response' function which is called from the main loop as
1534: * a fd handler.
1535: */
1536: static void sd_readv_writev_bh_cb(void *p)
1537: {
1538: SheepdogAIOCB *acb = p;
1539: int ret = 0;
1540: unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1541: unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1542: uint64_t oid;
1543: uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1544: BDRVSheepdogState *s = acb->common.bs->opaque;
1545: SheepdogInode *inode = &s->inode;
1546: AIOReq *aio_req;
1547:
1548: qemu_bh_delete(acb->bh);
1549: acb->bh = NULL;
1550:
1551: if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1552: /*
1553: * In the case we open the snapshot VDI, Sheepdog creates the
1554: * writable VDI when we do a write operation first.
1555: */
1556: ret = sd_create_branch(s);
1557: if (ret) {
1558: acb->ret = -EIO;
1559: goto out;
1560: }
1561: }
1562:
1563: while (done != total) {
1564: uint8_t flags = 0;
1565: uint64_t old_oid = 0;
1566: int create = 0;
1567:
1568: oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1569:
1570: len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1571:
1572: if (!inode->data_vdi_id[idx]) {
1573: if (acb->aiocb_type == AIOCB_READ_UDATA) {
1574: goto done;
1575: }
1576:
1577: create = 1;
1578: } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1579: && !is_data_obj_writeable(inode, idx)) {
1580: /* Copy-On-Write */
1581: create = 1;
1582: old_oid = oid;
1583: flags = SD_FLAG_CMD_COW;
1584: }
1585:
1586: if (create) {
1587: dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1588: " %" PRIu64 "\n", inode->vdi_id, oid,
1589: vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1590: oid = vid_to_data_oid(inode->vdi_id, idx);
1591: dprintf("new oid %lx\n", oid);
1592: }
1593:
1594: aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1595:
1596: if (create) {
1597: AIOReq *areq;
1598: QLIST_FOREACH(areq, &s->outstanding_aio_head,
1599: outstanding_aio_siblings) {
1600: if (areq == aio_req) {
1601: continue;
1602: }
1603: if (areq->oid == oid) {
1604: /*
1605: * Sheepdog cannot handle simultaneous create
1606: * requests to the same object. So we cannot send
1607: * the request until the previous request
1608: * finishes.
1609: */
1610: aio_req->flags = 0;
1611: aio_req->base_oid = 0;
1612: goto done;
1613: }
1614: }
1615: }
1616:
1617: ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1618: create, acb->aiocb_type);
1619: if (ret < 0) {
1620: error_report("add_aio_request is failed\n");
1621: free_aio_req(s, aio_req);
1622: acb->ret = -EIO;
1623: goto out;
1624: }
1625: done:
1626: offset = 0;
1627: idx++;
1628: done += len;
1629: }
1630: out:
1631: if (QLIST_EMPTY(&acb->aioreq_head)) {
1632: sd_finish_aiocb(acb);
1633: }
1634: }
1635:
1636: static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
1637: QEMUIOVector *qiov, int nb_sectors,
1638: BlockDriverCompletionFunc *cb,
1639: void *opaque)
1640: {
1641: SheepdogAIOCB *acb;
1642:
1643: if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1644: /* TODO: shouldn't block here */
1645: if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1646: return NULL;
1647: }
1648: bs->total_sectors = sector_num + nb_sectors;
1649: }
1650:
1651: acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1652: acb->aio_done_func = sd_write_done;
1653: acb->aiocb_type = AIOCB_WRITE_UDATA;
1654:
1655: sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1656: return &acb->common;
1657: }
1658:
1659: static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
1660: QEMUIOVector *qiov, int nb_sectors,
1661: BlockDriverCompletionFunc *cb,
1662: void *opaque)
1663: {
1664: SheepdogAIOCB *acb;
1665: int i;
1666:
1667: acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1668: acb->aiocb_type = AIOCB_READ_UDATA;
1669: acb->aio_done_func = sd_finish_aiocb;
1670:
1671: /*
1672: * TODO: we can do better; we don't need to initialize
1673: * blindly.
1674: */
1675: for (i = 0; i < qiov->niov; i++) {
1676: memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1677: }
1678:
1679: sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1680: return &acb->common;
1681: }
1682:
1683: static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1684: {
1685: BDRVSheepdogState *s = bs->opaque;
1686: int ret, fd;
1687: uint32_t new_vid;
1688: SheepdogInode *inode;
1689: unsigned int datalen;
1690:
1691: dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1692: "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1693: s->name, sn_info->vm_state_size, s->is_snapshot);
1694:
1695: if (s->is_snapshot) {
1696: error_report("You can't create a snapshot of a snapshot VDI, "
1697: "%s (%" PRIu32 ").\n", s->name, s->inode.vdi_id);
1698:
1699: return -EINVAL;
1700: }
1701:
1702: dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1703:
1704: s->inode.vm_state_size = sn_info->vm_state_size;
1705: s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1706: strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1707: /* we don't need to update entire object */
1708: datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1709:
1710: /* refresh inode. */
1711: fd = connect_to_sdog(s->addr, s->port);
1712: if (fd < 0) {
1713: ret = -EIO;
1714: goto cleanup;
1715: }
1716:
1717: ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1718: s->inode.nr_copies, datalen, 0, 0);
1719: if (ret < 0) {
1720: error_report("failed to write snapshot's inode.\n");
1721: ret = -EIO;
1722: goto cleanup;
1723: }
1724:
1725: ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1726: s->addr, s->port);
1727: if (ret < 0) {
1728: error_report("failed to create inode for snapshot. %s\n",
1729: strerror(errno));
1730: ret = -EIO;
1731: goto cleanup;
1732: }
1733:
1734: inode = (SheepdogInode *)qemu_malloc(datalen);
1735:
1736: ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1737: s->inode.nr_copies, datalen, 0);
1738:
1739: if (ret < 0) {
1740: error_report("failed to read new inode info. %s\n", strerror(errno));
1741: ret = -EIO;
1742: goto cleanup;
1743: }
1744:
1745: memcpy(&s->inode, inode, datalen);
1746: dprintf("s->inode: name %s snap_id %x oid %x\n",
1747: s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1748:
1749: cleanup:
1750: closesocket(fd);
1751: return ret;
1752: }
1753:
1754: static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1755: {
1756: BDRVSheepdogState *s = bs->opaque;
1757: BDRVSheepdogState *old_s;
1758: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1759: char *buf = NULL;
1760: uint32_t vid;
1761: uint32_t snapid = 0;
1762: int ret = -ENOENT, fd;
1763:
1764: old_s = qemu_malloc(sizeof(BDRVSheepdogState));
1765:
1766: memcpy(old_s, s, sizeof(BDRVSheepdogState));
1767:
1768: memset(vdi, 0, sizeof(vdi));
1769: strncpy(vdi, s->name, sizeof(vdi));
1770:
1771: memset(tag, 0, sizeof(tag));
1772: snapid = strtoul(snapshot_id, NULL, 10);
1773: if (!snapid) {
1774: strncpy(tag, s->name, sizeof(tag));
1775: }
1776:
1777: ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1778: if (ret) {
1779: error_report("Failed to find_vdi_name\n");
1780: ret = -ENOENT;
1781: goto out;
1782: }
1783:
1784: fd = connect_to_sdog(s->addr, s->port);
1785: if (fd < 0) {
1786: error_report("failed to connect\n");
1787: goto out;
1788: }
1789:
1790: buf = qemu_malloc(SD_INODE_SIZE);
1791: ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1792: SD_INODE_SIZE, 0);
1793:
1794: closesocket(fd);
1795:
1796: if (ret) {
1797: ret = -ENOENT;
1798: goto out;
1799: }
1800:
1801: memcpy(&s->inode, buf, sizeof(s->inode));
1802:
1803: if (!s->inode.vm_state_size) {
1804: error_report("Invalid snapshot\n");
1805: ret = -ENOENT;
1806: goto out;
1807: }
1808:
1809: s->is_snapshot = 1;
1810:
1811: qemu_free(buf);
1812: qemu_free(old_s);
1813:
1814: return 0;
1815: out:
1816: /* recover bdrv_sd_state */
1817: memcpy(s, old_s, sizeof(BDRVSheepdogState));
1818: qemu_free(buf);
1819: qemu_free(old_s);
1820:
1821: error_report("failed to open. recover old bdrv_sd_state.\n");
1822:
1823: return ret;
1824: }
1825:
1826: static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1827: {
1828: /* FIXME: Delete specified snapshot id. */
1829: return 0;
1830: }
1831:
1832: #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
1833: #define BITS_PER_BYTE 8
1834: #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
1835: #define DECLARE_BITMAP(name,bits) \
1836: unsigned long name[BITS_TO_LONGS(bits)]
1837:
1838: #define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
1839:
1840: static inline int test_bit(unsigned int nr, const unsigned long *addr)
1841: {
1842: return ((1UL << (nr % BITS_PER_LONG)) &
1843: (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
1844: }
1845:
1846: static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1847: {
1848: BDRVSheepdogState *s = bs->opaque;
1849: SheepdogReq req;
1850: int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1851: QEMUSnapshotInfo *sn_tab = NULL;
1852: unsigned wlen, rlen;
1853: int found = 0;
1854: static SheepdogInode inode;
1855: unsigned long *vdi_inuse;
1856: unsigned int start_nr;
1857: uint64_t hval;
1858: uint32_t vid;
1859:
1860: vdi_inuse = qemu_malloc(max);
1861:
1862: fd = connect_to_sdog(s->addr, s->port);
1863: if (fd < 0) {
1864: goto out;
1865: }
1866:
1867: rlen = max;
1868: wlen = 0;
1869:
1870: memset(&req, 0, sizeof(req));
1871:
1872: req.opcode = SD_OP_READ_VDIS;
1873: req.data_length = max;
1874:
1875: ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1876:
1877: closesocket(fd);
1878: if (ret) {
1879: goto out;
1880: }
1881:
1882: sn_tab = qemu_mallocz(nr * sizeof(*sn_tab));
1883:
1884: /* calculate a vdi id with hash function */
1885: hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1886: start_nr = hval & (SD_NR_VDIS - 1);
1887:
1888: fd = connect_to_sdog(s->addr, s->port);
1889: if (fd < 0) {
1890: error_report("failed to connect\n");
1891: goto out;
1892: }
1893:
1894: for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1895: if (!test_bit(vid, vdi_inuse)) {
1896: break;
1897: }
1898:
1899: /* we don't need to read entire object */
1900: ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1901: 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1902:
1903: if (ret) {
1904: continue;
1905: }
1906:
1907: if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1908: sn_tab[found].date_sec = inode.snap_ctime >> 32;
1909: sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1910: sn_tab[found].vm_state_size = inode.vm_state_size;
1911: sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1912:
1913: snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1914: inode.snap_id);
1915: strncpy(sn_tab[found].name, inode.tag,
1916: MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1917: found++;
1918: }
1919: }
1920:
1921: closesocket(fd);
1922: out:
1923: *psn_tab = sn_tab;
1924:
1925: qemu_free(vdi_inuse);
1926:
1927: return found;
1928: }
1929:
1930: static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1931: int64_t pos, int size, int load)
1932: {
1933: int fd, create;
1934: int ret = 0;
1935: unsigned int data_len;
1936: uint64_t vmstate_oid;
1937: uint32_t vdi_index;
1938: uint64_t offset;
1939:
1940: fd = connect_to_sdog(s->addr, s->port);
1941: if (fd < 0) {
1942: ret = -EIO;
1943: goto cleanup;
1944: }
1945:
1946: while (size) {
1947: vdi_index = pos / SD_DATA_OBJ_SIZE;
1948: offset = pos % SD_DATA_OBJ_SIZE;
1949:
1950: data_len = MIN(size, SD_DATA_OBJ_SIZE);
1951:
1952: vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1953:
1954: create = (offset == 0);
1955: if (load) {
1956: ret = read_object(fd, (char *)data, vmstate_oid,
1957: s->inode.nr_copies, data_len, offset);
1958: } else {
1959: ret = write_object(fd, (char *)data, vmstate_oid,
1960: s->inode.nr_copies, data_len, offset, create);
1961: }
1962:
1963: if (ret < 0) {
1964: error_report("failed to save vmstate %s\n", strerror(errno));
1965: ret = -EIO;
1966: goto cleanup;
1967: }
1968:
1969: pos += data_len;
1970: size -= data_len;
1971: ret += data_len;
1972: }
1973: cleanup:
1974: closesocket(fd);
1975: return ret;
1976: }
1977:
1978: static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
1979: int64_t pos, int size)
1980: {
1981: BDRVSheepdogState *s = bs->opaque;
1982:
1983: return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
1984: }
1985:
1986: static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
1987: int64_t pos, int size)
1988: {
1989: BDRVSheepdogState *s = bs->opaque;
1990:
1991: return do_load_save_vmstate(s, data, pos, size, 1);
1992: }
1993:
1994:
1995: static QEMUOptionParameter sd_create_options[] = {
1996: {
1997: .name = BLOCK_OPT_SIZE,
1998: .type = OPT_SIZE,
1999: .help = "Virtual disk size"
2000: },
2001: {
2002: .name = BLOCK_OPT_BACKING_FILE,
2003: .type = OPT_STRING,
2004: .help = "File name of a base image"
2005: },
2006: { NULL }
2007: };
2008:
2009: BlockDriver bdrv_sheepdog = {
2010: .format_name = "sheepdog",
2011: .protocol_name = "sheepdog",
2012: .instance_size = sizeof(BDRVSheepdogState),
2013: .bdrv_file_open = sd_open,
2014: .bdrv_close = sd_close,
2015: .bdrv_create = sd_create,
2016: .bdrv_getlength = sd_getlength,
2017: .bdrv_truncate = sd_truncate,
2018:
2019: .bdrv_aio_readv = sd_aio_readv,
2020: .bdrv_aio_writev = sd_aio_writev,
2021:
2022: .bdrv_snapshot_create = sd_snapshot_create,
2023: .bdrv_snapshot_goto = sd_snapshot_goto,
2024: .bdrv_snapshot_delete = sd_snapshot_delete,
2025: .bdrv_snapshot_list = sd_snapshot_list,
2026:
2027: .bdrv_save_vmstate = sd_save_vmstate,
2028: .bdrv_load_vmstate = sd_load_vmstate,
2029:
2030: .create_options = sd_create_options,
2031: };
2032:
2033: static void bdrv_sheepdog_init(void)
2034: {
2035: bdrv_register(&bdrv_sheepdog);
2036: }
2037: block_init(bdrv_sheepdog_init);
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.