|
|
1.1 root 1: /*
2: * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3: *
4: * This program is free software; you can redistribute it and/or
5: * modify it under the terms of the GNU General Public License version
6: * 2 as published by the Free Software Foundation.
7: *
8: * You should have received a copy of the GNU General Public License
9: * along with this program. If not, see <http://www.gnu.org/licenses/>.
10: */
11: #ifdef _WIN32
12: #include <windows.h>
13: #include <winsock2.h>
14: #include <ws2tcpip.h>
15: #else
16: #include <netdb.h>
17: #include <netinet/tcp.h>
18:
19: #define closesocket(s) close(s)
20: #endif
21:
22: #include "qemu-common.h"
23: #include "qemu-error.h"
24: #include "qemu_socket.h"
25: #include "block_int.h"
26:
27: #define SD_PROTO_VER 0x01
28:
29: #define SD_DEFAULT_ADDR "localhost"
30: #define SD_DEFAULT_PORT "7000"
31:
32: #define SD_OP_CREATE_AND_WRITE_OBJ 0x01
33: #define SD_OP_READ_OBJ 0x02
34: #define SD_OP_WRITE_OBJ 0x03
35:
36: #define SD_OP_NEW_VDI 0x11
37: #define SD_OP_LOCK_VDI 0x12
38: #define SD_OP_RELEASE_VDI 0x13
39: #define SD_OP_GET_VDI_INFO 0x14
40: #define SD_OP_READ_VDIS 0x15
41:
42: #define SD_FLAG_CMD_WRITE 0x01
43: #define SD_FLAG_CMD_COW 0x02
44:
45: #define SD_RES_SUCCESS 0x00 /* Success */
46: #define SD_RES_UNKNOWN 0x01 /* Unknown error */
47: #define SD_RES_NO_OBJ 0x02 /* No object found */
48: #define SD_RES_EIO 0x03 /* I/O error */
49: #define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
50: #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
51: #define SD_RES_SYSTEM_ERROR 0x06 /* System error */
52: #define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
53: #define SD_RES_NO_VDI 0x08 /* No vdi found */
54: #define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
55: #define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
56: #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
57: #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
58: #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
59: #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
60: #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
61: #define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
62: #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
63: #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
64: #define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
65: #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
66: #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
67: #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
68: #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
69: #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
70:
71: /*
72: * Object ID rules
73: *
74: * 0 - 19 (20 bits): data object space
75: * 20 - 31 (12 bits): reserved data object space
76: * 32 - 55 (24 bits): vdi object space
77: * 56 - 59 ( 4 bits): reserved vdi object space
78: * 60 - 63 ( 4 bits): object type indentifier space
79: */
80:
81: #define VDI_SPACE_SHIFT 32
82: #define VDI_BIT (UINT64_C(1) << 63)
83: #define VMSTATE_BIT (UINT64_C(1) << 62)
84: #define MAX_DATA_OBJS (UINT64_C(1) << 20)
85: #define MAX_CHILDREN 1024
86: #define SD_MAX_VDI_LEN 256
87: #define SD_MAX_VDI_TAG_LEN 256
88: #define SD_NR_VDIS (1U << 24)
89: #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
90: #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
91: #define SECTOR_SIZE 512
92:
93: #define SD_INODE_SIZE (sizeof(SheepdogInode))
94: #define CURRENT_VDI_ID 0
95:
96: typedef struct SheepdogReq {
97: uint8_t proto_ver;
98: uint8_t opcode;
99: uint16_t flags;
100: uint32_t epoch;
101: uint32_t id;
102: uint32_t data_length;
103: uint32_t opcode_specific[8];
104: } SheepdogReq;
105:
106: typedef struct SheepdogRsp {
107: uint8_t proto_ver;
108: uint8_t opcode;
109: uint16_t flags;
110: uint32_t epoch;
111: uint32_t id;
112: uint32_t data_length;
113: uint32_t result;
114: uint32_t opcode_specific[7];
115: } SheepdogRsp;
116:
117: typedef struct SheepdogObjReq {
118: uint8_t proto_ver;
119: uint8_t opcode;
120: uint16_t flags;
121: uint32_t epoch;
122: uint32_t id;
123: uint32_t data_length;
124: uint64_t oid;
125: uint64_t cow_oid;
126: uint32_t copies;
127: uint32_t rsvd;
128: uint64_t offset;
129: } SheepdogObjReq;
130:
131: typedef struct SheepdogObjRsp {
132: uint8_t proto_ver;
133: uint8_t opcode;
134: uint16_t flags;
135: uint32_t epoch;
136: uint32_t id;
137: uint32_t data_length;
138: uint32_t result;
139: uint32_t copies;
140: uint32_t pad[6];
141: } SheepdogObjRsp;
142:
143: typedef struct SheepdogVdiReq {
144: uint8_t proto_ver;
145: uint8_t opcode;
146: uint16_t flags;
147: uint32_t epoch;
148: uint32_t id;
149: uint32_t data_length;
150: uint64_t vdi_size;
151: uint32_t base_vdi_id;
152: uint32_t copies;
153: uint32_t snapid;
154: uint32_t pad[3];
155: } SheepdogVdiReq;
156:
157: typedef struct SheepdogVdiRsp {
158: uint8_t proto_ver;
159: uint8_t opcode;
160: uint16_t flags;
161: uint32_t epoch;
162: uint32_t id;
163: uint32_t data_length;
164: uint32_t result;
165: uint32_t rsvd;
166: uint32_t vdi_id;
167: uint32_t pad[5];
168: } SheepdogVdiRsp;
169:
170: typedef struct SheepdogInode {
171: char name[SD_MAX_VDI_LEN];
172: char tag[SD_MAX_VDI_TAG_LEN];
173: uint64_t ctime;
174: uint64_t snap_ctime;
175: uint64_t vm_clock_nsec;
176: uint64_t vdi_size;
177: uint64_t vm_state_size;
178: uint16_t copy_policy;
179: uint8_t nr_copies;
180: uint8_t block_size_shift;
181: uint32_t snap_id;
182: uint32_t vdi_id;
183: uint32_t parent_vdi_id;
184: uint32_t child_vdi_id[MAX_CHILDREN];
185: uint32_t data_vdi_id[MAX_DATA_OBJS];
186: } SheepdogInode;
187:
188: /*
189: * 64 bit FNV-1a non-zero initial basis
190: */
191: #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
192:
193: /*
194: * 64 bit Fowler/Noll/Vo FNV-1a hash code
195: */
196: static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
197: {
198: unsigned char *bp = buf;
199: unsigned char *be = bp + len;
200: while (bp < be) {
201: hval ^= (uint64_t) *bp++;
202: hval += (hval << 1) + (hval << 4) + (hval << 5) +
203: (hval << 7) + (hval << 8) + (hval << 40);
204: }
205: return hval;
206: }
207:
208: static inline int is_data_obj_writeable(SheepdogInode *inode, unsigned int idx)
209: {
210: return inode->vdi_id == inode->data_vdi_id[idx];
211: }
212:
213: static inline int is_data_obj(uint64_t oid)
214: {
215: return !(VDI_BIT & oid);
216: }
217:
218: static inline uint64_t data_oid_to_idx(uint64_t oid)
219: {
220: return oid & (MAX_DATA_OBJS - 1);
221: }
222:
223: static inline uint64_t vid_to_vdi_oid(uint32_t vid)
224: {
225: return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
226: }
227:
228: static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
229: {
230: return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
231: }
232:
233: static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
234: {
235: return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
236: }
237:
238: static inline int is_snapshot(struct SheepdogInode *inode)
239: {
240: return !!inode->snap_ctime;
241: }
242:
243: #undef dprintf
244: #ifdef DEBUG_SDOG
245: #define dprintf(fmt, args...) \
246: do { \
247: fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
248: } while (0)
249: #else
250: #define dprintf(fmt, args...)
251: #endif
252:
253: typedef struct SheepdogAIOCB SheepdogAIOCB;
254:
255: typedef struct AIOReq {
256: SheepdogAIOCB *aiocb;
257: unsigned int iov_offset;
258:
259: uint64_t oid;
260: uint64_t base_oid;
261: uint64_t offset;
262: unsigned int data_len;
263: uint8_t flags;
264: uint32_t id;
265:
266: QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
267: QLIST_ENTRY(AIOReq) aioreq_siblings;
268: } AIOReq;
269:
270: enum AIOCBState {
271: AIOCB_WRITE_UDATA,
272: AIOCB_READ_UDATA,
273: };
274:
275: struct SheepdogAIOCB {
276: BlockDriverAIOCB common;
277:
278: QEMUIOVector *qiov;
279:
280: int64_t sector_num;
281: int nb_sectors;
282:
283: int ret;
284: enum AIOCBState aiocb_type;
285:
286: QEMUBH *bh;
287: void (*aio_done_func)(SheepdogAIOCB *);
288:
289: int canceled;
290:
291: QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
292: };
293:
294: typedef struct BDRVSheepdogState {
295: SheepdogInode inode;
296:
297: uint32_t min_dirty_data_idx;
298: uint32_t max_dirty_data_idx;
299:
300: char name[SD_MAX_VDI_LEN];
301: int is_snapshot;
302:
303: char *addr;
304: char *port;
305: int fd;
306:
307: uint32_t aioreq_seq_num;
308: QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
309: } BDRVSheepdogState;
310:
311: static const char * sd_strerror(int err)
312: {
313: int i;
314:
315: static const struct {
316: int err;
317: const char *desc;
318: } errors[] = {
319: {SD_RES_SUCCESS, "Success"},
320: {SD_RES_UNKNOWN, "Unknown error"},
321: {SD_RES_NO_OBJ, "No object found"},
322: {SD_RES_EIO, "I/O error"},
323: {SD_RES_VDI_EXIST, "VDI exists already"},
324: {SD_RES_INVALID_PARMS, "Invalid parameters"},
325: {SD_RES_SYSTEM_ERROR, "System error"},
326: {SD_RES_VDI_LOCKED, "VDI is already locked"},
327: {SD_RES_NO_VDI, "No vdi found"},
328: {SD_RES_NO_BASE_VDI, "No base VDI found"},
329: {SD_RES_VDI_READ, "Failed read the requested VDI"},
330: {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
331: {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
332: {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
333: {SD_RES_NO_TAG, "Failed to find the requested tag"},
334: {SD_RES_STARTUP, "The system is still booting"},
335: {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
336: {SD_RES_SHUTDOWN, "The system is shutting down"},
337: {SD_RES_NO_MEM, "Out of memory on the server"},
338: {SD_RES_FULL_VDI, "We already have the maximum vdis"},
339: {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
340: {SD_RES_NO_SPACE, "Server has no space for new objects"},
341: {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
342: {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
343: {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
344: };
345:
346: for (i = 0; i < ARRAY_SIZE(errors); ++i) {
347: if (errors[i].err == err) {
348: return errors[i].desc;
349: }
350: }
351:
352: return "Invalid error code";
353: }
354:
355: /*
356: * Sheepdog I/O handling:
357: *
358: * 1. In the sd_aio_readv/writev, read/write requests are added to the
359: * QEMU Bottom Halves.
360: *
361: * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
362: * requests to the server and link the requests to the
363: * outstanding_list in the BDRVSheepdogState. we exits the
364: * function without waiting for receiving the response.
365: *
366: * 3. We receive the response in aio_read_response, the fd handler to
367: * the sheepdog connection. If metadata update is needed, we send
368: * the write request to the vdi object in sd_write_done, the write
369: * completion function. The AIOCB callback is not called until all
370: * the requests belonging to the AIOCB are finished.
371: */
372:
373: static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
374: uint64_t oid, unsigned int data_len,
375: uint64_t offset, uint8_t flags,
376: uint64_t base_oid, unsigned int iov_offset)
377: {
378: AIOReq *aio_req;
379:
380: aio_req = qemu_malloc(sizeof(*aio_req));
381: aio_req->aiocb = acb;
382: aio_req->iov_offset = iov_offset;
383: aio_req->oid = oid;
384: aio_req->base_oid = base_oid;
385: aio_req->offset = offset;
386: aio_req->data_len = data_len;
387: aio_req->flags = flags;
388: aio_req->id = s->aioreq_seq_num++;
389:
390: QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
391: outstanding_aio_siblings);
392: QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
393:
394: return aio_req;
395: }
396:
397: static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
398: {
399: SheepdogAIOCB *acb = aio_req->aiocb;
400: QLIST_REMOVE(aio_req, outstanding_aio_siblings);
401: QLIST_REMOVE(aio_req, aioreq_siblings);
402: qemu_free(aio_req);
403:
404: return !QLIST_EMPTY(&acb->aioreq_head);
405: }
406:
407: static void sd_finish_aiocb(SheepdogAIOCB *acb)
408: {
409: if (!acb->canceled) {
410: acb->common.cb(acb->common.opaque, acb->ret);
411: }
412: qemu_aio_release(acb);
413: }
414:
415: static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
416: {
417: SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
418:
419: /*
420: * Sheepdog cannot cancel the requests which are already sent to
421: * the servers, so we just complete the request with -EIO here.
422: */
423: acb->common.cb(acb->common.opaque, -EIO);
424: acb->canceled = 1;
425: }
426:
427: static AIOPool sd_aio_pool = {
428: .aiocb_size = sizeof(SheepdogAIOCB),
429: .cancel = sd_aio_cancel,
430: };
431:
432: static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
433: int64_t sector_num, int nb_sectors,
434: BlockDriverCompletionFunc *cb, void *opaque)
435: {
436: SheepdogAIOCB *acb;
437:
438: acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
439:
440: acb->qiov = qiov;
441:
442: acb->sector_num = sector_num;
443: acb->nb_sectors = nb_sectors;
444:
445: acb->aio_done_func = NULL;
446: acb->canceled = 0;
447: acb->bh = NULL;
448: acb->ret = 0;
449: QLIST_INIT(&acb->aioreq_head);
450: return acb;
451: }
452:
453: static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
454: {
455: if (acb->bh) {
456: error_report("bug: %d %d\n", acb->aiocb_type, acb->aiocb_type);
457: return -EIO;
458: }
459:
460: acb->bh = qemu_bh_new(cb, acb);
461: if (!acb->bh) {
462: error_report("oom: %d %d\n", acb->aiocb_type, acb->aiocb_type);
463: return -EIO;
464: }
465:
466: qemu_bh_schedule(acb->bh);
467:
468: return 0;
469: }
470:
471: #ifdef _WIN32
472:
473: struct msghdr {
474: struct iovec *msg_iov;
475: size_t msg_iovlen;
476: };
477:
478: static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
479: {
480: size_t size = 0;
481: char *buf, *p;
482: int i, ret;
483:
484: /* count the msg size */
485: for (i = 0; i < msg->msg_iovlen; i++) {
486: size += msg->msg_iov[i].iov_len;
487: }
488: buf = qemu_malloc(size);
489:
490: p = buf;
491: for (i = 0; i < msg->msg_iovlen; i++) {
492: memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
493: p += msg->msg_iov[i].iov_len;
494: }
495:
496: ret = send(s, buf, size, flags);
497:
498: qemu_free(buf);
499: return ret;
500: }
501:
502: static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
503: {
504: size_t size = 0;
505: char *buf, *p;
506: int i, ret;
507:
508: /* count the msg size */
509: for (i = 0; i < msg->msg_iovlen; i++) {
510: size += msg->msg_iov[i].iov_len;
511: }
512: buf = qemu_malloc(size);
513:
514: ret = recv(s, buf, size, flags);
515: if (ret < 0) {
516: goto out;
517: }
518:
519: p = buf;
520: for (i = 0; i < msg->msg_iovlen; i++) {
521: memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
522: p += msg->msg_iov[i].iov_len;
523: }
524: out:
525: qemu_free(buf);
526: return ret;
527: }
528:
529: #endif
530:
531: /*
532: * Send/recv data with iovec buffers
533: *
534: * This function send/recv data from/to the iovec buffer directly.
535: * The first `offset' bytes in the iovec buffer are skipped and next
536: * `len' bytes are used.
537: *
538: * For example,
539: *
540: * do_send_recv(sockfd, iov, len, offset, 1);
541: *
542: * is equals to
543: *
544: * char *buf = malloc(size);
545: * iov_to_buf(iov, iovcnt, buf, offset, size);
546: * send(sockfd, buf, size, 0);
547: * free(buf);
548: */
549: static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
550: int write)
551: {
552: struct msghdr msg;
553: int ret, diff;
554:
555: memset(&msg, 0, sizeof(msg));
556: msg.msg_iov = iov;
557: msg.msg_iovlen = 1;
558:
559: len += offset;
560:
561: while (iov->iov_len < len) {
562: len -= iov->iov_len;
563:
564: iov++;
565: msg.msg_iovlen++;
566: }
567:
568: diff = iov->iov_len - len;
569: iov->iov_len -= diff;
570:
571: while (msg.msg_iov->iov_len <= offset) {
572: offset -= msg.msg_iov->iov_len;
573:
574: msg.msg_iov++;
575: msg.msg_iovlen--;
576: }
577:
578: msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
579: msg.msg_iov->iov_len -= offset;
580:
581: if (write) {
582: ret = sendmsg(sockfd, &msg, 0);
583: } else {
584: ret = recvmsg(sockfd, &msg, 0);
585: }
586:
587: msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
588: msg.msg_iov->iov_len += offset;
589:
590: iov->iov_len += diff;
591: return ret;
592: }
593:
594: static int connect_to_sdog(const char *addr, const char *port)
595: {
596: char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
597: int fd, ret;
598: struct addrinfo hints, *res, *res0;
599:
600: if (!addr) {
601: addr = SD_DEFAULT_ADDR;
602: port = SD_DEFAULT_PORT;
603: }
604:
605: memset(&hints, 0, sizeof(hints));
606: hints.ai_socktype = SOCK_STREAM;
607:
608: ret = getaddrinfo(addr, port, &hints, &res0);
609: if (ret) {
610: error_report("unable to get address info %s, %s\n",
611: addr, strerror(errno));
612: return -1;
613: }
614:
615: for (res = res0; res; res = res->ai_next) {
616: ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
617: sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
618: if (ret) {
619: continue;
620: }
621:
622: fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
623: if (fd < 0) {
624: continue;
625: }
626:
627: reconnect:
628: ret = connect(fd, res->ai_addr, res->ai_addrlen);
629: if (ret < 0) {
630: if (errno == EINTR) {
631: goto reconnect;
632: }
633: break;
634: }
635:
636: dprintf("connected to %s:%s\n", addr, port);
637: goto success;
638: }
639: fd = -1;
640: error_report("failed connect to %s:%s\n", addr, port);
641: success:
642: freeaddrinfo(res0);
643: return fd;
644: }
645:
646: static int do_readv_writev(int sockfd, struct iovec *iov, int len,
647: int iov_offset, int write)
648: {
649: int ret;
650: again:
651: ret = do_send_recv(sockfd, iov, len, iov_offset, write);
652: if (ret < 0) {
653: if (errno == EINTR || errno == EAGAIN) {
654: goto again;
655: }
656: error_report("failed to recv a rsp, %s\n", strerror(errno));
657: return 1;
658: }
659:
660: iov_offset += ret;
661: len -= ret;
662: if (len) {
663: goto again;
664: }
665:
666: return 0;
667: }
668:
669: static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
670: {
671: return do_readv_writev(sockfd, iov, len, iov_offset, 0);
672: }
673:
674: static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
675: {
676: return do_readv_writev(sockfd, iov, len, iov_offset, 1);
677: }
678:
679: static int do_read_write(int sockfd, void *buf, int len, int write)
680: {
681: struct iovec iov;
682:
683: iov.iov_base = buf;
684: iov.iov_len = len;
685:
686: return do_readv_writev(sockfd, &iov, len, 0, write);
687: }
688:
689: static int do_read(int sockfd, void *buf, int len)
690: {
691: return do_read_write(sockfd, buf, len, 0);
692: }
693:
694: static int do_write(int sockfd, void *buf, int len)
695: {
696: return do_read_write(sockfd, buf, len, 1);
697: }
698:
699: static int send_req(int sockfd, SheepdogReq *hdr, void *data,
700: unsigned int *wlen)
701: {
702: int ret;
703: struct iovec iov[2];
704:
705: iov[0].iov_base = hdr;
706: iov[0].iov_len = sizeof(*hdr);
707:
708: if (*wlen) {
709: iov[1].iov_base = data;
710: iov[1].iov_len = *wlen;
711: }
712:
713: ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
714: if (ret) {
715: error_report("failed to send a req, %s\n", strerror(errno));
716: ret = -1;
717: }
718:
719: return ret;
720: }
721:
722: static int do_req(int sockfd, SheepdogReq *hdr, void *data,
723: unsigned int *wlen, unsigned int *rlen)
724: {
725: int ret;
726:
727: ret = send_req(sockfd, hdr, data, wlen);
728: if (ret) {
729: ret = -1;
730: goto out;
731: }
732:
733: ret = do_read(sockfd, hdr, sizeof(*hdr));
734: if (ret) {
735: error_report("failed to get a rsp, %s\n", strerror(errno));
736: ret = -1;
737: goto out;
738: }
739:
740: if (*rlen > hdr->data_length) {
741: *rlen = hdr->data_length;
742: }
743:
744: if (*rlen) {
745: ret = do_read(sockfd, data, *rlen);
746: if (ret) {
747: error_report("failed to get the data, %s\n", strerror(errno));
748: ret = -1;
749: goto out;
750: }
751: }
752: ret = 0;
753: out:
754: return ret;
755: }
756:
757: static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
758: struct iovec *iov, int niov, int create,
759: enum AIOCBState aiocb_type);
760:
761: /*
762: * This function searchs pending requests to the object `oid', and
763: * sends them.
764: */
765: static void send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
766: {
767: AIOReq *aio_req, *next;
768: SheepdogAIOCB *acb;
769: int ret;
770:
771: QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
772: outstanding_aio_siblings, next) {
773: if (id == aio_req->id) {
774: continue;
775: }
776: if (aio_req->oid != oid) {
777: continue;
778: }
779:
780: acb = aio_req->aiocb;
781: ret = add_aio_request(s, aio_req, acb->qiov->iov,
782: acb->qiov->niov, 0, acb->aiocb_type);
783: if (ret < 0) {
784: error_report("add_aio_request is failed\n");
785: free_aio_req(s, aio_req);
786: if (QLIST_EMPTY(&acb->aioreq_head)) {
787: sd_finish_aiocb(acb);
788: }
789: }
790: }
791: }
792:
793: /*
794: * Receive responses of the I/O requests.
795: *
796: * This function is registered as a fd handler, and called from the
797: * main loop when s->fd is ready for reading responses.
798: */
799: static void aio_read_response(void *opaque)
800: {
801: SheepdogObjRsp rsp;
802: BDRVSheepdogState *s = opaque;
803: int fd = s->fd;
804: int ret;
805: AIOReq *aio_req = NULL;
806: SheepdogAIOCB *acb;
807: int rest;
808: unsigned long idx;
809:
810: if (QLIST_EMPTY(&s->outstanding_aio_head)) {
811: return;
812: }
813:
814: /* read a header */
815: ret = do_read(fd, &rsp, sizeof(rsp));
816: if (ret) {
817: error_report("failed to get the header, %s\n", strerror(errno));
818: return;
819: }
820:
821: /* find the right aio_req from the outstanding_aio list */
822: QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
823: if (aio_req->id == rsp.id) {
824: break;
825: }
826: }
827: if (!aio_req) {
828: error_report("cannot find aio_req %x\n", rsp.id);
829: return;
830: }
831:
832: acb = aio_req->aiocb;
833:
834: switch (acb->aiocb_type) {
835: case AIOCB_WRITE_UDATA:
836: if (!is_data_obj(aio_req->oid)) {
837: break;
838: }
839: idx = data_oid_to_idx(aio_req->oid);
840:
841: if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
842: /*
843: * If the object is newly created one, we need to update
844: * the vdi object (metadata object). min_dirty_data_idx
845: * and max_dirty_data_idx are changed to include updated
846: * index between them.
847: */
848: s->inode.data_vdi_id[idx] = s->inode.vdi_id;
849: s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
850: s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
851:
852: /*
853: * Some requests may be blocked because simultaneous
854: * create requests are not allowed, so we search the
855: * pending requests here.
856: */
857: send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
858: }
859: break;
860: case AIOCB_READ_UDATA:
861: ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
862: aio_req->iov_offset);
863: if (ret) {
864: error_report("failed to get the data, %s\n", strerror(errno));
865: return;
866: }
867: break;
868: }
869:
870: if (rsp.result != SD_RES_SUCCESS) {
871: acb->ret = -EIO;
872: error_report("%s\n", sd_strerror(rsp.result));
873: }
874:
875: rest = free_aio_req(s, aio_req);
876: if (!rest) {
877: /*
878: * We've finished all requests which belong to the AIOCB, so
879: * we can call the callback now.
880: */
881: acb->aio_done_func(acb);
882: }
883: }
884:
885: static int aio_flush_request(void *opaque)
886: {
887: BDRVSheepdogState *s = opaque;
888:
889: return !QLIST_EMPTY(&s->outstanding_aio_head);
890: }
891:
892: #if !defined(SOL_TCP) || !defined(TCP_CORK)
893:
894: static int set_cork(int fd, int v)
895: {
896: return 0;
897: }
898:
899: #else
900:
901: static int set_cork(int fd, int v)
902: {
903: return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
904: }
905:
906: #endif
907:
908: static int set_nodelay(int fd)
909: {
910: int ret, opt;
911:
912: opt = 1;
913: ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
914: return ret;
915: }
916:
917: /*
918: * Return a socket discriptor to read/write objects.
919: *
920: * We cannot use this discriptor for other operations because
921: * the block driver may be on waiting response from the server.
922: */
923: static int get_sheep_fd(BDRVSheepdogState *s)
924: {
925: int ret, fd;
926:
927: fd = connect_to_sdog(s->addr, s->port);
928: if (fd < 0) {
929: error_report("%s\n", strerror(errno));
930: return -1;
931: }
932:
933: socket_set_nonblock(fd);
934:
935: ret = set_nodelay(fd);
936: if (ret) {
937: error_report("%s\n", strerror(errno));
938: closesocket(fd);
939: return -1;
940: }
941:
942: qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
943: NULL, s);
944: return fd;
945: }
946:
947: /*
948: * Parse a filename
949: *
950: * filename must be one of the following formats:
951: * 1. [vdiname]
952: * 2. [vdiname]:[snapid]
953: * 3. [vdiname]:[tag]
954: * 4. [hostname]:[port]:[vdiname]
955: * 5. [hostname]:[port]:[vdiname]:[snapid]
956: * 6. [hostname]:[port]:[vdiname]:[tag]
957: *
958: * You can boot from the snapshot images by specifying `snapid` or
959: * `tag'.
960: *
961: * You can run VMs outside the Sheepdog cluster by specifying
962: * `hostname' and `port' (experimental).
963: */
964: static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
965: char *vdi, uint32_t *snapid, char *tag)
966: {
967: char *p, *q;
968: int nr_sep;
969:
970: p = q = qemu_strdup(filename);
971:
972: /* count the number of separators */
973: nr_sep = 0;
974: while (*p) {
975: if (*p == ':') {
976: nr_sep++;
977: }
978: p++;
979: }
980: p = q;
981:
982: /* use the first two tokens as hostname and port number. */
983: if (nr_sep >= 2) {
984: s->addr = p;
985: p = strchr(p, ':');
986: *p++ = '\0';
987:
988: s->port = p;
989: p = strchr(p, ':');
990: *p++ = '\0';
991: } else {
992: s->addr = NULL;
993: s->port = 0;
994: }
995:
996: strncpy(vdi, p, SD_MAX_VDI_LEN);
997:
998: p = strchr(vdi, ':');
999: if (p) {
1000: *p++ = '\0';
1001: *snapid = strtoul(p, NULL, 10);
1002: if (*snapid == 0) {
1003: strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
1004: }
1005: } else {
1006: *snapid = CURRENT_VDI_ID; /* search current vdi */
1007: }
1008:
1009: if (s->addr == NULL) {
1010: qemu_free(q);
1011: }
1012:
1013: return 0;
1014: }
1015:
1016: static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
1017: char *tag, uint32_t *vid, int for_snapshot)
1018: {
1019: int ret, fd;
1020: SheepdogVdiReq hdr;
1021: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1022: unsigned int wlen, rlen = 0;
1023: char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
1024:
1025: fd = connect_to_sdog(s->addr, s->port);
1026: if (fd < 0) {
1027: return -1;
1028: }
1029:
1030: memset(buf, 0, sizeof(buf));
1031: strncpy(buf, filename, SD_MAX_VDI_LEN);
1032: strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1033:
1034: memset(&hdr, 0, sizeof(hdr));
1035: if (for_snapshot) {
1036: hdr.opcode = SD_OP_GET_VDI_INFO;
1037: } else {
1038: hdr.opcode = SD_OP_LOCK_VDI;
1039: }
1040: wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1041: hdr.proto_ver = SD_PROTO_VER;
1042: hdr.data_length = wlen;
1043: hdr.snapid = snapid;
1044: hdr.flags = SD_FLAG_CMD_WRITE;
1045:
1046: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1047: if (ret) {
1048: ret = -1;
1049: goto out;
1050: }
1051:
1052: if (rsp->result != SD_RES_SUCCESS) {
1053: error_report("cannot get vdi info, %s, %s %d %s\n",
1054: sd_strerror(rsp->result), filename, snapid, tag);
1055: ret = -1;
1056: goto out;
1057: }
1058: *vid = rsp->vdi_id;
1059:
1060: ret = 0;
1061: out:
1062: closesocket(fd);
1063: return ret;
1064: }
1065:
1066: static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
1067: struct iovec *iov, int niov, int create,
1068: enum AIOCBState aiocb_type)
1069: {
1070: int nr_copies = s->inode.nr_copies;
1071: SheepdogObjReq hdr;
1072: unsigned int wlen;
1073: int ret;
1074: uint64_t oid = aio_req->oid;
1075: unsigned int datalen = aio_req->data_len;
1076: uint64_t offset = aio_req->offset;
1077: uint8_t flags = aio_req->flags;
1078: uint64_t old_oid = aio_req->base_oid;
1079:
1080: if (!nr_copies) {
1081: error_report("bug\n");
1082: }
1083:
1084: memset(&hdr, 0, sizeof(hdr));
1085:
1086: if (aiocb_type == AIOCB_READ_UDATA) {
1087: wlen = 0;
1088: hdr.opcode = SD_OP_READ_OBJ;
1089: hdr.flags = flags;
1090: } else if (create) {
1091: wlen = datalen;
1092: hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1093: hdr.flags = SD_FLAG_CMD_WRITE | flags;
1094: } else {
1095: wlen = datalen;
1096: hdr.opcode = SD_OP_WRITE_OBJ;
1097: hdr.flags = SD_FLAG_CMD_WRITE | flags;
1098: }
1099:
1100: hdr.oid = oid;
1101: hdr.cow_oid = old_oid;
1102: hdr.copies = s->inode.nr_copies;
1103:
1104: hdr.data_length = datalen;
1105: hdr.offset = offset;
1106:
1107: hdr.id = aio_req->id;
1108:
1109: set_cork(s->fd, 1);
1110:
1111: /* send a header */
1112: ret = do_write(s->fd, &hdr, sizeof(hdr));
1113: if (ret) {
1114: error_report("failed to send a req, %s\n", strerror(errno));
1115: return -EIO;
1116: }
1117:
1118: if (wlen) {
1119: ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
1120: if (ret) {
1121: error_report("failed to send a data, %s\n", strerror(errno));
1122: return -EIO;
1123: }
1124: }
1125:
1126: set_cork(s->fd, 0);
1127:
1128: return 0;
1129: }
1130:
1131: static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
1132: unsigned int datalen, uint64_t offset,
1133: int write, int create)
1134: {
1135: SheepdogObjReq hdr;
1136: SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1137: unsigned int wlen, rlen;
1138: int ret;
1139:
1140: memset(&hdr, 0, sizeof(hdr));
1141:
1142: if (write) {
1143: wlen = datalen;
1144: rlen = 0;
1145: hdr.flags = SD_FLAG_CMD_WRITE;
1146: if (create) {
1147: hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1148: } else {
1149: hdr.opcode = SD_OP_WRITE_OBJ;
1150: }
1151: } else {
1152: wlen = 0;
1153: rlen = datalen;
1154: hdr.opcode = SD_OP_READ_OBJ;
1155: }
1156: hdr.oid = oid;
1157: hdr.data_length = datalen;
1158: hdr.offset = offset;
1159: hdr.copies = copies;
1160:
1161: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1162: if (ret) {
1163: error_report("failed to send a request to the sheep\n");
1164: return -1;
1165: }
1166:
1167: switch (rsp->result) {
1168: case SD_RES_SUCCESS:
1169: return 0;
1170: default:
1171: error_report("%s\n", sd_strerror(rsp->result));
1172: return -1;
1173: }
1174: }
1175:
1176: static int read_object(int fd, char *buf, uint64_t oid, int copies,
1177: unsigned int datalen, uint64_t offset)
1178: {
1179: return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1180: }
1181:
1182: static int write_object(int fd, char *buf, uint64_t oid, int copies,
1183: unsigned int datalen, uint64_t offset, int create)
1184: {
1185: return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1186: }
1187:
1188: static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1189: {
1190: int ret, fd;
1191: uint32_t vid = 0;
1192: BDRVSheepdogState *s = bs->opaque;
1193: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1194: uint32_t snapid;
1195: char *buf = NULL;
1196:
1197: strstart(filename, "sheepdog:", (const char **)&filename);
1198:
1199: QLIST_INIT(&s->outstanding_aio_head);
1200: s->fd = -1;
1201:
1202: memset(vdi, 0, sizeof(vdi));
1203: memset(tag, 0, sizeof(tag));
1204: if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1205: goto out;
1206: }
1207: s->fd = get_sheep_fd(s);
1208: if (s->fd < 0) {
1209: goto out;
1210: }
1211:
1212: ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1213: if (ret) {
1214: goto out;
1215: }
1216:
1217: if (snapid) {
1218: dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1219: s->is_snapshot = 1;
1220: }
1221:
1222: fd = connect_to_sdog(s->addr, s->port);
1223: if (fd < 0) {
1224: error_report("failed to connect\n");
1225: goto out;
1226: }
1227:
1228: buf = qemu_malloc(SD_INODE_SIZE);
1229: ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1230:
1231: closesocket(fd);
1232:
1233: if (ret) {
1234: goto out;
1235: }
1236:
1237: memcpy(&s->inode, buf, sizeof(s->inode));
1238: s->min_dirty_data_idx = UINT32_MAX;
1239: s->max_dirty_data_idx = 0;
1240:
1241: bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1242: strncpy(s->name, vdi, sizeof(s->name));
1243: qemu_free(buf);
1244: return 0;
1245: out:
1246: qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1247: if (s->fd >= 0) {
1248: closesocket(s->fd);
1249: }
1250: qemu_free(buf);
1251: return -1;
1252: }
1253:
1254: static int do_sd_create(char *filename, int64_t vdi_size,
1255: uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1256: const char *addr, const char *port)
1257: {
1258: SheepdogVdiReq hdr;
1259: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1260: int fd, ret;
1261: unsigned int wlen, rlen = 0;
1262: char buf[SD_MAX_VDI_LEN];
1263:
1264: fd = connect_to_sdog(addr, port);
1265: if (fd < 0) {
1266: return -EIO;
1267: }
1268:
1269: memset(buf, 0, sizeof(buf));
1270: strncpy(buf, filename, SD_MAX_VDI_LEN);
1271:
1272: memset(&hdr, 0, sizeof(hdr));
1273: hdr.opcode = SD_OP_NEW_VDI;
1274: hdr.base_vdi_id = base_vid;
1275:
1276: wlen = SD_MAX_VDI_LEN;
1277:
1278: hdr.flags = SD_FLAG_CMD_WRITE;
1279: hdr.snapid = snapshot;
1280:
1281: hdr.data_length = wlen;
1282: hdr.vdi_size = vdi_size;
1283:
1284: ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1285:
1286: closesocket(fd);
1287:
1288: if (ret) {
1289: return -EIO;
1290: }
1291:
1292: if (rsp->result != SD_RES_SUCCESS) {
1293: error_report("%s, %s\n", sd_strerror(rsp->result), filename);
1294: return -EIO;
1295: }
1296:
1297: if (vdi_id) {
1298: *vdi_id = rsp->vdi_id;
1299: }
1300:
1301: return 0;
1302: }
1303:
1304: static int sd_create(const char *filename, QEMUOptionParameter *options)
1305: {
1306: int ret;
1307: uint32_t vid = 0;
1308: int64_t vdi_size = 0;
1309: char *backing_file = NULL;
1310:
1311: strstart(filename, "sheepdog:", (const char **)&filename);
1312:
1313: while (options && options->name) {
1314: if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1315: vdi_size = options->value.n;
1316: } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1317: backing_file = options->value.s;
1318: }
1319: options++;
1320: }
1321:
1322: if (vdi_size > SD_MAX_VDI_SIZE) {
1323: error_report("too big image size\n");
1324: return -EINVAL;
1325: }
1326:
1327: if (backing_file) {
1328: BlockDriverState *bs;
1329: BDRVSheepdogState *s;
1330: BlockDriver *drv;
1331:
1332: /* Currently, only Sheepdog backing image is supported. */
1333: drv = bdrv_find_protocol(backing_file);
1334: if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
1335: error_report("backing_file must be a sheepdog image\n");
1336: return -EINVAL;
1337: }
1338:
1339: ret = bdrv_file_open(&bs, backing_file, 0);
1340: if (ret < 0)
1341: return -EIO;
1342:
1343: s = bs->opaque;
1344:
1345: if (!is_snapshot(&s->inode)) {
1346: error_report("cannot clone from a non snapshot vdi\n");
1347: bdrv_delete(bs);
1348: return -EINVAL;
1349: }
1350:
1351: vid = s->inode.vdi_id;
1352: bdrv_delete(bs);
1353: }
1354:
1355: return do_sd_create((char *)filename, vdi_size, vid, NULL, 0, NULL, NULL);
1356: }
1357:
1358: static void sd_close(BlockDriverState *bs)
1359: {
1360: BDRVSheepdogState *s = bs->opaque;
1361: SheepdogVdiReq hdr;
1362: SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1363: unsigned int wlen, rlen = 0;
1364: int fd, ret;
1365:
1366: dprintf("%s\n", s->name);
1367:
1368: fd = connect_to_sdog(s->addr, s->port);
1369: if (fd < 0) {
1370: return;
1371: }
1372:
1373: memset(&hdr, 0, sizeof(hdr));
1374:
1375: hdr.opcode = SD_OP_RELEASE_VDI;
1376: wlen = strlen(s->name) + 1;
1377: hdr.data_length = wlen;
1378: hdr.flags = SD_FLAG_CMD_WRITE;
1379:
1380: ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1381:
1382: closesocket(fd);
1383:
1384: if (!ret && rsp->result != SD_RES_SUCCESS &&
1385: rsp->result != SD_RES_VDI_NOT_LOCKED) {
1386: error_report("%s, %s\n", sd_strerror(rsp->result), s->name);
1387: }
1388:
1389: qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1390: closesocket(s->fd);
1391: qemu_free(s->addr);
1392: }
1393:
1394: static int64_t sd_getlength(BlockDriverState *bs)
1395: {
1396: BDRVSheepdogState *s = bs->opaque;
1397:
1398: return s->inode.vdi_size;
1399: }
1400:
1401: static int sd_truncate(BlockDriverState *bs, int64_t offset)
1402: {
1403: BDRVSheepdogState *s = bs->opaque;
1404: int ret, fd;
1405: unsigned int datalen;
1406:
1407: if (offset < s->inode.vdi_size) {
1408: error_report("shrinking is not supported\n");
1409: return -EINVAL;
1410: } else if (offset > SD_MAX_VDI_SIZE) {
1411: error_report("too big image size\n");
1412: return -EINVAL;
1413: }
1414:
1415: fd = connect_to_sdog(s->addr, s->port);
1416: if (fd < 0) {
1417: return -EIO;
1418: }
1419:
1420: /* we don't need to update entire object */
1421: datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1422: s->inode.vdi_size = offset;
1423: ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1424: s->inode.nr_copies, datalen, 0, 0);
1425: close(fd);
1426:
1427: if (ret < 0) {
1428: error_report("failed to update an inode.\n");
1429: return -EIO;
1430: }
1431:
1432: return 0;
1433: }
1434:
1435: /*
1436: * This function is called after writing data objects. If we need to
1437: * update metadata, this sends a write request to the vdi object.
1438: * Otherwise, this calls the AIOCB callback.
1439: */
1440: static void sd_write_done(SheepdogAIOCB *acb)
1441: {
1442: int ret;
1443: BDRVSheepdogState *s = acb->common.bs->opaque;
1444: struct iovec iov;
1445: AIOReq *aio_req;
1446: uint32_t offset, data_len, mn, mx;
1447:
1448: mn = s->min_dirty_data_idx;
1449: mx = s->max_dirty_data_idx;
1450: if (mn <= mx) {
1451: /* we need to update the vdi object. */
1452: offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1453: mn * sizeof(s->inode.data_vdi_id[0]);
1454: data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1455:
1456: s->min_dirty_data_idx = UINT32_MAX;
1457: s->max_dirty_data_idx = 0;
1458:
1459: iov.iov_base = &s->inode;
1460: iov.iov_len = sizeof(s->inode);
1461: aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1462: data_len, offset, 0, 0, offset);
1463: ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1464: if (ret) {
1465: free_aio_req(s, aio_req);
1466: acb->ret = -EIO;
1467: goto out;
1468: }
1469:
1470: acb->aio_done_func = sd_finish_aiocb;
1471: acb->aiocb_type = AIOCB_WRITE_UDATA;
1472: return;
1473: }
1474: out:
1475: sd_finish_aiocb(acb);
1476: }
1477:
1478: /*
1479: * Create a writable VDI from a snapshot
1480: */
1481: static int sd_create_branch(BDRVSheepdogState *s)
1482: {
1483: int ret, fd;
1484: uint32_t vid;
1485: char *buf;
1486:
1487: dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1488:
1489: buf = qemu_malloc(SD_INODE_SIZE);
1490:
1491: ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1492: s->addr, s->port);
1493: if (ret) {
1494: goto out;
1495: }
1496:
1497: dprintf("%" PRIx32 " is created.\n", vid);
1498:
1499: fd = connect_to_sdog(s->addr, s->port);
1500: if (fd < 0) {
1501: error_report("failed to connect\n");
1502: goto out;
1503: }
1504:
1505: ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1506: SD_INODE_SIZE, 0);
1507:
1508: closesocket(fd);
1509:
1510: if (ret < 0) {
1511: goto out;
1512: }
1513:
1514: memcpy(&s->inode, buf, sizeof(s->inode));
1515:
1516: s->is_snapshot = 0;
1517: ret = 0;
1518: dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1519:
1520: out:
1521: qemu_free(buf);
1522:
1523: return ret;
1524: }
1525:
1526: /*
1527: * Send I/O requests to the server.
1528: *
1529: * This function sends requests to the server, links the requests to
1530: * the outstanding_list in BDRVSheepdogState, and exits without
1531: * waiting the response. The responses are received in the
1532: * `aio_read_response' function which is called from the main loop as
1533: * a fd handler.
1534: */
1535: static void sd_readv_writev_bh_cb(void *p)
1536: {
1537: SheepdogAIOCB *acb = p;
1538: int ret = 0;
1539: unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1540: unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1541: uint64_t oid;
1542: uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1543: BDRVSheepdogState *s = acb->common.bs->opaque;
1544: SheepdogInode *inode = &s->inode;
1545: AIOReq *aio_req;
1546:
1547: qemu_bh_delete(acb->bh);
1548: acb->bh = NULL;
1549:
1550: if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1551: /*
1552: * In the case we open the snapshot VDI, Sheepdog creates the
1553: * writable VDI when we do a write operation first.
1554: */
1555: ret = sd_create_branch(s);
1556: if (ret) {
1557: acb->ret = -EIO;
1558: goto out;
1559: }
1560: }
1561:
1562: while (done != total) {
1563: uint8_t flags = 0;
1564: uint64_t old_oid = 0;
1565: int create = 0;
1566:
1567: oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1568:
1569: len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1570:
1571: if (!inode->data_vdi_id[idx]) {
1572: if (acb->aiocb_type == AIOCB_READ_UDATA) {
1573: goto done;
1574: }
1575:
1576: create = 1;
1577: } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
1578: && !is_data_obj_writeable(inode, idx)) {
1579: /* Copy-On-Write */
1580: create = 1;
1581: old_oid = oid;
1582: flags = SD_FLAG_CMD_COW;
1583: }
1584:
1585: if (create) {
1586: dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1587: " %" PRIu64 "\n", inode->vdi_id, oid,
1588: vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1589: oid = vid_to_data_oid(inode->vdi_id, idx);
1590: dprintf("new oid %lx\n", oid);
1591: }
1592:
1593: aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1594:
1595: if (create) {
1596: AIOReq *areq;
1597: QLIST_FOREACH(areq, &s->outstanding_aio_head,
1598: outstanding_aio_siblings) {
1599: if (areq == aio_req) {
1600: continue;
1601: }
1602: if (areq->oid == oid) {
1603: /*
1604: * Sheepdog cannot handle simultaneous create
1605: * requests to the same object. So we cannot send
1606: * the request until the previous request
1607: * finishes.
1608: */
1609: aio_req->flags = 0;
1610: aio_req->base_oid = 0;
1611: goto done;
1612: }
1613: }
1614: }
1615:
1616: ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1617: create, acb->aiocb_type);
1618: if (ret < 0) {
1619: error_report("add_aio_request is failed\n");
1620: free_aio_req(s, aio_req);
1621: acb->ret = -EIO;
1622: goto out;
1623: }
1624: done:
1625: offset = 0;
1626: idx++;
1627: done += len;
1628: }
1629: out:
1630: if (QLIST_EMPTY(&acb->aioreq_head)) {
1631: sd_finish_aiocb(acb);
1632: }
1633: }
1634:
1635: static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
1636: QEMUIOVector *qiov, int nb_sectors,
1637: BlockDriverCompletionFunc *cb,
1638: void *opaque)
1639: {
1640: SheepdogAIOCB *acb;
1641:
1642: if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1643: /* TODO: shouldn't block here */
1644: if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
1645: return NULL;
1646: }
1647: bs->total_sectors = sector_num + nb_sectors;
1648: }
1649:
1650: acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1651: acb->aio_done_func = sd_write_done;
1652: acb->aiocb_type = AIOCB_WRITE_UDATA;
1653:
1654: sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1655: return &acb->common;
1656: }
1657:
1658: static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
1659: QEMUIOVector *qiov, int nb_sectors,
1660: BlockDriverCompletionFunc *cb,
1661: void *opaque)
1662: {
1663: SheepdogAIOCB *acb;
1664: int i;
1665:
1666: acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
1667: acb->aiocb_type = AIOCB_READ_UDATA;
1668: acb->aio_done_func = sd_finish_aiocb;
1669:
1670: /*
1671: * TODO: we can do better; we don't need to initialize
1672: * blindly.
1673: */
1674: for (i = 0; i < qiov->niov; i++) {
1675: memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1676: }
1677:
1678: sd_schedule_bh(sd_readv_writev_bh_cb, acb);
1679: return &acb->common;
1680: }
1681:
1682: static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1683: {
1684: BDRVSheepdogState *s = bs->opaque;
1685: int ret, fd;
1686: uint32_t new_vid;
1687: SheepdogInode *inode;
1688: unsigned int datalen;
1689:
1690: dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1691: "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1692: s->name, sn_info->vm_state_size, s->is_snapshot);
1693:
1694: if (s->is_snapshot) {
1695: error_report("You can't create a snapshot of a snapshot VDI, "
1696: "%s (%" PRIu32 ").\n", s->name, s->inode.vdi_id);
1697:
1698: return -EINVAL;
1699: }
1700:
1701: dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1702:
1703: s->inode.vm_state_size = sn_info->vm_state_size;
1704: s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1705: strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1706: /* we don't need to update entire object */
1707: datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1708:
1709: /* refresh inode. */
1710: fd = connect_to_sdog(s->addr, s->port);
1711: if (fd < 0) {
1712: ret = -EIO;
1713: goto cleanup;
1714: }
1715:
1716: ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1717: s->inode.nr_copies, datalen, 0, 0);
1718: if (ret < 0) {
1719: error_report("failed to write snapshot's inode.\n");
1720: ret = -EIO;
1721: goto cleanup;
1722: }
1723:
1724: ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1725: s->addr, s->port);
1726: if (ret < 0) {
1727: error_report("failed to create inode for snapshot. %s\n",
1728: strerror(errno));
1729: ret = -EIO;
1730: goto cleanup;
1731: }
1732:
1733: inode = (SheepdogInode *)qemu_malloc(datalen);
1734:
1735: ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1736: s->inode.nr_copies, datalen, 0);
1737:
1738: if (ret < 0) {
1739: error_report("failed to read new inode info. %s\n", strerror(errno));
1740: ret = -EIO;
1741: goto cleanup;
1742: }
1743:
1744: memcpy(&s->inode, inode, datalen);
1745: dprintf("s->inode: name %s snap_id %x oid %x\n",
1746: s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1747:
1748: cleanup:
1749: closesocket(fd);
1750: return ret;
1751: }
1752:
1753: static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1754: {
1755: BDRVSheepdogState *s = bs->opaque;
1756: BDRVSheepdogState *old_s;
1757: char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1758: char *buf = NULL;
1759: uint32_t vid;
1760: uint32_t snapid = 0;
1761: int ret = -ENOENT, fd;
1762:
1763: old_s = qemu_malloc(sizeof(BDRVSheepdogState));
1764:
1765: memcpy(old_s, s, sizeof(BDRVSheepdogState));
1766:
1767: memset(vdi, 0, sizeof(vdi));
1768: strncpy(vdi, s->name, sizeof(vdi));
1769:
1770: memset(tag, 0, sizeof(tag));
1771: snapid = strtoul(snapshot_id, NULL, 10);
1772: if (!snapid) {
1773: strncpy(tag, s->name, sizeof(tag));
1774: }
1775:
1776: ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1777: if (ret) {
1778: error_report("Failed to find_vdi_name\n");
1779: ret = -ENOENT;
1780: goto out;
1781: }
1782:
1783: fd = connect_to_sdog(s->addr, s->port);
1784: if (fd < 0) {
1785: error_report("failed to connect\n");
1786: goto out;
1787: }
1788:
1789: buf = qemu_malloc(SD_INODE_SIZE);
1790: ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1791: SD_INODE_SIZE, 0);
1792:
1793: closesocket(fd);
1794:
1795: if (ret) {
1796: ret = -ENOENT;
1797: goto out;
1798: }
1799:
1800: memcpy(&s->inode, buf, sizeof(s->inode));
1801:
1802: if (!s->inode.vm_state_size) {
1803: error_report("Invalid snapshot\n");
1804: ret = -ENOENT;
1805: goto out;
1806: }
1807:
1808: s->is_snapshot = 1;
1809:
1810: qemu_free(buf);
1811: qemu_free(old_s);
1812:
1813: return 0;
1814: out:
1815: /* recover bdrv_sd_state */
1816: memcpy(s, old_s, sizeof(BDRVSheepdogState));
1817: qemu_free(buf);
1818: qemu_free(old_s);
1819:
1820: error_report("failed to open. recover old bdrv_sd_state.\n");
1821:
1822: return ret;
1823: }
1824:
1825: static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1826: {
1827: /* FIXME: Delete specified snapshot id. */
1828: return 0;
1829: }
1830:
1831: #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
1832: #define BITS_PER_BYTE 8
1833: #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
1834: #define DECLARE_BITMAP(name,bits) \
1835: unsigned long name[BITS_TO_LONGS(bits)]
1836:
1837: #define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
1838:
1839: static inline int test_bit(unsigned int nr, const unsigned long *addr)
1840: {
1841: return ((1UL << (nr % BITS_PER_LONG)) &
1842: (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
1843: }
1844:
1845: static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1846: {
1847: BDRVSheepdogState *s = bs->opaque;
1848: SheepdogReq req;
1849: int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1850: QEMUSnapshotInfo *sn_tab = NULL;
1851: unsigned wlen, rlen;
1852: int found = 0;
1853: static SheepdogInode inode;
1854: unsigned long *vdi_inuse;
1855: unsigned int start_nr;
1856: uint64_t hval;
1857: uint32_t vid;
1858:
1859: vdi_inuse = qemu_malloc(max);
1860:
1861: fd = connect_to_sdog(s->addr, s->port);
1862: if (fd < 0) {
1863: goto out;
1864: }
1865:
1866: rlen = max;
1867: wlen = 0;
1868:
1869: memset(&req, 0, sizeof(req));
1870:
1871: req.opcode = SD_OP_READ_VDIS;
1872: req.data_length = max;
1873:
1874: ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1875:
1876: closesocket(fd);
1877: if (ret) {
1878: goto out;
1879: }
1880:
1881: sn_tab = qemu_mallocz(nr * sizeof(*sn_tab));
1882:
1883: /* calculate a vdi id with hash function */
1884: hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1885: start_nr = hval & (SD_NR_VDIS - 1);
1886:
1887: fd = connect_to_sdog(s->addr, s->port);
1888: if (fd < 0) {
1889: error_report("failed to connect\n");
1890: goto out;
1891: }
1892:
1893: for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1894: if (!test_bit(vid, vdi_inuse)) {
1895: break;
1896: }
1897:
1898: /* we don't need to read entire object */
1899: ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1900: 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1901:
1902: if (ret) {
1903: continue;
1904: }
1905:
1906: if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1907: sn_tab[found].date_sec = inode.snap_ctime >> 32;
1908: sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1909: sn_tab[found].vm_state_size = inode.vm_state_size;
1910: sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1911:
1912: snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1913: inode.snap_id);
1914: strncpy(sn_tab[found].name, inode.tag,
1915: MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1916: found++;
1917: }
1918: }
1919:
1920: closesocket(fd);
1921: out:
1922: *psn_tab = sn_tab;
1923:
1924: qemu_free(vdi_inuse);
1925:
1926: return found;
1927: }
1928:
1929: static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1930: int64_t pos, int size, int load)
1931: {
1932: int fd, create;
1933: int ret = 0;
1934: unsigned int data_len;
1935: uint64_t vmstate_oid;
1936: uint32_t vdi_index;
1937: uint64_t offset;
1938:
1939: fd = connect_to_sdog(s->addr, s->port);
1940: if (fd < 0) {
1941: ret = -EIO;
1942: goto cleanup;
1943: }
1944:
1945: while (size) {
1946: vdi_index = pos / SD_DATA_OBJ_SIZE;
1947: offset = pos % SD_DATA_OBJ_SIZE;
1948:
1949: data_len = MIN(size, SD_DATA_OBJ_SIZE);
1950:
1951: vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1952:
1953: create = (offset == 0);
1954: if (load) {
1955: ret = read_object(fd, (char *)data, vmstate_oid,
1956: s->inode.nr_copies, data_len, offset);
1957: } else {
1958: ret = write_object(fd, (char *)data, vmstate_oid,
1959: s->inode.nr_copies, data_len, offset, create);
1960: }
1961:
1962: if (ret < 0) {
1963: error_report("failed to save vmstate %s\n", strerror(errno));
1964: ret = -EIO;
1965: goto cleanup;
1966: }
1967:
1968: pos += data_len;
1969: size -= data_len;
1970: ret += data_len;
1971: }
1972: cleanup:
1973: closesocket(fd);
1974: return ret;
1975: }
1976:
1977: static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
1978: int64_t pos, int size)
1979: {
1980: BDRVSheepdogState *s = bs->opaque;
1981:
1982: return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
1983: }
1984:
1985: static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
1986: int64_t pos, int size)
1987: {
1988: BDRVSheepdogState *s = bs->opaque;
1989:
1990: return do_load_save_vmstate(s, data, pos, size, 1);
1991: }
1992:
1993:
1994: static QEMUOptionParameter sd_create_options[] = {
1995: {
1996: .name = BLOCK_OPT_SIZE,
1997: .type = OPT_SIZE,
1998: .help = "Virtual disk size"
1999: },
2000: {
2001: .name = BLOCK_OPT_BACKING_FILE,
2002: .type = OPT_STRING,
2003: .help = "File name of a base image"
2004: },
2005: { NULL }
2006: };
2007:
2008: BlockDriver bdrv_sheepdog = {
2009: .format_name = "sheepdog",
2010: .protocol_name = "sheepdog",
2011: .instance_size = sizeof(BDRVSheepdogState),
2012: .bdrv_file_open = sd_open,
2013: .bdrv_close = sd_close,
2014: .bdrv_create = sd_create,
2015: .bdrv_getlength = sd_getlength,
2016: .bdrv_truncate = sd_truncate,
2017:
2018: .bdrv_aio_readv = sd_aio_readv,
2019: .bdrv_aio_writev = sd_aio_writev,
2020:
2021: .bdrv_snapshot_create = sd_snapshot_create,
2022: .bdrv_snapshot_goto = sd_snapshot_goto,
2023: .bdrv_snapshot_delete = sd_snapshot_delete,
2024: .bdrv_snapshot_list = sd_snapshot_list,
2025:
2026: .bdrv_save_vmstate = sd_save_vmstate,
2027: .bdrv_load_vmstate = sd_load_vmstate,
2028:
2029: .create_options = sd_create_options,
2030: };
2031:
2032: static void bdrv_sheepdog_init(void)
2033: {
2034: bdrv_register(&bdrv_sheepdog);
2035: }
2036: block_init(bdrv_sheepdog_init);
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.