|
|
1.1 root 1: /*
2: * QEMU Enhanced Disk Format
3: *
4: * Copyright IBM, Corp. 2010
5: *
6: * Authors:
7: * Stefan Hajnoczi <[email protected]>
8: * Anthony Liguori <[email protected]>
9: *
10: * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11: * See the COPYING.LIB file in the top-level directory.
12: *
13: */
14:
1.1.1.2 root 15: #include "qemu-timer.h"
1.1 root 16: #include "trace.h"
17: #include "qed.h"
18: #include "qerror.h"
1.1.1.3 root 19: #include "migration.h"
1.1 root 20:
21: static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
22: {
23: QEDAIOCB *acb = (QEDAIOCB *)blockacb;
24: bool finished = false;
25:
26: /* Wait for the request to finish */
27: acb->finished = &finished;
28: while (!finished) {
29: qemu_aio_wait();
30: }
31: }
32:
33: static AIOPool qed_aio_pool = {
34: .aiocb_size = sizeof(QEDAIOCB),
35: .cancel = qed_aio_cancel,
36: };
37:
38: static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
39: const char *filename)
40: {
41: const QEDHeader *header = (const QEDHeader *)buf;
42:
43: if (buf_size < sizeof(*header)) {
44: return 0;
45: }
46: if (le32_to_cpu(header->magic) != QED_MAGIC) {
47: return 0;
48: }
49: return 100;
50: }
51:
52: /**
53: * Check whether an image format is raw
54: *
55: * @fmt: Backing file format, may be NULL
56: */
57: static bool qed_fmt_is_raw(const char *fmt)
58: {
59: return fmt && strcmp(fmt, "raw") == 0;
60: }
61:
62: static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
63: {
64: cpu->magic = le32_to_cpu(le->magic);
65: cpu->cluster_size = le32_to_cpu(le->cluster_size);
66: cpu->table_size = le32_to_cpu(le->table_size);
67: cpu->header_size = le32_to_cpu(le->header_size);
68: cpu->features = le64_to_cpu(le->features);
69: cpu->compat_features = le64_to_cpu(le->compat_features);
70: cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
71: cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
72: cpu->image_size = le64_to_cpu(le->image_size);
73: cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
74: cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
75: }
76:
77: static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
78: {
79: le->magic = cpu_to_le32(cpu->magic);
80: le->cluster_size = cpu_to_le32(cpu->cluster_size);
81: le->table_size = cpu_to_le32(cpu->table_size);
82: le->header_size = cpu_to_le32(cpu->header_size);
83: le->features = cpu_to_le64(cpu->features);
84: le->compat_features = cpu_to_le64(cpu->compat_features);
85: le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
86: le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
87: le->image_size = cpu_to_le64(cpu->image_size);
88: le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
89: le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
90: }
91:
92: static int qed_write_header_sync(BDRVQEDState *s)
93: {
94: QEDHeader le;
95: int ret;
96:
97: qed_header_cpu_to_le(&s->header, &le);
98: ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
99: if (ret != sizeof(le)) {
100: return ret;
101: }
102: return 0;
103: }
104:
105: typedef struct {
106: GenericCB gencb;
107: BDRVQEDState *s;
108: struct iovec iov;
109: QEMUIOVector qiov;
110: int nsectors;
111: uint8_t *buf;
112: } QEDWriteHeaderCB;
113:
114: static void qed_write_header_cb(void *opaque, int ret)
115: {
116: QEDWriteHeaderCB *write_header_cb = opaque;
117:
118: qemu_vfree(write_header_cb->buf);
119: gencb_complete(write_header_cb, ret);
120: }
121:
122: static void qed_write_header_read_cb(void *opaque, int ret)
123: {
124: QEDWriteHeaderCB *write_header_cb = opaque;
125: BDRVQEDState *s = write_header_cb->s;
126:
127: if (ret) {
128: qed_write_header_cb(write_header_cb, ret);
129: return;
130: }
131:
132: /* Update header */
133: qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
134:
1.1.1.4 ! root 135: bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
! 136: write_header_cb->nsectors, qed_write_header_cb,
! 137: write_header_cb);
1.1 root 138: }
139:
140: /**
141: * Update header in-place (does not rewrite backing filename or other strings)
142: *
143: * This function only updates known header fields in-place and does not affect
144: * extra data after the QED header.
145: */
146: static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
147: void *opaque)
148: {
149: /* We must write full sectors for O_DIRECT but cannot necessarily generate
150: * the data following the header if an unrecognized compat feature is
151: * active. Therefore, first read the sectors containing the header, update
152: * them, and write back.
153: */
154:
155: int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
156: BDRV_SECTOR_SIZE;
157: size_t len = nsectors * BDRV_SECTOR_SIZE;
158: QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
159: cb, opaque);
160:
161: write_header_cb->s = s;
162: write_header_cb->nsectors = nsectors;
163: write_header_cb->buf = qemu_blockalign(s->bs, len);
164: write_header_cb->iov.iov_base = write_header_cb->buf;
165: write_header_cb->iov.iov_len = len;
166: qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
167:
1.1.1.4 ! root 168: bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
! 169: qed_write_header_read_cb, write_header_cb);
1.1 root 170: }
171:
172: static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
173: {
174: uint64_t table_entries;
175: uint64_t l2_size;
176:
177: table_entries = (table_size * cluster_size) / sizeof(uint64_t);
178: l2_size = table_entries * cluster_size;
179:
180: return l2_size * table_entries;
181: }
182:
183: static bool qed_is_cluster_size_valid(uint32_t cluster_size)
184: {
185: if (cluster_size < QED_MIN_CLUSTER_SIZE ||
186: cluster_size > QED_MAX_CLUSTER_SIZE) {
187: return false;
188: }
189: if (cluster_size & (cluster_size - 1)) {
190: return false; /* not power of 2 */
191: }
192: return true;
193: }
194:
195: static bool qed_is_table_size_valid(uint32_t table_size)
196: {
197: if (table_size < QED_MIN_TABLE_SIZE ||
198: table_size > QED_MAX_TABLE_SIZE) {
199: return false;
200: }
201: if (table_size & (table_size - 1)) {
202: return false; /* not power of 2 */
203: }
204: return true;
205: }
206:
207: static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
208: uint32_t table_size)
209: {
210: if (image_size % BDRV_SECTOR_SIZE != 0) {
211: return false; /* not multiple of sector size */
212: }
213: if (image_size > qed_max_image_size(cluster_size, table_size)) {
214: return false; /* image is too large */
215: }
216: return true;
217: }
218:
219: /**
220: * Read a string of known length from the image file
221: *
222: * @file: Image file
223: * @offset: File offset to start of string, in bytes
224: * @n: String length in bytes
225: * @buf: Destination buffer
226: * @buflen: Destination buffer length in bytes
227: * @ret: 0 on success, -errno on failure
228: *
229: * The string is NUL-terminated.
230: */
231: static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
232: char *buf, size_t buflen)
233: {
234: int ret;
235: if (n >= buflen) {
236: return -EINVAL;
237: }
238: ret = bdrv_pread(file, offset, buf, n);
239: if (ret < 0) {
240: return ret;
241: }
242: buf[n] = '\0';
243: return 0;
244: }
245:
246: /**
247: * Allocate new clusters
248: *
249: * @s: QED state
250: * @n: Number of contiguous clusters to allocate
251: * @ret: Offset of first allocated cluster
252: *
253: * This function only produces the offset where the new clusters should be
254: * written. It updates BDRVQEDState but does not make any changes to the image
255: * file.
256: */
257: static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
258: {
259: uint64_t offset = s->file_size;
260: s->file_size += n * s->header.cluster_size;
261: return offset;
262: }
263:
264: QEDTable *qed_alloc_table(BDRVQEDState *s)
265: {
266: /* Honor O_DIRECT memory alignment requirements */
267: return qemu_blockalign(s->bs,
268: s->header.cluster_size * s->header.table_size);
269: }
270:
271: /**
272: * Allocate a new zeroed L2 table
273: */
274: static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
275: {
276: CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
277:
278: l2_table->table = qed_alloc_table(s);
279: l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
280:
281: memset(l2_table->table->offsets, 0,
282: s->header.cluster_size * s->header.table_size);
283: return l2_table;
284: }
285:
286: static void qed_aio_next_io(void *opaque, int ret);
287:
1.1.1.2 root 288: static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
289: {
290: assert(!s->allocating_write_reqs_plugged);
291:
292: s->allocating_write_reqs_plugged = true;
293: }
294:
295: static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
296: {
297: QEDAIOCB *acb;
298:
299: assert(s->allocating_write_reqs_plugged);
300:
301: s->allocating_write_reqs_plugged = false;
302:
303: acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
304: if (acb) {
305: qed_aio_next_io(acb, 0);
306: }
307: }
308:
309: static void qed_finish_clear_need_check(void *opaque, int ret)
310: {
311: /* Do nothing */
312: }
313:
314: static void qed_flush_after_clear_need_check(void *opaque, int ret)
315: {
316: BDRVQEDState *s = opaque;
317:
318: bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
319:
320: /* No need to wait until flush completes */
321: qed_unplug_allocating_write_reqs(s);
322: }
323:
324: static void qed_clear_need_check(void *opaque, int ret)
325: {
326: BDRVQEDState *s = opaque;
327:
328: if (ret) {
329: qed_unplug_allocating_write_reqs(s);
330: return;
331: }
332:
333: s->header.features &= ~QED_F_NEED_CHECK;
334: qed_write_header(s, qed_flush_after_clear_need_check, s);
335: }
336:
337: static void qed_need_check_timer_cb(void *opaque)
338: {
339: BDRVQEDState *s = opaque;
340:
341: /* The timer should only fire when allocating writes have drained */
342: assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
343:
344: trace_qed_need_check_timer_cb(s);
345:
346: qed_plug_allocating_write_reqs(s);
347:
348: /* Ensure writes are on disk before clearing flag */
349: bdrv_aio_flush(s->bs, qed_clear_need_check, s);
350: }
351:
352: static void qed_start_need_check_timer(BDRVQEDState *s)
353: {
354: trace_qed_start_need_check_timer(s);
355:
356: /* Use vm_clock so we don't alter the image file while suspended for
357: * migration.
358: */
359: qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) +
360: get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT);
361: }
362:
363: /* It's okay to call this multiple times or when no timer is started */
364: static void qed_cancel_need_check_timer(BDRVQEDState *s)
365: {
366: trace_qed_cancel_need_check_timer(s);
367: qemu_del_timer(s->need_check_timer);
368: }
369:
1.1.1.4 ! root 370: static void bdrv_qed_rebind(BlockDriverState *bs)
! 371: {
! 372: BDRVQEDState *s = bs->opaque;
! 373: s->bs = bs;
! 374: }
! 375:
1.1 root 376: static int bdrv_qed_open(BlockDriverState *bs, int flags)
377: {
378: BDRVQEDState *s = bs->opaque;
379: QEDHeader le_header;
380: int64_t file_size;
381: int ret;
382:
383: s->bs = bs;
384: QSIMPLEQ_INIT(&s->allocating_write_reqs);
385:
386: ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
387: if (ret < 0) {
388: return ret;
389: }
390: qed_header_le_to_cpu(&le_header, &s->header);
391:
392: if (s->header.magic != QED_MAGIC) {
393: return -EINVAL;
394: }
395: if (s->header.features & ~QED_FEATURE_MASK) {
396: /* image uses unsupported feature bits */
397: char buf[64];
398: snprintf(buf, sizeof(buf), "%" PRIx64,
399: s->header.features & ~QED_FEATURE_MASK);
400: qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
401: bs->device_name, "QED", buf);
402: return -ENOTSUP;
403: }
404: if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
405: return -EINVAL;
406: }
407:
408: /* Round down file size to the last cluster */
409: file_size = bdrv_getlength(bs->file);
410: if (file_size < 0) {
411: return file_size;
412: }
413: s->file_size = qed_start_of_cluster(s, file_size);
414:
415: if (!qed_is_table_size_valid(s->header.table_size)) {
416: return -EINVAL;
417: }
418: if (!qed_is_image_size_valid(s->header.image_size,
419: s->header.cluster_size,
420: s->header.table_size)) {
421: return -EINVAL;
422: }
423: if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
424: return -EINVAL;
425: }
426:
427: s->table_nelems = (s->header.cluster_size * s->header.table_size) /
428: sizeof(uint64_t);
429: s->l2_shift = ffs(s->header.cluster_size) - 1;
430: s->l2_mask = s->table_nelems - 1;
431: s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
432:
433: if ((s->header.features & QED_F_BACKING_FILE)) {
434: if ((uint64_t)s->header.backing_filename_offset +
435: s->header.backing_filename_size >
436: s->header.cluster_size * s->header.header_size) {
437: return -EINVAL;
438: }
439:
440: ret = qed_read_string(bs->file, s->header.backing_filename_offset,
441: s->header.backing_filename_size, bs->backing_file,
442: sizeof(bs->backing_file));
443: if (ret < 0) {
444: return ret;
445: }
446:
447: if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
448: pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
449: }
450: }
451:
452: /* Reset unknown autoclear feature bits. This is a backwards
453: * compatibility mechanism that allows images to be opened by older
454: * programs, which "knock out" unknown feature bits. When an image is
455: * opened by a newer program again it can detect that the autoclear
456: * feature is no longer valid.
457: */
458: if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
1.1.1.4 ! root 459: !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
1.1 root 460: s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
461:
462: ret = qed_write_header_sync(s);
463: if (ret) {
464: return ret;
465: }
466:
467: /* From here on only known autoclear feature bits are valid */
468: bdrv_flush(bs->file);
469: }
470:
471: s->l1_table = qed_alloc_table(s);
472: qed_init_l2_cache(&s->l2_cache);
473:
474: ret = qed_read_l1_table_sync(s);
475: if (ret) {
476: goto out;
477: }
478:
479: /* If image was not closed cleanly, check consistency */
480: if (s->header.features & QED_F_NEED_CHECK) {
481: /* Read-only images cannot be fixed. There is no risk of corruption
482: * since write operations are not possible. Therefore, allow
483: * potentially inconsistent images to be opened read-only. This can
484: * aid data recovery from an otherwise inconsistent image.
485: */
1.1.1.4 ! root 486: if (!bdrv_is_read_only(bs->file) &&
! 487: !(flags & BDRV_O_INCOMING)) {
1.1 root 488: BdrvCheckResult result = {0};
489:
490: ret = qed_check(s, &result, true);
1.1.1.2 root 491: if (ret) {
492: goto out;
493: }
494: if (!result.corruptions && !result.check_errors) {
1.1 root 495: /* Ensure fixes reach storage before clearing check bit */
496: bdrv_flush(s->bs);
497:
498: s->header.features &= ~QED_F_NEED_CHECK;
499: qed_write_header_sync(s);
500: }
501: }
502: }
503:
1.1.1.2 root 504: s->need_check_timer = qemu_new_timer_ns(vm_clock,
505: qed_need_check_timer_cb, s);
506:
1.1 root 507: out:
508: if (ret) {
509: qed_free_l2_cache(&s->l2_cache);
510: qemu_vfree(s->l1_table);
511: }
512: return ret;
513: }
514:
515: static void bdrv_qed_close(BlockDriverState *bs)
516: {
517: BDRVQEDState *s = bs->opaque;
518:
1.1.1.2 root 519: qed_cancel_need_check_timer(s);
520: qemu_free_timer(s->need_check_timer);
521:
1.1 root 522: /* Ensure writes reach stable storage */
523: bdrv_flush(bs->file);
524:
525: /* Clean shutdown, no check required on next open */
526: if (s->header.features & QED_F_NEED_CHECK) {
527: s->header.features &= ~QED_F_NEED_CHECK;
528: qed_write_header_sync(s);
529: }
530:
531: qed_free_l2_cache(&s->l2_cache);
532: qemu_vfree(s->l1_table);
533: }
534:
535: static int qed_create(const char *filename, uint32_t cluster_size,
536: uint64_t image_size, uint32_t table_size,
537: const char *backing_file, const char *backing_fmt)
538: {
539: QEDHeader header = {
540: .magic = QED_MAGIC,
541: .cluster_size = cluster_size,
542: .table_size = table_size,
543: .header_size = 1,
544: .features = 0,
545: .compat_features = 0,
546: .l1_table_offset = cluster_size,
547: .image_size = image_size,
548: };
549: QEDHeader le_header;
550: uint8_t *l1_table = NULL;
551: size_t l1_size = header.cluster_size * header.table_size;
552: int ret = 0;
553: BlockDriverState *bs = NULL;
554:
555: ret = bdrv_create_file(filename, NULL);
556: if (ret < 0) {
557: return ret;
558: }
559:
560: ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
561: if (ret < 0) {
562: return ret;
563: }
564:
565: /* File must start empty and grow, check truncate is supported */
566: ret = bdrv_truncate(bs, 0);
567: if (ret < 0) {
568: goto out;
569: }
570:
571: if (backing_file) {
572: header.features |= QED_F_BACKING_FILE;
573: header.backing_filename_offset = sizeof(le_header);
574: header.backing_filename_size = strlen(backing_file);
575:
576: if (qed_fmt_is_raw(backing_fmt)) {
577: header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
578: }
579: }
580:
581: qed_header_cpu_to_le(&header, &le_header);
582: ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header));
583: if (ret < 0) {
584: goto out;
585: }
586: ret = bdrv_pwrite(bs, sizeof(le_header), backing_file,
587: header.backing_filename_size);
588: if (ret < 0) {
589: goto out;
590: }
591:
1.1.1.3 root 592: l1_table = g_malloc0(l1_size);
1.1 root 593: ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size);
594: if (ret < 0) {
595: goto out;
596: }
597:
598: ret = 0; /* success */
599: out:
1.1.1.3 root 600: g_free(l1_table);
1.1 root 601: bdrv_delete(bs);
602: return ret;
603: }
604:
605: static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
606: {
607: uint64_t image_size = 0;
608: uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
609: uint32_t table_size = QED_DEFAULT_TABLE_SIZE;
610: const char *backing_file = NULL;
611: const char *backing_fmt = NULL;
612:
613: while (options && options->name) {
614: if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
615: image_size = options->value.n;
616: } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
617: backing_file = options->value.s;
618: } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
619: backing_fmt = options->value.s;
620: } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
621: if (options->value.n) {
622: cluster_size = options->value.n;
623: }
624: } else if (!strcmp(options->name, BLOCK_OPT_TABLE_SIZE)) {
625: if (options->value.n) {
626: table_size = options->value.n;
627: }
628: }
629: options++;
630: }
631:
632: if (!qed_is_cluster_size_valid(cluster_size)) {
633: fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n",
634: QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
635: return -EINVAL;
636: }
637: if (!qed_is_table_size_valid(table_size)) {
638: fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n",
639: QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
640: return -EINVAL;
641: }
642: if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) {
643: fprintf(stderr, "QED image size must be a non-zero multiple of "
644: "cluster size and less than %" PRIu64 " bytes\n",
645: qed_max_image_size(cluster_size, table_size));
646: return -EINVAL;
647: }
648:
649: return qed_create(filename, cluster_size, image_size, table_size,
650: backing_file, backing_fmt);
651: }
652:
653: typedef struct {
1.1.1.4 ! root 654: Coroutine *co;
1.1 root 655: int is_allocated;
656: int *pnum;
657: } QEDIsAllocatedCB;
658:
659: static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
660: {
661: QEDIsAllocatedCB *cb = opaque;
662: *cb->pnum = len / BDRV_SECTOR_SIZE;
1.1.1.2 root 663: cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
1.1.1.4 ! root 664: if (cb->co) {
! 665: qemu_coroutine_enter(cb->co, NULL);
! 666: }
1.1 root 667: }
668:
1.1.1.4 ! root 669: static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
! 670: int64_t sector_num,
! 671: int nb_sectors, int *pnum)
1.1 root 672: {
673: BDRVQEDState *s = bs->opaque;
674: uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
675: size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
676: QEDIsAllocatedCB cb = {
677: .is_allocated = -1,
678: .pnum = pnum,
679: };
680: QEDRequest request = { .l2_table = NULL };
681:
682: qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
683:
1.1.1.4 ! root 684: /* Now sleep if the callback wasn't invoked immediately */
1.1 root 685: while (cb.is_allocated == -1) {
1.1.1.4 ! root 686: cb.co = qemu_coroutine_self();
! 687: qemu_coroutine_yield();
1.1 root 688: }
689:
690: qed_unref_l2_cache_entry(request.l2_table);
691:
692: return cb.is_allocated;
693: }
694:
695: static int bdrv_qed_make_empty(BlockDriverState *bs)
696: {
697: return -ENOTSUP;
698: }
699:
700: static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
701: {
702: return acb->common.bs->opaque;
703: }
704:
705: /**
706: * Read from the backing file or zero-fill if no backing file
707: *
708: * @s: QED state
709: * @pos: Byte position in device
710: * @qiov: Destination I/O vector
711: * @cb: Completion function
712: * @opaque: User data for completion function
713: *
714: * This function reads qiov->size bytes starting at pos from the backing file.
715: * If there is no backing file then zeroes are read.
716: */
717: static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
718: QEMUIOVector *qiov,
719: BlockDriverCompletionFunc *cb, void *opaque)
720: {
721: uint64_t backing_length = 0;
722: size_t size;
723:
724: /* If there is a backing file, get its length. Treat the absence of a
725: * backing file like a zero length backing file.
726: */
727: if (s->bs->backing_hd) {
728: int64_t l = bdrv_getlength(s->bs->backing_hd);
729: if (l < 0) {
730: cb(opaque, l);
731: return;
732: }
733: backing_length = l;
734: }
735:
736: /* Zero all sectors if reading beyond the end of the backing file */
737: if (pos >= backing_length ||
738: pos + qiov->size > backing_length) {
739: qemu_iovec_memset(qiov, 0, qiov->size);
740: }
741:
742: /* Complete now if there are no backing file sectors to read */
743: if (pos >= backing_length) {
744: cb(opaque, 0);
745: return;
746: }
747:
748: /* If the read straddles the end of the backing file, shorten it */
749: size = MIN((uint64_t)backing_length - pos, qiov->size);
750:
751: BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING);
1.1.1.4 ! root 752: bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
! 753: qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
1.1 root 754: }
755:
756: typedef struct {
757: GenericCB gencb;
758: BDRVQEDState *s;
759: QEMUIOVector qiov;
760: struct iovec iov;
761: uint64_t offset;
762: } CopyFromBackingFileCB;
763:
764: static void qed_copy_from_backing_file_cb(void *opaque, int ret)
765: {
766: CopyFromBackingFileCB *copy_cb = opaque;
767: qemu_vfree(copy_cb->iov.iov_base);
768: gencb_complete(©_cb->gencb, ret);
769: }
770:
771: static void qed_copy_from_backing_file_write(void *opaque, int ret)
772: {
773: CopyFromBackingFileCB *copy_cb = opaque;
774: BDRVQEDState *s = copy_cb->s;
775:
776: if (ret) {
777: qed_copy_from_backing_file_cb(copy_cb, ret);
778: return;
779: }
780:
781: BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
1.1.1.4 ! root 782: bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
! 783: ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
! 784: qed_copy_from_backing_file_cb, copy_cb);
1.1 root 785: }
786:
787: /**
788: * Copy data from backing file into the image
789: *
790: * @s: QED state
791: * @pos: Byte position in device
792: * @len: Number of bytes
793: * @offset: Byte offset in image file
794: * @cb: Completion function
795: * @opaque: User data for completion function
796: */
797: static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
798: uint64_t len, uint64_t offset,
799: BlockDriverCompletionFunc *cb,
800: void *opaque)
801: {
802: CopyFromBackingFileCB *copy_cb;
803:
804: /* Skip copy entirely if there is no work to do */
805: if (len == 0) {
806: cb(opaque, 0);
807: return;
808: }
809:
810: copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
811: copy_cb->s = s;
812: copy_cb->offset = offset;
813: copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
814: copy_cb->iov.iov_len = len;
815: qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1);
816:
817: qed_read_backing_file(s, pos, ©_cb->qiov,
818: qed_copy_from_backing_file_write, copy_cb);
819: }
820:
821: /**
822: * Link one or more contiguous clusters into a table
823: *
824: * @s: QED state
825: * @table: L2 table
826: * @index: First cluster index
827: * @n: Number of contiguous clusters
1.1.1.2 root 828: * @cluster: First cluster offset
829: *
830: * The cluster offset may be an allocated byte offset in the image file, the
831: * zero cluster marker, or the unallocated cluster marker.
1.1 root 832: */
833: static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
834: unsigned int n, uint64_t cluster)
835: {
836: int i;
837: for (i = index; i < index + n; i++) {
838: table->offsets[i] = cluster;
1.1.1.2 root 839: if (!qed_offset_is_unalloc_cluster(cluster) &&
840: !qed_offset_is_zero_cluster(cluster)) {
841: cluster += s->header.cluster_size;
842: }
1.1 root 843: }
844: }
845:
846: static void qed_aio_complete_bh(void *opaque)
847: {
848: QEDAIOCB *acb = opaque;
849: BlockDriverCompletionFunc *cb = acb->common.cb;
850: void *user_opaque = acb->common.opaque;
851: int ret = acb->bh_ret;
852: bool *finished = acb->finished;
853:
854: qemu_bh_delete(acb->bh);
855: qemu_aio_release(acb);
856:
857: /* Invoke callback */
858: cb(user_opaque, ret);
859:
860: /* Signal cancel completion */
861: if (finished) {
862: *finished = true;
863: }
864: }
865:
866: static void qed_aio_complete(QEDAIOCB *acb, int ret)
867: {
868: BDRVQEDState *s = acb_to_s(acb);
869:
870: trace_qed_aio_complete(s, acb, ret);
871:
872: /* Free resources */
873: qemu_iovec_destroy(&acb->cur_qiov);
874: qed_unref_l2_cache_entry(acb->request.l2_table);
875:
1.1.1.4 ! root 876: /* Free the buffer we may have allocated for zero writes */
! 877: if (acb->flags & QED_AIOCB_ZERO) {
! 878: qemu_vfree(acb->qiov->iov[0].iov_base);
! 879: acb->qiov->iov[0].iov_base = NULL;
! 880: }
! 881:
1.1 root 882: /* Arrange for a bh to invoke the completion function */
883: acb->bh_ret = ret;
884: acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
885: qemu_bh_schedule(acb->bh);
886:
887: /* Start next allocating write request waiting behind this one. Note that
888: * requests enqueue themselves when they first hit an unallocated cluster
889: * but they wait until the entire request is finished before waking up the
890: * next request in the queue. This ensures that we don't cycle through
891: * requests multiple times but rather finish one at a time completely.
892: */
893: if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
894: QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
895: acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
896: if (acb) {
897: qed_aio_next_io(acb, 0);
1.1.1.2 root 898: } else if (s->header.features & QED_F_NEED_CHECK) {
899: qed_start_need_check_timer(s);
1.1 root 900: }
901: }
902: }
903:
904: /**
905: * Commit the current L2 table to the cache
906: */
907: static void qed_commit_l2_update(void *opaque, int ret)
908: {
909: QEDAIOCB *acb = opaque;
910: BDRVQEDState *s = acb_to_s(acb);
911: CachedL2Table *l2_table = acb->request.l2_table;
1.1.1.2 root 912: uint64_t l2_offset = l2_table->offset;
1.1 root 913:
914: qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
915:
916: /* This is guaranteed to succeed because we just committed the entry to the
917: * cache.
918: */
1.1.1.2 root 919: acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1.1 root 920: assert(acb->request.l2_table != NULL);
921:
922: qed_aio_next_io(opaque, ret);
923: }
924:
925: /**
926: * Update L1 table with new L2 table offset and write it out
927: */
928: static void qed_aio_write_l1_update(void *opaque, int ret)
929: {
930: QEDAIOCB *acb = opaque;
931: BDRVQEDState *s = acb_to_s(acb);
932: int index;
933:
934: if (ret) {
935: qed_aio_complete(acb, ret);
936: return;
937: }
938:
939: index = qed_l1_index(s, acb->cur_pos);
940: s->l1_table->offsets[index] = acb->request.l2_table->offset;
941:
942: qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
943: }
944:
945: /**
946: * Update L2 table with new cluster offsets and write them out
947: */
1.1.1.4 ! root 948: static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
1.1 root 949: {
950: BDRVQEDState *s = acb_to_s(acb);
951: bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
952: int index;
953:
954: if (ret) {
955: goto err;
956: }
957:
958: if (need_alloc) {
959: qed_unref_l2_cache_entry(acb->request.l2_table);
960: acb->request.l2_table = qed_new_l2_table(s);
961: }
962:
963: index = qed_l2_index(s, acb->cur_pos);
964: qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1.1.1.4 ! root 965: offset);
1.1 root 966:
967: if (need_alloc) {
968: /* Write out the whole new L2 table */
969: qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
970: qed_aio_write_l1_update, acb);
971: } else {
972: /* Write out only the updated part of the L2 table */
973: qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
974: qed_aio_next_io, acb);
975: }
976: return;
977:
978: err:
979: qed_aio_complete(acb, ret);
980: }
981:
1.1.1.4 ! root 982: static void qed_aio_write_l2_update_cb(void *opaque, int ret)
! 983: {
! 984: QEDAIOCB *acb = opaque;
! 985: qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
! 986: }
! 987:
1.1 root 988: /**
989: * Flush new data clusters before updating the L2 table
990: *
991: * This flush is necessary when a backing file is in use. A crash during an
992: * allocating write could result in empty clusters in the image. If the write
993: * only touched a subregion of the cluster, then backing image sectors have
994: * been lost in the untouched region. The solution is to flush after writing a
995: * new data cluster and before updating the L2 table.
996: */
997: static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
998: {
999: QEDAIOCB *acb = opaque;
1000: BDRVQEDState *s = acb_to_s(acb);
1001:
1.1.1.4 ! root 1002: if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
1.1 root 1003: qed_aio_complete(acb, -EIO);
1004: }
1005: }
1006:
1007: /**
1008: * Write data to the image file
1009: */
1010: static void qed_aio_write_main(void *opaque, int ret)
1011: {
1012: QEDAIOCB *acb = opaque;
1013: BDRVQEDState *s = acb_to_s(acb);
1014: uint64_t offset = acb->cur_cluster +
1015: qed_offset_into_cluster(s, acb->cur_pos);
1016: BlockDriverCompletionFunc *next_fn;
1017:
1018: trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
1019:
1020: if (ret) {
1021: qed_aio_complete(acb, ret);
1022: return;
1023: }
1024:
1025: if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
1026: next_fn = qed_aio_next_io;
1027: } else {
1028: if (s->bs->backing_hd) {
1029: next_fn = qed_aio_write_flush_before_l2_update;
1030: } else {
1.1.1.4 ! root 1031: next_fn = qed_aio_write_l2_update_cb;
1.1 root 1032: }
1033: }
1034:
1035: BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1.1.1.4 ! root 1036: bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
! 1037: &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
! 1038: next_fn, acb);
1.1 root 1039: }
1040:
1041: /**
1042: * Populate back untouched region of new data cluster
1043: */
1044: static void qed_aio_write_postfill(void *opaque, int ret)
1045: {
1046: QEDAIOCB *acb = opaque;
1047: BDRVQEDState *s = acb_to_s(acb);
1048: uint64_t start = acb->cur_pos + acb->cur_qiov.size;
1049: uint64_t len =
1050: qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1051: uint64_t offset = acb->cur_cluster +
1052: qed_offset_into_cluster(s, acb->cur_pos) +
1053: acb->cur_qiov.size;
1054:
1055: if (ret) {
1056: qed_aio_complete(acb, ret);
1057: return;
1058: }
1059:
1060: trace_qed_aio_write_postfill(s, acb, start, len, offset);
1061: qed_copy_from_backing_file(s, start, len, offset,
1062: qed_aio_write_main, acb);
1063: }
1064:
1065: /**
1066: * Populate front untouched region of new data cluster
1067: */
1068: static void qed_aio_write_prefill(void *opaque, int ret)
1069: {
1070: QEDAIOCB *acb = opaque;
1071: BDRVQEDState *s = acb_to_s(acb);
1072: uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
1073: uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
1074:
1075: trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1076: qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
1077: qed_aio_write_postfill, acb);
1078: }
1079:
1080: /**
1081: * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1082: */
1083: static bool qed_should_set_need_check(BDRVQEDState *s)
1084: {
1085: /* The flush before L2 update path ensures consistency */
1086: if (s->bs->backing_hd) {
1087: return false;
1088: }
1089:
1090: return !(s->header.features & QED_F_NEED_CHECK);
1091: }
1092:
1.1.1.4 ! root 1093: static void qed_aio_write_zero_cluster(void *opaque, int ret)
! 1094: {
! 1095: QEDAIOCB *acb = opaque;
! 1096:
! 1097: if (ret) {
! 1098: qed_aio_complete(acb, ret);
! 1099: return;
! 1100: }
! 1101:
! 1102: qed_aio_write_l2_update(acb, 0, 1);
! 1103: }
! 1104:
1.1 root 1105: /**
1106: * Write new data cluster
1107: *
1108: * @acb: Write request
1109: * @len: Length in bytes
1110: *
1111: * This path is taken when writing to previously unallocated clusters.
1112: */
1113: static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1114: {
1115: BDRVQEDState *s = acb_to_s(acb);
1.1.1.4 ! root 1116: BlockDriverCompletionFunc *cb;
1.1 root 1117:
1.1.1.2 root 1118: /* Cancel timer when the first allocating request comes in */
1119: if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
1120: qed_cancel_need_check_timer(s);
1121: }
1122:
1.1 root 1123: /* Freeze this request if another allocating write is in progress */
1124: if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
1125: QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
1126: }
1.1.1.2 root 1127: if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
1128: s->allocating_write_reqs_plugged) {
1.1 root 1129: return; /* wait for existing request to finish */
1130: }
1131:
1132: acb->cur_nclusters = qed_bytes_to_clusters(s,
1133: qed_offset_into_cluster(s, acb->cur_pos) + len);
1134: qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1135:
1.1.1.4 ! root 1136: if (acb->flags & QED_AIOCB_ZERO) {
! 1137: /* Skip ahead if the clusters are already zero */
! 1138: if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
! 1139: qed_aio_next_io(acb, 0);
! 1140: return;
! 1141: }
! 1142:
! 1143: cb = qed_aio_write_zero_cluster;
! 1144: } else {
! 1145: cb = qed_aio_write_prefill;
! 1146: acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
! 1147: }
! 1148:
1.1 root 1149: if (qed_should_set_need_check(s)) {
1150: s->header.features |= QED_F_NEED_CHECK;
1.1.1.4 ! root 1151: qed_write_header(s, cb, acb);
1.1 root 1152: } else {
1.1.1.4 ! root 1153: cb(acb, 0);
1.1 root 1154: }
1155: }
1156:
1157: /**
1158: * Write data cluster in place
1159: *
1160: * @acb: Write request
1161: * @offset: Cluster offset in bytes
1162: * @len: Length in bytes
1163: *
1164: * This path is taken when writing to already allocated clusters.
1165: */
1166: static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1167: {
1.1.1.4 ! root 1168: /* Allocate buffer for zero writes */
! 1169: if (acb->flags & QED_AIOCB_ZERO) {
! 1170: struct iovec *iov = acb->qiov->iov;
! 1171:
! 1172: if (!iov->iov_base) {
! 1173: iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
! 1174: memset(iov->iov_base, 0, iov->iov_len);
! 1175: }
! 1176: }
! 1177:
1.1 root 1178: /* Calculate the I/O vector */
1179: acb->cur_cluster = offset;
1180: qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1181:
1182: /* Do the actual write */
1183: qed_aio_write_main(acb, 0);
1184: }
1185:
1186: /**
1187: * Write data cluster
1188: *
1189: * @opaque: Write request
1190: * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
1191: * or -errno
1192: * @offset: Cluster offset in bytes
1193: * @len: Length in bytes
1194: *
1195: * Callback from qed_find_cluster().
1196: */
1197: static void qed_aio_write_data(void *opaque, int ret,
1198: uint64_t offset, size_t len)
1199: {
1200: QEDAIOCB *acb = opaque;
1201:
1202: trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1203:
1204: acb->find_cluster_ret = ret;
1205:
1206: switch (ret) {
1207: case QED_CLUSTER_FOUND:
1208: qed_aio_write_inplace(acb, offset, len);
1209: break;
1210:
1211: case QED_CLUSTER_L2:
1212: case QED_CLUSTER_L1:
1.1.1.2 root 1213: case QED_CLUSTER_ZERO:
1.1 root 1214: qed_aio_write_alloc(acb, len);
1215: break;
1216:
1217: default:
1218: qed_aio_complete(acb, ret);
1219: break;
1220: }
1221: }
1222:
1223: /**
1224: * Read data cluster
1225: *
1226: * @opaque: Read request
1227: * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
1228: * or -errno
1229: * @offset: Cluster offset in bytes
1230: * @len: Length in bytes
1231: *
1232: * Callback from qed_find_cluster().
1233: */
1234: static void qed_aio_read_data(void *opaque, int ret,
1235: uint64_t offset, size_t len)
1236: {
1237: QEDAIOCB *acb = opaque;
1238: BDRVQEDState *s = acb_to_s(acb);
1239: BlockDriverState *bs = acb->common.bs;
1240:
1241: /* Adjust offset into cluster */
1242: offset += qed_offset_into_cluster(s, acb->cur_pos);
1243:
1244: trace_qed_aio_read_data(s, acb, ret, offset, len);
1245:
1246: if (ret < 0) {
1247: goto err;
1248: }
1249:
1250: qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1251:
1.1.1.2 root 1252: /* Handle zero cluster and backing file reads */
1253: if (ret == QED_CLUSTER_ZERO) {
1254: qemu_iovec_memset(&acb->cur_qiov, 0, acb->cur_qiov.size);
1255: qed_aio_next_io(acb, 0);
1256: return;
1257: } else if (ret != QED_CLUSTER_FOUND) {
1.1 root 1258: qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
1259: qed_aio_next_io, acb);
1260: return;
1261: }
1262:
1263: BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1.1.1.4 ! root 1264: bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
! 1265: &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
! 1266: qed_aio_next_io, acb);
1.1 root 1267: return;
1268:
1269: err:
1270: qed_aio_complete(acb, ret);
1271: }
1272:
1273: /**
1274: * Begin next I/O or complete the request
1275: */
1276: static void qed_aio_next_io(void *opaque, int ret)
1277: {
1278: QEDAIOCB *acb = opaque;
1279: BDRVQEDState *s = acb_to_s(acb);
1.1.1.4 ! root 1280: QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
! 1281: qed_aio_write_data : qed_aio_read_data;
1.1 root 1282:
1283: trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
1284:
1285: /* Handle I/O error */
1286: if (ret) {
1287: qed_aio_complete(acb, ret);
1288: return;
1289: }
1290:
1291: acb->qiov_offset += acb->cur_qiov.size;
1292: acb->cur_pos += acb->cur_qiov.size;
1293: qemu_iovec_reset(&acb->cur_qiov);
1294:
1295: /* Complete request */
1296: if (acb->cur_pos >= acb->end_pos) {
1297: qed_aio_complete(acb, 0);
1298: return;
1299: }
1300:
1301: /* Find next cluster and start I/O */
1302: qed_find_cluster(s, &acb->request,
1303: acb->cur_pos, acb->end_pos - acb->cur_pos,
1304: io_fn, acb);
1305: }
1306:
1307: static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
1308: int64_t sector_num,
1309: QEMUIOVector *qiov, int nb_sectors,
1310: BlockDriverCompletionFunc *cb,
1.1.1.4 ! root 1311: void *opaque, int flags)
1.1 root 1312: {
1313: QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
1314:
1315: trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
1.1.1.4 ! root 1316: opaque, flags);
1.1 root 1317:
1.1.1.4 ! root 1318: acb->flags = flags;
1.1 root 1319: acb->finished = NULL;
1320: acb->qiov = qiov;
1321: acb->qiov_offset = 0;
1322: acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
1323: acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
1324: acb->request.l2_table = NULL;
1325: qemu_iovec_init(&acb->cur_qiov, qiov->niov);
1326:
1327: /* Start request */
1328: qed_aio_next_io(acb, 0);
1329: return &acb->common;
1330: }
1331:
1332: static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
1333: int64_t sector_num,
1334: QEMUIOVector *qiov, int nb_sectors,
1335: BlockDriverCompletionFunc *cb,
1336: void *opaque)
1337: {
1.1.1.4 ! root 1338: return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
1.1 root 1339: }
1340:
1341: static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
1342: int64_t sector_num,
1343: QEMUIOVector *qiov, int nb_sectors,
1344: BlockDriverCompletionFunc *cb,
1345: void *opaque)
1346: {
1.1.1.4 ! root 1347: return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
! 1348: opaque, QED_AIOCB_WRITE);
1.1 root 1349: }
1350:
1.1.1.4 ! root 1351: typedef struct {
! 1352: Coroutine *co;
! 1353: int ret;
! 1354: bool done;
! 1355: } QEDWriteZeroesCB;
! 1356:
! 1357: static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
1.1 root 1358: {
1.1.1.4 ! root 1359: QEDWriteZeroesCB *cb = opaque;
! 1360:
! 1361: cb->done = true;
! 1362: cb->ret = ret;
! 1363: if (cb->co) {
! 1364: qemu_coroutine_enter(cb->co, NULL);
! 1365: }
! 1366: }
! 1367:
! 1368: static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
! 1369: int64_t sector_num,
! 1370: int nb_sectors)
! 1371: {
! 1372: BlockDriverAIOCB *blockacb;
! 1373: QEDWriteZeroesCB cb = { .done = false };
! 1374: QEMUIOVector qiov;
! 1375: struct iovec iov;
! 1376:
! 1377: /* Zero writes start without an I/O buffer. If a buffer becomes necessary
! 1378: * then it will be allocated during request processing.
! 1379: */
! 1380: iov.iov_base = NULL,
! 1381: iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE,
! 1382:
! 1383: qemu_iovec_init_external(&qiov, &iov, 1);
! 1384: blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
! 1385: qed_co_write_zeroes_cb, &cb,
! 1386: QED_AIOCB_WRITE | QED_AIOCB_ZERO);
! 1387: if (!blockacb) {
! 1388: return -EIO;
! 1389: }
! 1390: if (!cb.done) {
! 1391: cb.co = qemu_coroutine_self();
! 1392: qemu_coroutine_yield();
! 1393: }
! 1394: assert(cb.done);
! 1395: return cb.ret;
1.1 root 1396: }
1397:
1398: static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
1399: {
1.1.1.2 root 1400: BDRVQEDState *s = bs->opaque;
1401: uint64_t old_image_size;
1402: int ret;
1403:
1404: if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1405: s->header.table_size)) {
1406: return -EINVAL;
1407: }
1408:
1409: /* Shrinking is currently not supported */
1410: if ((uint64_t)offset < s->header.image_size) {
1411: return -ENOTSUP;
1412: }
1413:
1414: old_image_size = s->header.image_size;
1415: s->header.image_size = offset;
1416: ret = qed_write_header_sync(s);
1417: if (ret < 0) {
1418: s->header.image_size = old_image_size;
1419: }
1420: return ret;
1.1 root 1421: }
1422:
1423: static int64_t bdrv_qed_getlength(BlockDriverState *bs)
1424: {
1425: BDRVQEDState *s = bs->opaque;
1426: return s->header.image_size;
1427: }
1428:
1429: static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1430: {
1431: BDRVQEDState *s = bs->opaque;
1432:
1433: memset(bdi, 0, sizeof(*bdi));
1434: bdi->cluster_size = s->header.cluster_size;
1.1.1.4 ! root 1435: bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1.1 root 1436: return 0;
1437: }
1438:
1439: static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1440: const char *backing_file,
1441: const char *backing_fmt)
1442: {
1443: BDRVQEDState *s = bs->opaque;
1444: QEDHeader new_header, le_header;
1445: void *buffer;
1446: size_t buffer_len, backing_file_len;
1447: int ret;
1448:
1449: /* Refuse to set backing filename if unknown compat feature bits are
1450: * active. If the image uses an unknown compat feature then we may not
1451: * know the layout of data following the header structure and cannot safely
1452: * add a new string.
1453: */
1454: if (backing_file && (s->header.compat_features &
1455: ~QED_COMPAT_FEATURE_MASK)) {
1456: return -ENOTSUP;
1457: }
1458:
1459: memcpy(&new_header, &s->header, sizeof(new_header));
1460:
1461: new_header.features &= ~(QED_F_BACKING_FILE |
1462: QED_F_BACKING_FORMAT_NO_PROBE);
1463:
1464: /* Adjust feature flags */
1465: if (backing_file) {
1466: new_header.features |= QED_F_BACKING_FILE;
1467:
1468: if (qed_fmt_is_raw(backing_fmt)) {
1469: new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1470: }
1471: }
1472:
1473: /* Calculate new header size */
1474: backing_file_len = 0;
1475:
1476: if (backing_file) {
1477: backing_file_len = strlen(backing_file);
1478: }
1479:
1480: buffer_len = sizeof(new_header);
1481: new_header.backing_filename_offset = buffer_len;
1482: new_header.backing_filename_size = backing_file_len;
1483: buffer_len += backing_file_len;
1484:
1485: /* Make sure we can rewrite header without failing */
1486: if (buffer_len > new_header.header_size * new_header.cluster_size) {
1487: return -ENOSPC;
1488: }
1489:
1490: /* Prepare new header */
1.1.1.3 root 1491: buffer = g_malloc(buffer_len);
1.1 root 1492:
1493: qed_header_cpu_to_le(&new_header, &le_header);
1494: memcpy(buffer, &le_header, sizeof(le_header));
1495: buffer_len = sizeof(le_header);
1496:
1.1.1.3 root 1497: if (backing_file) {
1498: memcpy(buffer + buffer_len, backing_file, backing_file_len);
1499: buffer_len += backing_file_len;
1500: }
1.1 root 1501:
1502: /* Write new header */
1503: ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
1.1.1.3 root 1504: g_free(buffer);
1.1 root 1505: if (ret == 0) {
1506: memcpy(&s->header, &new_header, sizeof(new_header));
1507: }
1508: return ret;
1509: }
1510:
1.1.1.4 ! root 1511: static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
! 1512: {
! 1513: BDRVQEDState *s = bs->opaque;
! 1514:
! 1515: bdrv_qed_close(bs);
! 1516: memset(s, 0, sizeof(BDRVQEDState));
! 1517: bdrv_qed_open(bs, bs->open_flags);
! 1518: }
! 1519:
1.1 root 1520: static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
1521: {
1522: BDRVQEDState *s = bs->opaque;
1523:
1524: return qed_check(s, result, false);
1525: }
1526:
1527: static QEMUOptionParameter qed_create_options[] = {
1528: {
1529: .name = BLOCK_OPT_SIZE,
1530: .type = OPT_SIZE,
1531: .help = "Virtual disk size (in bytes)"
1532: }, {
1533: .name = BLOCK_OPT_BACKING_FILE,
1534: .type = OPT_STRING,
1535: .help = "File name of a base image"
1536: }, {
1537: .name = BLOCK_OPT_BACKING_FMT,
1538: .type = OPT_STRING,
1539: .help = "Image format of the base image"
1540: }, {
1541: .name = BLOCK_OPT_CLUSTER_SIZE,
1542: .type = OPT_SIZE,
1.1.1.2 root 1543: .help = "Cluster size (in bytes)",
1544: .value = { .n = QED_DEFAULT_CLUSTER_SIZE },
1.1 root 1545: }, {
1546: .name = BLOCK_OPT_TABLE_SIZE,
1547: .type = OPT_SIZE,
1548: .help = "L1/L2 table size (in clusters)"
1549: },
1550: { /* end of list */ }
1551: };
1552:
1553: static BlockDriver bdrv_qed = {
1554: .format_name = "qed",
1555: .instance_size = sizeof(BDRVQEDState),
1556: .create_options = qed_create_options,
1557:
1558: .bdrv_probe = bdrv_qed_probe,
1.1.1.4 ! root 1559: .bdrv_rebind = bdrv_qed_rebind,
1.1 root 1560: .bdrv_open = bdrv_qed_open,
1561: .bdrv_close = bdrv_qed_close,
1562: .bdrv_create = bdrv_qed_create,
1.1.1.4 ! root 1563: .bdrv_co_is_allocated = bdrv_qed_co_is_allocated,
1.1 root 1564: .bdrv_make_empty = bdrv_qed_make_empty,
1565: .bdrv_aio_readv = bdrv_qed_aio_readv,
1566: .bdrv_aio_writev = bdrv_qed_aio_writev,
1.1.1.4 ! root 1567: .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes,
1.1 root 1568: .bdrv_truncate = bdrv_qed_truncate,
1569: .bdrv_getlength = bdrv_qed_getlength,
1570: .bdrv_get_info = bdrv_qed_get_info,
1571: .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1.1.1.4 ! root 1572: .bdrv_invalidate_cache = bdrv_qed_invalidate_cache,
1.1 root 1573: .bdrv_check = bdrv_qed_check,
1574: };
1575:
1576: static void bdrv_qed_init(void)
1577: {
1578: bdrv_register(&bdrv_qed);
1579: }
1580:
1581: block_init(bdrv_qed_init);
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.