File:  [Qemu by Fabrice Bellard] / qemu / block-migration.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs
Tue Apr 24 17:35:11 2018 UTC (3 years, 3 months ago) by root
Branches: qemu, MAIN
CVS tags: qemu0125, qemu0124, qemu0123, qemu0122, qemu0121, qemu0120, HEAD
qemu 0.12.0

    1: /*
    2:  * QEMU live block migration
    3:  *
    4:  * Copyright IBM, Corp. 2009
    5:  *
    6:  * Authors:
    7:  *  Liran Schour   <lirans@il.ibm.com>
    8:  *
    9:  * This work is licensed under the terms of the GNU GPL, version 2.  See
   10:  * the COPYING file in the top-level directory.
   11:  *
   12:  */
   13: 
   14: #include "qemu-common.h"
   15: #include "block_int.h"
   16: #include "hw/hw.h"
   17: #include "qemu-queue.h"
   18: #include "monitor.h"
   19: #include "block-migration.h"
   20: #include <assert.h>
   21: 
   22: #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
   23: 
   24: #define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
   25: #define BLK_MIG_FLAG_EOS                0x02
   26: #define BLK_MIG_FLAG_PROGRESS           0x04
   27: 
   28: #define MAX_IS_ALLOCATED_SEARCH 65536
   29: #define MAX_BLOCKS_READ 10000
   30: #define BLOCKS_READ_CHANGE 100
   31: #define INITIAL_BLOCKS_READ 100
   32: 
   33: //#define DEBUG_BLK_MIGRATION
   34: 
   35: #ifdef DEBUG_BLK_MIGRATION
   36: #define dprintf(fmt, ...) \
   37:     do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
   38: #else
   39: #define dprintf(fmt, ...) \
   40:     do { } while (0)
   41: #endif
   42: 
   43: typedef struct BlkMigDevState {
   44:     BlockDriverState *bs;
   45:     int bulk_completed;
   46:     int shared_base;
   47:     int64_t cur_sector;
   48:     int64_t completed_sectors;
   49:     int64_t total_sectors;
   50:     int64_t dirty;
   51:     QSIMPLEQ_ENTRY(BlkMigDevState) entry;
   52: } BlkMigDevState;
   53: 
   54: typedef struct BlkMigBlock {
   55:     uint8_t *buf;
   56:     BlkMigDevState *bmds;
   57:     int64_t sector;
   58:     struct iovec iov;
   59:     QEMUIOVector qiov;
   60:     BlockDriverAIOCB *aiocb;
   61:     int ret;
   62:     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
   63: } BlkMigBlock;
   64: 
   65: typedef struct BlkMigState {
   66:     int blk_enable;
   67:     int shared_base;
   68:     QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
   69:     QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
   70:     int submitted;
   71:     int read_done;
   72:     int transferred;
   73:     int64_t total_sector_sum;
   74:     int prev_progress;
   75: } BlkMigState;
   76: 
   77: static BlkMigState block_mig_state;
   78: 
   79: static void blk_send(QEMUFile *f, BlkMigBlock * blk)
   80: {
   81:     int len;
   82: 
   83:     /* sector number and flags */
   84:     qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
   85:                      | BLK_MIG_FLAG_DEVICE_BLOCK);
   86: 
   87:     /* device name */
   88:     len = strlen(blk->bmds->bs->device_name);
   89:     qemu_put_byte(f, len);
   90:     qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
   91: 
   92:     qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
   93: }
   94: 
   95: int blk_mig_active(void)
   96: {
   97:     return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
   98: }
   99: 
  100: uint64_t blk_mig_bytes_transferred(void)
  101: {
  102:     BlkMigDevState *bmds;
  103:     uint64_t sum = 0;
  104: 
  105:     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  106:         sum += bmds->completed_sectors;
  107:     }
  108:     return sum << BDRV_SECTOR_BITS;
  109: }
  110: 
  111: uint64_t blk_mig_bytes_remaining(void)
  112: {
  113:     return blk_mig_bytes_total() - blk_mig_bytes_transferred();
  114: }
  115: 
  116: uint64_t blk_mig_bytes_total(void)
  117: {
  118:     BlkMigDevState *bmds;
  119:     uint64_t sum = 0;
  120: 
  121:     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  122:         sum += bmds->total_sectors;
  123:     }
  124:     return sum << BDRV_SECTOR_BITS;
  125: }
  126: 
  127: static void blk_mig_read_cb(void *opaque, int ret)
  128: {
  129:     BlkMigBlock *blk = opaque;
  130: 
  131:     blk->ret = ret;
  132: 
  133:     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
  134: 
  135:     block_mig_state.submitted--;
  136:     block_mig_state.read_done++;
  137:     assert(block_mig_state.submitted >= 0);
  138: }
  139: 
  140: static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
  141:                                 BlkMigDevState *bmds, int is_async)
  142: {
  143:     int64_t total_sectors = bmds->total_sectors;
  144:     int64_t cur_sector = bmds->cur_sector;
  145:     BlockDriverState *bs = bmds->bs;
  146:     BlkMigBlock *blk;
  147:     int nr_sectors;
  148: 
  149:     if (bmds->shared_base) {
  150:         while (cur_sector < total_sectors &&
  151:                !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
  152:                                   &nr_sectors)) {
  153:             cur_sector += nr_sectors;
  154:         }
  155:     }
  156: 
  157:     if (cur_sector >= total_sectors) {
  158:         bmds->cur_sector = bmds->completed_sectors = total_sectors;
  159:         return 1;
  160:     }
  161: 
  162:     bmds->completed_sectors = cur_sector;
  163: 
  164:     cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
  165: 
  166:     /* we are going to transfer a full block even if it is not allocated */
  167:     nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  168: 
  169:     if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  170:         nr_sectors = total_sectors - cur_sector;
  171:     }
  172: 
  173:     blk = qemu_malloc(sizeof(BlkMigBlock));
  174:     blk->buf = qemu_malloc(BLOCK_SIZE);
  175:     blk->bmds = bmds;
  176:     blk->sector = cur_sector;
  177: 
  178:     if (is_async) {
  179:         blk->iov.iov_base = blk->buf;
  180:         blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  181:         qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  182: 
  183:         blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
  184:                                     nr_sectors, blk_mig_read_cb, blk);
  185:         if (!blk->aiocb) {
  186:             goto error;
  187:         }
  188:         block_mig_state.submitted++;
  189:     } else {
  190:         if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
  191:             goto error;
  192:         }
  193:         blk_send(f, blk);
  194: 
  195:         qemu_free(blk->buf);
  196:         qemu_free(blk);
  197:     }
  198: 
  199:     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
  200:     bmds->cur_sector = cur_sector + nr_sectors;
  201: 
  202:     return (bmds->cur_sector >= total_sectors);
  203: 
  204: error:
  205:     monitor_printf(mon, "Error reading sector %" PRId64 "\n", cur_sector);
  206:     qemu_file_set_error(f);
  207:     qemu_free(blk->buf);
  208:     qemu_free(blk);
  209:     return 0;
  210: }
  211: 
  212: static void set_dirty_tracking(int enable)
  213: {
  214:     BlkMigDevState *bmds;
  215: 
  216:     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  217:         bdrv_set_dirty_tracking(bmds->bs, enable);
  218:     }
  219: }
  220: 
  221: static void init_blk_migration(Monitor *mon, QEMUFile *f)
  222: {
  223:     BlkMigDevState *bmds;
  224:     BlockDriverState *bs;
  225:     int64_t sectors;
  226: 
  227:     block_mig_state.submitted = 0;
  228:     block_mig_state.read_done = 0;
  229:     block_mig_state.transferred = 0;
  230:     block_mig_state.total_sector_sum = 0;
  231:     block_mig_state.prev_progress = -1;
  232: 
  233:     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
  234:         if (bs->type == BDRV_TYPE_HD) {
  235:             sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
  236:             if (sectors == 0) {
  237:                 continue;
  238:             }
  239: 
  240:             bmds = qemu_mallocz(sizeof(BlkMigDevState));
  241:             bmds->bs = bs;
  242:             bmds->bulk_completed = 0;
  243:             bmds->total_sectors = sectors;
  244:             bmds->completed_sectors = 0;
  245:             bmds->shared_base = block_mig_state.shared_base;
  246: 
  247:             block_mig_state.total_sector_sum += sectors;
  248: 
  249:             if (bmds->shared_base) {
  250:                 monitor_printf(mon, "Start migration for %s with shared base "
  251:                                     "image\n",
  252:                                bs->device_name);
  253:             } else {
  254:                 monitor_printf(mon, "Start full migration for %s\n",
  255:                                bs->device_name);
  256:             }
  257: 
  258:             QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
  259:         }
  260:     }
  261: }
  262: 
  263: static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
  264: {
  265:     int64_t completed_sector_sum = 0;
  266:     BlkMigDevState *bmds;
  267:     int progress;
  268:     int ret = 0;
  269: 
  270:     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  271:         if (bmds->bulk_completed == 0) {
  272:             if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) {
  273:                 /* completed bulk section for this device */
  274:                 bmds->bulk_completed = 1;
  275:             }
  276:             completed_sector_sum += bmds->completed_sectors;
  277:             ret = 1;
  278:             break;
  279:         } else {
  280:             completed_sector_sum += bmds->completed_sectors;
  281:         }
  282:     }
  283: 
  284:     progress = completed_sector_sum * 100 / block_mig_state.total_sector_sum;
  285:     if (progress != block_mig_state.prev_progress) {
  286:         block_mig_state.prev_progress = progress;
  287:         qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
  288:                          | BLK_MIG_FLAG_PROGRESS);
  289:         monitor_printf(mon, "Completed %d %%\r", progress);
  290:         monitor_flush(mon);
  291:     }
  292: 
  293:     return ret;
  294: }
  295: 
  296: #define MAX_NUM_BLOCKS 4
  297: 
  298: static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f)
  299: {
  300:     BlkMigDevState *bmds;
  301:     BlkMigBlock blk;
  302:     int64_t sector;
  303: 
  304:     blk.buf = qemu_malloc(BLOCK_SIZE);
  305: 
  306:     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  307:         for (sector = 0; sector < bmds->cur_sector;) {
  308:             if (bdrv_get_dirty(bmds->bs, sector)) {
  309:                 if (bdrv_read(bmds->bs, sector, blk.buf,
  310:                               BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
  311:                     monitor_printf(mon, "Error reading sector %" PRId64 "\n",
  312:                                    sector);
  313:                     qemu_file_set_error(f);
  314:                     qemu_free(blk.buf);
  315:                     return;
  316:                 }
  317:                 blk.bmds = bmds;
  318:                 blk.sector = sector;
  319:                 blk_send(f, &blk);
  320: 
  321:                 bdrv_reset_dirty(bmds->bs, sector,
  322:                                  BDRV_SECTORS_PER_DIRTY_CHUNK);
  323:             }
  324:             sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
  325:         }
  326:     }
  327: 
  328:     qemu_free(blk.buf);
  329: }
  330: 
  331: static void flush_blks(QEMUFile* f)
  332: {
  333:     BlkMigBlock *blk;
  334: 
  335:     dprintf("%s Enter submitted %d read_done %d transferred %d\n",
  336:             __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
  337:             block_mig_state.transferred);
  338: 
  339:     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  340:         if (qemu_file_rate_limit(f)) {
  341:             break;
  342:         }
  343:         if (blk->ret < 0) {
  344:             qemu_file_set_error(f);
  345:             break;
  346:         }
  347:         blk_send(f, blk);
  348: 
  349:         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  350:         qemu_free(blk->buf);
  351:         qemu_free(blk);
  352: 
  353:         block_mig_state.read_done--;
  354:         block_mig_state.transferred++;
  355:         assert(block_mig_state.read_done >= 0);
  356:     }
  357: 
  358:     dprintf("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
  359:             block_mig_state.submitted, block_mig_state.read_done,
  360:             block_mig_state.transferred);
  361: }
  362: 
  363: static int is_stage2_completed(void)
  364: {
  365:     BlkMigDevState *bmds;
  366: 
  367:     if (block_mig_state.submitted > 0) {
  368:         return 0;
  369:     }
  370: 
  371:     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  372:         if (bmds->bulk_completed == 0) {
  373:             return 0;
  374:         }
  375:     }
  376: 
  377:     return 1;
  378: }
  379: 
  380: static void blk_mig_cleanup(Monitor *mon)
  381: {
  382:     BlkMigDevState *bmds;
  383:     BlkMigBlock *blk;
  384: 
  385:     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
  386:         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
  387:         qemu_free(bmds);
  388:     }
  389: 
  390:     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  391:         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  392:         qemu_free(blk->buf);
  393:         qemu_free(blk);
  394:     }
  395: 
  396:     set_dirty_tracking(0);
  397: 
  398:     monitor_printf(mon, "\n");
  399: }
  400: 
  401: static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
  402: {
  403:     dprintf("Enter save live stage %d submitted %d transferred %d\n",
  404:             stage, block_mig_state.submitted, block_mig_state.transferred);
  405: 
  406:     if (stage < 0) {
  407:         blk_mig_cleanup(mon);
  408:         return 0;
  409:     }
  410: 
  411:     if (block_mig_state.blk_enable != 1) {
  412:         /* no need to migrate storage */
  413:         qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  414:         return 1;
  415:     }
  416: 
  417:     if (stage == 1) {
  418:         init_blk_migration(mon, f);
  419: 
  420:         /* start track dirty blocks */
  421:         set_dirty_tracking(1);
  422:     }
  423: 
  424:     flush_blks(f);
  425: 
  426:     if (qemu_file_has_error(f)) {
  427:         blk_mig_cleanup(mon);
  428:         return 0;
  429:     }
  430: 
  431:     /* control the rate of transfer */
  432:     while ((block_mig_state.submitted +
  433:             block_mig_state.read_done) * BLOCK_SIZE <
  434:            qemu_file_get_rate_limit(f)) {
  435:         if (blk_mig_save_bulked_block(mon, f, 1) == 0) {
  436:             /* no more bulk blocks for now */
  437:             break;
  438:         }
  439:     }
  440: 
  441:     flush_blks(f);
  442: 
  443:     if (qemu_file_has_error(f)) {
  444:         blk_mig_cleanup(mon);
  445:         return 0;
  446:     }
  447: 
  448:     if (stage == 3) {
  449:         while (blk_mig_save_bulked_block(mon, f, 0) != 0) {
  450:             /* empty */
  451:         }
  452: 
  453:         blk_mig_save_dirty_blocks(mon, f);
  454:         blk_mig_cleanup(mon);
  455: 
  456:         /* report completion */
  457:         qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
  458: 
  459:         if (qemu_file_has_error(f)) {
  460:             return 0;
  461:         }
  462: 
  463:         monitor_printf(mon, "Block migration completed\n");
  464:     }
  465: 
  466:     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  467: 
  468:     return ((stage == 2) && is_stage2_completed());
  469: }
  470: 
  471: static int block_load(QEMUFile *f, void *opaque, int version_id)
  472: {
  473:     static int banner_printed;
  474:     int len, flags;
  475:     char device_name[256];
  476:     int64_t addr;
  477:     BlockDriverState *bs;
  478:     uint8_t *buf;
  479: 
  480:     do {
  481:         addr = qemu_get_be64(f);
  482: 
  483:         flags = addr & ~BDRV_SECTOR_MASK;
  484:         addr >>= BDRV_SECTOR_BITS;
  485: 
  486:         if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
  487:             /* get device name */
  488:             len = qemu_get_byte(f);
  489:             qemu_get_buffer(f, (uint8_t *)device_name, len);
  490:             device_name[len] = '\0';
  491: 
  492:             bs = bdrv_find(device_name);
  493:             if (!bs) {
  494:                 fprintf(stderr, "Error unknown block device %s\n",
  495:                         device_name);
  496:                 return -EINVAL;
  497:             }
  498: 
  499:             buf = qemu_malloc(BLOCK_SIZE);
  500: 
  501:             qemu_get_buffer(f, buf, BLOCK_SIZE);
  502:             bdrv_write(bs, addr, buf, BDRV_SECTORS_PER_DIRTY_CHUNK);
  503: 
  504:             qemu_free(buf);
  505:         } else if (flags & BLK_MIG_FLAG_PROGRESS) {
  506:             if (!banner_printed) {
  507:                 printf("Receiving block device images\n");
  508:                 banner_printed = 1;
  509:             }
  510:             printf("Completed %d %%%c", (int)addr,
  511:                    (addr == 100) ? '\n' : '\r');
  512:             fflush(stdout);
  513:         } else if (!(flags & BLK_MIG_FLAG_EOS)) {
  514:             fprintf(stderr, "Unknown flags\n");
  515:             return -EINVAL;
  516:         }
  517:         if (qemu_file_has_error(f)) {
  518:             return -EIO;
  519:         }
  520:     } while (!(flags & BLK_MIG_FLAG_EOS));
  521: 
  522:     return 0;
  523: }
  524: 
  525: static void block_set_params(int blk_enable, int shared_base, void *opaque)
  526: {
  527:     block_mig_state.blk_enable = blk_enable;
  528:     block_mig_state.shared_base = shared_base;
  529: 
  530:     /* shared base means that blk_enable = 1 */
  531:     block_mig_state.blk_enable |= shared_base;
  532: }
  533: 
  534: void blk_mig_init(void)
  535: {
  536:     QSIMPLEQ_INIT(&block_mig_state.bmds_list);
  537:     QSIMPLEQ_INIT(&block_mig_state.blk_list);
  538: 
  539:     register_savevm_live("block", 0, 1, block_set_params, block_save_live,
  540:                          NULL, block_load, &block_mig_state);
  541: }

unix.superglobalmegacorp.com