前言

工作中经常涉及到libvirt下qemu虚拟机的热迁移场景,这里简单介绍一下热迁移的整体流程

热迁移由virsh的migrate命令发起,最终调用virDomainMigrateVersion3Full()进行热迁移

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
//#0  virDomainMigrateVersion3Full
// (domain=0x55555567a690, dconn=0x5555556731d0, xmlin=0x0, dname=0x0, uri=0x0, bandwidth=0, params=0x0, nparams=0, useParams=true, flags=1)
// at ../src/libvirt-domain.c:3224
//#1 0x00007ffff7c6eb9b in virDomainMigrateVersion3Params
// (flags=1, nparams=0, params=0x0, dconn=0x5555556731d0, domain=0x55555567a690) at ../src/libvirt-domain.c:3532
//#2 virDomainMigrate3
// (domain=domain@entry=0x55555567a690, dconn=dconn@entry=0x5555556731d0, params=<optimized out>, nparams=0, flags=flags@entry=1)
// at ../src/libvirt-domain.c:4311
//#3 0x00005555555a36f1 in doMigrate (opaque=0x7fffffffd8b0)
// at ../tools/virsh-domain.c:11097
//#4 0x00007ffff7b2f5b9 in virThreadHelper (data=<optimized out>)
// at ../src/util/virthread.c:256
//#5 0x00007ffff7494ac3 in start_thread (arg=<optimized out>)
// at ./nptl/pthread_create.c:442
//#6 0x00007ffff7526850 in clone3 ()
// at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

/*
* Sequence v3:
*
* Src: Begin
* - Generate XML to pass to dst
* - Generate optional cookie to pass to dst
*
* Dst: Prepare
* - Get ready to accept incoming VM
* - Generate optional cookie to pass to src
*
* Src: Perform
* - Start migration and wait for send completion
* - Generate optional cookie to pass to dst
*
* Dst: Finish
* - Wait for recv completion and check status
* - Kill off VM if failed, resume if success
* - Generate optional cookie to pass to src
*
* Src: Confirm
* - Kill off VM if success, resume if failed
*
* If useParams is true, params and nparams contain migration parameters and
* we know it's safe to call the API which supports extensible parameters.
* Otherwise, we have to use xmlin, dname, uri, and bandwidth and pass them
* to the old-style APIs.
*/
static virDomainPtr
virDomainMigrateVersion3Full(virDomainPtr domain,
virConnectPtr dconn,
const char *xmlin,
const char *dname,
const char *uri,
unsigned long long bandwidth,
virTypedParameterPtr params,
int nparams,
bool useParams,
unsigned int flags)
{
...
VIR_DEBUG("Begin3 %p", domain->conn);
...
VIR_DEBUG("Prepare3 %p flags=0x%x", dconn, destflags);
...

/* Perform the migration. The driver isn't supposed to return
* until the migration is complete. The src VM should remain
* running, but in paused state until the destination can
* confirm migration completion.
*/
VIR_DEBUG("Perform3 %p uri=%s", domain->conn, uri);
...

/*
* The status code from the source is passed to the destination.
* The dest can cleanup if the source indicated it failed to
* send all migration data. Returns NULL for ddomain if
* the dest was unable to complete migration.
*/
VIR_DEBUG("Finish3 %p ret=%d", dconn, ret);
...

/*
* If cancelled, then src VM will be restarted, else it will be killed.
* Don't do this if migration failed on source and thus it was already
* cancelled there.
*/
if (notify_source) {
VIR_DEBUG("Confirm3 %p ret=%d domain=%p", domain->conn, ret, domain);
...
}
...
}

根据注释信息,实际上整个热迁移可以分为begin、prepare、perform、finish和confirm等五个步骤

begin阶段

概述

begin阶段主要是源端生成目的端所需要的子机xml和包含源端热迁移额外配置信息的cookie,流程如下所示

热迁移begin阶段流程图

begin阶段的代码如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
static virDomainPtr
virDomainMigrateVersion3Full(virDomainPtr domain,
virConnectPtr dconn,
const char *xmlin,
const char *dname,
const char *uri,
unsigned long long bandwidth,
virTypedParameterPtr params,
int nparams,
bool useParams,
unsigned int flags)
{
virDomainPtr ddomain = NULL;
g_autofree char *uri_out = NULL;
g_autofree char *cookiein = NULL;
g_autofree char *cookieout = NULL;
g_autofree char *dom_xml = NULL;
int cookieinlen = 0;
int cookieoutlen = 0;
int ret;
virDomainInfo info;
virErrorPtr orig_err = NULL;
int cancelled = 1;
unsigned long protection = 0;
bool notify_source = true;
unsigned int destflags;
int state;
virTypedParameterPtr tmp;

VIR_DOMAIN_DEBUG(domain,
"dconn=%p, xmlin=%s, dname=%s, uri=%s, bandwidth=%llu, "
"params=%p, nparams=%d, useParams=%d, flags=0x%x",
dconn, NULLSTR(xmlin), NULLSTR(dname), NULLSTR(uri),
bandwidth, params, nparams, useParams, flags);
VIR_TYPED_PARAMS_DEBUG(params, nparams);

virCheckNonEmptyOptStringArgReturn(dname, NULL);

if ((!useParams &&
(!domain->conn->driver->domainMigrateBegin3 ||
!domain->conn->driver->domainMigratePerform3 ||
!domain->conn->driver->domainMigrateConfirm3 ||
!dconn->driver->domainMigratePrepare3 ||
!dconn->driver->domainMigrateFinish3)) ||
(useParams &&
(!domain->conn->driver->domainMigrateBegin3Params ||
!domain->conn->driver->domainMigratePerform3Params ||
!domain->conn->driver->domainMigrateConfirm3Params ||
!dconn->driver->domainMigratePrepare3Params ||
!dconn->driver->domainMigrateFinish3Params))) {
virReportUnsupportedError();
return NULL;
}

virTypedParamsCopy(&tmp, params, nparams);
params = tmp;

ret = VIR_DRV_SUPPORTS_FEATURE(domain->conn->driver, domain->conn,
VIR_DRV_FEATURE_MIGRATE_CHANGE_PROTECTION);
if (ret < 0)
goto done;
if (ret)
protection = VIR_MIGRATE_CHANGE_PROTECTION;

VIR_DEBUG("Begin3 %p", domain->conn);
if (useParams) {
dom_xml = domain->conn->driver->domainMigrateBegin3Params
(domain, params, nparams, &cookieout, &cookieoutlen,
flags | protection);
} else {
dom_xml = domain->conn->driver->domainMigrateBegin3
(domain, xmlin, &cookieout, &cookieoutlen,
flags | protection, dname, bandwidth);
}
if (!dom_xml)
goto done;
...
}

可以看到,其调用driverdomainMigrateBegin3Params函数指针,即qemuDomainMigrateBegin3Params(),完成xml和cookie的生成,其中其核心逻辑在qemuMigrationSrcBeginXML()

qemuMigrationSrcBeginXML

其代码如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
//#0  qemuMigrationSrcBeginXML (vm=0x7f9570011a20, xmlin=0x0, cookieout=cookieout@entry=0x7f959fc9a978, 
// cookieoutlen=cookieoutlen@entry=0x7f959fc9a96c, cookieFlags=2, migrate_disks=0x0, flags=257) at ../src/qemu/qemu_migration.c:2612
//#1 0x00007f959c38c411 in qemuMigrationSrcBeginPhase (driver=<optimized out>, vm=<optimized out>, xmlin=<optimized out>,
// dname=<optimized out>, cookieout=0x7f959fc9a978, cookieoutlen=0x7f959fc9a96c, migrate_disks=<optimized out>,
// migrate_disks_detect_zeroes=0x0, flags=257) at ../src/qemu/qemu_migration.c:2805
//#2 0x00007f959c38d929 in qemuMigrationSrcBegin (conn=0x7f95900029d0, vm=<optimized out>, xmlin=0x0, dname=0x0,
// cookieout=cookieout@entry=0x7f959fc9a978, cookieoutlen=cookieoutlen@entry=0x7f959fc9a96c, migrate_disks=0x0,
// migrate_disks_detect_zeroes=0x0, flags=257) at ../src/qemu/qemu_migration.c:2991
//#3 0x00007f959c35218a in qemuDomainMigrateBegin3Params (domain=0x7f9590005290, params=0x0, nparams=0, cookieout=0x7f959fc9a978,
// cookieoutlen=0x7f959fc9a96c, flags=257) at ../src/qemu/qemu_driver.c:10905
//#4 0x00007f95a434cb54 in virDomainMigrateBegin3Params (domain=domain@entry=0x7f9590005290, params=<optimized out>, nparams=0,
// cookieout=cookieout@entry=0x7f959fc9a978, cookieoutlen=cookieoutlen@entry=0x7f959fc9a96c, flags=257) at ../src/libvirt-domain.c:5266
//#5 0x000055b31f9f6d8b in remoteDispatchDomainMigrateBegin3Params (server=0x55b33b067080, msg=0x55b33b071e00, ret=0x7f9580003180,
// args=0x7f95800030f0, rerr=0x7f959fc9aa60, client=<optimized out>) at ../src/remote/remote_daemon_dispatch.c:5576
//#6 remoteDispatchDomainMigrateBegin3ParamsHelper (server=0x55b33b067080, client=<optimized out>, msg=0x55b33b071e00,
// rerr=0x7f959fc9aa60, args=0x7f95800030f0, ret=0x7f9580003180) at src/remote/remote_daemon_dispatch_stubs.h:8436
//#7 0x00007f95a42c3644 in virNetServerProgramDispatchCall (msg=0x55b33b071e00, client=0x55b33b09c4f0, server=0x55b33b067080,
// prog=0x55b33b06b410) at ../src/rpc/virnetserverprogram.c:423
//#8 virNetServerProgramDispatch (prog=0x55b33b06b410, server=server@entry=0x55b33b067080, client=0x55b33b09c4f0, msg=0x55b33b071e00)
// at ../src/rpc/virnetserverprogram.c:299
//#9 0x00007f95a42c93a8 in virNetServerProcessMsg (msg=<optimized out>, prog=<optimized out>, client=<optimized out>, srv=0x55b33b067080)
// at ../src/rpc/virnetserver.c:135
//#10 virNetServerHandleJob (jobOpaque=0x55b33b065c90, opaque=0x55b33b067080) at ../src/rpc/virnetserver.c:155
//#11 0x00007f95a420af63 in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#12 0x00007f95a420a5b9 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#13 0x00007f95a3bae1f5 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
//#14 0x00007f95a3c2e89c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

static char *
qemuMigrationSrcBeginXML(virDomainObj *vm,
const char *xmlin,
char **cookieout,
int *cookieoutlen,
unsigned int cookieFlags,
const char **migrate_disks,
unsigned int flags)
{
qemuDomainObjPrivate *priv = vm->privateData;
virQEMUDriver *driver = priv->driver;
g_autoptr(qemuMigrationCookie) mig = NULL;

if (priv->origCPU)
cookieFlags |= QEMU_MIGRATION_COOKIE_CPU;

if (!(flags & VIR_MIGRATE_OFFLINE))
cookieFlags |= QEMU_MIGRATION_COOKIE_CAPS;

if (!(mig = qemuMigrationCookieNew(vm->def, priv->origname)))
return NULL;

if (cookieFlags & QEMU_MIGRATION_COOKIE_NBD &&
virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_MIGRATION_PARAM_BLOCK_BITMAP_MAPPING) &&
qemuMigrationSrcBeginPhaseBlockDirtyBitmaps(mig, vm, migrate_disks) < 0)
return NULL;

if (qemuMigrationCookieFormat(mig, driver, vm,
QEMU_MIGRATION_SOURCE,
cookieout, cookieoutlen,
cookieFlags) < 0)
return NULL;

if (xmlin) {
g_autoptr(virDomainDef) def = NULL;

if (!(def = virDomainDefParseString(xmlin, driver->xmlopt, priv->qemuCaps,
VIR_DOMAIN_DEF_PARSE_INACTIVE)))
return NULL;

if (!qemuDomainCheckABIStability(driver, vm, def))
return NULL;

return qemuDomainDefFormatLive(driver, priv->qemuCaps, def, NULL, false, true);
}

return qemuDomainDefFormatLive(driver, priv->qemuCaps, vm->def, priv->origCPU,
false, true);
}

其逻辑比较清晰,可以分为两部分:即qemuMigrationCookieNew()qemuMigrationCookieFormat()生成cookie信息;qemuDomainDefFormatLive()生成子机xml

生成cookie

libvirt使用struct _qemuMigrationCookie结构来表示热迁移cookie结构,其会包含源端/目的端热迁移的额外配置信息,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
typedef struct _qemuMigrationCookie qemuMigrationCookie;
struct _qemuMigrationCookie {
unsigned int flags;
unsigned int flagsMandatory;

/* Host properties */
unsigned char localHostuuid[VIR_UUID_BUFLEN];
unsigned char remoteHostuuid[VIR_UUID_BUFLEN];
char *localHostname;
char *remoteHostname;

/* Guest properties */
unsigned char uuid[VIR_UUID_BUFLEN];
char *name;

/* If (flags & QEMU_MIGRATION_COOKIE_LOCKSTATE) */
char *lockState;
char *lockDriver;

/* If (flags & QEMU_MIGRATION_COOKIE_GRAPHICS) */
qemuMigrationCookieGraphics *graphics;

/* If (flags & QEMU_MIGRATION_COOKIE_PERSISTENT) */
virDomainDef *persistent;

/* If (flags & QEMU_MIGRATION_COOKIE_NETWORK) */
qemuMigrationCookieNetwork *network;

/* If (flags & QEMU_MIGRATION_COOKIE_NBD) */
qemuMigrationCookieNBD *nbd;

/* If (flags & QEMU_MIGRATION_COOKIE_STATS) */
virDomainJobData *jobData;

/* If flags & QEMU_MIGRATION_COOKIE_CPU */
virCPUDef *cpu;

/* If flags & QEMU_MIGRATION_COOKIE_CAPS */
qemuMigrationCookieCaps *caps;

/* If flags & QEMU_MIGRATION_COOKIE_BLOCK_DIRTY_BITMAPS */
GSList *blockDirtyBitmaps;
};

其会在qemuMigrationCookieFormat()中填充struct _qemuMigrationCookie结构体中flags指定的配置信息所对应的字段,并序列化为cookie,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
int
qemuMigrationCookieFormat(qemuMigrationCookie *mig,
virQEMUDriver *driver,
virDomainObj *dom,
qemuMigrationParty party,
char **cookieout,
int *cookieoutlen,
unsigned int flags)
{
qemuDomainObjPrivate *priv = dom->privateData;
g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;

if (!cookieout || !cookieoutlen)
return 0;

*cookieoutlen = 0;

if (flags & QEMU_MIGRATION_COOKIE_GRAPHICS &&
qemuMigrationCookieAddGraphics(mig, driver, dom) < 0)
return -1;

if (flags & QEMU_MIGRATION_COOKIE_LOCKSTATE &&
qemuMigrationCookieAddLockstate(mig, driver, dom) < 0)
return -1;

if (flags & QEMU_MIGRATION_COOKIE_NETWORK &&
qemuMigrationCookieAddNetwork(mig, driver, dom) < 0) {
return -1;
}

if ((flags & QEMU_MIGRATION_COOKIE_NBD) &&
qemuMigrationCookieAddNBD(mig, dom) < 0)
return -1;

if (flags & QEMU_MIGRATION_COOKIE_STATS &&
qemuMigrationCookieAddStatistics(mig, dom) < 0)
return -1;

if (flags & QEMU_MIGRATION_COOKIE_MEMORY_HOTPLUG)
mig->flagsMandatory |= QEMU_MIGRATION_COOKIE_MEMORY_HOTPLUG;

if (flags & QEMU_MIGRATION_COOKIE_CPU_HOTPLUG)
mig->flagsMandatory |= QEMU_MIGRATION_COOKIE_CPU_HOTPLUG;

if (flags & QEMU_MIGRATION_COOKIE_CPU &&
qemuMigrationCookieAddCPU(mig, dom) < 0)
return -1;

if (flags & QEMU_MIGRATION_COOKIE_CAPS &&
qemuMigrationCookieAddCaps(mig, dom, party) < 0)
return -1;

if (qemuMigrationCookieXMLFormat(driver, priv->qemuCaps, &buf, mig) < 0)
return -1;

*cookieoutlen = virBufferUse(&buf) + 1;
*cookieout = virBufferContentAndReset(&buf);

VIR_DEBUG("cookielen=%d cookie=%s", *cookieoutlen, *cookieout);

return 0;
}

生成子机xml

libvirt会使用qemuDomainDefFormatLive()来生成子机xml字符串,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
char *
qemuDomainDefFormatLive(virQEMUDriver *driver,
virQEMUCaps *qemuCaps,
virDomainDef *def,
virCPUDef *origCPU,
bool inactive,
bool compatible)
{
unsigned int flags = QEMU_DOMAIN_FORMAT_LIVE_FLAGS;

if (inactive)
flags |= VIR_DOMAIN_XML_INACTIVE;
if (compatible)
flags |= VIR_DOMAIN_XML_MIGRATABLE;

return qemuDomainDefFormatXMLInternal(driver, qemuCaps, def, origCPU, flags);
}

static char *
qemuDomainDefFormatXMLInternal(virQEMUDriver *driver,
virQEMUCaps *qemuCaps,
virDomainDef *def,
virCPUDef *origCPU,
unsigned int flags)
{
g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;

if (qemuDomainDefFormatBufInternal(driver, qemuCaps, def, origCPU, flags, &buf) < 0)
return NULL;

return virBufferContentAndReset(&buf);
}

static int
qemuDomainDefFormatBufInternal(virQEMUDriver *driver,
virQEMUCaps *qemuCaps,
virDomainDef *def,
virCPUDef *origCPU,
unsigned int flags,
virBuffer *buf)
{
...
format:
return virDomainDefFormatInternal(def, driver->xmlopt, buf,
virDomainDefFormatConvertXMLFlags(flags));
}

int
virDomainDefFormatInternal(virDomainDef *def,
virDomainXMLOption *xmlopt,
virBuffer *buf,
unsigned int flags)
{
return virDomainDefFormatInternalSetRootName(def, xmlopt, buf,
"domain", flags);
}


/* This internal version appends to an existing buffer
* (possibly with auto-indent), rather than flattening
* to string.
* Return -1 on failure. */
int
virDomainDefFormatInternalSetRootName(virDomainDef *def,
virDomainXMLOption *xmlopt,
virBuffer *buf,
const char *rootname,
unsigned int flags)
{
unsigned char *uuid;
char uuidstr[VIR_UUID_STRING_BUFLEN];
const char *type = NULL;
int n;
size_t i;
bool migratable = !!(flags & VIR_DOMAIN_DEF_FORMAT_MIGRATABLE);

virCheckFlags(VIR_DOMAIN_DEF_FORMAT_COMMON_FLAGS |
VIR_DOMAIN_DEF_FORMAT_STATUS |
VIR_DOMAIN_DEF_FORMAT_ACTUAL_NET |
VIR_DOMAIN_DEF_FORMAT_PCI_ORIG_STATES |
VIR_DOMAIN_DEF_FORMAT_CLOCK_ADJUST |
VIR_DOMAIN_DEF_FORMAT_VOLUME_TRANSLATED,
-1);

if (!(type = virDomainVirtTypeToString(def->virtType))) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unexpected domain type %1$d"), def->virtType);
return -1;
}

/* When changing this condition, beware that tests such as qemuxml*test
* were optimized based on this predicate and may need to be fixed. */
if (def->id == -1)
flags |= VIR_DOMAIN_DEF_FORMAT_INACTIVE;

virBufferAsprintf(buf, "<%s type='%s'", rootname, type);
if (!(flags & VIR_DOMAIN_DEF_FORMAT_INACTIVE))
virBufferAsprintf(buf, " id='%d'", def->id);
if (def->namespaceData && def->ns.format)
virXMLNamespaceFormatNS(buf, &def->ns);
virBufferAddLit(buf, ">\n");
virBufferAdjustIndent(buf, 2);

virBufferEscapeString(buf, "<name>%s</name>\n", def->name);

uuid = def->uuid;
virUUIDFormat(uuid, uuidstr);
virBufferAsprintf(buf, "<uuid>%s</uuid>\n", uuidstr);

if (def->genidRequested) {
char genidstr[VIR_UUID_STRING_BUFLEN];

virUUIDFormat(def->genid, genidstr);
virBufferAsprintf(buf, "<genid>%s</genid>\n", genidstr);
}

virBufferEscapeString(buf, "<title>%s</title>\n", def->title);

virBufferEscapeString(buf, "<description>%s</description>\n",
def->description);

if (virXMLFormatMetadata(buf, def->metadata) < 0)
return -1;

if (virDomainDefHasMemoryHotplug(def)) {
g_auto(virBuffer) attrBuf = VIR_BUFFER_INITIALIZER;
g_auto(virBuffer) contentBuf = VIR_BUFFER_INITIALIZER;

if (def->mem.memory_slots > 0)
virBufferAsprintf(&attrBuf, " slots='%u'", def->mem.memory_slots);

virBufferAddLit(&attrBuf, " unit='KiB'");
virBufferAsprintf(&contentBuf, "%llu", def->mem.max_memory);

virXMLFormatElementInternal(buf, "maxMemory", &attrBuf, &contentBuf, false, false);
}

virBufferAddLit(buf, "<memory");
if (def->mem.dump_core)
virBufferAsprintf(buf, " dumpCore='%s'",
virTristateSwitchTypeToString(def->mem.dump_core));
virBufferAsprintf(buf, " unit='KiB'>%llu</memory>\n",
virDomainDefGetMemoryTotal(def));

virBufferAsprintf(buf, "<currentMemory unit='KiB'>%llu</currentMemory>\n",
def->mem.cur_balloon);

virDomainDefFormatBlkiotune(buf, def);

virDomainMemtuneFormat(buf, &def->mem);
virDomainMemorybackingFormat(buf, &def->mem);

if (virDomainCpuDefFormat(buf, def) < 0)
return -1;

virDomainDefIOThreadsFormat(buf, def);

if (virDomainCputuneDefFormat(buf, def, flags) < 0)
return -1;

if (virDomainNumatuneFormatXML(buf, def->numa) < 0)
return -1;

virDomainResourceDefFormat(buf, def->resource);

for (i = 0; i < def->nsysinfo; i++) {
if (virSysinfoFormat(buf, def->sysinfo[i]) < 0)
return -1;
}

if (def->os.bootloader) {
virBufferEscapeString(buf, "<bootloader>%s</bootloader>\n",
def->os.bootloader);
virBufferEscapeString(buf,
"<bootloader_args>%s</bootloader_args>\n",
def->os.bootloaderArgs);
}

virBufferAddLit(buf, "<os");
if (def->os.firmware && !migratable)
virBufferAsprintf(buf, " firmware='%s'",
virDomainOsDefFirmwareTypeToString(def->os.firmware));
virBufferAddLit(buf, ">\n");
virBufferAdjustIndent(buf, 2);
virBufferAddLit(buf, "<type");
if (def->os.arch)
virBufferAsprintf(buf, " arch='%s'", virArchToString(def->os.arch));
if (def->os.machine)
virBufferAsprintf(buf, " machine='%s'", def->os.machine);
/*
* HACK: For xen driver we previously used bogus 'linux' as the
* os type for paravirt, whereas capabilities declare it to
* be 'xen'. So we convert to the former for backcompat
*/
if (def->virtType == VIR_DOMAIN_VIRT_XEN &&
def->os.type == VIR_DOMAIN_OSTYPE_XEN)
virBufferAsprintf(buf, ">%s</type>\n",
virDomainOSTypeToString(VIR_DOMAIN_OSTYPE_LINUX));
else
virBufferAsprintf(buf, ">%s</type>\n",
virDomainOSTypeToString(def->os.type));

if (def->os.firmwareFeatures && !migratable) {
virBufferAddLit(buf, "<firmware>\n");
virBufferAdjustIndent(buf, 2);

for (i = 0; i < VIR_DOMAIN_OS_DEF_FIRMWARE_FEATURE_LAST; i++) {
if (def->os.firmwareFeatures[i] == VIR_TRISTATE_BOOL_ABSENT)
continue;

virBufferAsprintf(buf, "<feature enabled='%s' name='%s'/>\n",
virTristateBoolTypeToString(def->os.firmwareFeatures[i]),
virDomainOsDefFirmwareFeatureTypeToString(i));
}

virBufferAdjustIndent(buf, -2);

virBufferAddLit(buf, "</firmware>\n");
}

virBufferEscapeString(buf, "<init>%s</init>\n",
def->os.init);
for (i = 0; def->os.initargv && def->os.initargv[i]; i++)
virBufferEscapeString(buf, "<initarg>%s</initarg>\n",
def->os.initargv[i]);
for (i = 0; def->os.initenv && def->os.initenv[i]; i++)
virBufferAsprintf(buf, "<initenv name='%s'>%s</initenv>\n",
def->os.initenv[i]->name, def->os.initenv[i]->value);
virBufferEscapeString(buf, "<initdir>%s</initdir>\n",
def->os.initdir);
if (def->os.inituser)
virBufferAsprintf(buf, "<inituser>%s</inituser>\n", def->os.inituser);
if (def->os.initgroup)
virBufferAsprintf(buf, "<initgroup>%s</initgroup>\n", def->os.initgroup);

if (def->os.loader &&
virDomainLoaderDefFormat(buf, def->os.loader, xmlopt, flags) < 0)
return -1;
virBufferEscapeString(buf, "<kernel>%s</kernel>\n",
def->os.kernel);
virBufferEscapeString(buf, "<initrd>%s</initrd>\n",
def->os.initrd);
virBufferEscapeString(buf, "<cmdline>%s</cmdline>\n",
def->os.cmdline);
virBufferEscapeString(buf, "<dtb>%s</dtb>\n",
def->os.dtb);
virBufferEscapeString(buf, "<root>%s</root>\n",
def->os.root);
if (def->os.slic_table) {
virBufferAddLit(buf, "<acpi>\n");
virBufferAdjustIndent(buf, 2);
virBufferEscapeString(buf, "<table type='slic'>%s</table>\n",
def->os.slic_table);
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</acpi>\n");
}

if (!def->os.bootloader) {
for (n = 0; n < def->os.nBootDevs; n++) {
const char *boottype =
virDomainBootTypeToString(def->os.bootDevs[n]);
if (!boottype) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unexpected boot device type %1$d"),
def->os.bootDevs[n]);
return -1;
}
virBufferAsprintf(buf, "<boot dev='%s'/>\n", boottype);
}

if (def->os.bootmenu) {
virBufferAsprintf(buf, "<bootmenu enable='%s'",
virTristateBoolTypeToString(def->os.bootmenu));
if (def->os.bm_timeout_set)
virBufferAsprintf(buf, " timeout='%u'", def->os.bm_timeout);
virBufferAddLit(buf, "/>\n");
}

if (def->os.bios.useserial || def->os.bios.rt_set) {
virBufferAddLit(buf, "<bios");
if (def->os.bios.useserial)
virBufferAsprintf(buf, " useserial='%s'",
virTristateBoolTypeToString(def->os.bios.useserial));
if (def->os.bios.rt_set)
virBufferAsprintf(buf, " rebootTimeout='%d'", def->os.bios.rt_delay);

virBufferAddLit(buf, "/>\n");
}
}

if (def->os.smbios_mode) {
const char *mode;

mode = virDomainSmbiosModeTypeToString(def->os.smbios_mode);
if (mode == NULL) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("unexpected smbios mode %1$d"), def->os.smbios_mode);
return -1;
}
virBufferAsprintf(buf, "<smbios mode='%s'/>\n", mode);
}

virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</os>\n");


if (def->idmap.uidmap) {
virBufferAddLit(buf, "<idmap>\n");
virBufferAdjustIndent(buf, 2);
for (i = 0; i < def->idmap.nuidmap; i++) {
virBufferAsprintf(buf,
"<uid start='%u' target='%u' count='%u'/>\n",
def->idmap.uidmap[i].start,
def->idmap.uidmap[i].target,
def->idmap.uidmap[i].count);
}
for (i = 0; i < def->idmap.ngidmap; i++) {
virBufferAsprintf(buf,
"<gid start='%u' target='%u' count='%u'/>\n",
def->idmap.gidmap[i].start,
def->idmap.gidmap[i].target,
def->idmap.gidmap[i].count);
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</idmap>\n");
}

if (virDomainDefFormatFeatures(buf, def) < 0)
return -1;

if (virCPUDefFormatBufFull(buf, def->cpu, def->numa) < 0)
return -1;

virDomainClockDefFormat(buf, &def->clock, flags);

if (virDomainEventActionDefFormat(buf, def->onPoweroff,
"on_poweroff",
virDomainLifecycleActionTypeToString) < 0)
return -1;
if (virDomainEventActionDefFormat(buf, def->onReboot,
"on_reboot",
virDomainLifecycleActionTypeToString) < 0)
return -1;
if (virDomainEventActionDefFormat(buf, def->onCrash,
"on_crash",
virDomainLifecycleActionTypeToString) < 0)
return -1;
if (def->onLockFailure != VIR_DOMAIN_LOCK_FAILURE_DEFAULT &&
virDomainEventActionDefFormat(buf, def->onLockFailure,
"on_lockfailure",
virDomainLockFailureTypeToString) < 0)
return -1;

if (def->pm.s3 || def->pm.s4) {
virBufferAddLit(buf, "<pm>\n");
virBufferAdjustIndent(buf, 2);
if (def->pm.s3) {
virBufferAsprintf(buf, "<suspend-to-mem enabled='%s'/>\n",
virTristateBoolTypeToString(def->pm.s3));
}
if (def->pm.s4) {
virBufferAsprintf(buf, "<suspend-to-disk enabled='%s'/>\n",
virTristateBoolTypeToString(def->pm.s4));
}
virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</pm>\n");
}

virDomainPerfDefFormat(buf, &def->perf);

virBufferAddLit(buf, "<devices>\n");
virBufferAdjustIndent(buf, 2);

virBufferEscapeString(buf, "<emulator>%s</emulator>\n",
def->emulator);

for (n = 0; n < def->ndisks; n++)
if (virDomainDiskDefFormat(buf, def->disks[n], flags, xmlopt) < 0)
return -1;

for (n = 0; n < def->ncontrollers; n++)
if (virDomainControllerDefFormat(buf, def->controllers[n], flags) < 0)
return -1;

for (n = 0; n < def->nleases; n++)
virDomainLeaseDefFormat(buf, def->leases[n]);

for (n = 0; n < def->nfss; n++)
if (virDomainFSDefFormat(buf, def->fss[n], flags) < 0)
return -1;

for (n = 0; n < def->nnets; n++)
if (virDomainNetDefFormat(buf, def->nets[n], xmlopt, flags) < 0)
return -1;

for (n = 0; n < def->nsmartcards; n++)
if (virDomainSmartcardDefFormat(buf, def->smartcards[n], flags) < 0)
return -1;

for (n = 0; n < def->nserials; n++)
if (virDomainChrDefFormat(buf, def->serials[n], flags) < 0)
return -1;

for (n = 0; n < def->nparallels; n++)
if (virDomainChrDefFormat(buf, def->parallels[n], flags) < 0)
return -1;

for (n = 0; n < def->nconsoles; n++) {
virDomainChrDef console;
/* Back compat, ignore the console element for hvm guests
* if it is type == serial
*/
if (def->os.type == VIR_DOMAIN_OSTYPE_HVM &&
(def->consoles[n]->targetType == VIR_DOMAIN_CHR_CONSOLE_TARGET_TYPE_SERIAL ||
def->consoles[n]->targetType == VIR_DOMAIN_CHR_CONSOLE_TARGET_TYPE_NONE) &&
(n < def->nserials)) {
memcpy(&console, def->serials[n], sizeof(console));
console.deviceType = VIR_DOMAIN_CHR_DEVICE_TYPE_CONSOLE;
console.targetType = VIR_DOMAIN_CHR_CONSOLE_TARGET_TYPE_SERIAL;
} else {
memcpy(&console, def->consoles[n], sizeof(console));
}
if (virDomainChrDefFormat(buf, &console, flags) < 0)
return -1;
}

for (n = 0; n < def->nchannels; n++)
if (virDomainChrDefFormat(buf, def->channels[n], flags) < 0)
return -1;

for (n = 0; n < def->ninputs; n++) {
if (virDomainInputDefFormat(buf, def->inputs[n], flags) < 0)
return -1;
}

for (n = 0; n < def->ntpms; n++) {
if (virDomainTPMDefFormat(buf, def->tpms[n], flags, xmlopt) < 0)
return -1;
}

for (n = 0; n < def->ngraphics; n++) {
if (virDomainGraphicsDefFormat(buf, def->graphics[n], flags) < 0)
return -1;
}

for (n = 0; n < def->nsounds; n++) {
if (virDomainSoundDefFormat(buf, def->sounds[n], flags) < 0)
return -1;
}

for (n = 0; n < def->naudios; n++) {
if (virDomainAudioDefFormat(buf, def->audios[n]) < 0)
return -1;
}

for (n = 0; n < def->nvideos; n++) {
if (virDomainVideoDefFormat(buf, def->videos[n], flags) < 0)
return -1;
}

for (n = 0; n < def->nhostdevs; n++) {
/* If parentnet != NONE, this is just a pointer to the
* hostdev in a higher-level device (e.g. virDomainNetDef),
* and will have already been formatted there.
*/
if (!def->hostdevs[n]->parentnet &&
virDomainHostdevDefFormat(buf, def->hostdevs[n], flags, xmlopt) < 0) {
return -1;
}
}

for (n = 0; n < def->nredirdevs; n++) {
if (virDomainRedirdevDefFormat(buf, def->redirdevs[n], flags) < 0)
return -1;
}

if (def->redirfilter)
virDomainRedirFilterDefFormat(buf, def->redirfilter);

for (n = 0; n < def->nhubs; n++) {
if (virDomainHubDefFormat(buf, def->hubs[n], flags) < 0)
return -1;
}

for (n = 0; n < def->nwatchdogs; n++)
virDomainWatchdogDefFormat(buf, def->watchdogs[n], flags);

if (def->memballoon)
virDomainMemballoonDefFormat(buf, def->memballoon, flags);

for (n = 0; n < def->nrngs; n++) {
if (virDomainRNGDefFormat(buf, def->rngs[n], flags))
return -1;
}

if (def->nvram)
virDomainNVRAMDefFormat(buf, def->nvram, flags);

for (n = 0; n < def->npanics; n++)
virDomainPanicDefFormat(buf, def->panics[n]);

for (n = 0; n < def->nshmems; n++)
virDomainShmemDefFormat(buf, def->shmems[n], flags);

for (n = 0; n < def->nmems; n++) {
if (virDomainMemoryDefFormat(buf, def->mems[n], flags) < 0)
return -1;
}

for (n = 0; n < def->ncryptos; n++) {
virDomainCryptoDefFormat(buf, def->cryptos[n], flags);
}
if (def->iommu)
virDomainIOMMUDefFormat(buf, def->iommu);

if (def->vsock)
virDomainVsockDefFormat(buf, def->vsock);

if (def->pstore)
virDomainPstoreDefFormat(buf, def->pstore, flags);

virBufferAdjustIndent(buf, -2);
virBufferAddLit(buf, "</devices>\n");

for (n = 0; n < def->nseclabels; n++)
virSecurityLabelDefFormat(buf, def->seclabels[n], flags);

if (def->keywrap)
virDomainKeyWrapDefFormat(buf, def->keywrap);

virDomainSecDefFormat(buf, def->sec);

if (def->namespaceData && def->ns.format) {
if ((def->ns.format)(buf, def->namespaceData) < 0)
return -1;
}

virBufferAdjustIndent(buf, -2);
virBufferAsprintf(buf, "</%s>\n", rootname);

return 0;
}

可以看到,其基于源端子机的struct _virDomainDef,生成对应的xml字符串

prepare阶段

概述

prepare阶段则是接受源端发送的cookie,在目的端基于前面xml创建一个等待源端输入的子机,并再生成一个包含目的端prepare阶段热迁移额外配置信息的cookie返回给源端,流程如下所示

热迁移prepare阶段流程图

prepare阶段的代码如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
static virDomainPtr
virDomainMigrateVersion3Full(virDomainPtr domain,
virConnectPtr dconn,
const char *xmlin,
const char *dname,
const char *uri,
unsigned long long bandwidth,
virTypedParameterPtr params,
int nparams,
bool useParams,
unsigned int flags)
{
...
if (useParams) {
/* If source is new enough to support extensible migration parameters,
* it's certainly new enough to support virDomainGetState. */
ret = virDomainGetState(domain, &state, NULL, 0);
} else {
ret = virDomainGetInfo(domain, &info);
state = info.state;
}
if (ret == 0 &&
state == VIR_DOMAIN_PAUSED &&
!(flags & VIR_MIGRATE_POSTCOPY_RESUME))
flags |= VIR_MIGRATE_PAUSED;

destflags = flags & ~(VIR_MIGRATE_ABORT_ON_ERROR |
VIR_MIGRATE_AUTO_CONVERGE);

VIR_DEBUG("Prepare3 %p flags=0x%x", dconn, destflags);
cookiein = g_steal_pointer(&cookieout);
cookieinlen = cookieoutlen;
cookieoutlen = 0;
if (useParams) {
if (virTypedParamsReplaceString(&params, &nparams,
VIR_MIGRATE_PARAM_DEST_XML,
dom_xml) < 0)
goto done;
ret = dconn->driver->domainMigratePrepare3Params
(dconn, params, nparams, cookiein, cookieinlen,
&cookieout, &cookieoutlen, &uri_out, destflags);
} else {
ret = dconn->driver->domainMigratePrepare3
(dconn, cookiein, cookieinlen, &cookieout, &cookieoutlen,
uri, &uri_out, destflags, dname, bandwidth, dom_xml);
}
if (ret == -1) {
if (protection) {
/* Begin already started a migration job so we need to cancel it by
* calling Confirm while making sure it doesn't overwrite the error
*/
virErrorPreserveLast(&orig_err);
goto confirm;
} else {
goto done;
}
}

/* Did domainMigratePrepare3 change URI? */
if (uri_out) {
uri = uri_out;
if (useParams &&
virTypedParamsReplaceString(&params, &nparams,
VIR_MIGRATE_PARAM_URI,
uri_out) < 0) {
virErrorPreserveLast(&orig_err);
goto finish;
}
} else if (!uri &&
virTypedParamsGetString(params, nparams,
VIR_MIGRATE_PARAM_URI, &uri) <= 0) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("domainMigratePrepare3 did not set uri"));
virErrorPreserveLast(&orig_err);
goto finish;
}

if (flags & VIR_MIGRATE_OFFLINE) {
VIR_DEBUG("Offline migration, skipping Perform phase");
VIR_FREE(cookieout);
cookieoutlen = 0;
cancelled = 0;
goto finish;
}
...
}

可以看到,其调用driverdomainMigratePrepare3Params函数指针,即qemuDomainMigratePrepare3Params(),其基于前面begin阶段生成的子机xml,处理源端传入的cookie信息,创建一个等待源端热迁移输入的子机,并重新生成一个给源端处理的包含目的端prepare阶段热迁移额外配置信息的cookie,其中核心逻辑在qemuMigrationDstPrepareFresh(),如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
static int
qemuMigrationDstPrepareFresh(virQEMUDriver *driver,
virConnectPtr dconn,
const char *cookiein,
int cookieinlen,
char **cookieout,
int *cookieoutlen,
virDomainDef **def,
const char *origname,
virStreamPtr st,
const char *protocol,
unsigned short port,
bool autoPort,
const char *listenAddress,
const char **migrate_disks,
int nbdPort,
const char *nbdURI,
qemuMigrationParams *migParams,
unsigned int flags)
{
...
/* Parse cookie earlier than adding the domain onto the
* domain list. Parsing/validation may fail and there's no
* point in having the domain in the list at that point. */
if (!(mig = qemuMigrationCookieParse(driver, NULL, *def, origname, NULL,
cookiein, cookieinlen,
QEMU_MIGRATION_COOKIE_LOCKSTATE |
QEMU_MIGRATION_COOKIE_NBD |
QEMU_MIGRATION_COOKIE_MEMORY_HOTPLUG |
QEMU_MIGRATION_COOKIE_CPU_HOTPLUG |
QEMU_MIGRATION_COOKIE_CPU |
QEMU_MIGRATION_COOKIE_CAPS |
QEMU_MIGRATION_COOKIE_BLOCK_DIRTY_BITMAPS)))
goto cleanup;

if (!(vm = virDomainObjListAdd(driver->domains, def,
driver->xmlopt,
VIR_DOMAIN_OBJ_LIST_ADD_LIVE |
VIR_DOMAIN_OBJ_LIST_ADD_CHECK_LIVE,
NULL)))
goto cleanup;
...
if (!(flags & VIR_MIGRATE_OFFLINE)) {
if (qemuMigrationDstPrepareActive(driver, vm, dconn, mig, st,
protocol, port, listenAddress,
migrate_disks,
nbdPort, nbdURI,
migParams, flags) < 0) {
goto stopjob;
}

if (mig->nbd &&
flags & (VIR_MIGRATE_NON_SHARED_DISK | VIR_MIGRATE_NON_SHARED_INC))
cookieFlags |= QEMU_MIGRATION_COOKIE_NBD;
}

if (qemuMigrationCookieFormat(mig, driver, vm,
QEMU_MIGRATION_DESTINATION,
cookieout, cookieoutlen, cookieFlags) < 0) {
/* We could tear down the whole guest here, but
* cookie data is (so far) non-critical, so that
* seems a little harsh. We'll just warn for now.
*/
VIR_WARN("Unable to encode migration cookie");
}
...
}

其会调用qemuMigrationCookieParse()处理源端的cookie信息,获取源端热迁移的能力信息;然后调用qemuMigrationDstPrepareActive()创建热迁移目的端的子机;最后调用qemuMigrationCookieFormat()生成包含目的端prepare阶段额外热迁移能力信息的cookie

处理cookie

即根据flags反序列化源端传入的序列化的cookie数据中对应的结构,从而完成源端begin阶段额外热迁移配置解析,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
qemuMigrationCookie *
qemuMigrationCookieParse(virQEMUDriver *driver,
virDomainObj *vm,
const virDomainDef *def,
const char *origname,
virQEMUCaps *qemuCaps,
const char *cookiein,
int cookieinlen,
unsigned int flags)
{
g_autoptr(qemuMigrationCookie) mig = NULL;
...
if (!(mig = qemuMigrationCookieNew(def, origname)))
return NULL;

if (cookiein && cookieinlen &&
qemuMigrationCookieXMLParseStr(mig, driver, qemuCaps, cookiein, flags) < 0)
return NULL;
...
return g_steal_pointer(&mig);
}

static int
qemuMigrationCookieXMLParseStr(qemuMigrationCookie *mig,
virQEMUDriver *driver,
virQEMUCaps *qemuCaps,
const char *xml,
unsigned int flags)
{
...
if (!(doc = virXMLParseStringCtxt(xml, _("(qemu_migration_cookie)"), &ctxt)))
return -1;

return qemuMigrationCookieXMLParse(mig, driver, qemuCaps, ctxt, flags);
}

static int
qemuMigrationCookieXMLParse(qemuMigrationCookie *mig,
virQEMUDriver *driver,
virQEMUCaps *qemuCaps,
xmlXPathContextPtr ctxt,
unsigned int flags)
{
g_autofree char *name = NULL;
g_autofree char *uuid = NULL;
g_autofree char *hostuuid = NULL;
char localdomuuid[VIR_UUID_STRING_BUFLEN];

/* We don't store the uuid, name, hostname, or hostuuid
* values. We just compare them to local data to do some
* sanity checking on migration operation
*/

/* Extract domain name */
if (!(name = virXPathString("string(./name[1])", ctxt))) {
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("missing name element in migration data"));
return -1;
}
if (STRNEQ(name, mig->name)) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Incoming cookie data had unexpected name %1$s vs %2$s"),
name, mig->name);
return -1;
}

/* Extract domain uuid */
if (!(uuid = virXPathString("string(./uuid[1])", ctxt))) {
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("missing uuid element in migration data"));
return -1;
}
virUUIDFormat(mig->uuid, localdomuuid);
if (STRNEQ(uuid, localdomuuid)) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Incoming cookie data had unexpected UUID %1$s vs %2$s"),
uuid, localdomuuid);
return -1;
}

if (!(mig->remoteHostname = virXPathString("string(./hostname[1])", ctxt))) {
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("missing hostname element in migration data"));
return -1;
}
/* Historically, this is the place where we checked whether remoteHostname
* and localHostname are the same. But even if they were, it doesn't mean
* the domain is migrating onto the same host. Rely on UUID which can tell
* for sure. */

/* Check & forbid localhost migration */
if (!(hostuuid = virXPathString("string(./hostuuid[1])", ctxt))) {
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("missing hostuuid element in migration data"));
return -1;
}
if (virUUIDParse(hostuuid, mig->remoteHostuuid) < 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("malformed hostuuid element in migration data"));
return -1;
}
if (memcmp(mig->remoteHostuuid, mig->localHostuuid, VIR_UUID_BUFLEN) == 0) {
virReportError(VIR_ERR_INTERNAL_ERROR,
_("Attempt to migrate guest to the same host %1$s"),
hostuuid);
return -1;
}

if (qemuMigrationCookieXMLParseMandatoryFeatures(ctxt, flags) < 0)
return -1;

if ((flags & QEMU_MIGRATION_COOKIE_GRAPHICS) &&
virXPathBoolean("count(./graphics) > 0", ctxt) &&
(!(mig->graphics = qemuMigrationCookieGraphicsXMLParse(ctxt))))
return -1;

if ((flags & QEMU_MIGRATION_COOKIE_LOCKSTATE) &&
virXPathBoolean("count(./lockstate) > 0", ctxt)) {
g_autofree char *lockState = NULL;

mig->lockDriver = virXPathString("string(./lockstate[1]/@driver)", ctxt);
if (!mig->lockDriver) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Missing lock driver name in migration cookie"));
return -1;
}

lockState = virXPathString("string(./lockstate[1]/leases[1])", ctxt);
if (STRNEQ_NULLABLE(lockState, ""))
mig->lockState = g_steal_pointer(&lockState);
}

if ((flags & QEMU_MIGRATION_COOKIE_PERSISTENT) &&
virXPathBoolean("count(./domain) > 0", ctxt)) {
VIR_XPATH_NODE_AUTORESTORE(ctxt)
g_autofree xmlNodePtr *nodes = NULL;

if ((virXPathNodeSet("./domain", ctxt, &nodes)) != 1) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Too many domain elements in migration cookie"));
return -1;
}

ctxt->node = nodes[0];

mig->persistent = virDomainDefParseNode(ctxt, driver->xmlopt, qemuCaps,
VIR_DOMAIN_DEF_PARSE_INACTIVE |
VIR_DOMAIN_DEF_PARSE_ABI_UPDATE_MIGRATION |
VIR_DOMAIN_DEF_PARSE_SKIP_VALIDATE);
if (!mig->persistent)
return -1;
}

if ((flags & QEMU_MIGRATION_COOKIE_NETWORK) &&
virXPathBoolean("count(./network) > 0", ctxt) &&
(!(mig->network = qemuMigrationCookieNetworkXMLParse(ctxt))))
return -1;

if (flags & QEMU_MIGRATION_COOKIE_NBD &&
virXPathBoolean("boolean(./nbd)", ctxt) &&
(!(mig->nbd = qemuMigrationCookieNBDXMLParse(ctxt))))
return -1;

if (flags & QEMU_MIGRATION_COOKIE_STATS &&
virXPathBoolean("boolean(./statistics)", ctxt) &&
(!(mig->jobData = qemuMigrationCookieStatisticsXMLParse(ctxt))))
return -1;

if (flags & QEMU_MIGRATION_COOKIE_CPU &&
virCPUDefParseXML(ctxt, "./cpu[1]", VIR_CPU_TYPE_GUEST, &mig->cpu,
false) < 0)
return -1;

if (flags & QEMU_MIGRATION_COOKIE_CAPS &&
!(mig->caps = qemuMigrationCookieCapsXMLParse(ctxt)))
return -1;

if (flags & QEMU_MIGRATION_COOKIE_BLOCK_DIRTY_BITMAPS &&
virXPathBoolean("boolean(./blockDirtyBitmaps)", ctxt) &&
qemuMigrationCookieBlockDirtyBitmapsParse(ctxt, mig) < 0)
return -1;

return 0;
}

可以看到,其基本就是qemuMigrationCookieFormat的逆操作

生成子机

即生成一个等待后续源端输入的子机,因此除了libvirt外还涉及qemu部分的逻辑

libvirt

libvirt的核心逻辑在qemuMigrationDstPrepareActive()中,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
static int
qemuMigrationDstPrepareActive(virQEMUDriver *driver,
virDomainObj *vm,
virConnectPtr dconn,
qemuMigrationCookie *mig,
virStreamPtr st,
const char *protocol,
unsigned short port,
const char *listenAddress,
const char **migrate_disks,
int nbdPort,
const char *nbdURI,
qemuMigrationParams *migParams,
unsigned int flags)
{
qemuDomainObjPrivate *priv = vm->privateData;
qemuDomainJobPrivate *jobPriv = vm->job->privateData;
qemuProcessIncomingDef *incoming = NULL;
g_autofree char *tlsAlias = NULL;
virObjectEvent *event = NULL;
virErrorPtr origErr = NULL;
int dataFD[2] = { -1, -1 };
bool stopProcess = false;
unsigned int startFlags;
bool relabel = false;
bool tunnel = !!st;
int ret = -1;
int rv;

if (STREQ_NULLABLE(protocol, "rdma") &&
!virMemoryLimitIsSet(vm->def->mem.hard_limit)) {
virReportError(VIR_ERR_OPERATION_INVALID, "%s",
_("cannot start RDMA migration with no memory hard limit set"));
goto error;
}

if (qemuMigrationDstPrepareStorage(vm, mig->nbd, migrate_disks,
!!(flags & VIR_MIGRATE_NON_SHARED_INC)) < 0)
goto error;

if (tunnel &&
virPipe(dataFD) < 0)
goto error;

startFlags = VIR_QEMU_PROCESS_START_AUTODESTROY;

if (qemuProcessInit(driver, vm, mig->cpu, VIR_ASYNC_JOB_MIGRATION_IN,
true, startFlags) < 0)
goto error;
stopProcess = true;

if (!(incoming = qemuMigrationDstPrepare(vm, tunnel, protocol,
listenAddress, port,
dataFD[0])))
goto error;

qemuMigrationDstPrepareDiskSeclabels(vm, migrate_disks, flags);

if (qemuProcessPrepareDomain(driver, vm, startFlags) < 0)
goto error;

if (qemuProcessPrepareHost(driver, vm, startFlags) < 0)
goto error;

rv = qemuProcessLaunch(dconn, driver, vm, VIR_ASYNC_JOB_MIGRATION_IN,
incoming, NULL,
VIR_NETDEV_VPORT_PROFILE_OP_MIGRATE_IN_START,
startFlags);
if (rv < 0) {
if (rv == -2)
relabel = true;
goto error;
}
relabel = true;

if (tunnel) {
if (virFDStreamOpen(st, dataFD[1]) < 0) {
virReportSystemError(errno, "%s",
_("cannot pass pipe for tunnelled migration"));
goto error;
}
dataFD[1] = -1; /* 'st' owns the FD now & will close it */
}

if (STREQ_NULLABLE(protocol, "rdma") &&
virMemoryLimitIsSet(vm->def->mem.hard_limit) &&
qemuDomainSetMaxMemLock(vm, vm->def->mem.hard_limit << 10,
&priv->preMigrationMemlock) < 0) {
goto error;
}

if (qemuMigrationDstPrepareAnyBlockDirtyBitmaps(vm, mig, migParams, flags) < 0)
goto error;

if (qemuMigrationParamsCheck(vm, VIR_ASYNC_JOB_MIGRATION_IN, migParams,
mig->caps->supported, mig->caps->automatic) < 0)
goto error;

/* Save original migration parameters */
qemuDomainSaveStatus(vm);

/* Migrations using TLS need to add the "tls-creds-x509" object and
* set the migration TLS parameters */
if (flags & VIR_MIGRATE_TLS) {
if (qemuMigrationParamsEnableTLS(driver, vm, true,
VIR_ASYNC_JOB_MIGRATION_IN,
&tlsAlias, NULL,
migParams) < 0)
goto error;
} else {
if (qemuMigrationParamsDisableTLS(vm, migParams) < 0)
goto error;
}

if (qemuMigrationParamsApply(vm, VIR_ASYNC_JOB_MIGRATION_IN,
migParams, flags) < 0)
goto error;

if (mig->nbd &&
flags & (VIR_MIGRATE_NON_SHARED_DISK | VIR_MIGRATE_NON_SHARED_INC)) {
const char *nbdTLSAlias = NULL;

if (flags & VIR_MIGRATE_TLS) {
if (!virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_NBD_TLS)) {
virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
_("QEMU NBD server does not support TLS transport"));
goto error;
}

nbdTLSAlias = tlsAlias;
}

if (qemuMigrationDstStartNBDServer(driver, vm, incoming->address,
migrate_disks,
nbdPort, nbdURI,
nbdTLSAlias) < 0) {
goto error;
}
}

if (mig->lockState) {
VIR_DEBUG("Received lockstate %s", mig->lockState);
VIR_FREE(priv->lockState);
priv->lockState = g_steal_pointer(&mig->lockState);
} else {
VIR_DEBUG("Received no lockstate");
}

if (qemuMigrationDstRun(vm, incoming->uri,
VIR_ASYNC_JOB_MIGRATION_IN) < 0)
goto error;

if (qemuProcessFinishStartup(driver, vm, VIR_ASYNC_JOB_MIGRATION_IN,
false, VIR_DOMAIN_PAUSED_MIGRATION) < 0)
goto error;

virDomainAuditStart(vm, "migrated", true);
event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_STARTED,
VIR_DOMAIN_EVENT_STARTED_MIGRATED);

ret = 0;
...
}

其整体逻辑比较清晰,在qemuProcessLaunch()前调用相关函数准备好qemu运行的环境;然后调用qemuProcessLaunch()完成子机的创建;接着调用qemuMigrationParamsApply()qemuMigrationDstRun()完成目的端子机的热迁移相关qmp命令设置;最后调用qemuProcessFinishStartup()设置libvirt中子机的状态

创建子机

libvirt调用qemuProcessLaunch()完成子机的创建,其具体逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/**
* qemuProcessLaunch:
*
* Launch a new QEMU process with stopped virtual CPUs.
*
* The caller is supposed to call qemuProcessStop with appropriate
* flags in case of failure.
*
* Returns 0 on success,
* -1 on error which happened before devices were labeled and thus
* there is no need to restore them,
* -2 on error requesting security labels to be restored.
*/
int
qemuProcessLaunch(virConnectPtr conn,
virQEMUDriver *driver,
virDomainObj *vm,
virDomainAsyncJob asyncJob,
qemuProcessIncomingDef *incoming,
virDomainMomentObj *snapshot,
virNetDevVPortProfileOp vmop,
unsigned int flags)
{
...
if (!(cmd = qemuBuildCommandLine(vm,
incoming ? "defer" : NULL,
snapshot, vmop,
&nnicindexes, &nicindexes)))
goto cleanup;

if (incoming && incoming->fd != -1)
virCommandPassFD(cmd, incoming->fd, 0);
...

VIR_DEBUG("Setting up raw IO");
if (qemuProcessSetupRawIO(vm, cmd) < 0)
goto cleanup;

...
VIR_DEBUG("Setting up process limits");

/* In some situations, eg. VFIO passthrough, QEMU might need to lock a
* significant amount of memory, so we need to set the limit accordingly */
maxMemLock = qemuDomainGetMemLockLimitBytes(vm->def);

/* For all these settings, zero indicates that the limit should
* not be set explicitly and the default/inherited limit should
* be applied instead */
if (maxMemLock > 0)
virCommandSetMaxMemLock(cmd, maxMemLock);
if (cfg->maxProcesses > 0)
virCommandSetMaxProcesses(cmd, cfg->maxProcesses);
if (cfg->maxFiles > 0)
virCommandSetMaxFiles(cmd, cfg->maxFiles);
if (cfg->schedCore == QEMU_SCHED_CORE_EMULATOR ||
cfg->schedCore == QEMU_SCHED_CORE_FULL)
virCommandSetRunAmong(cmd, priv->schedCoreChildPID);

/* In this case, however, zero means that core dumps should be
* disabled, and so we always need to set the limit explicitly */
virCommandSetMaxCoreSize(cmd, cfg->maxCore);

VIR_DEBUG("Setting up security labelling");
if (qemuSecuritySetChildProcessLabel(driver->securityManager,
vm->def, false, cmd) < 0)
goto cleanup;

virCommandSetOutputFD(cmd, &logfile);
virCommandSetErrorFD(cmd, &logfile);
virCommandNonblockingFDs(cmd);
virCommandSetPidFile(cmd, priv->pidfile);
virCommandDaemonize(cmd);
virCommandRequireHandshake(cmd);

if (qemuSecurityPreFork(driver->securityManager) < 0)
goto cleanup;
rv = virCommandRun(cmd, NULL);
qemuSecurityPostFork(driver->securityManager);

/* wait for qemu process to show up */
if (rv == 0) {
if ((rv = virPidFileReadPath(priv->pidfile, &vm->pid)) < 0) {
virReportSystemError(-rv,
_("Domain %1$s didn't show up"),
vm->def->name);
goto cleanup;
}
VIR_DEBUG("QEMU vm=%p name=%s running with pid=%lld",
vm, vm->def->name, (long long)vm->pid);
} else {
VIR_DEBUG("QEMU vm=%p name=%s failed to spawn",
vm, vm->def->name);
goto cleanup;
}

VIR_DEBUG("Writing early domain status to disk");
if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0)
goto cleanup;

VIR_DEBUG("Waiting for handshake from child");
if (virCommandHandshakeWait(cmd) < 0) {
/* Read errors from child that occurred between fork and exec. */
qemuProcessReportLogError(logCtxt,
_("Process exited prior to exec"));
goto cleanup;
}

VIR_DEBUG("Building domain mount namespace (if required)");
if (qemuDomainBuildNamespace(cfg, vm) < 0)
goto cleanup;

VIR_DEBUG("Setting up domain cgroup (if required)");
if (qemuSetupCgroup(vm, nnicindexes, nicindexes) < 0)
goto cleanup;

VIR_DEBUG("Setting up domain perf (if required)");
if (qemuProcessEnablePerf(vm) < 0)
goto cleanup;

/* This must be done after cgroup placement to avoid resetting CPU
* affinity */
if (qemuProcessInitCpuAffinity(vm) < 0)
goto cleanup;

VIR_DEBUG("Setting emulator tuning/settings");
if (qemuProcessSetupEmulator(vm) < 0)
goto cleanup;

VIR_DEBUG("Setting cgroup for external devices (if required)");
if (qemuSetupCgroupForExtDevices(vm, driver) < 0)
goto cleanup;

VIR_DEBUG("Setting up resctrl");
if (qemuProcessResctrlCreate(driver, vm) < 0)
goto cleanup;

VIR_DEBUG("Setting up managed PR daemon");
if (virDomainDefHasManagedPR(vm->def) &&
qemuProcessStartManagedPRDaemon(vm) < 0)
goto cleanup;

VIR_DEBUG("Setting up permissions to allow post-copy migration");
if (qemuProcessAllowPostCopyMigration(vm) < 0)
goto cleanup;

VIR_DEBUG("Setting domain security labels");
if (qemuSecuritySetAllLabel(driver,
vm,
incoming ? incoming->path : NULL,
incoming != NULL) < 0)
goto cleanup;

/* Security manager labeled all devices, therefore
* if any operation from now on fails, we need to ask the caller to
* restore labels.
*/
ret = -2;

if (incoming && incoming->fd != -1) {
/* if there's an fd to migrate from, and it's a pipe, put the
* proper security label on it
*/
struct stat stdin_sb;

VIR_DEBUG("setting security label on pipe used for migration");

if (fstat(incoming->fd, &stdin_sb) < 0) {
virReportSystemError(errno,
_("cannot stat fd %1$d"), incoming->fd);
goto cleanup;
}
if (S_ISFIFO(stdin_sb.st_mode) &&
qemuSecuritySetImageFDLabel(driver->securityManager,
vm->def, incoming->fd) < 0)
goto cleanup;
}

VIR_DEBUG("Labelling done, completing handshake to child");
if (virCommandHandshakeNotify(cmd) < 0)
goto cleanup;
VIR_DEBUG("Handshake complete, child running");

if (qemuDomainObjStartWorker(vm) < 0)
goto cleanup;

VIR_DEBUG("Waiting for monitor to show up");
if (qemuProcessWaitForMonitor(driver, vm, asyncJob, logCtxt) < 0)
goto cleanup;

if (qemuConnectAgent(driver, vm) < 0)
goto cleanup;

VIR_DEBUG("setting up hotpluggable cpus");
if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) {
if (qemuDomainRefreshVcpuInfo(vm, asyncJob, false) < 0)
goto cleanup;

if (qemuProcessValidateHotpluggableVcpus(vm->def) < 0)
goto cleanup;

if (qemuProcessSetupHotpluggableVcpus(vm, asyncJob) < 0)
goto cleanup;
}

VIR_DEBUG("Refreshing VCPU info");
if (qemuDomainRefreshVcpuInfo(vm, asyncJob, false) < 0)
goto cleanup;

if (qemuDomainValidateVcpuInfo(vm) < 0)
goto cleanup;

qemuDomainVcpuPersistOrder(vm->def);

VIR_DEBUG("Verifying and updating provided guest CPU");
if (qemuProcessUpdateAndVerifyCPU(vm, asyncJob) < 0)
goto cleanup;

VIR_DEBUG("Detecting IOThread PIDs");
if (qemuProcessDetectIOThreadPIDs(vm, asyncJob) < 0)
goto cleanup;

VIR_DEBUG("Setting global CPU cgroup (if required)");
if (virDomainCgroupSetupGlobalCpuCgroup(vm, priv->cgroup) < 0)
goto cleanup;

VIR_DEBUG("Setting vCPU tuning/settings");
if (qemuProcessSetupVcpus(vm) < 0)
goto cleanup;

VIR_DEBUG("Setting IOThread tuning/settings");
if (qemuProcessSetupIOThreads(vm) < 0)
goto cleanup;

VIR_DEBUG("Setting emulator scheduler");
if (vm->def->cputune.emulatorsched &&
virProcessSetScheduler(vm->pid,
vm->def->cputune.emulatorsched->policy,
vm->def->cputune.emulatorsched->priority) < 0)
goto cleanup;

VIR_DEBUG("Setting any required VM passwords");
if (qemuProcessInitPasswords(driver, vm, asyncJob) < 0)
goto cleanup;

/* set default link states */
/* qemu doesn't support setting this on the command line, so
* enter the monitor */
VIR_DEBUG("Setting network link states");
if (qemuProcessSetLinkStates(vm, asyncJob) < 0)
goto cleanup;

VIR_DEBUG("Setting initial memory amount");
if (qemuProcessSetupBalloon(vm, asyncJob) < 0)
goto cleanup;

if (qemuProcessSetupDiskThrottling(vm, asyncJob) < 0)
goto cleanup;

/* Since CPUs were not started yet, the balloon could not return the memory
* to the host and thus cur_balloon needs to be updated so that GetXMLdesc
* and friends return the correct size in case they can't grab the job */
if (!incoming && !snapshot &&
qemuProcessRefreshBalloonState(vm, asyncJob) < 0)
goto cleanup;

if (flags & VIR_QEMU_PROCESS_START_AUTODESTROY)
virCloseCallbacksDomainAdd(vm, conn, qemuProcessAutoDestroy);

if (!incoming && !snapshot) {
VIR_DEBUG("Setting up transient disk");
if (qemuProcessSetupDisksTransient(vm, asyncJob) < 0)
goto cleanup;
}

VIR_DEBUG("Setting handling of lifecycle actions");
if (qemuProcessSetupLifecycleActions(vm, asyncJob) < 0)
goto cleanup;

if (qemuProcessDeleteThreadContextHelper(vm, asyncJob) < 0)
goto cleanup;

ret = 0;

cleanup:
qemuDomainSchedCoreStop(priv);
qemuDomainStartupCleanup(vm);
return ret;
}

其首先通过qemuBuildCommandLine()virCommand*()一系列函数拼凑出启动qemu的struct _virCommand数据结构;然后调用virCommandRun()异步创建相关qemu进程;接着调用virCommandHandshakeWait()与异步创建的进程进行同步,确保此时异步进程处于virCommandHandshakeChild()函数中;然后调用qemuProcess*()一系列函数完成进程相关属性设置;接着调用virCommandHandshakeNotify()函数,通知异步进程退出virCommandHandshakeChild()函数,执行qemu命令;然后调用qemuProcessWaitForMonitor()连接子机的monitor,并完成后续的子机设置

设置热迁移

libvirt首先调用qemuMigrationParamsApply()执行migrate-set-parameter的qemu monitor命令来设置热迁移参数,其具体逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/**
* qemuMigrationParamsApply
* @driver: qemu driver
* @vm: domain object
* @asyncJob: migration job
* @migParams: migration parameters to send to QEMU
* @apiFlags: migration flags, some of them may affect which parameters are applied
*
* Send parameters stored in @migParams to QEMU. If @apiFlags is non-zero, some
* parameters that do not make sense for the enabled flags will be ignored.
* VIR_MIGRATE_POSTCOPY_RESUME is the only flag checked currently.
*
* Returns 0 on success, -1 on failure.
*/
int
qemuMigrationParamsApply(virDomainObj *vm,
int asyncJob,
qemuMigrationParams *migParams,
unsigned int apiFlags)
{
bool postcopyResume = !!(apiFlags & VIR_MIGRATE_POSTCOPY_RESUME);
int ret = -1;

if (qemuDomainObjEnterMonitorAsync(vm, asyncJob) < 0)
return -1;

/* Changing capabilities is only allowed before migration starts, we need
* to skip them when resuming post-copy migration.
*/
if (!postcopyResume) {
if (asyncJob == VIR_ASYNC_JOB_NONE) {
if (!virBitmapIsAllClear(migParams->caps)) {
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("Migration capabilities can only be set by a migration job"));
goto cleanup;
}
} else if (qemuMigrationParamsApplyCaps(vm, migParams->caps) < 0) {
goto cleanup;
}
}

if (qemuMigrationParamsApplyValues(vm, migParams, postcopyResume) < 0)
goto cleanup;

ret = 0;

cleanup:
qemuDomainObjExitMonitor(vm);

return ret;
}

static int
qemuMigrationParamsApplyValues(virDomainObj *vm,
qemuMigrationParams *params,
bool postcopyResume)
{
qemuDomainObjPrivate *priv = vm->privateData;
g_autoptr(virJSONValue) json = NULL;

if (!(json = qemuMigrationParamsToJSON(params, postcopyResume)))
return -1;

if (virJSONValueObjectKeysNumber(json) > 0 &&
qemuMonitorSetMigrationParams(priv->mon, &json) < 0)
return -1;

return 0;
}

然后调用qemuMigrationDstRun()执行migrate-incoming的qemu monitor命令来启动目的端的热迁移流程,其逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
int
qemuMigrationDstRun(virDomainObj *vm,
const char *uri,
virDomainAsyncJob asyncJob)
{
qemuDomainObjPrivate *priv = vm->privateData;
int rv;

VIR_DEBUG("Setting up incoming migration with URI %s", uri);

if (qemuDomainObjEnterMonitorAsync(vm, asyncJob) < 0)
return -1;

rv = qemuMonitorSetDBusVMStateIdList(priv->mon, priv->dbusVMStateIds);
if (rv < 0)
goto exit_monitor;

rv = qemuMonitorMigrateIncoming(priv->mon, uri);

exit_monitor:
qemuDomainObjExitMonitor(vm);
if (rv < 0)
return -1;

if (asyncJob == VIR_ASYNC_JOB_MIGRATION_IN) {
/* qemuMigrationDstWaitForCompletion is called from the Finish phase */
return 0;
}

if (qemuMigrationDstWaitForCompletion(vm, asyncJob, false) < 0)
return -1;

return 0;
}

这里需要注意的是,由于这里只是启动目的端的热迁移流程,并没有真正开始热迁移,所以这里并不会执行后续qemuMigrationDstWaitForCompletion的逻辑

qemu

这里主要涉及qemu的两部分代码,进程启动和执行migrate-incoming的qemu monitor命令

进程启动

即libvirt调用virCommandRun()执行拼凑的qemu命令,启动子机对应的进程。其入口函数为main(),整体逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
int qemu_default_main(void)
{
int status;

status = qemu_main_loop();
qemu_cleanup(status);

return status;
}
int (*qemu_main)(void) = qemu_default_main;

int main(int argc, char **argv)
{
qemu_init(argc, argv);
return qemu_main();
}

void qemu_init(int argc, char **argv)
{
...
qemu_init_subsystems();

/* first pass of option parsing */
optind = 1;
while (optind < argc) {
if (argv[optind][0] != '-') {
/* disk image */
optind++;
} else {
const QEMUOption *popt;

popt = lookup_opt(argc, argv, &optarg, &optind);
switch (popt->index) {
case QEMU_OPTION_nouserconfig:
userconfig = false;
break;
}
}
}

machine_opts_dict = qdict_new();
if (userconfig) {
qemu_read_default_config_file(&error_fatal);
}

/* second pass of option parsing */
optind = 1;
for(;;) {
if (optind >= argc)
break;
if (argv[optind][0] != '-') {
loc_set_cmdline(argv, optind, 1);
drive_add(IF_DEFAULT, 0, argv[optind++], HD_OPTS);
} else {
const QEMUOption *popt;

popt = lookup_opt(argc, argv, &optarg, &optind);
if (!(popt->arch_mask & arch_type)) {
error_report("Option not supported for this target");
exit(1);
}
switch(popt->index) {
case QEMU_OPTION_cpu:
/* hw initialization will check this */
cpu_option = optarg;
break;
case QEMU_OPTION_hda:
case QEMU_OPTION_hdb:
case QEMU_OPTION_hdc:
case QEMU_OPTION_hdd:
drive_add(IF_DEFAULT, popt->index - QEMU_OPTION_hda, optarg,
HD_OPTS);
break;
...
default:
error_report("Option not supported in this build");
exit(1);
}
}
}
/*
* Clear error location left behind by the loop.
* Best done right after the loop. Do not insert code here!
*/
loc_set_none();

qemu_validate_options(machine_opts_dict);
qemu_process_sugar_options();

/*
* These options affect everything else and should be processed
* before daemonizing.
*/
qemu_process_early_options();

qemu_process_help_options();
qemu_maybe_daemonize(pid_file);

/*
* The trace backend must be initialized after daemonizing.
* trace_init_backends() will call st_init(), which will create the
* trace thread in the parent, and also register st_flush_trace_buffer()
* in atexit(). This function will force the parent to wait for the
* writeout thread to finish, which will not occur, and the parent
* process will be left in the host.
*/
if (!trace_init_backends()) {
exit(1);
}
trace_init_file();

qemu_init_main_loop(&error_fatal);
cpu_timers_init();

user_register_global_props();
replay_configure(icount_opts);

configure_rtc(qemu_find_opts_singleton("rtc"));

/* Transfer QemuOpts options into machine options */
parse_memory_options();

qemu_create_machine(machine_opts_dict);

suspend_mux_open();

qemu_disable_default_devices();
qemu_setup_display();
qemu_create_default_devices();
qemu_create_early_backends();

qemu_apply_legacy_machine_options(machine_opts_dict);
qemu_apply_machine_options(machine_opts_dict);
qobject_unref(machine_opts_dict);
phase_advance(PHASE_MACHINE_CREATED);

/*
* Note: uses machine properties such as kernel-irqchip, must run
* after qemu_apply_machine_options.
*/
configure_accelerators(argv[0]);
phase_advance(PHASE_ACCEL_CREATED);

/*
* Beware, QOM objects created before this point miss global and
* compat properties.
*
* Global properties get set up by qdev_prop_register_global(),
* called from user_register_global_props(), and certain option
* desugaring. Also in CPU feature desugaring (buried in
* parse_cpu_option()), which happens below this point, but may
* only target the CPU type, which can only be created after
* parse_cpu_option() returned the type.
*
* Machine compat properties: object_set_machine_compat_props().
* Accelerator compat props: object_set_accelerator_compat_props(),
* called from do_configure_accelerator().
*/

machine_class = MACHINE_GET_CLASS(current_machine);
if (!qtest_enabled() && machine_class->deprecation_reason) {
warn_report("Machine type '%s' is deprecated: %s",
machine_class->name, machine_class->deprecation_reason);
}

/*
* Create backends before creating migration objects, so that it can
* check against compatibilities on the backend memories (e.g. postcopy
* over memory-backend-file objects).
*/
qemu_create_late_backends();
phase_advance(PHASE_LATE_BACKENDS_CREATED);

/*
* Note: creates a QOM object, must run only after global and
* compat properties have been set up.
*/
migration_object_init();

/* parse features once if machine provides default cpu_type */
current_machine->cpu_type = machine_class_default_cpu_type(machine_class);
if (cpu_option) {
current_machine->cpu_type = parse_cpu_option(cpu_option);
}
/* NB: for machine none cpu_type could STILL be NULL here! */

qemu_resolve_machine_memdev();
parse_numa_opts(current_machine);

if (vmstate_dump_file) {
/* dump and exit */
module_load_qom_all();
dump_vmstate_json_to_file(vmstate_dump_file);
exit(0);
}

if (!preconfig_requested) {
qmp_x_exit_preconfig(&error_fatal);
}
qemu_init_displays();
accel_setup_post(current_machine);
os_setup_post();
resume_mux_open();
}

其整体逻辑还是比较清晰,其首先调用qemu_init()完成参数解析、子机QOM构建等逻辑,然后执行qemu_default_main()进行监听事件。

其中和热迁移相关很重要的是子机monitor的构建,这样libvirt可以通过调用migrate-incoming的qmp命令触发目的端热迁移逻辑。其中monitor的构建逻辑在mon_init_func()中,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
static void qemu_create_late_backends(void)
{
...
qemu_opts_foreach(qemu_find_opts("mon"),
mon_init_func, NULL, &error_fatal);
...
}

static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
{
return monitor_init_opts(opts, errp);
}

int monitor_init_opts(QemuOpts *opts, Error **errp)
{
Visitor *v;
MonitorOptions *options;
int ret;

v = opts_visitor_new(opts);
visit_type_MonitorOptions(v, NULL, &options, errp);
visit_free(v);
if (!options) {
return -1;
}

ret = monitor_init(options, true, errp);
qapi_free_MonitorOptions(options);
return ret;
}

int monitor_init(MonitorOptions *opts, bool allow_hmp, Error **errp)
{
...
switch (opts->mode) {
case MONITOR_MODE_CONTROL:
monitor_init_qmp(chr, opts->pretty, errp);
break;
case MONITOR_MODE_READLINE:
if (!allow_hmp) {
error_setg(errp, "Only QMP is supported");
return -1;
}
if (opts->pretty) {
error_setg(errp, "'pretty' is not compatible with HMP monitors");
return -1;
}
monitor_init_hmp(chr, true, errp);
break;
default:
g_assert_not_reached();
}
...
}

void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp)
{
...
json_message_parser_init(&mon->parser, handle_qmp_command, mon, NULL);
...
if (mon->common.use_io_thread) {
...
} else {
qemu_chr_fe_set_handlers(&mon->common.chr, monitor_can_read,
monitor_qmp_read, monitor_qmp_event,
NULL, &mon->common, NULL, true);
monitor_list_append(&mon->common);
}
}

//#0 qmp_dispatcher_co_wake () at ../monitor/qmp.c:359
//#1 0x0000555555ec78c4 in handle_qmp_command (opaque=0x555556f2b950,
// req=0x7fffc8002570, err=0x0) at ../monitor/qmp.c:425
//#2 0x0000555555f85ce5 in json_message_process_token (lexer=0x555556f2ba18,
// input=0x555556f34f90, type=JSON_RCURLY, x=47, y=0)
// at ../qobject/json-streamer.c:99
//#3 0x0000555555fd4811 in json_lexer_feed_char (lexer=0x555556f2ba18,
// ch=125 '}', flush=false) at ../qobject/json-lexer.c:313
//#4 0x0000555555fd4981 in json_lexer_feed (lexer=0x555556f2ba18,
// buffer=0x7ffff6e8e940 "}", size=1) at ../qobject/json-lexer.c:350
//#5 0x0000555555f85ddf in json_message_parser_feed (parser=0x555556f2ba00,
// buffer=0x7ffff6e8e940 "}", size=1) at ../qobject/json-streamer.c:121
//#6 0x0000555555ec7923 in monitor_qmp_read (opaque=0x555556f2b950,
// buf=0x7ffff6e8e940 "}", size=1) at ../monitor/qmp.c:432
//#7 0x0000555555ebe318 in qemu_chr_be_write_impl (s=0x555557156660,
// buf=0x7ffff6e8e940 "}", len=1) at ../chardev/char.c:214
//#8 0x0000555555ebe38d in qemu_chr_be_write (s=0x555557156660,
// buf=0x7ffff6e8e940 "}", len=1) at ../chardev/char.c:226
//#9 0x0000555555eb9a2f in tcp_chr_read (chan=0x7fffc8000cd0, cond=G_IO_IN,
// opaque=0x555557156660) at ../chardev/char-socket.c:512
//#10 0x0000555555dd6a55 in qio_channel_fd_source_dispatch (
// source=0x555557156f60, callback=0x555555eb98a7 <tcp_chr_read>,
// user_data=0x555557156660) at ../io/channel-watch.c:84
//#11 0x00007ffff7da5385 in ?? ()
// from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#12 0x00007ffff7da75b7 in ?? ()
// from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#13 0x00007ffff7da801f in g_main_loop_run ()
// from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#14 0x0000555555e03d32 in iothread_run (opaque=0x5555570275a0)
// at ../iothread.c:70
//#15 0x0000555555f93cfe in qemu_thread_start (args=0x555556f2d820)
// at ../util/qemu-thread-posix.c:541
//#16 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#17 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
static void handle_qmp_command(void *opaque, QObject *req, Error *err)
{
MonitorQMP *mon = opaque;
QDict *qdict = qobject_to(QDict, req);
QMPRequest *req_obj;
...
req_obj = g_new0(QMPRequest, 1);
req_obj->mon = mon;
req_obj->req = req;
req_obj->err = err;

/* Protect qmp_requests and fetching its length. */
WITH_QEMU_LOCK_GUARD(&mon->qmp_queue_lock) {
...
/*
* Put the request to the end of queue so that requests will be
* handled in time order. Ownership for req_obj, req,
* etc. will be delivered to the handler side.
*/
trace_monitor_qmp_in_band_enqueue(req_obj, mon,
mon->qmp_requests->length);
assert(mon->qmp_requests->length < QMP_REQ_QUEUE_LEN_MAX);
g_queue_push_tail(mon->qmp_requests, req_obj);
}

/* Kick the dispatcher routine */
qmp_dispatcher_co_wake();
}

其最终在monitor_init_qmp()中绑定子机monitor事件源的回调函数,并设置对应json解析器的回调函数。这样当后续通过子机monitor执行qmp命令时,会触发相关的事件源并执行回调函数monitor_qmp_read()使用json解析器解析输入的qmp命令;当成功解析一个命令后,则会调用handle_qmp_command()去触发qmp_dispatcher_co协程执行qmp命令

而这个协程是在monitor_init_globals()中初始化的,会调用qmp_dispatch()执行qmp命令并调用monitor_qmp_respond()输出结果,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
//#0  monitor_init_globals () at ../monitor/monitor.c:713
//#1 0x0000555555b3fcda in qemu_init_subsystems () at ../system/runstate.c:824
//#2 0x0000555555b46ca2 in qemu_init (argc=55, argv=0x7fffffffe638)
// at ../system/vl.c:2786
//#3 0x0000555555db4396 in main (argc=55, argv=0x7fffffffe638)
// at ../system/main.c:47
void monitor_init_globals(void)
{
...
/*
* The dispatcher BH must run in the main loop thread, since we
* have commands assuming that context. It would be nice to get
* rid of those assumptions.
*/
qmp_dispatcher_co = qemu_coroutine_create(monitor_qmp_dispatcher_co, NULL);
aio_co_schedule(iohandler_get_aio_context(), qmp_dispatcher_co);
}

void coroutine_fn monitor_qmp_dispatcher_co(void *data)
{
QMPRequest *req_obj;
QDict *rsp;
bool oob_enabled;
MonitorQMP *mon;

while ((req_obj = monitor_qmp_dispatcher_pop_any()) != NULL) {
...
mon = req_obj->mon;
...
if (qatomic_xchg(&qmp_dispatcher_co_busy, true) == true) {
/*
* Someone rescheduled us (probably because a new requests
* came in), but we didn't actually yield. Do that now,
* only to be immediately reentered and removed from the
* list of scheduled coroutines.
*/
qemu_coroutine_yield();
}

/* Process request */
if (req_obj->req) {
if (trace_event_get_state(TRACE_MONITOR_QMP_CMD_IN_BAND)) {
QDict *qdict = qobject_to(QDict, req_obj->req);
QObject *id = qdict ? qdict_get(qdict, "id") : NULL;
GString *id_json;

id_json = id ? qobject_to_json(id) : g_string_new(NULL);
trace_monitor_qmp_cmd_in_band(id_json->str);
g_string_free(id_json, true);
}
monitor_qmp_dispatch(mon, req_obj->req);
} else {
assert(req_obj->err);
trace_monitor_qmp_err_in_band(error_get_pretty(req_obj->err));
rsp = qmp_error_response(req_obj->err);
req_obj->err = NULL;
monitor_qmp_respond(mon, rsp);
qobject_unref(rsp);
}

if (!oob_enabled) {
monitor_resume(&mon->common);
}

qmp_request_free(req_obj);
}
qatomic_set(&qmp_dispatcher_co, NULL);
}

/*
* Runs outside of coroutine context for OOB commands, but in
* coroutine context for everything else.
*/
static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req)
{
QDict *rsp;
QDict *error;

rsp = qmp_dispatch(mon->commands, req, qmp_oob_enabled(mon),
&mon->common);
...
monitor_qmp_respond(mon, rsp);
qobject_unref(rsp);
}

//#0 aio_bh_schedule_oneshot_full (ctx=0x555556f2abe0,
// cb=0x555555f8063e <do_qmp_dispatch_bh>, opaque=0x7ffff7294e40,
// name=0x555556211597 "do_qmp_dispatch_bh") at ../util/async.c:132
//#1 0x0000555555f80ce5 in qmp_dispatch (
// cmds=0x555556eb61c0 <qmp_cap_negotiation_commands>, request=0x7fffc8002570,
// allow_oob=false, cur_mon=0x555556f2b950) at ../qapi/qmp-dispatch.c:254
//#2 0x0000555555ec6f49 in monitor_qmp_dispatch (mon=0x555556f2b950, req=0x7fffc8002570)
// at ../monitor/qmp.c:168
//#3 0x0000555555ec747f in monitor_qmp_dispatcher_co (data=0x0) at ../monitor/qmp.c:335
//#4 0x0000555555fb24c8 in coroutine_trampoline (i0=1458743968, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#5 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#6 0x00007fffffffdb70 in ?? ()
//#7 0x0000000000000000 in ?? ()
QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *request,
bool allow_oob, Monitor *cur_mon)
{
Error *err = NULL;
bool oob;
const char *command;
QDict *args;
const QmpCommand *cmd;
QDict *dict;
QObject *id;
QObject *ret = NULL;
QDict *rsp = NULL;

dict = qobject_to(QDict, request);
if (!dict) {
id = NULL;
error_setg(&err, "QMP input must be a JSON object");
goto out;
}

id = qdict_get(dict, "id");

if (!qmp_dispatch_check_obj(dict, allow_oob, &err)) {
goto out;
}

command = qdict_get_try_str(dict, "execute");
oob = false;
if (!command) {
assert(allow_oob);
command = qdict_get_str(dict, "exec-oob");
oob = true;
}
cmd = qmp_find_command(cmds, command);
if (cmd == NULL) {
error_set(&err, ERROR_CLASS_COMMAND_NOT_FOUND,
"The command %s has not been found", command);
goto out;
}
if (!compat_policy_input_ok(cmd->special_features, &compat_policy,
ERROR_CLASS_COMMAND_NOT_FOUND,
"command", command, &err)) {
goto out;
}
if (!cmd->enabled) {
error_set(&err, ERROR_CLASS_COMMAND_NOT_FOUND,
"Command %s has been disabled%s%s",
command,
cmd->disable_reason ? ": " : "",
cmd->disable_reason ?: "");
goto out;
}
if (oob && !(cmd->options & QCO_ALLOW_OOB)) {
error_setg(&err, "The command %s does not support OOB",
command);
goto out;
}

if (!qmp_command_available(cmd, &err)) {
goto out;
}

if (!qdict_haskey(dict, "arguments")) {
args = qdict_new();
} else {
args = qdict_get_qdict(dict, "arguments");
qobject_ref(args);
}

assert(!(oob && qemu_in_coroutine()));
assert(monitor_cur() == NULL);
if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) {
if (qemu_in_coroutine()) {
/*
* Move the coroutine from iohandler_ctx to qemu_aio_context for
* executing the command handler so that it can make progress if it
* involves an AIO_WAIT_WHILE().
*/
aio_co_reschedule_self(qemu_get_aio_context());
}

monitor_set_cur(qemu_coroutine_self(), cur_mon);
cmd->fn(args, &ret, &err);
monitor_set_cur(qemu_coroutine_self(), NULL);

if (qemu_in_coroutine()) {
/*
* Yield and reschedule so the main loop stays responsive.
*
* Move back to iohandler_ctx so that nested event loops for
* qemu_aio_context don't start new monitor commands.
*/
aio_co_reschedule_self(iohandler_get_aio_context());
}
} else {
/*
* Actual context doesn't match the one the command needs.
*
* Case 1: we are in coroutine context, but command does not
* have QCO_COROUTINE. We need to drop out of coroutine
* context for executing it.
*
* Case 2: we are outside coroutine context, but command has
* QCO_COROUTINE. Can't actually happen, because we get here
* outside coroutine context only when executing a command
* out of band, and OOB commands never have QCO_COROUTINE.
*/
assert(!oob && qemu_in_coroutine() && !(cmd->options & QCO_COROUTINE));

QmpDispatchBH data = {
.cur_mon = cur_mon,
.cmd = cmd,
.args = args,
.ret = &ret,
.errp = &err,
.co = qemu_coroutine_self(),
};
aio_bh_schedule_oneshot(iohandler_get_aio_context(), do_qmp_dispatch_bh,
&data);
qemu_coroutine_yield();
}
qobject_unref(args);
if (err) {
/* or assert(!ret) after reviewing all handlers: */
qobject_unref(ret);
goto out;
}

if (cmd->options & QCO_NO_SUCCESS_RESP) {
g_assert(!ret);
return NULL;
} else if (!ret) {
/*
* When the command's schema has no 'returns', cmd->fn()
* leaves @ret null. The QMP spec calls for an empty object
* then; supply it.
*/
ret = QOBJECT(qdict_new());
}

rsp = qdict_new();
qdict_put_obj(rsp, "return", ret);

out:
if (err) {
assert(!rsp);
rsp = qmp_error_response(err);
}

assert(rsp);

if (id) {
qdict_put_obj(rsp, "id", qobject_ref(id));
}

return rsp;
}

可以看到,其所有的qmp命令都在mon->commands中,qmp_dispatch()会调用qmp_find_command()基于用户输入的命令名称找到对应的qmp命令,然后以协程/QEMUBH事件形式执行qmp的逻辑。

其中,mon->commandsmonitor_init_qmp_commands()中被初始化,其函数实现由qapi-gen.py基于qapi目录下的*.json内容自动化生成,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
static void __attribute__((__constructor__)) monitor_init_qmp_commands(void)
{
/*
* Two command lists:
* - qmp_commands contains all QMP commands
* - qmp_cap_negotiation_commands contains just
* "qmp_capabilities", to enforce capability negotiation
*/

qmp_init_marshal(&qmp_commands);

qmp_register_command(&qmp_commands, "device_add",
qmp_device_add, 0, 0);

QTAILQ_INIT(&qmp_cap_negotiation_commands);
qmp_register_command(&qmp_cap_negotiation_commands, "qmp_capabilities",
qmp_marshal_qmp_capabilities,
QCO_ALLOW_PRECONFIG, 0);
}

//自动化生成的qmp_init_marshal()
void qmp_init_marshal(QmpCommandList *cmds)
{
QTAILQ_INIT(cmds);
...
qmp_register_command(cmds, "migrate-incoming",
qmp_marshal_migrate_incoming, 0, 0);
...
}

//对应的qapi/migration.json
##
# @migrate-incoming:
#
# Start an incoming migration, the qemu must have been started with
# -incoming defer
#
# @uri: The Uniform Resource Identifier identifying the source or
# address to listen on
#
# @channels: list of migration stream channels with each stream in the
# list connected to a destination interface endpoint.
#
# Since: 2.3
#
# Notes:
#
# 1. It's a bad idea to use a string for the uri, but it needs to
# stay compatible with -incoming and the format of the uri is
# already exposed above libvirt.
#
# 2. QEMU must be started with -incoming defer to allow
# migrate-incoming to be used.
#
# 3. The uri format is the same as for -incoming
#
# 4. For now, number of migration streams is restricted to one,
# i.e. number of items in 'channels' list is just 1.
#
# 5. The 'uri' and 'channels' arguments are mutually exclusive;
# exactly one of the two should be present.
#
# Example:
#
# -> { "execute": "migrate-incoming",
# "arguments": { "uri": "tcp:0:4446" } }
# <- { "return": {} }
#
# -> { "execute": "migrate-incoming",
# "arguments": {
# "channels": [ { "channel-type": "main",
# "addr": { "transport": "socket",
# "type": "inet",
# "host": "10.12.34.9",
# "port": "1050" } } ] } }
# <- { "return": {} }
#
# -> { "execute": "migrate-incoming",
# "arguments": {
# "channels": [ { "channel-type": "main",
# "addr": { "transport": "exec",
# "args": [ "/bin/nc", "-p", "6000",
# "/some/sock" ] } } ] } }
# <- { "return": {} }
#
# -> { "execute": "migrate-incoming",
# "arguments": {
# "channels": [ { "channel-type": "main",
# "addr": { "transport": "rdma",
# "host": "10.12.34.9",
# "port": "1050" } } ] } }
# <- { "return": {} }
##
{ 'command': 'migrate-incoming',
'data': {'*uri': 'str',
'*channels': [ 'MigrationChannel' ] } }

migrate-incoming

根据前面内容可知,libvirt最终会调用migrate-incoming的qmp命令来启动目的端的热迁移流程。由前面章节可知,其逻辑为qmp_migrate_incoming(),如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
void qmp_migrate_incoming(const char *uri, bool has_channels,
MigrationChannelList *channels, Error **errp)
{
...
qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
...
}

static void qemu_start_incoming_migration(const char *uri, bool has_channels,
MigrationChannelList *channels,
Error **errp)
{
...
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
SocketAddress *saddr = &addr->u.socket;
if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
socket_start_incoming_migration(saddr, errp);
} else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
fd_start_incoming_migration(saddr->u.fd.str, errp);
}
...
} else {
error_setg(errp, "unknown migration protocol: %s", uri);
}
}

//#0 socket_start_incoming_migration (saddr=0x555557316ac8, errp=0x7fffffffe178) at ../migration/socket.c:178
//#1 0x0000555555b63ee7 in qemu_start_incoming_migration (uri=0x55555794f1e0 "tcp:[::]:49152", has_channels=false,
// channels=0x0, errp=0x7fffffffe178) at ../migration/migration.c:644
//#2 0x0000555555b66a00 in qmp_migrate_incoming (uri=0x55555794f1e0 "tcp:[::]:49152", has_channels=false, channels=0x0,
// errp=0x7fffffffe1d8) at ../migration/migration.c:1815
//#3 0x0000555555f343a1 in qmp_marshal_migrate_incoming (args=0x7fffc80048f0, ret=0x7ffff7294da8, errp=0x7ffff7294da0)
// at qapi/qapi-commands-migration.c:523
//#4 0x0000555555f806cd in do_qmp_dispatch_bh (opaque=0x7ffff7294e40) at ../qapi/qmp-dispatch.c:128
//#5 0x0000555555fadb09 in aio_bh_call (bh=0x55555734e3a0) at ../util/async.c:171
//#6 0x0000555555fadc30 in aio_bh_poll (ctx=0x555556f2abe0) at ../util/async.c:218
//#7 0x0000555555f8e7e1 in aio_dispatch (ctx=0x555556f2abe0) at ../util/aio-posix.c:423
//#8 0x0000555555fae0c5 in aio_ctx_dispatch (source=0x555556f2abe0, callback=0x0, user_data=0x0) at ../util/async.c:360
//#9 0x00007ffff7da5385 in ?? () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#10 0x00007ffff7da7c78 in g_main_context_dispatch () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#11 0x0000555555faf750 in glib_pollfds_poll () at ../util/main-loop.c:287
//#12 0x0000555555faf7db in os_host_main_loop_wait (timeout=0) at ../util/main-loop.c:310
//#13 0x0000555555faf907 in main_loop_wait (nonblocking=0) at ../util/main-loop.c:589
//#14 0x0000555555b3fbac in qemu_main_loop () at ../system/runstate.c:783
//#15 0x0000555555db435e in qemu_default_main () at ../system/main.c:37
//#16 0x0000555555db439f in main (argc=55, argv=0x7fffffffe638) at ../system/main.c:48
void socket_start_incoming_migration(SocketAddress *saddr,
Error **errp)
{
QIONetListener *listener = qio_net_listener_new();
MigrationIncomingState *mis = migration_incoming_get_current();
size_t i;
int num = 1;

qio_net_listener_set_name(listener, "migration-socket-listener");

if (migrate_multifd()) {
num = migrate_multifd_channels();
}
...

if (qio_net_listener_open_sync(listener, saddr, num, errp) < 0) {
object_unref(OBJECT(listener));
return;
}

mis->transport_data = listener;
mis->transport_cleanup = socket_incoming_migration_end;

qio_net_listener_set_client_func_full(listener,
socket_accept_incoming_migration,
NULL, NULL,
g_main_context_get_thread_default());
...
}

其逻辑非常简单,就是调用qio_net_listener_open_sync()监听目的端指定端口;并调用qio_net_listener_set_client_func_full()注册回调函数socket_accept_incoming_migration(),该回调函数会在监听端口有新的连接到来时被触发执行,其具体逻辑在后面perform章节再详细介绍

生成cookie

这里和begin阶段一样,使用qemuMigrationCookieFormat根据flags指定的配置字段,序列化struct _qemuMigrationCookie类型的cookie,其包含目的端prepare阶段额外热迁移设置信息。

perform阶段

概述

preform阶段是整个热迁移的核心流程,其解析包含目的端prepare阶段的额外设置信息的cookie数据,然后调用migrate的qmp命令来启动源端的热迁移流程,并监控整个热迁移流程的结束,流程如下所示

热迁移perform阶段流程图

perform阶段的代码如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
static virDomainPtr
virDomainMigrateVersion3Full(virDomainPtr domain,
virConnectPtr dconn,
const char *xmlin,
const char *dname,
const char *uri,
unsigned long long bandwidth,
virTypedParameterPtr params,
int nparams,
bool useParams,
unsigned int flags)
{
...
/* Perform the migration. The driver isn't supposed to return
* until the migration is complete. The src VM should remain
* running, but in paused state until the destination can
* confirm migration completion.
*/
VIR_DEBUG("Perform3 %p uri=%s", domain->conn, uri);
VIR_FREE(cookiein);
cookiein = g_steal_pointer(&cookieout);
cookieinlen = cookieoutlen;
cookieoutlen = 0;
/* dconnuri not relevant in non-P2P modes, so left NULL here */
if (useParams) {
ret = domain->conn->driver->domainMigratePerform3Params
(domain, NULL, params, nparams, cookiein, cookieinlen,
&cookieout, &cookieoutlen, flags | protection);
} else {
ret = domain->conn->driver->domainMigratePerform3
(domain, NULL, cookiein, cookieinlen,
&cookieout, &cookieoutlen, NULL,
uri, flags | protection, dname, bandwidth);
}

/* Perform failed. Make sure Finish doesn't overwrite the error */
if (ret < 0) {
virErrorPreserveLast(&orig_err);
/* Perform failed so we don't need to call confirm to let source know
* about the failure.
*/
notify_source = false;
}

/* If Perform returns < 0, then we need to cancel the VM
* startup on the destination
*/
cancelled = ret < 0 ? 1 : 0;
...
}

其逻辑很直接,就是调用driverdomainMigratePerform3Params函数指针,即qemuDomainMigratePerform3Params(),其基于包含目的端prepare阶段的热迁移的额外配置信息的cookie确定对应的热迁移额外配置,并通过源端的monitor调用migrate qmp来启动热迁移流程,最后会生成一个包含源端perform阶段的热迁移额外配置信息的cookie,核心逻辑在qemuMigrationSrcRun()如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
static int
qemuMigrationSrcRun(virQEMUDriver *driver,
virDomainObj *vm,
const char *xmlin,
const char *persist_xml,
const char *cookiein,
int cookieinlen,
char **cookieout,
int *cookieoutlen,
unsigned int flags,
unsigned long resource,
qemuMigrationSpec *spec,
virConnectPtr dconn,
const char *graphicsuri,
const char **migrate_disks,
const char **migrate_disks_detect_zeroes,
qemuMigrationParams *migParams,
const char *nbdURI)
{
...
mig = qemuMigrationCookieParse(driver, vm, vm->def, priv->origname,
priv->qemuCaps,
cookiein, cookieinlen,
cookieFlags |
QEMU_MIGRATION_COOKIE_GRAPHICS |
QEMU_MIGRATION_COOKIE_CAPS |
QEMU_MIGRATION_COOKIE_BLOCK_DIRTY_BITMAPS);
if (!mig)
goto error;
...
/* Save original migration parameters */
qemuDomainSaveStatus(vm);
...
if (qemuMigrationParamsSetULL(migParams, QEMU_MIGRATION_PARAM_MAX_BANDWIDTH,
priv->migMaxBandwidth * 1024 * 1024) < 0)
goto error;

if (qemuMigrationParamsApply(vm, VIR_ASYNC_JOB_MIGRATION_OUT,
migParams, flags) < 0)
goto error;
...
if (qemuDomainObjEnterMonitorAsync(vm, VIR_ASYNC_JOB_MIGRATION_OUT) < 0)
goto error;

if (vm->job->abortJob) {
/* explicitly do this *after* we entered the monitor,
* as this is a critical section so we are guaranteed
* vm->job->abortJob will not change */
vm->job->current->status = VIR_DOMAIN_JOB_STATUS_CANCELED;
virReportError(VIR_ERR_OPERATION_ABORTED, _("%1$s: %2$s"),
virDomainAsyncJobTypeToString(vm->job->asyncJob),
_("canceled by client"));
goto exit_monitor;
}

rc = qemuMigrationSrcStart(vm, spec, 0, &fd);

qemuDomainObjExitMonitor(vm);
if (rc < 0)
goto error;

/* From this point onwards we *must* call cancel to abort the
* migration on source if anything goes wrong */
cancel = true;
...
rc = qemuMigrationSrcWaitForCompletion(vm, VIR_ASYNC_JOB_MIGRATION_OUT,
dconn, waitFlags);
if (rc == -2)
goto error;

if (rc == -1) {
/* QEMU reported failed migration, nothing to cancel anymore */
cancel = false;
goto error;
}

/* When migration completed, QEMU will have paused the CPUs for us.
* Wait for the STOP event to be processed to release the lock state.
*/
while (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) {
priv->signalStop = true;
rc = qemuDomainObjWait(vm);
priv->signalStop = false;
if (rc < 0)
goto error;
}
...
cookieFlags |= QEMU_MIGRATION_COOKIE_NETWORK |
QEMU_MIGRATION_COOKIE_STATS;

if (qemuMigrationCookieAddPersistent(mig, &persistDef) < 0 ||
qemuMigrationCookieFormat(mig, driver, vm,
QEMU_MIGRATION_SOURCE,
cookieout, cookieoutlen, cookieFlags) < 0) {
VIR_WARN("Unable to encode migration cookie");
}

ret = 0;

cleanup:
priv->signalIOError = false;
priv->migMaxBandwidth = restore_max_bandwidth;
virErrorRestore(&orig_err);

return ret;
...
}

其会调用qemuMigrationCookieParse()处理目的端的cookie信息,获取目的端prepare阶段的热迁移的额外配置信息;然后调用qemuMigrationSrcStart()执行migrate的qmp命令来启动源端的热迁移流程;接着调用qemuMigrationSrcWaitForCompletion()轮训热迁移状态;最后调用qemuMigrationCookieFormat()生成包含源端perform阶段的热迁移额外配置信息的cookie

处理cookie

这里和begin阶段一样,使用qemuMigrationCookieParse根据flags指定的配置字段,根据flags反序列化cookie字符串到对应的struct _qemuMigrationCookie类型数据中,其为目的端prepare阶段的热迁移额外配置信息

启动源端热迁移

libvirt

正如前面分析的,perform阶段libvirt的主要逻辑就是启动源端热迁移流程和等待热迁移结束

启动源端热迁移

libvirt会调用qemuMigrationSrcStart()启动源端的热迁移流程,其核心逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/* The caller is supposed to enter monitor before calling this. */
static int
qemuMigrationSrcStart(virDomainObj *vm,
qemuMigrationSpec *spec,
unsigned int migrateFlags,
int *tunnelFd)
{
...
switch (spec->destType) {
case MIGRATION_DEST_HOST:
if (STREQ(spec->dest.host.protocol, "rdma") &&
virMemoryLimitIsSet(vm->def->mem.hard_limit)) {
if (qemuDomainSetMaxMemLock(vm, vm->def->mem.hard_limit << 10,
&priv->preMigrationMemlock) < 0)
return -1;
/* Store the original memory locking limit */
qemuDomainSaveStatus(vm);
}
return qemuMonitorMigrateToHost(priv->mon, migrateFlags,
spec->dest.host.protocol,
spec->dest.host.name,
spec->dest.host.port);
...
}

virReportError(VIR_ERR_INTERNAL_ERROR,
_("unexpected migration schema: %1$d"), spec->destType);
return -1;
}

int
qemuMonitorMigrateToHost(qemuMonitor *mon,
unsigned int flags,
const char *protocol,
const char *hostname,
int port)
{
int ret;
char *uri = NULL;
VIR_DEBUG("hostname=%s port=%d flags=0x%x", hostname, port, flags);

QEMU_CHECK_MONITOR(mon);

if (strchr(hostname, ':')) {
uri = g_strdup_printf("%s:[%s]:%d", protocol, hostname, port);
} else uri = g_strdup_printf("%s:%s:%d", protocol, hostname, port);

ret = qemuMonitorJSONMigrate(mon, flags, uri);

VIR_FREE(uri);
return ret;
}

int qemuMonitorJSONMigrate(qemuMonitor *mon,
unsigned int flags,
const char *uri)
{
bool resume = !!(flags & QEMU_MONITOR_MIGRATE_RESUME);
g_autoptr(virJSONValue) cmd = qemuMonitorJSONMakeCommand("migrate",
"b:detach", true,
"b:resume", resume,
"s:uri", uri,
NULL);
g_autoptr(virJSONValue) reply = NULL;

if (!cmd)
return -1;

if (qemuMonitorJSONCommand(mon, cmd, &reply) < 0)
return -1;

if (qemuMonitorJSONCheckError(cmd, reply) < 0)
return -1;

return 0;
}

可以看到,其逻辑非常清晰,即调用qemu的migrate的qmp命令

等待热迁移结束

libvirt会调用qemuMigrationSrcWaitForCompletion()轮训热迁移状态,其核心逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* Returns 0 on success, -2 when migration needs to be cancelled, or -1 when
* QEMU reports failed migration.
*/
static int
qemuMigrationSrcWaitForCompletion(virDomainObj *vm,
virDomainAsyncJob asyncJob,
virConnectPtr dconn,
unsigned int flags)
{
virDomainJobData *jobData = vm->job->current;
int rv;

jobData->status = VIR_DOMAIN_JOB_STATUS_MIGRATING;

while ((rv = qemuMigrationAnyCompleted(vm, asyncJob, dconn, flags)) != 1) {
if (rv < 0)
return rv;

if (qemuDomainObjWait(vm) < 0) {
if (qemuDomainObjIsActive(vm))
jobData->status = VIR_DOMAIN_JOB_STATUS_FAILED;
return -2;
}
}

ignore_value(qemuMigrationAnyFetchStats(vm, asyncJob, jobData, NULL));

qemuDomainJobDataUpdateTime(jobData);
qemuDomainJobDataUpdateDowntime(jobData);
g_clear_pointer(&vm->job->completed, virDomainJobDataFree);
vm->job->completed = virDomainJobDataCopy(jobData);
vm->job->completed->status = VIR_DOMAIN_JOB_STATUS_COMPLETED;
...
return 0;
}

/**
* Returns 1 if migration completed successfully,
* 0 if the domain is still being migrated,
* -1 migration failed,
* -2 something else failed, we need to cancel migration.
*/
static int
qemuMigrationAnyCompleted(virDomainObj *vm,
virDomainAsyncJob asyncJob,
virConnectPtr dconn,
unsigned int flags)
{
virDomainJobData *jobData = vm->job->current;
int pauseReason;

if (qemuMigrationJobCheckStatus(vm, asyncJob) < 0)
goto error;
...
if (jobData->status == VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED)
return 1;
else
return 0;
...
}

static int
qemuMigrationJobCheckStatus(virDomainObj *vm,
virDomainAsyncJob asyncJob)
{
virDomainJobData *jobData = vm->job->current;
qemuDomainJobDataPrivate *privJob = jobData->privateData;
g_autofree char *error = NULL;

if (privJob->stats.mig.status == QEMU_MONITOR_MIGRATION_STATUS_ERROR) {
if (qemuMigrationAnyFetchStats(vm, asyncJob, jobData, &error) < 0)
return -1;
}

qemuMigrationUpdateJobType(jobData);
...
return 0;
}

int
qemuMigrationAnyFetchStats(virDomainObj *vm,
virDomainAsyncJob asyncJob,
virDomainJobData *jobData,
char **error)
{
qemuDomainObjPrivate *priv = vm->privateData;
qemuMonitorMigrationStats stats;
qemuDomainJobDataPrivate *privJob = jobData->privateData;
int rv;

if (qemuDomainObjEnterMonitorAsync(vm, asyncJob) < 0)
return -1;

rv = qemuMonitorGetMigrationStats(priv->mon, &stats, error);

qemuDomainObjExitMonitor(vm);
if (rv < 0)
return -1;

privJob->stats.mig = stats;

return 0;
}

int
qemuMonitorGetMigrationStats(qemuMonitor *mon,
qemuMonitorMigrationStats *stats,
char **error)
{
QEMU_CHECK_MONITOR(mon);

if (error)
*error = NULL;

return qemuMonitorJSONGetMigrationStats(mon, stats, error);
}

int qemuMonitorJSONGetMigrationStats(qemuMonitor *mon,
qemuMonitorMigrationStats *stats,
char **error)
{
g_autoptr(virJSONValue) cmd = qemuMonitorJSONMakeCommand("query-migrate",
NULL);
...
}

可以看到,其逻辑也很清晰,即轮询query-migrate的qmp命令,查询热迁移的状态并进行相应的处理即可

qemu

源端

migrate qmp

根据前面内容可知,libvirt最终会调用migrate的qmp命令来启动源端的热迁移流程,逻辑为qmp_migrate(),如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
void qmp_migrate(const char *uri, bool has_channels,
MigrationChannelList *channels, bool has_blk, bool blk,
bool has_inc, bool inc, bool has_detach, bool detach,
bool has_resume, bool resume, Error **errp)
{
...
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
SocketAddress *saddr = &addr->u.socket;
if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
socket_start_outgoing_migration(s, saddr, &local_err);
}
}
...
if (local_err) {
if (!resume_requested) {
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
migrate_fd_error(s, local_err);
error_propagate(errp, local_err);
return;
}
}

void socket_start_outgoing_migration(MigrationState *s,
SocketAddress *saddr,
Error **errp)
{
...
qio_channel_socket_connect_async(sioc,
saddr,
socket_outgoing_migration,
data,
socket_connect_data_free,
NULL);
}

//#0 socket_outgoing_migration (task=0x555557595730, opaque=0x555557973350) at ../migration/socket.c:80
//#1 0x0000555555ddecc7 in qio_task_complete (task=0x555557595730) at ../io/task.c:197
//#2 0x0000555555dde91d in qio_task_thread_result (opaque=0x555557595730) at ../io/task.c:112
//#3 0x00007ffff7da5385 in ?? () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#4 0x00007ffff7da7c78 in g_main_context_dispatch () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#5 0x0000555555faf750 in glib_pollfds_poll () at ../util/main-loop.c:287
//#6 0x0000555555faf7db in os_host_main_loop_wait (timeout=0) at ../util/main-loop.c:310
//#7 0x0000555555faf907 in main_loop_wait (nonblocking=0) at ../util/main-loop.c:589
//#8 0x0000555555b3fbac in qemu_main_loop () at ../system/runstate.c:783
//#9 0x0000555555db435e in qemu_default_main () at ../system/main.c:37
//#10 0x0000555555db439f in main (argc=53, argv=0x7fffffffe658) at ../system/main.c:48
static void socket_outgoing_migration(QIOTask *task,
gpointer opaque)
{
struct SocketConnectData *data = opaque;
QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
Error *err = NULL;
...
out:
migration_channel_connect(data->s, sioc, data->hostname, err);
object_unref(OBJECT(sioc));
}

可以看到,其核心逻辑就是在qio_channel_socket_connect_async()中异步的连接前面prepare阶段监听的端口并在连接后触发migration_channel_connect()回调逻辑,同时也会触发前面prepare阶段监听端口设置的有新连接到来时的回调函数socket_accept_incoming_migration(),其逻辑在后面目的端章节中再描述

创建主线程

migration_channel_connect()的主要逻辑就是创建热迁移线程live_migration,并在该线程中执行migration_thread()完成后续热迁移的流程,整体逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
void migration_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error *error)
{
...
if (!error) {
if (migrate_channel_requires_tls_upgrade(ioc)) {
...
} else {
QEMUFile *f = qemu_file_new_output(ioc);

migration_ioc_register_yank(ioc);

qemu_mutex_lock(&s->qemu_file_lock);
s->to_dst_file = f;
qemu_mutex_unlock(&s->qemu_file_lock);
}
}
migrate_fd_connect(s, error);
...
}

void migrate_fd_connect(MigrationState *s, Error *error_in)
{
...
s->expected_downtime = migrate_downtime_limit();
...
migration_rate_set(rate_limit);
qemu_file_set_blocking(s->to_dst_file, true);
...
if (migrate_background_snapshot()) {
...
} else {
qemu_thread_create(&s->thread, "live_migration",
migration_thread, s, QEMU_THREAD_JOINABLE);
}
s->migration_thread_running = true;
return;
...
}
migration_thread

整个热迁移的控制逻辑就在migration_thread()函数中,核心逻辑就是迭代设备/内存脏数据到目的端直到脏页产生速率收敛,小于动态计算的阈值后停止迭代并完成最终的数据同步,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static void *migration_thread(void *opaque)
{
...
if (!multifd_send_setup()) {
goto out;
}
...
qemu_savevm_state_header(s->to_dst_file);
...
qemu_savevm_state_setup(s->to_dst_file);
...
while (migration_is_active()) {
if (urgent || !migration_rate_exceeded(s->to_dst_file)) {
MigIterateState iter_state = migration_iteration_run(s);
if (iter_state == MIG_ITERATE_SKIP) {
continue;
} else if (iter_state == MIG_ITERATE_BREAK) {
break;
}
}
...
urgent = migration_rate_limit();
}

out:
...
migration_iteration_finish(s);
...
}
multifd线程

multifd_send_setup()的核心逻辑就是初始化multifd线程所需的数据结构,即MultiFDPages_tmultifd_send_stateMultiFDInit_tMultiFDPacket_tstruct MultiFDSendParams,以及multifd线程multifd_send_thread()

具体的,MultiFDPages_t用于缓存内存脏页,如下所示

1
2
3
4
5
6
7
8
9
10
11
typedef struct {
/* number of used pages */
uint32_t num;
/* number of normal pages */
uint32_t normal_num;
/* number of allocated pages */
uint32_t allocated;
/* offset of each page */
ram_addr_t *offset;
RAMBlock *block;
} MultiFDPages_t;

其中block是qemu中用于表示gpa及其对应hva的数据结构;numoffset表示其缓存的脏页个数和脏页在block中的起始偏移。可以看到,前面multifd_send_setup()中一共初始化migrate_multifd_channels()+1MultiFDPages_t实例,multifd_send_state(即migration_thread线程)一个,每个struct MultiFDSendParams(即multifd线程)一个;当qemu扫描内存脏页时,填充multifd_send_stateMultiFDPages_t实例;当qemu需要发送到目的端时,交换multifd_send_statestruct MultiFDSendParamsMultiFDPages_t实例即可

multifd_send_state包含热迁移源端multifd特性所需的状态数据,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
struct {
MultiFDSendParams *params;
/* array of pages to sent */
MultiFDPages_t *pages;
/*
* Global number of generated multifd packets.
*
* Note that we used 'uintptr_t' because it'll naturally support atomic
* operations on both 32bit / 64 bits hosts. It means on 32bit systems
* multifd will overflow the packet_num easier, but that should be
* fine.
*
* Another option is to use QEMU's Stat64 then it'll be 64 bits on all
* hosts, however so far it does not support atomic fetch_add() yet.
* Make it easy for now.
*/
uintptr_t packet_num;
/*
* Synchronization point past which no more channels will be
* created.
*/
QemuSemaphore channels_created;
/* send channels ready */
QemuSemaphore channels_ready;
/*
* Have we already run terminate threads. There is a race when it
* happens that we got one error while we are exiting.
* We will use atomic operations. Only valid values are 0 and 1.
*/
int exiting;
/* multifd ops */
MultiFDMethods *ops;
} *multifd_send_state;

其中pages用于获取子机脏页并交给multifd线程发送给目的端;channels_created用于migration_thread等待multifd_send_thread创建完成;而channels_ready信号量则表明可用的multifd_send_thread个数。

MultiFDInit_t则是用于初始化multifd发送线程和接受线程的信息,如下所示

1
2
3
4
5
6
7
8
typedef struct {
uint32_t magic;
uint32_t version;
unsigned char uuid[16]; /* QemuUUID */
uint8_t id;
uint8_t unused1[7]; /* Reserved for future use */
uint64_t unused2[4]; /* Reserved for future use */
} __attribute__((packed)) MultiFDInit_t;

其中magic用于接收端区别是migration_thread连接还是multifd_send_thread连接;而id则是用于接收端区别哪个multifd_send_thread进行连接

MultiFDPacket_t则是用于multifd发送线程向multifd接受线程描述待同步脏页的元数据,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
typedef struct {
uint32_t magic;
uint32_t version;
uint32_t flags;
/* maximum number of allocated pages */
uint32_t pages_alloc;
/* non zero pages */
uint32_t normal_pages;
/* size of the next packet that contains pages */
uint32_t next_packet_size;
uint64_t packet_num;
/* zero pages */
uint32_t zero_pages;
uint32_t unused32[1]; /* Reserved for future use */
uint64_t unused64[3]; /* Reserved for future use */
char ramblock[256];
/*
* This array contains the pointers to:
* - normal pages (initial normal_pages entries)
* - zero pages (following zero_pages entries)
*/
uint64_t offset[];
} __attribute__((packed)) MultiFDPacket_t;

struct MultiFDSendParams则包含multifd_send_thread在热迁移过程中所需的状态数据,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
typedef struct {
/* Fields are only written at creating/deletion time */
/* No lock required for them, they are read only */

/* channel number */
uint8_t id;
/* channel thread name */
char *name;
/* channel thread id */
QemuThread thread;
bool thread_created;
QemuThread tls_thread;
bool tls_thread_created;
/* communication channel */
QIOChannel *c;
/* packet allocated len */
uint32_t packet_len;
/* guest page size */
uint32_t page_size;
/* number of pages in a full packet */
uint32_t page_count;
/* multifd flags for sending ram */
int write_flags;

/* sem where to wait for more work */
QemuSemaphore sem;
/* syncs main thread and channels */
QemuSemaphore sem_sync;

/* multifd flags for each packet */
uint32_t flags;
/*
* The sender thread has work to do if either of below boolean is set.
*
* @pending_job: a job is pending
* @pending_sync: a sync request is pending
*
* For both of these fields, they're only set by the requesters, and
* cleared by the multifd sender threads.
*/
bool pending_job;
bool pending_sync;
/* array of pages to sent.
* The owner of 'pages' depends of 'pending_job' value:
* pending_job == 0 -> migration_thread can use it.
* pending_job != 0 -> multifd_channel can use it.
*/
MultiFDPages_t *pages;

/* thread local variables. No locking required */

/* pointer to the packet */
MultiFDPacket_t *packet;
/* size of the next packet that contains pages */
uint32_t next_packet_size;
/* packets sent through this channel */
uint64_t packets_sent;
/* non zero pages sent through this channel */
uint64_t total_normal_pages;
/* zero pages sent through this channel */
uint64_t total_zero_pages;
/* buffers to send */
struct iovec *iov;
/* number of iovs used */
uint32_t iovs_num;
/* used for compression methods */
void *compress_data;
} MultiFDSendParams;

其中semmigration_thread用来通知multifd_send_thread有数据需要发送,其中pending_job表示有脏页数据,pages则是对应的脏页数据,而pending_sync表示有同步信息需要发送,packet用来向multifd接收端描述待发送数据的元数据信息;而sem_sync则是multifd_send_thread用来通知migration_thread完成了同步信息的发送;

而multifd_send_setup()整体的逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
static MultiFDMethods multifd_nocomp_ops = {
.send_setup = nocomp_send_setup,
.send_cleanup = nocomp_send_cleanup,
.send_prepare = nocomp_send_prepare,
.recv_setup = nocomp_recv_setup,
.recv_cleanup = nocomp_recv_cleanup,
.recv = nocomp_recv
};

static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
[MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
};

//#0 multifd_send_setup () at ../migration/multifd.c:1146
//#1 0x0000555555b6a007 in migration_thread (opaque=0x555556f34ff0) at ../migration/migration.c:3442
//#2 0x0000555555f93cfe in qemu_thread_start (args=0x555557ab8510) at ../util/qemu-thread-posix.c:541
//#3 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#4 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
bool multifd_send_setup(void)
{
...
uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
...
if (!migrate_multifd()) {
return true;
}

thread_count = migrate_multifd_channels();
multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
multifd_send_state->pages = multifd_pages_init(page_count);
qemu_sem_init(&multifd_send_state->channels_created, 0);
qemu_sem_init(&multifd_send_state->channels_ready, 0);
qatomic_set(&multifd_send_state->exiting, 0);
multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];

for (i = 0; i < thread_count; i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];

qemu_sem_init(&p->sem, 0);
qemu_sem_init(&p->sem_sync, 0);
p->id = i;
p->pages = multifd_pages_init(page_count);

if (use_packets) {
p->packet_len = sizeof(MultiFDPacket_t)
+ sizeof(uint64_t) * page_count;
p->packet = g_malloc0(p->packet_len);
p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
p->packet->version = cpu_to_be32(MULTIFD_VERSION);

/* We need one extra place for the packet header */
p->iov = g_new0(struct iovec, page_count + 1);
} else {
...
}
p->name = g_strdup_printf("multifdsend_%d", i);
p->page_size = qemu_target_page_size();
p->page_count = page_count;
p->write_flags = 0;

if (!multifd_new_send_channel_create(p, &local_err)) {
return false;
}
}

/*
* Wait until channel creation has started for all channels. The
* creation can still fail, but no more channels will be created
* past this point.
*/
for (i = 0; i < thread_count; i++) {
qemu_sem_wait(&multifd_send_state->channels_created);
}
...
return true;
}

可以看到,其就是分别初始化multifd_send_state以及每个multifd_send_thread线程的MultiFDSendParams变量,然后调用multifd_new_send_channel_create()创建所有的multifd_send_thread线程并等待所有线程创建结束。其创建线程过程如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
static bool multifd_new_send_channel_create(gpointer opaque, Error **errp)
{
...
socket_send_channel_create(multifd_new_send_channel_async, opaque);
return true;
}

static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
{
MultiFDSendParams *p = opaque;
QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
Error *local_err = NULL;
bool ret;

...
multifd_channel_connect(p, ioc);
ret = true;
...

out:
/*
* Here we're not interested whether creation succeeded, only that
* it happened at all.
*/
multifd_send_channel_created();
...
}

void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
{
qio_channel_set_delay(ioc, false);

migration_ioc_register_yank(ioc);
/* Setup p->c only if the channel is completely setup */
p->c = ioc;

p->thread_created = true;
qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
QEMU_THREAD_JOINABLE);
}

void multifd_send_channel_created(void)
{
qemu_sem_post(&multifd_send_state->channels_created);
}

可以看到,在完成multifd_new_send_channel_async()调用multifd_channel_connect()创建multifd_send_thread线程后,其会调用multifd_send_channel_created()通知multifd_send_setup()相关multifd_send_thread已经创建完

multifd_send_thread的逻辑就比较简单,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
static void *multifd_send_thread(void *opaque)
{
MultiFDSendParams *p = opaque;
...
int ret = 0;
bool use_packets = multifd_use_packets();
...
if (use_packets) {
if (multifd_send_initial_packet(p, &local_err) < 0) {
ret = -1;
goto out;
}
}

while (true) {
qemu_sem_post(&multifd_send_state->channels_ready);
qemu_sem_wait(&p->sem);

if (multifd_send_should_exit()) {
break;
}

/*
* Read pending_job flag before p->pages. Pairs with the
* qatomic_store_release() in multifd_send_pages().
*/
if (qatomic_load_acquire(&p->pending_job)) {
MultiFDPages_t *pages = p->pages;

p->iovs_num = 0;
assert(pages->num);

ret = multifd_send_state->ops->send_prepare(p, &local_err);
if (ret != 0) {
break;
}

...
ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
NULL, 0, p->write_flags,
&local_err);
...

if (ret != 0) {
break;
}

stat64_add(&mig_stats.multifd_bytes,
p->next_packet_size + p->packet_len);
stat64_add(&mig_stats.normal_pages, pages->normal_num);
stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num);

multifd_pages_reset(p->pages);
p->next_packet_size = 0;

/*
* Making sure p->pages is published before saying "we're
* free". Pairs with the smp_mb_acquire() in
* multifd_send_pages().
*/
qatomic_store_release(&p->pending_job, false);
} else {
/*
* If not a normal job, must be a sync request. Note that
* pending_sync is a standalone flag (unlike pending_job), so
* it doesn't require explicit memory barriers.
*/
assert(qatomic_read(&p->pending_sync));

if (use_packets) {
p->flags = MULTIFD_FLAG_SYNC;
multifd_send_fill_packet(p);
ret = qio_channel_write_all(p->c, (void *)p->packet,
p->packet_len, &local_err);
if (ret != 0) {
break;
}
/* p->next_packet_size will always be zero for a SYNC packet */
stat64_add(&mig_stats.multifd_bytes, p->packet_len);
p->flags = 0;
}

qatomic_set(&p->pending_sync, false);
qemu_sem_post(&p->sem_sync);
}
}

...
return NULL;
}

static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
{
...
multifd_send_prepare_header(p);

multifd_send_prepare_iovs(p);
p->flags |= MULTIFD_FLAG_NOCOMP;

multifd_send_fill_packet(p);
...
return 0;
}

static inline void multifd_send_prepare_header(MultiFDSendParams *p)
{
p->iov[0].iov_len = p->packet_len;
p->iov[0].iov_base = p->packet;
p->iovs_num++;
}

static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
{
MultiFDInit_t msg = {};
size_t size = sizeof(msg);
int ret;

msg.magic = cpu_to_be32(MULTIFD_MAGIC);
msg.version = cpu_to_be32(MULTIFD_VERSION);
msg.id = p->id;
memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));

ret = qio_channel_write_all(p->c, (char *)&msg, size, errp);
if (ret != 0) {
return -1;
}
stat64_add(&mig_stats.multifd_bytes, size);
return 0;
}

static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
[MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
};

static MultiFDMethods multifd_nocomp_ops = {
.send_setup = nocomp_send_setup,
.send_cleanup = nocomp_send_cleanup,
.send_prepare = nocomp_send_prepare,
.recv_setup = nocomp_recv_setup,
.recv_cleanup = nocomp_recv_cleanup,
.recv = nocomp_recv
};

其首先会调用multifd_send_initial_packet向接收端同步指定信息,然后轮训发送migration_thread指定的数据:首先设置multifd_send_state->channels_ready表明当前multifd_send_thread可以进行数据发送,然后通过p->sem等待migration_thread分派的数据;当有分派的数据后,如果是脏页信息,则在multifd_send_state->ops->send_prepare()(nocomp_send_prepare())中填充相关的p->packet和脏页数据到p->iov缓存中然后发送;如果是同步信息,则调用multifd_send_fill_packet()填充相关的p->packet后,和缓存的所有脏页信息一同发送

qemu_savevm_state_header

前面介绍了multifd_send_thread会通过MultiFDInit_t结构初始化multifd发送线程和接收线程的信息;类似的,migration_thread线程会通过qemu_savevm_state_header()函数初始化migration发送线程migration_thread和接收协程process_incoming_migration_co()的信息,逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
void qemu_savevm_state_header(QEMUFile *f)
{
MigrationState *s = migrate_get_current();

s->vmdesc = json_writer_new(false);

trace_savevm_state_header();
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
qemu_put_be32(f, QEMU_VM_FILE_VERSION);
...
}

可以看到,和multifd线程发送的信息具有公共的结构,从而可以区别multifd线程和migration主线程

qemu_savevm_state_setup

qemu的热迁移机制就是将源端虚拟机的完整运行状态(包括内存、设备寄存器等)序列化传输,并在目的端反序列化并恢复执行的过程。为实现这一目标,qemu将虚拟机中每一个可迁移对象的状态,抽象成一个个独立的struct SaveStateEntry对象,并将所有这些对象放在savevm_state全局变量中进行管理,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
typedef struct SaveState {
QTAILQ_HEAD(, SaveStateEntry) handlers;
SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
int global_section_id;
uint32_t len;
const char *name;
uint32_t target_page_bits;
uint32_t caps_count;
MigrationCapability *capabilities;
QemuUUID uuid;
} SaveState;

typedef struct SaveStateEntry {
QTAILQ_ENTRY(SaveStateEntry) entry;
char idstr[256];
uint32_t instance_id;
int alias_id;
int version_id;
/* version id read from the stream */
int load_version_id;
int section_id;
/* section id read from the stream */
int load_section_id;
const SaveVMHandlers *ops;
const VMStateDescription *vmsd;
void *opaque;
CompatEntry *compat;
int is_ram;
} SaveStateEntry;

对于数据量较小的设备状态(例如cpu),qemu使用VMStateDescription结构来描述设备所需要迁移的每一个状态字段,该状态会在迁移完成阶段一次性打包传输。设备会通过vmstate_register()将其注册到savevm_state中,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
struct VMStateDescription {
const char *name;
bool unmigratable;
/*
* This VMSD describes something that should be sent during setup phase
* of migration. It plays similar role as save_setup() for explicitly
* registered vmstate entries, so it can be seen as a way to describe
* save_setup() in VMSD structures.
*
* Note that for now, a SaveStateEntry cannot have a VMSD and
* operations (e.g., save_setup()) set at the same time. Consequently,
* save_setup() and a VMSD with early_setup set to true are mutually
* exclusive. For this reason, also early_setup VMSDs are migrated in a
* QEMU_VM_SECTION_FULL section, while save_setup() data is migrated in
* a QEMU_VM_SECTION_START section.
*/
bool early_setup;
int version_id;
int minimum_version_id;
MigrationPriority priority;
int (*pre_load)(void *opaque);
int (*post_load)(void *opaque, int version_id);
int (*pre_save)(void *opaque);
int (*post_save)(void *opaque);
bool (*needed)(void *opaque);
bool (*dev_unplug_pending)(void *opaque);

const VMStateField *fields;
const VMStateDescription * const *subsections;
};

const VMStateDescription vmstate_cpu_common = {
.name = "cpu_common",
.version_id = 1,
.minimum_version_id = 1,
.pre_load = cpu_common_pre_load,
.post_load = cpu_common_post_load,
.fields = (const VMStateField[]) {
VMSTATE_UINT32(halted, CPUState),
VMSTATE_UINT32(interrupt_request, CPUState),
VMSTATE_END_OF_LIST()
},
.subsections = (const VMStateDescription * const []) {
&vmstate_cpu_common_exception_index,
&vmstate_cpu_common_crash_occurred,
NULL
}
};

bool cpu_exec_realizefn(CPUState *cpu, Error **errp)
{
...
if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
}
...
}

而对于数据量巨大且在迁移过程中可持续变化的状态(例如内存),qemu使用SaveVMHandlers结构,其提供了一组回调函数,允许在迁移的迭代阶段增量式地同步状态并在完成阶段完成最终的数据传输。设备可以通过register_savevm_live()将其注册到savevm_state全局变量中,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
typedef struct SaveVMHandlers {

/* The following handlers run inside the BQL. */

/**
* @save_state
*
* Saves state section on the source using the latest state format
* version.
*
* Legacy method. Should be deprecated when all users are ported
* to VMStateDescription.
*
* @f: QEMUFile where to send the data
* @opaque: data pointer passed to register_savevm_live()
*/
void (*save_state)(QEMUFile *f, void *opaque);

/**
* @save_prepare
*
* Called early, even before migration starts, and can be used to
* perform early checks.
*
* @opaque: data pointer passed to register_savevm_live()
* @errp: pointer to Error*, to store an error if it happens.
*
* Returns zero to indicate success and negative for error
*/
int (*save_prepare)(void *opaque, Error **errp);

/**
* @save_setup
*
* Initializes the data structures on the source and transmits
* first section containing information on the device
*
* @f: QEMUFile where to send the data
* @opaque: data pointer passed to register_savevm_live()
*
* Returns zero to indicate success and negative for error
*/
int (*save_setup)(QEMUFile *f, void *opaque);

/**
* @save_cleanup
*
* Uninitializes the data structures on the source
*
* @opaque: data pointer passed to register_savevm_live()
*/
void (*save_cleanup)(void *opaque);

/**
* @save_live_complete_postcopy
*
* Called at the end of postcopy for all postcopyable devices.
*
* @f: QEMUFile where to send the data
* @opaque: data pointer passed to register_savevm_live()
*
* Returns zero to indicate success and negative for error
*/
int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);

/**
* @save_live_complete_precopy
*
* Transmits the last section for the device containing any
* remaining data at the end of a precopy phase. When postcopy is
* enabled, devices that support postcopy will skip this step,
* where the final data will be flushed at the end of postcopy via
* @save_live_complete_postcopy instead.
*
* @f: QEMUFile where to send the data
* @opaque: data pointer passed to register_savevm_live()
*
* Returns zero to indicate success and negative for error
*/
int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);

/* This runs both outside and inside the BQL. */

/**
* @is_active
*
* Will skip a state section if not active
*
* @opaque: data pointer passed to register_savevm_live()
*
* Returns true if state section is active else false
*/
bool (*is_active)(void *opaque);

/**
* @has_postcopy
*
* Checks if a device supports postcopy
*
* @opaque: data pointer passed to register_savevm_live()
*
* Returns true for postcopy support else false
*/
bool (*has_postcopy)(void *opaque);

/**
* @is_active_iterate
*
* As #SaveVMHandlers.is_active(), will skip an inactive state
* section in qemu_savevm_state_iterate.
*
* For example, it is needed for only-postcopy-states, which needs
* to be handled by qemu_savevm_state_setup() and
* qemu_savevm_state_pending(), but do not need iterations until
* not in postcopy stage.
*
* @opaque: data pointer passed to register_savevm_live()
*
* Returns true if state section is active else false
*/
bool (*is_active_iterate)(void *opaque);

/* This runs outside the BQL in the migration case, and
* within the lock in the savevm case. The callback had better only
* use data that is local to the migration thread or protected
* by other locks.
*/

/**
* @save_live_iterate
*
* Should send a chunk of data until the point that stream
* bandwidth limits tell it to stop. Each call generates one
* section.
*
* @f: QEMUFile where to send the data
* @opaque: data pointer passed to register_savevm_live()
*
* Returns 0 to indicate that there is still more data to send,
* 1 that there is no more data to send and
* negative to indicate an error.
*/
int (*save_live_iterate)(QEMUFile *f, void *opaque);

/* This runs outside the BQL! */

/**
* @state_pending_estimate
*
* This estimates the remaining data to transfer
*
* Sum of @can_postcopy and @must_postcopy is the whole amount of
* pending data.
*
* @opaque: data pointer passed to register_savevm_live()
* @must_precopy: amount of data that must be migrated in precopy
* or in stopped state, i.e. that must be migrated
* before target start.
* @can_postcopy: amount of data that can be migrated in postcopy
* or in stopped state, i.e. after target start.
* Some can also be migrated during precopy (RAM).
* Some must be migrated after source stops
* (block-dirty-bitmap)
*/
void (*state_pending_estimate)(void *opaque, uint64_t *must_precopy,
uint64_t *can_postcopy);

/**
* @state_pending_exact
*
* This calculates the exact remaining data to transfer
*
* Sum of @can_postcopy and @must_postcopy is the whole amount of
* pending data.
*
* @opaque: data pointer passed to register_savevm_live()
* @must_precopy: amount of data that must be migrated in precopy
* or in stopped state, i.e. that must be migrated
* before target start.
* @can_postcopy: amount of data that can be migrated in postcopy
* or in stopped state, i.e. after target start.
* Some can also be migrated during precopy (RAM).
* Some must be migrated after source stops
* (block-dirty-bitmap)
*/
void (*state_pending_exact)(void *opaque, uint64_t *must_precopy,
uint64_t *can_postcopy);

/**
* @load_state
*
* Load sections generated by any of the save functions that
* generate sections.
*
* Legacy method. Should be deprecated when all users are ported
* to VMStateDescription.
*
* @f: QEMUFile where to receive the data
* @opaque: data pointer passed to register_savevm_live()
* @version_id: the maximum version_id supported
*
* Returns zero to indicate success and negative for error
*/
int (*load_state)(QEMUFile *f, void *opaque, int version_id);

/**
* @load_setup
*
* Initializes the data structures on the destination.
*
* @f: QEMUFile where to receive the data
* @opaque: data pointer passed to register_savevm_live()
*
* Returns zero to indicate success and negative for error
*/
int (*load_setup)(QEMUFile *f, void *opaque);

/**
* @load_cleanup
*
* Uninitializes the data structures on the destination.
*
* @opaque: data pointer passed to register_savevm_live()
*
* Returns zero to indicate success and negative for error
*/
int (*load_cleanup)(void *opaque);

/**
* @resume_prepare
*
* Called when postcopy migration wants to resume from failure
*
* @s: Current migration state
* @opaque: data pointer passed to register_savevm_live()
*
* Returns zero to indicate success and negative for error
*/
int (*resume_prepare)(MigrationState *s, void *opaque);

/**
* @switchover_ack_needed
*
* Checks if switchover ack should be used. Called only on
* destination.
*
* @opaque: data pointer passed to register_savevm_live()
*
* Returns true if switchover ack should be used and false
* otherwise
*/
bool (*switchover_ack_needed)(void *opaque);
} SaveVMHandlers;

static SaveVMHandlers savevm_ram_handlers = {
.save_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
.save_live_complete_postcopy = ram_save_complete,
.save_live_complete_precopy = ram_save_complete,
.has_postcopy = ram_has_postcopy,
.state_pending_exact = ram_state_pending_exact,
.state_pending_estimate = ram_state_pending_estimate,
.load_state = ram_load,
.save_cleanup = ram_save_cleanup,
.load_setup = ram_load_setup,
.load_cleanup = ram_load_cleanup,
.resume_prepare = ram_resume_prepare,
};

void ram_mig_init(void)
{
qemu_mutex_init(&XBZRLE.lock);
register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
ram_block_notifier_add(&ram_mig_ram_notifier);
}

qemu_savevm_state_setup()则用于初始化每个设备的热迁移状态并与目的端协商相关的初始化状态,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void qemu_savevm_state_setup(QEMUFile *f)
{
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
...
if (!se->ops || !se->ops->save_setup) {
continue;
}
if (se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}
save_section_header(f, se, QEMU_VM_SECTION_START);

ret = se->ops->save_setup(f, se->opaque);
save_section_footer(f, se);
if (ret < 0) {
qemu_file_set_error(f, ret);
break;
}
}
...
}

可以看到,其逻辑很清晰,源端会遍历savevm_state所有设备注册的SaveStateEntry信息并调用热迁移起始阶段的回调函数,同时将相关信息以SECTION的格式发送到目的端。

这里以内存设备为例介绍一下具体的设备回调函数大致逻辑,其会调用ram_save_setup()回调函数进行初始化,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
static int ram_save_setup(QEMUFile *f, void *opaque)
{
RAMState **rsp = opaque;
RAMBlock *block;
int ret, max_hg_page_size;

...
if (ram_init_all(rsp) != 0) {
...
return -1;
}
(*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;

/*
* ??? Mirrors the previous value of qemu_host_page_size,
* but is this really what was intended for the migration?
*/
max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE);

WITH_RCU_READ_LOCK_GUARD() {
qemu_put_be64(f, ram_bytes_total_with_ignored()
| RAM_SAVE_FLAG_MEM_SIZE);

RAMBLOCK_FOREACH_MIGRATABLE(block) {
qemu_put_byte(f, strlen(block->idstr));
qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
qemu_put_be64(f, block->used_length);
...
}
}

migration_ops = g_malloc0(sizeof(MigrationOps));
...
migration_ops->ram_save_target_page = ram_save_target_page_multifd;
...

bql_unlock();
ret = multifd_send_sync_main();
bql_lock();
if (ret < 0) {
return ret;
}

if (migrate_multifd() && !migrate_multifd_flush_after_each_section()
&& !migrate_mapped_ram()) {
qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
}

qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
return qemu_fflush(f);
}

int multifd_send_sync_main(void)
{
int i;
bool flush_zero_copy;

if (!migrate_multifd()) {
return 0;
}
if (multifd_send_state->pages->num) {
if (!multifd_send_pages()) {
error_report("%s: multifd_send_pages fail", __func__);
return -1;
}
}
...
for (i = 0; i < migrate_multifd_channels(); i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];

if (multifd_send_should_exit()) {
return -1;
}

trace_multifd_send_sync_main_signal(p->id);

/*
* We should be the only user so far, so not possible to be set by
* others concurrently.
*/
assert(qatomic_read(&p->pending_sync) == false);
qatomic_set(&p->pending_sync, true);
qemu_sem_post(&p->sem);
}
for (i = 0; i < migrate_multifd_channels(); i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];

if (multifd_send_should_exit()) {
return -1;
}

qemu_sem_wait(&multifd_send_state->channels_ready);
trace_multifd_send_sync_main_wait(p->id);
qemu_sem_wait(&p->sem_sync);
...
}
trace_multifd_send_sync_main(multifd_send_state->packet_num);

return 0;
}

其中,在ram_init_all()中初始化脏页bitmap并开启kvm的脏页追踪;然后向目的端热迁移线程发送相关信息:先发送RAM_SAVE_FLAG_MEM_SIZE类型的数据同步RAMBlock信息;然后调用multifd_send_sync_main()完成源端和目的端的multifd线程的一轮同步,然后发送RAM_SAVE_FLAG_EOS,告知目的端主线程这轮内存初始/内存迭代已经结束

其中,multifd_send_sync_main()会首先设置每个MultiFDSendParamspending_sync字段并postsem字段,表示当前有同步信息,让multifd将同步信息和缓存的脏页信息一同发送给目的端;然后等待multifd_send_state->channels_readysem_sync锁,表示每个multifd完成了同步信息的处理

迭代

则完成初始化后,则是热迁移的核心逻辑,即迭代设备/内存脏数据直到小于设定的阈值,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
static void *migration_thread(void *opaque)
{
...
while (migration_is_active()) {
if (urgent || !migration_rate_exceeded(s->to_dst_file)) {
MigIterateState iter_state = migration_iteration_run(s);
if (iter_state == MIG_ITERATE_SKIP) {
continue;
} else if (iter_state == MIG_ITERATE_BREAK) {
break;
}
}

/*
* Try to detect any kind of failures, and see whether we
* should stop the migration now.
*/
thr_error = migration_detect_error(s);
if (thr_error == MIG_THR_ERR_FATAL) {
/* Stop migration */
break;
} else if (thr_error == MIG_THR_ERR_RECOVERED) {
/*
* Just recovered from a e.g. network failure, reset all
* the local variables. This is important to avoid
* breaking transferred_bytes and bandwidth calculation
*/
update_iteration_initial_status(s);
}

urgent = migration_rate_limit();
}
}

其逻辑很清晰,一直循环调用migration_iteration_run()直到其返回值为MIG_ITERATE_BREAK,表示此时剩余脏数据小于设定阈值。而migration_iteration_run()函数的逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
static MigIterateState migration_iteration_run(MigrationState *s)
{
uint64_t must_precopy, can_postcopy, pending_size;
Error *local_err = NULL;
bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
bool can_switchover = migration_can_switchover(s);

...
qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
pending_size = must_precopy + can_postcopy;
...
if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
trace_migration_thread_low_pending(pending_size);
migration_completion(s);
return MIG_ITERATE_BREAK;
}
...
/* Just another iteration step */
qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
return MIG_ITERATE_RESUME;
}

void qemu_savevm_state_pending_exact(uint64_t *must_precopy,
uint64_t *can_postcopy)
{
SaveStateEntry *se;

*must_precopy = 0;
*can_postcopy = 0;

QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->state_pending_exact) {
continue;
}
if (se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}
se->ops->state_pending_exact(se->opaque, must_precopy, can_postcopy);
}
}

int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
{
SaveStateEntry *se;
bool all_finished = true;
int ret;

trace_savevm_state_iterate();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_iterate) {
continue;
}
if (se->ops->is_active &&
!se->ops->is_active(se->opaque)) {
continue;
}
if (se->ops->is_active_iterate &&
!se->ops->is_active_iterate(se->opaque)) {
continue;
}
/*
* In the postcopy phase, any device that doesn't know how to
* do postcopy should have saved it's state in the _complete
* call that's already run, it might get confused if we call
* iterate afterwards.
*/
if (postcopy &&
!(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
continue;
}
if (migration_rate_exceeded(f)) {
return 0;
}
trace_savevm_section_start(se->idstr, se->section_id);

save_section_header(f, se, QEMU_VM_SECTION_PART);

ret = se->ops->save_live_iterate(f, se->opaque);
trace_savevm_section_end(se->idstr, se->section_id, ret);
save_section_footer(f, se);

if (ret < 0) {
error_report("failed to save SaveStateEntry with id(name): "
"%d(%s): %d",
se->section_id, se->idstr, ret);
qemu_file_set_error(f, ret);
return ret;
} else if (!ret) {
all_finished = false;
}
}
return all_finished;
}

/*
* Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
*/
static void save_section_header(QEMUFile *f, SaveStateEntry *se,
uint8_t section_type)
{
qemu_put_byte(f, section_type);
qemu_put_be32(f, se->section_id);

if (section_type == QEMU_VM_SECTION_FULL ||
section_type == QEMU_VM_SECTION_START) {
/* ID string */
size_t len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t *)se->idstr, len);

qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
}
}

其在qemu_savevm_state_pending_exact()中调用struct SaveStateEntrystate_pending_exact回调函数同步设备的脏数据信息,并在qemu_savevm_state_iterate()中调用save_section_header()通知目的端下面要迭代的设备信息并调用struct SaveStateEntrysave_live_iterate回调函数将脏数据发送到目的端

这里仍然以内存设备为例介绍一下具体的设备回调函数的大致逻辑。其会调用ram_state_pending_exact()同步设备的脏数据信息,逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
static SaveVMHandlers savevm_ram_handlers = {
...
.state_pending_exact = ram_state_pending_exact,
...
};

static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
uint64_t *can_postcopy)
{
RAMState **temp = opaque;
RAMState *rs = *temp;
uint64_t remaining_size;

if (!migration_in_postcopy()) {
bql_lock();
WITH_RCU_READ_LOCK_GUARD() {
migration_bitmap_sync_precopy(rs, false);
}
bql_unlock();
}

remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;

if (migrate_postcopy_ram()) {
/* We can do postcopy, and all the data is postcopiable */
*can_postcopy += remaining_size;
} else {
*must_precopy += remaining_size;
}
}

static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
{
...
migration_bitmap_sync(rs, last_stage);
...
}

static void migration_bitmap_sync(RAMState *rs, bool last_stage)
{
...
memory_global_dirty_log_sync(last_stage);

qemu_mutex_lock(&rs->bitmap_mutex);
WITH_RCU_READ_LOCK_GUARD() {
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
ramblock_sync_dirty_bitmap(rs, block);
}
stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
}
qemu_mutex_unlock(&rs->bitmap_mutex);
...
}

//#0 kvm_physical_sync_dirty_bitmap (kml=0x555556f3c100, section=0x7fffcf5fd8e0)
// at ../accel/kvm/kvm-all.c:832
//#1 0x0000555555da419e in kvm_log_sync (listener=0x555556f3c100,
// section=0x7fffcf5fd8e0) at ../accel/kvm/kvm-all.c:1591
//#2 0x0000555555d42b54 in memory_region_sync_dirty_bitmap (mr=0x0,
// last_stage=false) at ../system/memory.c:2295
//#3 0x0000555555d44c85 in memory_global_dirty_log_sync (last_stage=false)
// at ../system/memory.c:2901
//#4 0x0000555555d5913f in migration_bitmap_sync (rs=0x7fffb80043e0,
// last_stage=false) at ../migration/ram.c:1063
//#5 0x0000555555d59385 in migration_bitmap_sync_precopy (rs=0x7fffb80043e0,
// last_stage=false) at ../migration/ram.c:1111
//#6 0x0000555555d5c86c in ram_init_bitmaps (rs=0x7fffb80043e0)
// at ../migration/ram.c:2864
//#7 0x0000555555d5c915 in ram_init_all (rsp=0x555556ea4c40 <ram_state>)
// at ../migration/ram.c:2887
//#8 0x0000555555d5cf34 in ram_save_setup (f=0x55555714d4d0,
// opaque=0x555556ea4c40 <ram_state>) at ../migration/ram.c:3082
//#9 0x0000555555b81e25 in qemu_savevm_state_setup (f=0x55555714d4d0)
// at ../migration/savevm.c:1346
//#10 0x0000555555b6a0d7 in migration_thread (opaque=0x555556f34ff0)
// at ../migration/migration.c:3477
//#11 0x0000555555f93cfe in qemu_thread_start (args=0x555557ab8510)
// at ../util/qemu-thread-posix.c:541
//#12 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#13 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
void memory_global_dirty_log_sync(bool last_stage)
{
memory_region_sync_dirty_bitmap(NULL, last_stage);
}
static void memory_region_sync_dirty_bitmap(MemoryRegion *mr, bool last_stage)
{
MemoryListener *listener;
AddressSpace *as;
FlatView *view;
FlatRange *fr;

/* If the same address space has multiple log_sync listeners, we
* visit that address space's FlatView multiple times. But because
* log_sync listeners are rare, it's still cheaper than walking each
* address space once.
*/
QTAILQ_FOREACH(listener, &memory_listeners, link) {
if (listener->log_sync) {
as = listener->address_space;
view = address_space_get_flatview(as);
FOR_EACH_FLAT_RANGE(fr, view) {
if (fr->dirty_log_mask && (!mr || fr->mr == mr)) {
MemoryRegionSection mrs = section_from_flat_range(fr, view);
listener->log_sync(listener, &mrs);
}
}
flatview_unref(view);
trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 0);
}
...
}
}
static void kvm_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);

kvm_slots_lock();
kvm_physical_sync_dirty_bitmap(kml, section);
kvm_slots_unlock();
}
static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
MemoryRegionSection *section)
{
KVMState *s = kvm_state;
KVMSlot *mem;
hwaddr start_addr, size;
hwaddr slot_size;

size = kvm_align_section(section, &start_addr);
while (size) {
slot_size = MIN(kvm_max_slot_size, size);
mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
if (!mem) {
/* We don't have a slot if we want to trap every access. */
return;
}
if (kvm_slot_get_dirty_log(s, mem)) {
kvm_slot_sync_dirty_pages(mem);
}
start_addr += slot_size;
size -= slot_size;
}
}

//#0 cpu_physical_memory_sync_dirty_bitmap (rb=0x555556f2b4c0, start=0,
// length=536870912)
// at /home/hawk/Desktop/mqemu/qemu/include/exec/ram_addr.h:480
//#1 0x0000555555d58bea in ramblock_sync_dirty_bitmap (rs=0x7fffc4000c80,
// rb=0x555556f2b4c0) at ../migration/ram.c:918
//#2 0x0000555555d591b0 in migration_bitmap_sync (rs=0x7fffc4000c80,
// last_stage=false) at ../migration/ram.c:1068
//#3 0x0000555555d59385 in migration_bitmap_sync_precopy (rs=0x7fffc4000c80,
// last_stage=false) at ../migration/ram.c:1111
//#4 0x0000555555d5c86c in ram_init_bitmaps (rs=0x7fffc4000c80)
// at ../migration/ram.c:2864
//#5 0x0000555555d5c915 in ram_init_all (rsp=0x555556ea4c40 <ram_state>)
// at ../migration/ram.c:2887
//#6 0x0000555555d5cf34 in ram_save_setup (f=0x55555714d4d0,
// opaque=0x555556ea4c40 <ram_state>) at ../migration/ram.c:3082
//#7 0x0000555555b81e25 in qemu_savevm_state_setup (f=0x55555714d4d0)
// at ../migration/savevm.c:1346
//#8 0x0000555555b6a0d7 in migration_thread (opaque=0x555556f34ff0)
// at ../migration/migration.c:3477
//#9 0x0000555555f93cfe in qemu_thread_start (args=0x555557ab8510)
// at ../util/qemu-thread-posix.c:541
//#10 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#11 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
{
uint64_t new_dirty_pages =
cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);

rs->migration_dirty_pages += new_dirty_pages;
rs->num_dirty_pages_period += new_dirty_pages;
}
static inline
uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
ram_addr_t start,
ram_addr_t length)
{
ram_addr_t addr;
unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
uint64_t num_dirty = 0;
unsigned long *dest = rb->bmap;

/* start address and length is aligned at the start of a word? */
if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) ==
(start + rb->offset) &&
!(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) {
int k;
int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
unsigned long * const *src;
unsigned long idx = (word * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE;
unsigned long offset = BIT_WORD((word * BITS_PER_LONG) %
DIRTY_MEMORY_BLOCK_SIZE);
unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS);

src = qatomic_rcu_read(
&ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;

for (k = page; k < page + nr; k++) {
if (src[idx][offset]) {
unsigned long bits = qatomic_xchg(&src[idx][offset], 0);
unsigned long new_dirty;
new_dirty = ~dest[k];
dest[k] |= bits;
new_dirty &= bits;
num_dirty += ctpopl(new_dirty);
}

if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
offset = 0;
idx++;
}
}
if (num_dirty) {
cpu_physical_memory_dirty_bits_cleared(start, length);
}
...
} else {
ram_addr_t offset = rb->offset;

for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
if (cpu_physical_memory_test_and_clear_dirty(
start + addr + offset,
TARGET_PAGE_SIZE,
DIRTY_MEMORY_MIGRATION)) {
long k = (start + addr) >> TARGET_PAGE_BITS;
if (!test_and_set_bit(k, dest)) {
num_dirty++;
}
}
}
}

return num_dirty;
}

可以看到,其首先会在kvm_log_sync()中通过KVM_GET_DIRTY_LOG的ioctl从kvm同步追踪的脏页到ram_list.dirty_memory中,然后在ramblock_sync_dirty_bitmap()中从ram_list.dirty_memory同步结果到RAMBlock->bmap

然后调用ram_save_iterate()来遍历RAMBlock->bmap的脏页bitmap并发送,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
static SaveVMHandlers savevm_ram_handlers = {
...
.save_live_iterate = ram_save_iterate,
...
};

static int ram_save_iterate(QEMUFile *f, void *opaque)
{
RAMState **temp = opaque;
RAMState *rs = *temp;
int ret = 0;
int i;
int64_t t0;
int done = 0;
...
/*
* We'll take this lock a little bit long, but it's okay for two reasons.
* Firstly, the only possible other thread to take it is who calls
* qemu_guest_free_page_hint(), which should be rare; secondly, see
* MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
* guarantees that we'll at least released it in a regular basis.
*/
WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) {
WITH_RCU_READ_LOCK_GUARD() {
...
t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
i = 0;
while ((ret = migration_rate_exceeded(f)) == 0 ||
postcopy_has_request(rs)) {
int pages;

if (qemu_file_get_error(f)) {
break;
}

pages = ram_find_and_save_block(rs);
/* no more pages to sent */
if (pages == 0) {
done = 1;
break;
}

if (pages < 0) {
qemu_file_set_error(f, pages);
break;
}

rs->target_page_count += pages;
...
/*
* we want to check in the 1st loop, just in case it was the 1st
* time and we had to sync the dirty bitmap.
* qemu_clock_get_ns() is a bit expensive, so we only check each
* some iterations
*/
if ((i & 63) == 0) {
uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
1000000;
if (t1 > MAX_WAIT) {
trace_ram_save_iterate_big_wait(t1, i);
break;
}
}
i++;
}
}
}
...

out:
if (ret >= 0
&& migration_is_setup_or_active()) {
if (migrate_multifd() && migrate_multifd_flush_after_each_section() &&
!migrate_mapped_ram()) {
ret = multifd_send_sync_main();
if (ret < 0) {
return ret;
}
}

qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
ram_transferred_add(8);
ret = qemu_fflush(f);
}
if (ret < 0) {
return ret;
}

return done;
}

static int ram_find_and_save_block(RAMState *rs)
{
PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
int pages = 0;
...
/*
* Always keep last_seen_block/last_page valid during this procedure,
* because find_dirty_block() relies on these values (e.g., we compare
* last_seen_block with pss.block to see whether we searched all the
* ramblocks) to detect the completion of migration. Having NULL value
* of last_seen_block can conditionally cause below loop to run forever.
*/
if (!rs->last_seen_block) {
rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
rs->last_page = 0;
}

pss_init(pss, rs->last_seen_block, rs->last_page);

while (true){
if (!get_queued_page(rs, pss)) {
/* priority queue empty, so just search for something dirty */
int res = find_dirty_block(rs, pss);
if (res != PAGE_DIRTY_FOUND) {
if (res == PAGE_ALL_CLEAN) {
break;
} else if (res == PAGE_TRY_AGAIN) {
continue;
} else if (res < 0) {
pages = res;
break;
}
}
}
pages = ram_save_host_page(rs, pss);
if (pages) {
break;
}
}

rs->last_seen_block = pss->block;
rs->last_page = pss->page;

return pages;
}

//#0 find_dirty_block (rs=0x7fffb80043e0, pss=0x7fffb80043e0) at ../migration/ram.c:1349
//#1 0x0000555555d5b7ab in ram_find_and_save_block (rs=0x7fffb80043e0) at ../migration/ram.c:2353
//#2 0x0000555555d5d4d1 in ram_save_iterate (f=0x55555714d4d0, opaque=0x555556ea4c40 <ram_state>) at ../migration/ram.c:3240
//#3 0x0000555555b82133 in qemu_savevm_state_iterate (f=0x55555714d4d0, postcopy=false) at ../migration/savevm.c:1430
//#4 0x0000555555b69ada in migration_iteration_run (s=0x555556f34ff0) at ../migration/migration.c:3252
//#5 0x0000555555b6a141 in migration_thread (opaque=0x555556f34ff0) at ../migration/migration.c:3489
//#6 0x0000555555f93cfe in qemu_thread_start (args=0x555557ab8510) at ../util/qemu-thread-posix.c:541
//#7 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#8 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
{
/* Update pss->page for the next dirty bit in ramblock */
pss_find_next_dirty(pss);

if (pss->complete_round && pss->block == rs->last_seen_block &&
pss->page >= rs->last_page) {
/*
* We've been once around the RAM and haven't found anything.
* Give up.
*/
return PAGE_ALL_CLEAN;
}
if (!offset_in_ramblock(pss->block,
((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
/* Didn't find anything in this RAM Block */
pss->page = 0;
pss->block = QLIST_NEXT_RCU(pss->block, next);
if (!pss->block) {
...
/* Hit the end of the list */
pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
/* Flag that we've looped */
pss->complete_round = true;
...
}
/* Didn't find anything this time, but try again on the new block */
return PAGE_TRY_AGAIN;
} else {
/* We've found something */
return PAGE_DIRTY_FOUND;
}
}

//#0 ram_save_multifd_page (block=0x555556f2b4c0, offset=0) at ../migration/ram.c:1288
//#1 0x0000555555d5b22e in ram_save_target_page_multifd (rs=0x7fffb80043e0, pss=0x7fffb80043e0) at ../migration/ram.c:2123
//#2 0x0000555555d5b60d in ram_save_host_page (rs=0x7fffb80043e0, pss=0x7fffb80043e0) at ../migration/ram.c:2281
//#3 0x0000555555d5b7e1 in ram_find_and_save_block (rs=0x7fffb80043e0) at ../migration/ram.c:2365
//#4 0x0000555555d5d4d1 in ram_save_iterate (f=0x55555714d4d0, opaque=0x555556ea4c40 <ram_state>) at ../migration/ram.c:3240
//#5 0x0000555555b82133 in qemu_savevm_state_iterate (f=0x55555714d4d0, postcopy=false) at ../migration/savevm.c:1430
//#6 0x0000555555b69ada in migration_iteration_run (s=0x555556f34ff0) at ../migration/migration.c:3252
//#7 0x0000555555b6a141 in migration_thread (opaque=0x555556f34ff0) at ../migration/migration.c:3489
//#8 0x0000555555f93cfe in qemu_thread_start (args=0x555557ab8510) at ../util/qemu-thread-posix.c:541
//#9 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#10 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
{
if (!multifd_queue_page(block, offset)) {
return -1;
}

return 1;
}

可以看到,其会在find_dirty_block()从上一次遍历的位置继续遍历所有RAMBlock->bmap位图,并将被标记的脏页通过ram_save_multifd_page()分派给multifd线程,等multifd线程的待发送队列满或后续调用multifd_send_sync_main()时将发送队列的所有脏页真正发送到目的端的multifd接收线程中。最后完成这轮内存迭代后,会通过主线程通道发送RAM_SAVE_FLAG_EOS告知目的端结束这轮内存设备的迭代。

完成

当最终迭代设备/内存脏数据小于设定的阈值时,其会执行migration_completion()完成最终的数据同步,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
static MigIterateState migration_iteration_run(MigrationState *s)
{
qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
pending_size = must_precopy + can_postcopy;
...

if (pending_size < s->threshold_size) {
qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
pending_size = must_precopy + can_postcopy;
...
}

if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
trace_migration_thread_low_pending(pending_size);
migration_completion(s);
return MIG_ITERATE_BREAK;
}
...
}

static void migration_completion(MigrationState *s)
{
int ret = 0;
int current_active_state = s->state;
Error *local_err = NULL;

...
ret = migration_completion_precopy(s, &current_active_state);
...
return;
}

static int migration_completion_precopy(MigrationState *s,
int *current_active_state)
{
int ret;
ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
if (ret < 0) {
goto out_unlock;
}
...
ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
s->block_inactive);
...
out_unlock:
...
return ret;
}

//#0 qemu_savevm_state_complete_precopy (f=0x55555714d4d0, iterable_only=false, inactivate_disks=true) at ../migration/savevm.c:1606
//#1 0x0000555555b68a9a in migration_completion_precopy (s=0x555556f34ff0, current_active_state=0x7fffcf5fda78) at ../migration/migration.c:2749
//#2 0x0000555555b68c61 in migration_completion (s=0x555556f34ff0) at ../migration/migration.c:2813
//#3 0x0000555555b69a3f in migration_iteration_run (s=0x555556f34ff0) at ../migration/migration.c:3237
//#4 0x0000555555b6a141 in migration_thread (opaque=0x555556f34ff0) at ../migration/migration.c:3489
//#5 0x0000555555f93cfe in qemu_thread_start (args=0x555557ab8510) at ../util/qemu-thread-posix.c:541
//#6 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#7 0x00007ffff7c637b8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
bool inactivate_disks)
{
int ret;
Error *local_err = NULL;
bool in_postcopy = migration_in_postcopy();

if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
error_report_err(local_err);
}

trace_savevm_state_complete_precopy();

cpu_synchronize_all_states();

if (!in_postcopy || iterable_only) {
ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
if (ret) {
return ret;
}
}

if (iterable_only) {
goto flush;
}

ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
inactivate_disks);
if (ret) {
return ret;
}

flush:
return qemu_fflush(f);
}

static
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
{
int64_t start_ts_each, end_ts_each;
SaveStateEntry *se;
int ret;

QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops ||
(in_postcopy && se->ops->has_postcopy &&
se->ops->has_postcopy(se->opaque)) ||
!se->ops->save_live_complete_precopy) {
continue;
}

if (se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}

start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
trace_savevm_section_start(se->idstr, se->section_id);

save_section_header(f, se, QEMU_VM_SECTION_END);

ret = se->ops->save_live_complete_precopy(f, se->opaque);
trace_savevm_section_end(se->idstr, se->section_id, ret);
save_section_footer(f, se);
if (ret < 0) {
qemu_file_set_error(f, ret);
return -1;
}
end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id,
end_ts_each - start_ts_each);
}

trace_vmstate_downtime_checkpoint("src-iterable-saved");

return 0;
}

int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy,
bool inactivate_disks)
{
MigrationState *ms = migrate_get_current();
int64_t start_ts_each, end_ts_each;
JSONWriter *vmdesc = ms->vmdesc;
int vmdesc_len;
SaveStateEntry *se;
int ret;

QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->vmsd && se->vmsd->early_setup) {
/* Already saved during qemu_savevm_state_setup(). */
continue;
}

start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);

ret = vmstate_save(f, se, vmdesc);
if (ret) {
qemu_file_set_error(f, ret);
return ret;
}

end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
trace_vmstate_downtime_save("non-iterable", se->idstr, se->instance_id,
end_ts_each - start_ts_each);
}
...
if (!in_postcopy) {
/* Postcopy stream will still be going */
qemu_put_byte(f, QEMU_VM_EOF);
}
...

trace_vmstate_downtime_checkpoint("src-non-iterable-saved");

return 0;
}

可以看到,其首先调用migration_stop_vm()暂停子机运行避免新增脏数据,然后调用struct SaveStateEntry的相关回调函数完成存量脏数据的传输:对于迭代类型的可迁移对象(数据量巨大且持续变化),则调用save_live_complete_precopy函数;对于非迭代类型的可迁移对象(数据量小),则调用vmstate_save()序列化需要迁移的数据。

目的端

端口连接回调

前面prepare章节介绍了,每次目的端监听端口有新连接时会调用socket_accept_incoming_migration()。其最终会调用到migration_ioc_process_incoming()处理每个连接,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
//#0  migration_ioc_process_incoming (ioc=0x555557972b60, errp=0x7fffffffe298) at ../migration/migration.c:898
//#1 0x0000555555b57799 in migration_channel_process_incoming (ioc=0x555557972b60) at ../migration/channel.c:45
//#2 0x0000555555b86f2d in socket_accept_incoming_migration (listener=0x55555790a7b0, cioc=0x555557972b60, opaque=0x0) at ../migration/socket.c:150
//#3 0x0000555555ddd2d5 in qio_net_listener_channel_func (ioc=0x5555572ee1a0, condition=G_IO_IN, opaque=0x55555790a7b0) at ../io/net-listener.c:54
//#4 0x0000555555dd6a55 in qio_channel_fd_source_dispatch (source=0x55555718a0a0, callback=0x555555ddd255 <qio_net_listener_channel_func>, user_data=0x55555790a7b0) at ../io/channel-watch.c:84
//#5 0x00007ffff7da5385 in ?? () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#6 0x00007ffff7da7c78 in g_main_context_dispatch () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#7 0x0000555555faf750 in glib_pollfds_poll () at ../util/main-loop.c:287
//#8 0x0000555555faf7db in os_host_main_loop_wait (timeout=713438000) at ../util/main-loop.c:310
//#9 0x0000555555faf907 in main_loop_wait (nonblocking=0) at ../util/main-loop.c:589
//#10 0x0000555555b3fbac in qemu_main_loop () at ../system/runstate.c:783
//#11 0x0000555555db435e in qemu_default_main () at ../system/main.c:37
//#12 0x0000555555db439f in main (argc=55, argv=0x7fffffffe638) at ../system/main.c:48
void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
QEMUFile *f;
bool default_channel = true;
uint32_t channel_magic = 0;
int ret = 0;

if (migrate_multifd() && !migrate_mapped_ram() &&
!migrate_postcopy_ram() &&
qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
/*
* With multiple channels, it is possible that we receive channels
* out of order on destination side, causing incorrect mapping of
* source channels on destination side. Check channel MAGIC to
* decide type of channel. Please note this is best effort, postcopy
* preempt channel does not send any magic number so avoid it for
* postcopy live migration. Also tls live migration already does
* tls handshake while initializing main channel so with tls this
* issue is not possible.
*/
ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
sizeof(channel_magic), errp);

if (ret != 0) {
return;
}

default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
}
...

if (multifd_recv_setup(errp) != 0) {
return;
}

if (default_channel) {
f = qemu_file_new_input(ioc);
migration_incoming_setup(f);
} else {
...
if (migrate_multifd()) {
multifd_recv_new_channel(ioc, &local_err);
}
...
}

if (migration_should_start_incoming(default_channel)) {
/* If it's a recovery, we're done */
if (postcopy_try_recover()) {
return;
}
migration_incoming_process();
}
}

static void migration_incoming_setup(QEMUFile *f)
{
MigrationIncomingState *mis = migration_incoming_get_current();

if (!mis->from_src_file) {
mis->from_src_file = f;
}
qemu_file_set_blocking(f, false);
}

前面qemu_savevm_state_header章节可知,无论是主线程还是multifd线程,其最开始发送的数据格式头都如下所示

1
2
3
4
struct {
uint32_t magic;
uint32_t version;
}

因此qemu可以获取magic字段(这里只是获取,并不会从流中移除)并进行相应的处理:如果是QEMU_VM_FILE_MAGIC,则后续会调用migration_incoming_setup()设置主通道;如果不是,则后续会调用multifd_recv_new_channel处理multifd接收线程

除此之外,这里也会初始化几个目的端所需的重要数据结构,即MultiFDRecvParamsmultifd_recv_state

其中,MultiFDRecvParams包含目的端multifd线程在热迁移过程中所需的状态数据,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
typedef struct {
/* Fields are only written at creating/deletion time */
/* No lock required for them, they are read only */

/* channel number */
uint8_t id;
/* channel thread name */
char *name;
...
/* communication channel */
QIOChannel *c;
...
/* syncs main thread and channels */
QemuSemaphore sem_sync;
...
/* multifd flags for each packet */
uint32_t flags;
...

/* thread local variables. No locking required */

/* pointer to the packet */
MultiFDPacket_t *packet;
...
} MultiFDRecvParams;

类似前面章节介绍的MultiFDSendParams,其中packet是源端multifd线程发送的元数据信息,包含脏页数据或同步信息数据;而sem_sync则是multifd线程用来通知主线程完成了同步处理

multifd_recv_state类似于multifd_send_state,包含热迁移目的端multifd特性所需的全局状态数据,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
struct {
MultiFDRecvParams *params;
MultiFDRecvData *data;
/* number of created threads */
int count;
/*
* This is always posted by the recv threads, the migration thread
* uses it to wait for recv threads to finish assigned tasks.
*/
QemuSemaphore sem_sync;
/* global number of generated multifd packets */
uint64_t packet_num;
int exiting;
/* multifd ops */
MultiFDMethods *ops;
} *multifd_recv_state;

其中params是所有multifd线程的MultiFDRecvParams数组;sem_sync是multifd线程用来通知主线程完成了同步信息的接收

multifd线程

如前面介绍的,如果是源端multifd线程连接的监听端口,则其回调函数会调用multifd_recv_new_channel创建multifd接收线程,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
{
MultiFDRecvParams *p;
Error *local_err = NULL;
int id;

if (use_packets) {
id = multifd_recv_initial_packet(ioc, &local_err);
...
}
...

p = &multifd_recv_state->params[id];
...
qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
QEMU_THREAD_JOINABLE);
...
}

static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
{
MultiFDInit_t msg;
int ret;

ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
if (ret != 0) {
return -1;
}

msg.magic = be32_to_cpu(msg.magic);
msg.version = be32_to_cpu(msg.version);

if (msg.magic != MULTIFD_MAGIC) {
error_setg(errp, "multifd: received packet magic %x "
"expected %x", msg.magic, MULTIFD_MAGIC);
return -1;
}

if (msg.version != MULTIFD_VERSION) {
error_setg(errp, "multifd: received packet version %u "
"expected %u", msg.version, MULTIFD_VERSION);
return -1;
}

if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);

error_setg(errp, "multifd: received uuid '%s' and expected "
"uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
g_free(uuid);
g_free(msg_uuid);
return -1;
}

if (msg.id > migrate_multifd_channels()) {
error_setg(errp, "multifd: received channel id %u is greater than "
"number of channels %u", msg.id, migrate_multifd_channels());
return -1;
}

return msg.id;
}

可以看到,如前面章节其首先通过multifd_recv_initial_packet()验证源端multifd发送的线程信息MultiFDPacket_t。然后创建对应的multifd_recv_thread(),其内容如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
static MultiFDMethods multifd_nocomp_ops = {
.send_setup = nocomp_send_setup,
.send_cleanup = nocomp_send_cleanup,
.send_prepare = nocomp_send_prepare,
.recv_setup = nocomp_recv_setup,
.recv_cleanup = nocomp_recv_cleanup,
.recv = nocomp_recv
};

static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
[MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
};

//#0 multifd_recv_setup (errp=0x7fffffffe298) at ../migration/multifd.c:1562
//#1 0x0000555555b64696 in migration_ioc_process_incoming (ioc=0x555557972b60, errp=0x7fffffffe298) at ../migration/migration.c:931
//#2 0x0000555555b57799 in migration_channel_process_incoming (ioc=0x555557972b60) at ../migration/channel.c:45
//#3 0x0000555555b86f2d in socket_accept_incoming_migration (listener=0x55555790a7b0, cioc=0x555557972b60, opaque=0x0) at ../migration/socket.c:150
//#4 0x0000555555ddd2d5 in qio_net_listener_channel_func (ioc=0x5555572ee1a0, condition=G_IO_IN, opaque=0x55555790a7b0) at ../io/net-listener.c:54
//#5 0x0000555555dd6a55 in qio_channel_fd_source_dispatch (source=0x55555718a0a0, callback=0x555555ddd255 <qio_net_listener_channel_func>, user_data=0x55555790a7b0) at ../io/channel-watch.c:84
//#6 0x00007ffff7da5385 in ?? () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#7 0x00007ffff7da7c78 in g_main_context_dispatch () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#8 0x0000555555faf750 in glib_pollfds_poll () at ../util/main-loop.c:287
//#9 0x0000555555faf7db in os_host_main_loop_wait (timeout=705145000) at ../util/main-loop.c:310
//#10 0x0000555555faf907 in main_loop_wait (nonblocking=0) at ../util/main-loop.c:589
//#11 0x0000555555b3fbac in qemu_main_loop () at ../system/runstate.c:783
//#12 0x0000555555db435e in qemu_default_main () at ../system/main.c:37
//#13 0x0000555555db439f in main (argc=55, argv=0x7fffffffe638) at ../system/main.c:48
int multifd_recv_setup(Error **errp)
{
...
/*
* Return successfully if multiFD recv state is already initialised
* or multiFD is not enabled.
*/
if (multifd_recv_state || !migrate_multifd()) {
return 0;
}

thread_count = migrate_multifd_channels();
multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
...
multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
}

static void *multifd_recv_thread(void *opaque)
{
MultiFDRecvParams *p = opaque;
Error *local_err = NULL;
...

while (true) {
uint32_t flags = 0;
bool has_data = false;
p->normal_num = 0;

if (multifd_recv_should_exit()) {
break;
}

ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
p->packet_len, &local_err);
if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */
break;
}

qemu_mutex_lock(&p->mutex);
ret = multifd_recv_unfill_packet(p, &local_err);
if (ret) {
qemu_mutex_unlock(&p->mutex);
break;
}

flags = p->flags;
/* recv methods don't know how to handle the SYNC flag */
p->flags &= ~MULTIFD_FLAG_SYNC;
has_data = p->normal_num || p->zero_num;
qemu_mutex_unlock(&p->mutex);
...

if (has_data) {
ret = multifd_recv_state->ops->recv(p, &local_err);
if (ret != 0) {
break;
}
}

if (flags & MULTIFD_FLAG_SYNC) {
qemu_sem_post(&multifd_recv_state->sem_sync);
qemu_sem_wait(&p->sem_sync);
}
}
...
return NULL;
}

参考前面章节,其会调用qio_channel_read_all_eof()接受源端multifd发送的packet数据并调用multifd_recv_unfill_packet()解析出源端multifd发送的元数据信息并进行相应的处理:如果包含脏页信息,则调用multifd_recv_state->ops->recv()(nocomp_recv())接受脏页数据;如果包含同步信息,则和主线程完成相关同步操作

process_incoming_migration_co

根据前面章节可知,在处理最后一个新连接时,其会调用migration_incoming_process()创建process_incoming_migration_co协程,相当于发送端热迁移主线程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
{
...
if (migration_should_start_incoming(default_channel)) {
migration_incoming_process();
}
}

/*
* Returns true when we want to start a new incoming migration process,
* false otherwise.
*/
static bool migration_should_start_incoming(bool main_channel)
{
/* Multifd doesn't start unless all channels are established */
if (migrate_multifd()) {
return migration_has_all_channels();
}
...
}

/**
* @migration_has_all_channels: We have received all channels that we need
*
* Returns true when we have got connections to all the channels that
* we need for migration.
*/
bool migration_has_all_channels(void)
{
MigrationIncomingState *mis = migration_incoming_get_current();

if (!mis->from_src_file) {
return false;
}

if (migrate_multifd()) {
return multifd_recv_all_channels_created();
}
...
}

//#0 migration_incoming_process () at ../migration/migration.c:826
//#1 0x0000555555b51dd7 in migration_ioc_process_incoming (ioc=0x555557a922a0, errp=0x7fffffffe2b0)
// at ../migration/migration.c:959
//#2 0x0000555555b455b9 in migration_channel_process_incoming (ioc=0x555557a922a0) at ../migration/channel.c:45
//#3 0x0000555555b73229 in socket_accept_incoming_migration (listener=0x5555578d57b0, cioc=0x555557a922a0, opaque=0x0)
// at ../migration/socket.c:150
//#4 0x0000555555dbaea1 in qio_net_listener_channel_func (ioc=0x5555572b91a0, condition=G_IO_IN, opaque=0x5555578d57b0)
// at ../io/net-listener.c:54
//#5 0x0000555555db4a8b in qio_channel_fd_source_dispatch (source=0x5555571550a0,
// callback=0x555555dbae1d <qio_net_listener_channel_func>, user_data=0x5555578d57b0) at ../io/channel-watch.c:84
//#6 0x00007ffff7da5385 in ?? () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#7 0x00007ffff7da7c78 in g_main_context_dispatch () from target:/lib/x86_64-linux-gnu/libglib-2.0.so.0
//#8 0x0000555555f7d9b6 in glib_pollfds_poll () at ../util/main-loop.c:287
//#9 0x0000555555f7da44 in os_host_main_loop_wait (timeout=59822371073000) at ../util/main-loop.c:310
//#10 0x0000555555f7db64 in main_loop_wait (nonblocking=0) at ../util/main-loop.c:589
//#11 0x0000555555b2e067 in qemu_main_loop () at ../system/runstate.c:783
//#12 0x0000555555d92c24 in qemu_default_main () at ../system/main.c:37
//#13 0x0000555555d92c61 in main (argc=55, argv=0x7fffffffe638) at ../system/main.c:48
void migration_incoming_process(void)
{
Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
qemu_coroutine_enter(co);
}

可以看到,协程调用的函数是process_incoming_migration_co(),其用于完成热迁移接收端的设备/内存脏数据的迭代逻辑,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static void coroutine_fn
process_incoming_migration_co(void *opaque)
{
...
mis->loadvm_co = qemu_coroutine_self();
ret = qemu_loadvm_state(mis->from_src_file);
mis->loadvm_co = NULL;
...
migration_bh_schedule(process_incoming_migration_bh, mis);
return;
}

//#0 qemu_loadvm_state (f=0x5555571184d0) at ../migration/savevm.c:2928
//#1 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#2 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#3 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#4 0x00007fffffffdde0 in ?? ()
//#5 0x0000000000000000 in ?? ()
int qemu_loadvm_state(QEMUFile *f)
{
...
ret = qemu_loadvm_state_header(f);
if (ret) {
return ret;
}

if (qemu_loadvm_state_setup(f) != 0) {
return -EINVAL;
}
...
cpu_synchronize_all_pre_loadvm();
...
ret = qemu_loadvm_state_main(f, mis);
...
qemu_loadvm_state_cleanup();
cpu_synchronize_all_post_init();

return ret;
}

可以看到,基本和前面源端migration_thread的整体逻辑结构一致,在qemu_loadvm_state_header()中接收源端主线程发送的初始信息;在qemu_loadvm_state_setup()中初始化每个设备的热迁移状态;在qemu_loadvm_state_main()中接收完成设备/脏页数据,完成热迁移的核心逻辑

qemu_loadvm_state_header

前面qemu_savevm_state_header描述了源端发送的相关信息,目的端则进行相应的处理,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//#0  qemu_loadvm_state_header (f=0x5555571184d0) at ../migration/savevm.c:2690
//#1 0x0000555555b71559 in qemu_loadvm_state (f=0x5555571184d0)
// at ../migration/savevm.c:2937
//#2 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#3 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#4 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#5 0x00007fffffffdde0 in ?? ()
//#6 0x0000000000000000 in ?? ()
static int qemu_loadvm_state_header(QEMUFile *f)
{
unsigned int v;
int ret;

v = qemu_get_be32(f);
if (v != QEMU_VM_FILE_MAGIC) {
error_report("Not a migration stream");
return -EINVAL;
}

v = qemu_get_be32(f);
if (v == QEMU_VM_FILE_VERSION_COMPAT) {
error_report("SaveVM v2 format is obsolete and don't work anymore");
return -ENOTSUP;
}
if (v != QEMU_VM_FILE_VERSION) {
error_report("Unsupported migration stream version");
return -ENOTSUP;
}
...
return 0;
}
qemu_loadvm_state_setup

类似于前面qemu_savevm_state_setup章节内容,其会初始化目的端每个设备的热迁移状态,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int qemu_loadvm_state_setup(QEMUFile *f)
{
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->load_setup) {
continue;
}
if (se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}

ret = se->ops->load_setup(f, se->opaque);
if (ret < 0) {
qemu_file_set_error(f, ret);
error_report("Load state of device %s failed", se->idstr);
return ret;
}
}
return 0;
}

可以看到,目的端会遍历savevm_state所有设备注册的SaveStateEntry信息并调用热迁移起始状态的回调函数。但需要注意的是,这里并没有处理qemu_savevm_state_setup()中发送的SECTION格式数据,这些数据会在后续迭代中进行处理

这里同样以内存设备为例介绍一下具体的设备回调函数大致逻辑,其会调用ram_load_setup()进行初始化,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
//#0  ram_load_setup (f=0x5555571184d0, opaque=0x555556e6fc40 <ram_state>)
// at ../migration/ram.c:3700
//#1 0x0000555555b70fad in qemu_loadvm_state_setup (f=0x5555571184d0)
// at ../migration/savevm.c:2758
//#2 0x0000555555b71576 in qemu_loadvm_state (f=0x5555571184d0)
// at ../migration/savevm.c:2942
//#3 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#4 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#5 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#6 0x00007fffffffdde0 in ?? ()
//#7 0x0000000000000000 in ?? ()
static int ram_load_setup(QEMUFile *f, void *opaque)
{
xbzrle_load_setup();
ramblock_recv_map_init();

return 0;
}

可以看到,其确实没有处理源端发送的SECTION,只是初始化了设备迭代需要的相关数据结构

迭代

在完成初始化后,则是热迁移的核心逻辑,即设备/脏页数据的迭代过程,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
{
uint8_t section_type;
int ret = 0;

retry:
while (true) {
section_type = qemu_get_byte(f);

ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);
if (ret) {
break;
}

trace_qemu_loadvm_state_section(section_type);
switch (section_type) {
case QEMU_VM_SECTION_START:
case QEMU_VM_SECTION_FULL:
ret = qemu_loadvm_section_start_full(f, mis, section_type);
if (ret < 0) {
goto out;
}
break;
case QEMU_VM_SECTION_PART:
case QEMU_VM_SECTION_END:
ret = qemu_loadvm_section_part_end(f, mis, section_type);
if (ret < 0) {
goto out;
}
break;
case QEMU_VM_COMMAND:
ret = loadvm_process_command(f);
trace_qemu_loadvm_state_section_command(ret);
if ((ret < 0) || (ret == LOADVM_QUIT)) {
goto out;
}
break;
case QEMU_VM_EOF:
/* This is the end of migration */
goto out;
default:
error_report("Unknown savevm section type %d", section_type);
ret = -EINVAL;
goto out;
}
}
...
return ret;
}

源端热迁移主线程每个设备的脏页数据是以SECTION为单位进行迭代的,因此这里也以SECTION为单位进行处理即可。

前面章节介绍qemu热迁移机制将设备分为两类:数据量较小的设备,其在迁移完成阶段一次性打包传输,其数据都是QEMU_VM_SECTION_FULL类型的SECTION;而对于数据量较大的设备,其在初始化阶段数据是QEMU_VM_SECTION_START类型的SECTION,在迭代阶段数据是QEMU_VM_SECTION_PART类型的SECTION,在完成阶段数据都是QEMU_VM_SECTION_END类型的SECTION

不同类型的SECTION实际上最后都会调用vmstate_load()处理脏页数据,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
//#0  save_section_header (f=0x5555571184d0, se=0x555556f00c80, 
// section_type=1 '\001') at ../migration/savevm.c:985
//#1 0x0000555555b6e295 in qemu_savevm_state_setup (f=0x5555571184d0)
// at ../migration/savevm.c:1344
//#2 0x0000555555b57373 in migration_thread (opaque=0x555556effff0)
// at ../migration/migration.c:3477
//#3 0x0000555555f63f51 in qemu_thread_start (args=0x555557a83510)
// at ../util/qemu-thread-posix.c:541
//#4 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#5 0x00007ffff7c637f8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
void qemu_savevm_state_setup(QEMUFile *f)
{
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
...
save_section_header(f, se, QEMU_VM_SECTION_START);

ret = se->ops->save_setup(f, se->opaque);
...
}
...
}

//#0 save_section_header (f=0x5555571184d0, se=0x555556efe790,
// section_type=4 '\004') at ../migration/savevm.c:985
//#1 0x0000555555b6d5e9 in vmstate_save (f=0x5555571184d0, se=0x555556efe790,
// vmdesc=0x7fffc4004280) at ../migration/savevm.c:1027
//#2 0x0000555555b6eb43 in qemu_savevm_state_complete_precopy_non_iterable (
// f=0x5555571184d0, in_postcopy=false, inactivate_disks=true)
// at ../migration/savevm.c:1555
//#3 0x0000555555b6ede1 in qemu_savevm_state_complete_precopy (
// f=0x5555571184d0, iterable_only=false, inactivate_disks=true)
// at ../migration/savevm.c:1630
//#4 0x0000555555b55db7 in migration_completion_precopy (s=0x555556effff0,
// current_active_state=0x7fffcf5fda98) at ../migration/migration.c:2749
//#5 0x0000555555b55f40 in migration_completion (s=0x555556effff0)
// at ../migration/migration.c:2813
//#6 0x0000555555b56d01 in migration_iteration_run (s=0x555556effff0)
// at ../migration/migration.c:3237
//#7 0x0000555555b573dd in migration_thread (opaque=0x555556effff0)
// at ../migration/migration.c:3489
//#8 0x0000555555f63f51 in qemu_thread_start (args=0x555557a83510)
// at ../util/qemu-thread-posix.c:541
//#9 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#10 0x00007ffff7c637f8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy,
bool inactivate_disks)
{
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
...
ret = vmstate_save(f, se, vmdesc);
...
}
...
}

static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc)
{
...
save_section_header(f, se, QEMU_VM_SECTION_FULL);
...
if (!se->vmsd) {
vmstate_save_old_style(f, se, vmdesc);
} else {
ret = vmstate_save_state_with_err(f, se->vmsd, se->opaque, vmdesc, &local_err);
...
}
...
}

static void save_section_header(QEMUFile *f, SaveStateEntry *se,
uint8_t section_type)
{
qemu_put_byte(f, section_type);
qemu_put_be32(f, se->section_id);

if (section_type == QEMU_VM_SECTION_FULL ||
section_type == QEMU_VM_SECTION_START) {
/* ID string */
size_t len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t *)se->idstr, len);

qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
}
}

//#0 qemu_loadvm_section_start_full (f=0x5555571184d0, mis=0x555556f00750,
// type=1 '\001') at ../migration/savevm.c:2559
//#1 0x0000555555b71414 in qemu_loadvm_state_main (f=0x5555571184d0,
// mis=0x555556f00750) at ../migration/savevm.c:2869
//#2 0x0000555555b715b1 in qemu_loadvm_state (f=0x5555571184d0)
// at ../migration/savevm.c:2952
//#3 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#4 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#5 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#6 0x00007fffffffdde0 in ?? ()
//#7 0x0000000000000000 in ?? ()
static int
qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis,
uint8_t type)
{
...
/* Read section start */
section_id = qemu_get_be32(f);
if (!qemu_get_counted_string(f, idstr)) {
...
}
...
instance_id = qemu_get_be32(f);
version_id = qemu_get_be32(f);
...

/* Find savevm section */
se = find_se(idstr, instance_id);
if (se == NULL) {
error_report("Unknown savevm section or instance '%s' %"PRIu32". "
"Make sure that your current VM setup matches your "
"saved VM setup, including any hotplugged devices",
idstr, instance_id);
return -EINVAL;
}

/* Validate version */
if (version_id > se->version_id) {
error_report("savevm: unsupported version %d for '%s' v%d",
version_id, idstr, se->version_id);
return -EINVAL;
}
se->load_version_id = version_id;
se->load_section_id = section_id;
...
ret = vmstate_load(f, se);
...
return 0;
}

可以看到,对于QEMU_VM_SECTION_STARTQEMU_VM_SECTION_FULL类型的SECTION,源端会调用save_section_header()发送section_typesection_idid stringinstance_idversion_id,目的段在qemu_loadvm_section_start_full()中也会处理这些字段并最终调用vmstate_load进行恢复

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
//#0  save_section_header (f=0x5555571184d0, se=0x555556f00c80, 
// section_type=2 '\002') at ../migration/savevm.c:985
//#1 0x0000555555b6e5a5 in qemu_savevm_state_iterate (f=0x5555571184d0,
// postcopy=false) at ../migration/savevm.c:1428
//#2 0x0000555555b56d9c in migration_iteration_run (s=0x555556effff0)
// at ../migration/migration.c:3252
//#3 0x0000555555b573dd in migration_thread (opaque=0x555556effff0)
// at ../migration/migration.c:3489
//#4 0x0000555555f63f51 in qemu_thread_start (args=0x555557a83510)
// at ../util/qemu-thread-posix.c:541
//#5 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#6 0x00007ffff7c637f8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
{
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
save_section_header(f, se, QEMU_VM_SECTION_PART);

ret = se->ops->save_live_iterate(f, se->opaque);
...
}
...
}

//#0 save_section_header (f=0x5555571184d0, se=0x555556f00c80,
// section_type=3 '\003') at ../migration/savevm.c:985
//#1 0x0000555555b6e99c in qemu_savevm_state_complete_precopy_iterable (
// f=0x5555571184d0, in_postcopy=false) at ../migration/savevm.c:1517
//#2 0x0000555555b6edb7 in qemu_savevm_state_complete_precopy (
// f=0x5555571184d0, iterable_only=false, inactivate_disks=true)
// at ../migration/savevm.c:1620
//#3 0x0000555555b55db7 in migration_completion_precopy (s=0x555556effff0,
// current_active_state=0x7fffcf5fda98) at ../migration/migration.c:2749
//#4 0x0000555555b55f40 in migration_completion (s=0x555556effff0)
// at ../migration/migration.c:2813
//#5 0x0000555555b56d01 in migration_iteration_run (s=0x555556effff0)
// at ../migration/migration.c:3237
//#6 0x0000555555b573dd in migration_thread (opaque=0x555556effff0)
// at ../migration/migration.c:3489
//#7 0x0000555555f63f51 in qemu_thread_start (args=0x555557a83510)
// at ../util/qemu-thread-posix.c:541
//#8 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#9 0x00007ffff7c637f8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
static
int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
{
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
save_section_header(f, se, QEMU_VM_SECTION_END);

ret = se->ops->save_live_complete_precopy(f, se->opaque);
...
}
...
}

static void save_section_header(QEMUFile *f, SaveStateEntry *se,
uint8_t section_type)
{
qemu_put_byte(f, section_type);
qemu_put_be32(f, se->section_id);

if (section_type == QEMU_VM_SECTION_FULL ||
section_type == QEMU_VM_SECTION_START) {
/* ID string */
size_t len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t *)se->idstr, len);

qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
}
}

//#0 qemu_loadvm_section_part_end (f=0x5555571184d0, mis=0x555556f00750,
// type=2 '\002') at ../migration/savevm.c:2638
//#1 0x0000555555b71439 in qemu_loadvm_state_main (f=0x5555571184d0,
// mis=0x555556f00750) at ../migration/savevm.c:2876
//#2 0x0000555555b715b1 in qemu_loadvm_state (f=0x5555571184d0)
// at ../migration/savevm.c:2952
//#3 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#4 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#5 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#6 0x00007fffffffdde0 in ?? ()
//#7 0x0000000000000000 in ?? ()
static int
qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
uint8_t type)
{
...
section_id = qemu_get_be32(f);
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (se->load_section_id == section_id) {
break;
}
}
...
ret = vmstate_load(f, se);
if (ret < 0) {
error_report("error while loading state section id %d(%s)",
section_id, se->idstr);
return ret;
}

if (trace_downtime) {
end_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
trace_vmstate_downtime_load("iterable", se->idstr,
se->instance_id, end_ts - start_ts);
}

if (!check_section_footer(f, se)) {
return -EINVAL;
}

return 0;
}

类似的,对于QEMU_VM_SECTION_PARTQEMU_VM_SECTION_END类型的SECTION,源端同样会调用save_section_header(),但只发送section_typesection_id字段,目的端在qemu_loadvm_section_part_end()中处理这些字段并最终调用vmstate_load()进行恢复

这里仍然以内存设备为例介绍一下。具体的,在迭代时,源端在qemu_savevm_state_iterate()中通过save_section_header()设置SECTION相关信息,然后调用ram_save_iterate()完成脏页迭代并发送RAM_SAVE_FLAG_EOS完成内存迭代。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
{
...
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
...
save_section_header(f, se, QEMU_VM_SECTION_PART);

ret = se->ops->save_live_iterate(f, se->opaque);
...
}
...
}

//#0 ram_save_iterate (f=0x5555571184d0, opaque=0x555556e6fc40 <ram_state>)
// at ../migration/ram.c:3194
//#1 0x0000555555b6e5cb in qemu_savevm_state_iterate (f=0x5555571184d0,
// postcopy=false) at ../migration/savevm.c:1430
//#2 0x0000555555b56d9c in migration_iteration_run (s=0x555556effff0)
// at ../migration/migration.c:3252
//#3 0x0000555555b573dd in migration_thread (opaque=0x555556effff0)
// at ../migration/migration.c:3489
//#4 0x0000555555f63f51 in qemu_thread_start (args=0x555557a83510)
// at ../util/qemu-thread-posix.c:541
//#5 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#6 0x00007ffff7c637f8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
static int ram_save_iterate(QEMUFile *f, void *opaque)
{
...
/*
* We'll take this lock a little bit long, but it's okay for two reasons.
* Firstly, the only possible other thread to take it is who calls
* qemu_guest_free_page_hint(), which should be rare; secondly, see
* MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
* guarantees that we'll at least released it in a regular basis.
*/
WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) {
WITH_RCU_READ_LOCK_GUARD() {
while ((ret = migration_rate_exceeded(f)) == 0 ||
postcopy_has_request(rs)) {
...
pages = ram_find_and_save_block(rs);
...
}
}
}

out:
if (ret >= 0
&& migration_is_setup_or_active()) {
...
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
...
}
...
}

目的端则在qemu_loadvm_state_main()中处理源端发送的SECTION数据,对于内存设备的QEMU_VM_SECTION_PARTSECTION数据,其调用qemu_loadvm_section_part_end()解析SECTION元数据并调用ram_load的回调函数进行处理,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
{
...
while (true) {
section_type = qemu_get_byte(f);
...

switch (section_type) {
...
case QEMU_VM_SECTION_PART:
ret = qemu_loadvm_section_part_end(f, mis, section_type);
...
}
}
...
}

static int
qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis,
uint8_t type)
{
...
section_id = qemu_get_be32(f);
...
ret = vmstate_load(f, se);
...
}

static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
{
...
if (!se->vmsd) { /* Old style */
return se->ops->load_state(f, se->opaque, se->load_version_id);
}
...
}

//#0 ram_load (f=0x5555571184d0, opaque=0x555556e6fc40 <ram_state>,
// version_id=4) at ../migration/ram.c:4390
//#1 0x0000555555b6d268 in vmstate_load (f=0x5555571184d0, se=0x555556f00c80)
// at ../migration/savevm.c:955
//#2 0x0000555555b70aad in qemu_loadvm_section_start_full (f=0x5555571184d0,
// mis=0x555556f00750, type=1 '\001') at ../migration/savevm.c:2614
//#3 0x0000555555b71414 in qemu_loadvm_state_main (f=0x5555571184d0,
// mis=0x555556f00750) at ../migration/savevm.c:2869
//#4 0x0000555555b715b1 in qemu_loadvm_state (f=0x5555571184d0)
// at ../migration/savevm.c:2952
//#5 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#6 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#7 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#8 0x00007fffffffdde0 in ?? ()
//#9 0x0000000000000000 in ?? ()
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
...
WITH_RCU_READ_LOCK_GUARD() {
...
ret = ram_load_precopy(f);
...
}
...
}

其在ram_load_precopy()中,等待源端处理完内存脏页信息并发送RAM_SAVE_FLAG_EOS数据后完成一轮迭代,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
//#0  ram_load_precopy (f=0x5555571184d0) at ../migration/ram.c:4213
//#1 0x0000555555d407c4 in ram_load (f=0x5555571184d0,
// opaque=0x555556e6fc40 <ram_state>, version_id=4) at ../migration/ram.c:4419
//#2 0x0000555555b6d268 in vmstate_load (f=0x5555571184d0, se=0x555556f00c80)
// at ../migration/savevm.c:955
//#3 0x0000555555b70aad in qemu_loadvm_section_start_full (f=0x5555571184d0,
// mis=0x555556f00750, type=1 '\001') at ../migration/savevm.c:2614
//#4 0x0000555555b71414 in qemu_loadvm_state_main (f=0x5555571184d0,
// mis=0x555556f00750) at ../migration/savevm.c:2869
//#5 0x0000555555b715b1 in qemu_loadvm_state (f=0x5555571184d0)
// at ../migration/savevm.c:2952
//#6 0x0000555555b51893 in process_incoming_migration_co (opaque=0x0)
// at ../migration/migration.c:755
//#7 0x0000555555f8041f in coroutine_trampoline (i0=1470332912, i1=21845)
// at ../util/coroutine-ucontext.c:175
//#8 0x00007ffff7bab890 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#9 0x00007fffffffdde0 in ?? ()
//#10 0x0000000000000000 in ?? ()
static int ram_load_precopy(QEMUFile *f)
{
...
int flags = 0, ret = 0;
...
while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
...
addr = qemu_get_be64(f);
...
switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
...
case RAM_SAVE_FLAG_EOS:
...
break;
...
}
}
...
}
完成

根据前面章节可知,源端在暂停子机运行并迭代完最后的脏页数据后,会发送QEMU_VM_EOF数据,完成所有数据的传输,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
static int migration_completion_precopy(MigrationState *s,
int *current_active_state)
{
if (!migrate_mode_is_cpr(s)) {
ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
...
}
...
ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
s->block_inactive);
...
return ret;
}

int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
bool inactivate_disks)
{
...
ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
inactivate_disks);
if (ret) {
return ret;
}

flush:
return qemu_fflush(f);
}

//#0 qemu_put_byte (f=0x5555576c0700, v=21845) at ../migration/qemu-file.c:560
//#1 0x0000555555b6ec76 in qemu_savevm_state_complete_precopy_non_iterable (f=0x5555571184d0, in_postcopy=false,
// inactivate_disks=true) at ../migration/savevm.c:1582
//#2 0x0000555555b6ede1 in qemu_savevm_state_complete_precopy (f=0x5555571184d0, iterable_only=false, inactivate_disks=true)
// at ../migration/savevm.c:1630
//#3 0x0000555555b55db7 in migration_completion_precopy (s=0x555556effff0, current_active_state=0x7fffcf5fda98)
// at ../migration/migration.c:2749
//#4 0x0000555555b55f40 in migration_completion (s=0x555556effff0) at ../migration/migration.c:2813
//#5 0x0000555555b56d01 in migration_iteration_run (s=0x555556effff0) at ../migration/migration.c:3237
//#6 0x0000555555b573dd in migration_thread (opaque=0x555556effff0) at ../migration/migration.c:3489
//#7 0x0000555555f63f51 in qemu_thread_start (args=0x555557a83510) at ../util/qemu-thread-posix.c:541
//#8 0x00007ffff7be5b7b in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
//#9 0x00007ffff7c637f8 in ?? () from target:/lib/x86_64-linux-gnu/libc.so.6
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy,
bool inactivate_disks)
{
...
if (!in_postcopy) {
/* Postcopy stream will still be going */
qemu_put_byte(f, QEMU_VM_EOF);
}
...
return 0;
}

而目的端收到QEMU_VM_EOF后,结束脏页迭代的接受流程并恢复虚拟机的全部状态,等待libvirt在finish阶段重启guest运行,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int qemu_loadvm_state(QEMUFile *f)
{
...
ret = qemu_loadvm_state_main(f, mis);
...
qemu_loadvm_state_cleanup();
cpu_synchronize_all_post_init();

return ret;
}

int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
{
retry:
while (true) {
section_type = qemu_get_byte(f);
...
switch (section_type) {
...
case QEMU_VM_EOF:
/* This is the end of migration */
goto out;
...
}
}

out:
...
return ret;
}

生成cookie

这里和begin阶段一样,使用qemuMigrationCookieFormat根据flags指定的配置字段,序列化struct _qemuMigrationCookie类型的cookie,其为源端perform阶段的热迁移额外配置信息

finish阶段

概述

finish阶段则是在目的端处理热迁移结果,即如果此次热迁移成功,则恢复目的端子机的运行;如果失败,则销毁目的端子机,流程如下所示

热迁移finish阶段流程图

finish阶段的代码如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
static virDomainPtr
virDomainMigrateVersion3Full(virDomainPtr domain,
virConnectPtr dconn,
const char *xmlin,
const char *dname,
const char *uri,
unsigned long long bandwidth,
virTypedParameterPtr params,
int nparams,
bool useParams,
unsigned int flags)
{
...
/* If Perform returns < 0, then we need to cancel the VM
* startup on the destination
*/
cancelled = ret < 0 ? 1 : 0;

finish:
/*
* The status code from the source is passed to the destination.
* The dest can cleanup if the source indicated it failed to
* send all migration data. Returns NULL for ddomain if
* the dest was unable to complete migration.
*/
VIR_DEBUG("Finish3 %p ret=%d", dconn, ret);
VIR_FREE(cookiein);
cookiein = g_steal_pointer(&cookieout);
cookieinlen = cookieoutlen;
cookieoutlen = 0;
if (useParams) {
if (virTypedParamsGetString(params, nparams,
VIR_MIGRATE_PARAM_DEST_NAME, NULL) <= 0 &&
virTypedParamsReplaceString(&params, &nparams,
VIR_MIGRATE_PARAM_DEST_NAME,
domain->name) < 0) {
ddomain = NULL;
} else {
ddomain = dconn->driver->domainMigrateFinish3Params
(dconn, params, nparams, cookiein, cookieinlen,
&cookieout, &cookieoutlen, destflags, cancelled);
}
} else {
dname = dname ? dname : domain->name;
ddomain = dconn->driver->domainMigrateFinish3
(dconn, dname, cookiein, cookieinlen, &cookieout, &cookieoutlen,
NULL, uri, destflags, cancelled);
}

if (cancelled) {
if (ddomain) {
VIR_ERROR(_("finish step ignored that migration was cancelled"));
} else {
/* If Finish reported a useful error, use it instead of the
* original "migration unexpectedly failed" error.
*
* This is ugly but we can't do better with the APIs we have. We
* only replace the error if Finish was called with cancelled == 1
* and reported a real error (old libvirt would report an error
* from RPC instead of MIGRATE_FINISH_OK), which only happens when
* the domain died on destination. To further reduce a possibility
* of false positives we also check that Perform returned
* VIR_ERR_OPERATION_FAILED.
*/
if (orig_err &&
orig_err->domain == VIR_FROM_QEMU &&
orig_err->code == VIR_ERR_OPERATION_FAILED) {
virErrorPtr err = virGetLastError();
if (err &&
err->domain == VIR_FROM_QEMU &&
err->code != VIR_ERR_MIGRATE_FINISH_OK) {
g_clear_pointer(&orig_err, virFreeError);
}
}
}
}
...
}

其核心就是调用driverdomainMigrateFinish3Params函数指针,即qemuDomainMigrateFinish3Params(),其基于前面perform阶段的结果,处理源端返回的cookie信息和目的端的虚拟机状态,并重新生成一个给源端的包含目的端finish阶段热迁移额外配置信息的cookie,其中核心逻辑在qemuMigrationDstFinishActive()中,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
//#0  qemuMigrationDstFinishActive
// (cookie_flags=60, driver=0x55caa06c0930, dconn=0x7fce80000c80, vm=0x7fce6407a820, cookiein=0x7fce8c0019c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>13d91ed2-8dc5-4c18-b3af-9e54aec2c1f1</uuid>\n <hostname>src</hostname>\n <hostuuid>43b3e83d-912b-48b6-b7e7-e0bd07f7f86d</hostuuid>\n <statistics>\n"..., cookieinlen=1141, cookieout=0x7fce9a38c970, cookieoutlen=0x7fce9a38c964, flags=131073, retcode=0, v3proto=true, timeReceived=1780186934878, finishJob=<synthetic pointer>) at ../src/qemu/qemu_migration.c:6851
//#1 qemuMigrationDstFinish
// (driver=driver@entry=0x55caa06c0930, dconn=dconn@entry=0x7fce80000c80, vm=<optimized out>, cookiein=cookiein@entry=0x7fce8c0019c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>13d91ed2-8dc5-4c18-b3af-9e54aec2c1f1</uuid>\n <hostname>src</hostname>\n <hostuuid>43b3e83d-912b-48b6-b7e7-e0bd07f7f86d</hostuuid>\n <statistics>\n"..., cookieinlen=cookieinlen@entry=1141, cookieout=cookieout@entry=0x7fce9a38c970, cookieoutlen=0x7fce9a38c964, flags=131073, retcode=0, v3proto=true) at ../src/qemu/qemu_migration.c:7010
//#2 0x00007fce982474b7 in qemuDomainMigrateFinish3Params
// (dconn=0x7fce80000c80, params=0x7fce8c001e40, nparams=3, cookiein=0x7fce8c0019c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>13d91ed2-8dc5-4c18-b3af-9e54aec2c1f1</uuid>\n <hostname>src</hostname>\n <hostuuid>43b3e83d-912b-48b6-b7e7-e0bd07f7f86d</hostuuid>\n <statistics>\n"..., cookieinlen=1141, cookieout=0x7fce9a38c970, cookieoutlen=0x7fce9a38c964, flags=131073, cancelled=0) at ../src/qemu/qemu_driver.c:11395
//#3 0x00007fce9d9de695 in virDomainMigrateFinish3Params
// (dconn=dconn@entry=0x7fce80000c80, params=0x7fce8c001e40, nparams=3, cookiein=0x7fce8c0019c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>13d91ed2-8dc5-4c18-b3af-9e54aec2c1f1</uuid>\n <hostname>src</hostname>\n <hostuuid>43b3e83d-912b-48b6-b7e7-e0bd07f7f86d</hostuuid>\n <statistics>\n"..., cookieinlen=1141, cookieout=cookieout@entry=0x7fce9a38c970, cookieoutlen=0x7fce9a38c964, flags=131073, cancelled=0) at ../src/libvirt-domain.c:5485
//#4 0x000055ca9505c88f in remoteDispatchDomainMigrateFinish3Params
// (server=0x55caa06afbf0, msg=0x55caa06ad1a0, client=<optimized out>, rerr=0x7fce9a38ca30, args=0x55caa06ad140, ret=0x7fce8c0013f0) at ../src/remote/remote_daemon_dispatch.c:5806
//#5 remoteDispatchDomainMigrateFinish3ParamsHelper (server=0x55caa06afbf0, client=<optimized out>, msg=0x55caa06ad1a0, rerr=0x7fce9a38ca30, args=0x55caa06ad140, ret=0x7fce8c0013f0)
// at src/remote/remote_daemon_dispatch_stubs.h:8656
//#6 0x00007fce9d95b88e in virNetServerProgramDispatchCall (prog=0x55caa06b0430, server=0x55caa06afbf0, client=0x7fce70000d80, msg=0x55caa06ad1a0) at ../src/rpc/virnetserverprogram.c:423
//#7 virNetServerProgramDispatch (prog=0x55caa06b0430, server=server@entry=0x55caa06afbf0, client=client@entry=0x7fce70000d80, msg=msg@entry=0x55caa06ad1a0)
// at ../src/rpc/virnetserverprogram.c:299
//#8 0x00007fce9d960e22 in virNetServerProcessMsg (srv=srv@entry=0x55caa06afbf0, client=0x7fce70000d80, prog=<optimized out>, msg=0x55caa06ad1a0) at ../src/rpc/virnetserver.c:135
//#9 0x00007fce9d96115d in virNetServerHandleJob (jobOpaque=0x7fce780a64f0, opaque=0x55caa06afbf0) at ../src/rpc/virnetserver.c:155
//#10 0x00007fce9d8a88bd in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#11 0x00007fce9d8a7f44 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#12 0x00007fce9d201b7b in ??? () at /lib/x86_64-linux-gnu/libc.so.6
//#13 0x00007fce9d27f7f8 in ??? () at /lib/x86_64-linux-gnu/libc.so.6
static virDomainPtr
qemuMigrationDstFinishActive(virQEMUDriver *driver,
virConnectPtr dconn,
virDomainObj *vm,
int cookie_flags,
const char *cookiein,
int cookieinlen,
char **cookieout,
int *cookieoutlen,
unsigned int flags,
int retcode,
bool v3proto,
unsigned long long timeReceived,
bool *finishJob)
{
...
if (!(mig = qemuMigrationCookieParse(driver, vm, vm->def, priv->origname,
priv->qemuCaps,
cookiein, cookieinlen, cookie_flags)))
goto error;

if (retcode != 0) {
/* Check for a possible error on the monitor in case Finish was called
* earlier than monitor EOF handler got a chance to process the error
*/
qemuDomainCheckMonitor(vm, VIR_ASYNC_JOB_MIGRATION_IN);
goto error;
}
...
rc = qemuMigrationDstFinishFresh(driver, vm, mig, flags, v3proto,
timeReceived, &doKill, &inPostCopy);
...
if (qemuMigrationCookieFormat(mig, driver, vm,
QEMU_MIGRATION_DESTINATION,
cookieout, cookieoutlen,
QEMU_MIGRATION_COOKIE_STATS) < 0)
VIR_WARN("Unable to encode migration cookie");

qemuMigrationDstComplete(driver, vm, inPostCopy,
VIR_ASYNC_JOB_MIGRATION_IN, vm->job);

return dom;

error:
...
qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED,
VIR_ASYNC_JOB_MIGRATION_IN,
VIR_QEMU_PROCESS_STOP_MIGRATED);
...
return NULL;
}

其会调用qemuMigrationCookieParse()处理源端的cookie信息,获取源端perform阶段热迁移的能力信息;调用qemuMigrationCookieFormat()生成包含目的端finish阶段额外热迁移能力信息的cookie。如果perform阶段成功,会调用qemuMigrationDstFinishFresh()恢复目的端子机的运行;否则调用qemuProcessStop()销毁目的端子机

处理cookie

这里和begin阶段一样,使用qemuMigrationCookieParse根据flags指定的配置字段,根据flags反序列化cookie字符串到对应的struct _qemuMigrationCookie类型数据中,其为目的端prepare阶段的热迁移额外配置信息

处理热迁移结果

热迁移成功

如果perform阶段热迁移成功,则调用qemuMigrationDstFinishFresh()恢复目的端子机的运行,并在后续confirm阶段清理源端的子机资源即可,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
//#0  qemuMigrationDstFinishFresh
// (driver=<optimized out>, vm=<optimized out>, mig=<optimized out>, flags=<optimized out>, v3proto=<optimized out>, timeReceived=<optimized out>, doKill=<optimized out>, inPostCopy=<optimized out>) at ../src/qemu/qemu_migration.c:6715
//#1 qemuMigrationDstFinishActive
// (cookie_flags=60, driver=0x56109f030930, dconn=0x7f93fc002f30, vm=<optimized out>, cookiein=0x7f941c041dc0 "0\t\003\237\020V", cookieinlen=32659, cookieout=0x7f942be99970, cookieoutlen=0x7f942be99964, flags=131073, retcode=0, v3proto=true, timeReceived=1780190289045, finishJob=<synthetic pointer>) at ../src/qemu/qemu_migration.c:6884
//#2 qemuMigrationDstFinish
// (driver=driver@entry=0x56109f030930, dconn=dconn@entry=0x7f93fc002f30, vm=<optimized out>, cookiein=cookiein@entry=0x7f93fc001960 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>0cd0f6c9-2206-41ef-bb03-808873900a38</uuid>\n <hostname>src</hostname>\n <hostuuid>ba8b4bb8-c4bf-4da4-87bd-7f7a1ca07a7c</hostuuid>\n <statistics>\n"..., cookieinlen=cookieinlen@entry=1141, cookieout=cookieout@entry=0x7f942be99970, cookieoutlen=0x7f942be99964, flags=131073, retcode=0, v3proto=true) at ../src/qemu/qemu_migration.c:7010
//#3 0x00007f9428d524b7 in qemuDomainMigrateFinish3Params
// (dconn=0x7f93fc002f30, params=0x7f93fc001de0, nparams=3, cookiein=0x7f93fc001960 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>0cd0f6c9-2206-41ef-bb03-808873900a38</uuid>\n <hostname>src</hostname>\n <hostuuid>ba8b4bb8-c4bf-4da4-87bd-7f7a1ca07a7c</hostuuid>\n <statistics>\n"..., cookieinlen=1141, cookieout=0x7f942be99970, cookieoutlen=0x7f942be99964, flags=131073, cancelled=0) at ../src/qemu/qemu_driver.c:11395
//#4 0x00007f942f4eb695 in virDomainMigrateFinish3Params
// (dconn=dconn@entry=0x7f93fc002f30, params=0x7f93fc001de0, nparams=3, cookiein=0x7f93fc001960 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>0cd0f6c9-2206-41ef-bb03-808873900a38</uuid>\n <hostname>src</hostname>\n <hostuuid>ba8b4bb8-c4bf-4da4-87bd-7f7a1ca07a7c</hostuuid>\n <statistics>\n"..., cookieinlen=1141, cookieout=cookieout@entry=0x7f942be99970, cookieoutlen=0x7f942be99964, flags=131073, cancelled=0) at ../src/libvirt-domain.c:5485
//#5 0x000056109159788f in remoteDispatchDomainMigrateFinish3Params
// (server=0x56109f01fbf0, msg=0x56109f01c820, client=<optimized out>, rerr=0x7f942be99a30, args=0x56109f023d30, ret=0x56109f01d390) at ../src/remote/remote_daemon_dispatch.c:5806
//#6 remoteDispatchDomainMigrateFinish3ParamsHelper (server=0x56109f01fbf0, client=<optimized out>, msg=0x56109f01c820, rerr=0x7f942be99a30, args=0x56109f023d30, ret=0x56109f01d390)
// at src/remote/remote_daemon_dispatch_stubs.h:8656
//#7 0x00007f942f46888e in virNetServerProgramDispatchCall (prog=0x56109f020430, server=0x56109f01fbf0, client=0x7f9420000fd0, msg=0x56109f01c820) at ../src/rpc/virnetserverprogram.c:423
//#8 virNetServerProgramDispatch (prog=0x56109f020430, server=server@entry=0x56109f01fbf0, client=client@entry=0x7f9420000fd0, msg=msg@entry=0x56109f01c820)
// at ../src/rpc/virnetserverprogram.c:299
//#9 0x00007f942f46de22 in virNetServerProcessMsg (srv=srv@entry=0x56109f01fbf0, client=0x7f9420000fd0, prog=<optimized out>, msg=0x56109f01c820) at ../src/rpc/virnetserver.c:135
//#10 0x00007f942f46e15d in virNetServerHandleJob (jobOpaque=0x7f941c09e460, opaque=0x56109f01fbf0) at ../src/rpc/virnetserver.c:155
//#11 0x00007f942f3b58bd in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#12 0x00007f942f3b4f44 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#13 0x00007f942ed0eb7b in ??? () at /lib/x86_64-linux-gnu/libc.so.6
//#14 0x00007f942ed8c7f8 in ??? () at /lib/x86_64-linux-gnu/libc.so.6
static int
qemuMigrationDstFinishFresh(virQEMUDriver *driver,
virDomainObj *vm,
qemuMigrationCookie *mig,
unsigned int flags,
bool v3proto,
unsigned long long timeReceived,
bool *doKill,
bool *inPostCopy)
{
...
/* We need to wait for QEMU to process all data sent by the source
* before starting guest CPUs.
*/
if (qemuMigrationDstWaitForCompletion(vm,
VIR_ASYNC_JOB_MIGRATION_IN,
!!(flags & VIR_MIGRATE_POSTCOPY)) < 0) {
/* There's not much we can do for v2 protocol since the
* original domain on the source host is already gone.
*/
if (v3proto)
return -1;
}

/* Now that the state data was transferred we can refresh the actual state
* of the devices */
if (qemuProcessRefreshState(driver, vm, VIR_ASYNC_JOB_MIGRATION_IN) < 0) {
/* Similarly to the case above v2 protocol will not be able to recover
* from this. Let's ignore this and perhaps stuff will not break. */
if (v3proto)
return -1;
}

...
if (qemuProcessStartCPUs(driver, vm, runningReason,
VIR_ASYNC_JOB_MIGRATION_IN) < 0) {
if (virGetLastErrorCode() == VIR_ERR_OK)
virReportError(VIR_ERR_INTERNAL_ERROR,
"%s", _("resume operation failed"));
/*
* In v3 protocol, the source VM is still available to
* restart during confirm() step, so we kill it off
* now.
* In v2 protocol, the source is dead, so we leave
* target in paused state, in case admin can fix
* things up.
*/
if (v3proto)
return -1;
}
...
return 0;
}

可以看到,在qemuMigrationDstWaitForCompletion()等待qemu相关event后,libvirt会调用qemuProcessStartCPUs()恢复目的端子机的运行

热迁移失败

如果perform阶段热迁移失败,则调用qemuProcessStop()销毁目的端子机,并在后续confirm阶段恢复源端的子机运行即可,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
//#0  qemuProcessKill (vm=vm@entry=0x7f4394049ac0, flags=flags@entry=5) at ../src/qemu/qemu_process.c:8509
//#1 0x00007f43d04192c0 in qemuProcessStop
// (driver=driver@entry=0x559a2829d930, vm=vm@entry=0x7f4394049ac0, reason=reason@entry=VIR_DOMAIN_SHUTOFF_FAILED, asyncJob=asyncJob@entry=VIR_ASYNC_JOB_MIGRATION_IN, flags=flags@entry=1)
// at ../src/qemu/qemu_process.c:8654
//#2 0x00007f43d03d78c0 in qemuMigrationDstPrepareActive
// (driver=0x7f43b8072310, vm=<optimized out>, dconn=<optimized out>, mig=<optimized out>, st=<optimized out>, protocol=<optimized out>, port=<optimized out>, listenAddress=<optimized out>, migrate_disks=<optimized out>, nbdPort=<optimized out>, nbdURI=<optimized out>, migParams=<optimized out>, flags=<optimized out>) at ../src/qemu/qemu_migration.c:3380
//#3 qemuMigrationDstPrepareFresh
// (driver=0x7f43b8072310, dconn=<optimized out>, cookiein=<optimized out>, cookieinlen=<optimized out>, cookieout=<optimized out>, cookieoutlen=<optimized out>, def=<optimized out>, origname=<optimized out>, st=<optimized out>, protocol=<optimized out>, port=<optimized out>, autoPort=<optimized out>, listenAddress=<optimized out>, migrate_disks=<optimized out>, nbdPort=<optimized out>, nbdURI=<optimized out>, migParams=<optimized out>, flags=<optimized out>) at ../src/qemu/qemu_migration.c:3510
//#4 qemuMigrationDstPrepareAny
// (driver=driver@entry=0x559a2829d930, dconn=dconn@entry=0x7f43c4002f30, cookiein=cookiein@entry=0x7f43b80011c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>f1e073d4-48c5-4374-bfcb-45ee42faa93e</uuid>\n <hostname>src</hostname>\n <hostuuid>eb399419-9dfb-496f-98cd-6a5070c174a6</hostuuid>\n <feature name"..., cookieinlen=cookieinlen@entry=877, cookieout=cookieout@entry=0x7f43d44e2970, cookieoutlen=cookieoutlen@entry=0x7f43d44e2964, def=<optimized out>, origname=<optimized out>, st=<optimized out>, protocol=<optimized out>, port=<optimized out>, autoPort=<optimized out>, listenAddress=<optimized out>, migrate_disks=<optimized out>, nbdPort=<optimized out>, nbdURI=<optimized out>, migParams=<optimized out>, flags=<optimized out>)
// at ../src/qemu/qemu_migration.c:3732
//#5 0x00007f43d03d9893 in qemuMigrationDstPrepareDirect
// (driver=driver@entry=0x559a2829d930, dconn=dconn@entry=0x7f43c4002f30, cookiein=cookiein@entry=0x7f43b80011c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>f1e073d4-48c5-4374-bfcb-45ee42faa93e</uuid>\n <hostname>src</hostname>\n <hostuuid>eb399419-9dfb-496f-98cd-6a5070c174a6</hostuuid>\n <feature name"..., cookieinlen=cookieinlen@entry=877, cookieout=cookieout@entry=0x7f43d44e2970, cookieoutlen=cookieoutlen@entry=0x7f43d44e2964, uri_in=0x7f43b8001610 "tcp://172.192.169.131", uri_out=0x7f43b8001bc0, def=0x7f43d44e2818, origname=0x0, listenAddress=0x0, migrate_disks=0x0, nbdPort=0, nbdURI=0x0, migParams=0x7f43b8001d30, flags=131073) at ../src/qemu/qemu_migration.c:3939
//#6 0x00007f43d039a321 in qemuDomainMigratePrepare3Params
// (dconn=0x7f43c4002f30, params=<optimized out>, nparams=2, cookiein=0x7f43b80011c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>f1e073d4-48c5-4374-bfcb-45ee42faa93e</uuid>\n <hostname>src</hostname>\n <hostuuid>eb399419-9dfb-496f-98cd-6a5070c174a6</hostuuid>\n <feature name"..., cookieinlen=877, cookieout=0x7f43d44e2970, cookieoutlen=0x7f43d44e2964, uri_out=0x7f43b8001bc0, flags=131073) at ../src/qemu/qemu_driver.c:11054
//#7 0x00007f43d6330bfc in virDomainMigratePrepare3Params
// (dconn=dconn@entry=0x7f43c4002f30, params=<optimized out>, nparams=2, cookiein=0x7f43b80011c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>f1e073d4-48c5-4374-bfcb-45ee42faa93e</uuid>\n <hostname>src</hostname>\n <hostuuid>eb399419-9dfb-496f-98cd-6a5070c174a6</hostuuid>\n <feature name"..., cookieinlen=877, cookieout=cookieout@entry=0x7f43d44e2970, cookieoutlen=0x7f43d44e2964, uri_out=0x7f43b8001bc0, flags=131073) at ../src/libvirt-domain.c:5319
//#8 0x0000559a0cf55f58 in remoteDispatchDomainMigratePrepare3Params
// (server=<optimized out>, msg=0x559a2829b1a0, client=<optimized out>, rerr=0x7f43d44e2a30, args=0x559a2828d030, ret=0x559a28273de0) at ../src/remote/remote_daemon_dispatch.c:5629
//#9 remoteDispatchDomainMigratePrepare3ParamsHelper (server=<optimized out>, client=<optimized out>, msg=0x559a2829b1a0, rerr=0x7f43d44e2a30, args=0x559a2828d030, ret=0x559a28273de0)
// at src/remote/remote_daemon_dispatch_stubs.h:9051
//#10 0x00007f43d62ae88e in virNetServerProgramDispatchCall (prog=0x559a2828d430, server=0x559a2828cbf0, client=0x7f43bc000ec0, msg=0x559a2829b1a0) at ../src/rpc/virnetserverprogram.c:423
//#11 virNetServerProgramDispatch (prog=0x559a2828d430, server=server@entry=0x559a2828cbf0, client=client@entry=0x7f43bc000ec0, msg=msg@entry=0x559a2829b1a0)
// at ../src/rpc/virnetserverprogram.c:299
//#12 0x00007f43d62b3e22 in virNetServerProcessMsg (srv=srv@entry=0x559a2828cbf0, client=0x7f43bc000ec0, prog=<optimized out>, msg=0x559a2829b1a0) at ../src/rpc/virnetserver.c:135
//#13 0x00007f43d62b415d in virNetServerHandleJob (jobOpaque=0x559a28276dc0, opaque=0x559a2828cbf0) at ../src/rpc/virnetserver.c:155
//#14 0x00007f43d61fb8bd in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#15 0x00007f43d61faf44 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#16 0x00007f43d5b54b7b in ??? () at /lib/x86_64-linux-gnu/libc.so.6
//#17 0x00007f43d5bd27f8 in ??? () at /lib/x86_64-linux-gnu/libc.so.6
void qemuProcessStop(virQEMUDriver *driver,
virDomainObj *vm,
virDomainShutoffReason reason,
virDomainAsyncJob asyncJob,
unsigned int flags)
{
...
if (!virDomainObjIsActive(vm)) {
VIR_DEBUG("VM '%s' not active", vm->def->name);
goto endjob;
}
...
/* shut it off for sure */
ignore_value(qemuProcessKill(vm,
VIR_QEMU_PROCESS_KILL_FORCE|
VIR_QEMU_PROCESS_KILL_NOCHECK));
...
qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false);

/* Do this before we delete the tree and remove pidfile. */
qemuProcessKillManagedPRDaemon(vm);

qemuDomainCleanupRun(driver, vm);

outgoingMigration = (flags & VIR_QEMU_PROCESS_STOP_MIGRATED) &&
(asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT);

qemuExtDevicesStop(driver, vm, outgoingMigration);

qemuDBusStop(driver, vm);

/* Only after this point we can reset 'priv->beingDestroyed' so that
* there's no point at which the VM could be considered as alive between
* entering the destroy job and this point where the active "flag" is
* cleared.
*/
vm->def->id = -1;
priv->beingDestroyed = false;

/* No unlocking of @vm after this point until whole cleanup is done. */

/* Wake up anything waiting on domain condition */
virDomainObjBroadcast(vm);

if (priv->eventThread)
g_object_unref(g_steal_pointer(&priv->eventThread));

if (g_atomic_int_dec_and_test(&driver->nactive) && driver->inhibitCallback)
driver->inhibitCallback(false, driver->inhibitOpaque);

/* Clear network bandwidth */
virDomainClearNetBandwidth(vm->def);

virDomainConfVMNWFilterTeardown(vm);

if (cfg->macFilter) {
for (i = 0; i < def->nnets; i++) {
virDomainNetDef *net = def->nets[i];
if (net->ifname == NULL)
continue;
ignore_value(ebtablesRemoveForwardAllowIn(driver->ebtables,
net->ifname,
&net->mac));
}
}

virPortAllocatorRelease(priv->nbdPort);
priv->nbdPort = 0;

if (priv->monConfig) {
if (priv->monConfig->type == VIR_DOMAIN_CHR_TYPE_UNIX)
unlink(priv->monConfig->data.nix.path);
g_clear_pointer(&priv->monConfig, virObjectUnref);
}

/* Remove the master key */
qemuDomainMasterKeyRemove(priv);

ignore_value(virDomainChrDefForeach(vm->def,
false,
qemuProcessCleanupChardevDevice,
NULL));


/* Its namespace is also gone then. */
qemuDomainDestroyNamespace(driver, vm);

virFileDeleteTree(priv->libDir);
virFileDeleteTree(priv->channelTargetDir);

/* Stop autodestroy in case guest is restarted */
virCloseCallbacksDomainRemove(vm, NULL, qemuProcessAutoDestroy);

/* now that we know it's stopped call the hook if present */
if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) {
g_autofree char *xml = qemuDomainDefFormatXML(driver, NULL, vm->def, 0);

/* we can't stop the operation even if the script raised an error */
ignore_value(virHookCall(VIR_HOOK_DRIVER_QEMU, vm->def->name,
VIR_HOOK_QEMU_OP_STOPPED, VIR_HOOK_SUBOP_END,
NULL, xml, NULL));
}

/* Reset Security Labels unless caller don't want us to */
if (!(flags & VIR_QEMU_PROCESS_STOP_NO_RELABEL))
qemuSecurityRestoreAllLabel(driver, vm,
!!(flags & VIR_QEMU_PROCESS_STOP_MIGRATED));

/* Clear out dynamically assigned labels */
for (i = 0; i < vm->def->nseclabels; i++) {
if (vm->def->seclabels[i]->type == VIR_DOMAIN_SECLABEL_DYNAMIC)
VIR_FREE(vm->def->seclabels[i]->label);
VIR_FREE(vm->def->seclabels[i]->imagelabel);
}

qemuHostdevReAttachDomainDevices(driver, vm->def);
for (i = 0; i < def->nnets; i++) {
virDomainNetDef *net = def->nets[i];
virDomainInterfaceDeleteDevice(def,
net,
QEMU_DOMAIN_NETWORK_PRIVATE(net)->created,
cfg->stateDir);
}

retry:
if ((ret = virDomainCgroupRemoveCgroup(vm, priv->cgroup, priv->machineName)) < 0) {
if (ret == -EBUSY && (retries++ < 5)) {
g_usleep(200*1000);
goto retry;
}
VIR_WARN("Failed to remove cgroup for %s",
vm->def->name);
}

/* Remove resctrl allocation after cgroups are cleaned up which makes it
* kind of safer (although removing the allocation should work even with
* pids in tasks file */
for (i = 0; i < vm->def->nresctrls; i++) {
size_t j = 0;

for (j = 0; j < vm->def->resctrls[i]->nmonitors; j++) {
virDomainResctrlMonDef *mon = NULL;

mon = vm->def->resctrls[i]->monitors[j];
virResctrlMonitorRemove(mon->instance);
}

virResctrlAllocRemove(vm->def->resctrls[i]->alloc);
}

qemuProcessRemoveDomainStatus(driver, vm);

/* Remove VNC and Spice ports from port reservation bitmap, but only if
they were reserved by the driver (autoport=yes)
*/
for (i = 0; i < vm->def->ngraphics; ++i) {
virDomainGraphicsDef *graphics = vm->def->graphics[i];
if (graphics->type == VIR_DOMAIN_GRAPHICS_TYPE_VNC) {
if (graphics->data.vnc.portReserved) {
virPortAllocatorRelease(graphics->data.vnc.port);
graphics->data.vnc.portReserved = false;
}
if (graphics->data.vnc.websocketReserved) {
virPortAllocatorRelease(graphics->data.vnc.websocket);
graphics->data.vnc.websocketReserved = false;
}
if (graphics->data.vnc.websocketGenerated) {
graphics->data.vnc.websocketGenerated = false;
graphics->data.vnc.websocket = -1;
}
}
if (graphics->type == VIR_DOMAIN_GRAPHICS_TYPE_SPICE) {
if (graphics->data.spice.portReserved) {
virPortAllocatorRelease(graphics->data.spice.port);
graphics->data.spice.portReserved = false;
}

if (graphics->data.spice.tlsPortReserved) {
virPortAllocatorRelease(graphics->data.spice.tlsPort);
graphics->data.spice.tlsPortReserved = false;
}
}
}

for (i = 0; i < vm->ndeprecations; i++)
g_free(vm->deprecations[i]);
g_clear_pointer(&vm->deprecations, g_free);
vm->ndeprecations = 0;
vm->taint = 0;
vm->pid = 0;
virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, reason);
for (i = 0; i < vm->def->niothreadids; i++)
vm->def->iothreadids[i]->thread_id = 0;

/* clean up a possible backup job */
if (priv->backup)
qemuBackupJobTerminate(vm, VIR_DOMAIN_JOB_STATUS_CANCELED);

/* Do this explicitly after vm->pid is reset so that security drivers don't
* try to enter the domain's namespace which is non-existent by now as qemu
* is no longer running. */
if (!(flags & VIR_QEMU_PROCESS_STOP_NO_RELABEL)) {
for (i = 0; i < def->ndisks; i++) {
virDomainDiskDef *disk = def->disks[i];

if (disk->mirror) {
if (qemuSecurityRestoreImageLabel(driver, vm, disk->mirror, false) < 0)
VIR_WARN("Unable to restore security label on %s", disk->dst);

if (virStorageSourceChainHasNVMe(disk->mirror))
qemuHostdevReAttachOneNVMeDisk(driver, vm->def->name, disk->mirror);
}

qemuBlockRemoveImageMetadata(driver, vm, disk->dst, disk->src);

/* for now transient disks are forbidden with migration so they
* can be handled here */
if (disk->transient &&
QEMU_DOMAIN_DISK_PRIVATE(disk)->transientOverlayCreated) {
VIR_DEBUG("Removing transient overlay '%s' of disk '%s'",
disk->src->path, disk->dst);
if (qemuDomainStorageFileInit(driver, vm, disk->src, NULL) >= 0) {
virStorageSourceUnlink(disk->src);
virStorageSourceDeinit(disk->src);
}
}
}
}

qemuSecurityReleaseLabel(driver->securityManager, vm->def);

/* clear all private data entries which are no longer needed */
qemuDomainObjPrivateDataClear(priv);

/* The "release" hook cleans up additional resources */
if (virHookPresent(VIR_HOOK_DRIVER_QEMU)) {
g_autofree char *xml = qemuDomainDefFormatXML(driver, NULL, vm->def, 0);

/* we can't stop the operation even if the script raised an error */
virHookCall(VIR_HOOK_DRIVER_QEMU, vm->def->name,
VIR_HOOK_QEMU_OP_RELEASE, VIR_HOOK_SUBOP_END,
virDomainShutoffReasonTypeToString(reason), xml, NULL);
}

virDomainObjRemoveTransientDef(vm);

endjob:
if (asyncJob != VIR_ASYNC_JOB_NONE)
virDomainObjEndJob(vm);

cleanup:
virErrorRestore(&orig_err);
}

可以看到,其调用qemuProcessKill()将子机进程kill掉,并调用相关函数清理子机资源,从而销毁目的端子机

生成cookie

这里和begin阶段一样,使用qemuMigrationCookieFormat根据flags指定的配置字段,序列化struct _qemuMigrationCookie类型的cookie,其为目的端finish阶段的热迁移额外配置信息

confirm阶段

概述

confirm阶段则是在源端处理热迁移结果,即如果此次热迁移成功,则销毁子机即可;如果此次热迁移失败,则需要重新恢复源端子机的运行,流程如下所示

热迁移confirm阶段流程图

confirm阶段的代码如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
static virDomainPtr
virDomainMigrateVersion3Full(virDomainPtr domain,
virConnectPtr dconn,
const char *xmlin,
const char *dname,
const char *uri,
unsigned long long bandwidth,
virTypedParameterPtr params,
int nparams,
bool useParams,
unsigned int flags)
{
/* If ddomain is NULL, then we were unable to start
* the guest on the target, and must restart on the
* source. There is a small chance that the ddomain
* is NULL due to an RPC failure, in which case
* ddomain could in fact be running on the dest.
* The lock manager plugins should take care of
* safety in this scenario.
*/
cancelled = ddomain == NULL ? 1 : 0;
...
confirm:
/*
* If cancelled, then src VM will be restarted, else it will be killed.
* Don't do this if migration failed on source and thus it was already
* cancelled there.
*/
if (notify_source) {
VIR_DEBUG("Confirm3 %p ret=%d domain=%p", domain->conn, ret, domain);
VIR_FREE(cookiein);
cookiein = g_steal_pointer(&cookieout);
cookieinlen = cookieoutlen;
cookieoutlen = 0;
if (useParams) {
ret = domain->conn->driver->domainMigrateConfirm3Params
(domain, params, nparams, cookiein, cookieinlen,
flags | protection, cancelled);
} else {
ret = domain->conn->driver->domainMigrateConfirm3
(domain, cookiein, cookieinlen,
flags | protection, cancelled);
}
/* If Confirm3 returns -1, there's nothing more we can
* do, but fortunately worst case is that there is a
* domain left in 'paused' state on source.
*/
if (ret < 0) {
VIR_WARN("Guest %s probably left in 'paused' state on source",
domain->name);
}
}
...
}

其核心就是调用driverdomainMigrateConfirm3Params函数指针,即qemuDomainMigrateConfirm3Params(),其基于前面finish阶段的结果,处理目的端返回的cookie信息和目的端的虚拟机状态。其中核心逻辑在qemuMigrationSrcConfirmPhase(),如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
//#0  qemuMigrationSrcConfirmPhase
// (driver=driver@entry=0x55dd7d714930, vm=0x7fc71002eba0, cookiein=cookiein@entry=0x7fc724002360 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>25e32d71-1e57-46b3-b9e9-c617ea041627</uuid>\n <hostname>dst</hostname>\n <hostuuid>c871cf8c-8c85-44e6-a828-f21b4d0b221f</hostuuid>\n <statistics>\n"..., cookieinlen=cookieinlen@entry=1165, flags=flags@entry=131329, retcode=retcode@entry=0) at ../src/qemu/qemu_migration.c:4044
//#1 0x00007fc74840615b in qemuMigrationSrcConfirm
// (driver=0x55dd7d714930, vm=<optimized out>, cookiein=cookiein@entry=0x7fc724002360 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>25e32d71-1e57-46b3-b9e9-c617ea041627</uuid>\n <hostname>dst</hostname>\n <hostuuid>c871cf8c-8c85-44e6-a828-f21b4d0b221f</hostuuid>\n <statistics>\n"..., cookieinlen=cookieinlen@entry=1165, flags=flags@entry=131329, cancelled=cancelled@entry=0) at ../src/qemu/qemu_migration.c:4182
//#2 0x00007fc7483c9236 in qemuDomainMigrateConfirm3Params
// (domain=0x7fc724001310, params=<optimized out>, nparams=<optimized out>, cookiein=0x7fc724002360 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>25e32d71-1e57-46b3-b9e9-c617ea041627</uuid>\n <hostname>dst</hostname>\n <hostuuid>c871cf8c-8c85-44e6-a828-f21b4d0b221f</hostuuid>\n <statistics>\n"..., cookieinlen=1165, flags=131329, cancelled=0)
// at ../src/qemu/qemu_driver.c:11449
//#3 0x00007fc74cb5ea30 in virDomainMigrateConfirm3Params
// (domain=domain@entry=0x7fc724001310, params=0x7fc724001560, nparams=3, cookiein=0x7fc724002360 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>25e32d71-1e57-46b3-b9e9-c617ea041627</uuid>\n <hostname>dst</hostname>\n <hostuuid>c871cf8c-8c85-44e6-a828-f21b4d0b221f</hostuuid>\n <statistics>\n"..., cookieinlen=1165, flags=131329, cancelled=0)
// at ../src/libvirt-domain.c:5537
//#4 0x000055dd405cad3c in remoteDispatchDomainMigrateConfirm3Params (server=0x55dd7d703bf0, msg=0x55dd7d7121a0, client=<optimized out>, rerr=0x7fc74a50ea30, args=0x7fc724002cd0)
// at ../src/remote/remote_daemon_dispatch.c:5863
//#5 remoteDispatchDomainMigrateConfirm3ParamsHelper (server=0x55dd7d703bf0, client=<optimized out>, msg=0x55dd7d7121a0, rerr=0x7fc74a50ea30, args=0x7fc724002cd0, ret=0x0)
// at src/remote/remote_daemon_dispatch_stubs.h:8488
//#6 0x00007fc74cadb88e in virNetServerProgramDispatchCall (prog=0x55dd7d704430, server=0x55dd7d703bf0, client=0x7fc720081370, msg=0x55dd7d7121a0) at ../src/rpc/virnetserverprogram.c:423
//#7 virNetServerProgramDispatch (prog=0x55dd7d704430, server=server@entry=0x55dd7d703bf0, client=client@entry=0x7fc720081370, msg=msg@entry=0x55dd7d7121a0)
// at ../src/rpc/virnetserverprogram.c:299
//#8 0x00007fc74cae0e22 in virNetServerProcessMsg (srv=srv@entry=0x55dd7d703bf0, client=0x7fc720081370, prog=<optimized out>, msg=0x55dd7d7121a0) at ../src/rpc/virnetserver.c:135
//#9 0x00007fc74cae115d in virNetServerHandleJob (jobOpaque=0x55dd7d702ee0, opaque=0x55dd7d703bf0) at ../src/rpc/virnetserver.c:155
//#10 0x00007fc74ca288bd in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#11 0x00007fc74ca27f44 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#12 0x00007fc74c381b7b in ??? () at /lib/x86_64-linux-gnu/libc.so.6
//#13 0x00007fc74c3ff7f8 in ??? () at /lib/x86_64-linux-gnu/libc.so.6
static int
qemuMigrationSrcConfirmPhase(virQEMUDriver *driver,
virDomainObj *vm,
const char *cookiein,
int cookieinlen,
unsigned int flags,
int retcode)
{
...
if (!(mig = qemuMigrationCookieParse(driver, vm, vm->def, priv->origname,
priv->qemuCaps,
cookiein, cookieinlen,
QEMU_MIGRATION_COOKIE_STATS)))
return -1;
...
/* Did the migration go as planned? If yes, kill off the domain object.
* If something failed, resume CPUs, but only if we didn't use post-copy.
*/
if (retcode == 0) {
qemuMigrationSrcComplete(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT);
} else {
...
qemuMigrationSrcRestoreDomainState(driver, vm);
...
}

return 0;
}

其会调用qemuMigrationCookieParse()处理目的端的cookie信息,获取目的端finish阶段热迁移的能力信息。如果finish阶段成功,会调用qemuMigrationSrcComplete()销毁源端子机;否则调用qemuMigrationSrcRestoreDomainState()恢复源端子机的运行

处理cookie

这里和begin阶段一样,使用qemuMigrationCookieParse根据flags指定的配置字段,根据flags反序列化cookie字符串到对应的struct _qemuMigrationCookie类型数据中,其为目的端finish阶段的热迁移额外配置信息

处理热迁移结果

热迁移成功

如果perform阶段热迁移成功,则调用qemuMigrationSrcComplete()清理源端的子机资源即可,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
//#0  qemuMigrationSrcComplete (driver=driver@entry=0x55d52e773930, vm=vm@entry=0x7ff28407a7d0, asyncJob=asyncJob@entry=VIR_ASYNC_JOB_MIGRATION_OUT) at ../src/qemu/qemu_migration.c:3994
//#1 0x00007ff2bc347e68 in qemuMigrationSrcConfirmPhase
// (driver=driver@entry=0x55d52e773930, vm=0x7ff28407a7d0, cookiein=cookiein@entry=0x7ff2a80013c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>50107375-88a7-46e8-96cc-3d9637da642e</uuid>\n <hostname>dst</hostname>\n <hostuuid>aa0c5537-0711-46b5-b937-bd6fffe29840</hostuuid>\n <statistics>\n"..., cookieinlen=cookieinlen@entry=1166, flags=flags@entry=131329, retcode=retcode@entry=0) at ../src/qemu/qemu_migration.c:4112
//#2 0x00007ff2bc34815b in qemuMigrationSrcConfirm
// (driver=0x55d52e773930, vm=<optimized out>, cookiein=cookiein@entry=0x7ff2a80013c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>50107375-88a7-46e8-96cc-3d9637da642e</uuid>\n <hostname>dst</hostname>\n <hostuuid>aa0c5537-0711-46b5-b937-bd6fffe29840</hostuuid>\n <statistics>\n"..., cookieinlen=cookieinlen@entry=1166, flags=flags@entry=131329, cancelled=cancelled@entry=0) at ../src/qemu/qemu_migration.c:4182
//#3 0x00007ff2bc30b236 in qemuDomainMigrateConfirm3Params
// (domain=0x7ff2a8000d40, params=<optimized out>, nparams=<optimized out>, cookiein=0x7ff2a80013c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>50107375-88a7-46e8-96cc-3d9637da642e</uuid>\n <hostname>dst</hostname>\n <hostuuid>aa0c5537-0711-46b5-b937-bd6fffe29840</hostuuid>\n <statistics>\n"..., cookieinlen=1166, flags=131329, cancelled=0)
// at ../src/qemu/qemu_driver.c:11449
//#4 0x00007ff2c029fa30 in virDomainMigrateConfirm3Params
// (domain=domain@entry=0x7ff2a8000d40, params=0x7ff2a8001880, nparams=3, cookiein=0x7ff2a80013c0 "<qemu-migration>\n <name>migrate_guest</name>\n <uuid>50107375-88a7-46e8-96cc-3d9637da642e</uuid>\n <hostname>dst</hostname>\n <hostuuid>aa0c5537-0711-46b5-b937-bd6fffe29840</hostuuid>\n <statistics>\n"..., cookieinlen=1166, flags=131329, cancelled=0)
// at ../src/libvirt-domain.c:5537
//#5 0x000055d509e1dd3c in remoteDispatchDomainMigrateConfirm3Params (server=0x55d52e762bf0, msg=0x55d52e7711a0, client=<optimized out>, rerr=0x7ff2bcc4da30, args=0x7ff2a8000d90)
// at ../src/remote/remote_daemon_dispatch.c:5863
//#6 remoteDispatchDomainMigrateConfirm3ParamsHelper (server=0x55d52e762bf0, client=<optimized out>, msg=0x55d52e7711a0, rerr=0x7ff2bcc4da30, args=0x7ff2a8000d90, ret=0x0)
// at src/remote/remote_daemon_dispatch_stubs.h:8488
//#7 0x00007ff2c021c88e in virNetServerProgramDispatchCall (prog=0x55d52e763430, server=0x55d52e762bf0, client=0x7ff2a4000d60, msg=0x55d52e7711a0) at ../src/rpc/virnetserverprogram.c:423
//#8 virNetServerProgramDispatch (prog=0x55d52e763430, server=server@entry=0x55d52e762bf0, client=client@entry=0x7ff2a4000d60, msg=msg@entry=0x55d52e7711a0)
// at ../src/rpc/virnetserverprogram.c:299
//#9 0x00007ff2c0221e22 in virNetServerProcessMsg (srv=srv@entry=0x55d52e762bf0, client=0x7ff2a4000d60, prog=<optimized out>, msg=0x55d52e7711a0) at ../src/rpc/virnetserver.c:135
//#10 0x00007ff2c022215d in virNetServerHandleJob (jobOpaque=0x7ff2b0000e30, opaque=0x55d52e762bf0) at ../src/rpc/virnetserver.c:155
//#11 0x00007ff2c01698bd in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#12 0x00007ff2c0168f44 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#13 0x00007ff2bfac2b7b in ??? () at /lib/x86_64-linux-gnu/libc.so.6
//#14 0x00007ff2bfb407f8 in ??? () at /lib/x86_64-linux-gnu/libc.so.6
void
qemuMigrationSrcComplete(virQEMUDriver *driver,
virDomainObj *vm,
virDomainAsyncJob asyncJob)
{
...
qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_MIGRATED, asyncJob,
VIR_QEMU_PROCESS_STOP_MIGRATED);
...
}

可以看到,libvirt会调用qemuProcessStop()销毁源端子机

热迁移失败

如果perform阶段热迁移失败,则调用qemuMigrationSrcRestoreDomainState()恢复源端子机的运行,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//#0  qemuMigrationSrcRestoreDomainState (driver=driver@entry=0x7fb90800f700, vm=vm@entry=0x7fb908503370) at ../src/qemu/qemu_migration.c:224
//#1 0x00007fb92a70efbd in qemuMigrationSrcConfirmPhase
// (driver=driver@entry=0x7fb90800f700, vm=0x7fb908503370, cookiein=cookiein@entry=0x0, cookieinlen=cookieinlen@entry=0, flags=<optimized out>,
// flags@entry=131329, retcode=retcode@entry=1) at ../src/qemu/qemu_migration.c:4127
//#2 0x00007fb92a70f15b in qemuMigrationSrcConfirm
// (driver=0x7fb90800f700, vm=<optimized out>, cookiein=cookiein@entry=0x0, cookieinlen=cookieinlen@entry=0, flags=flags@entry=131329, cancelled=cancelled@entry=1)
// at ../src/qemu/qemu_migration.c:4182
//#3 0x00007fb92a6d2236 in qemuDomainMigrateConfirm3Params (domain=0x7fb92c0013b0, params=<optimized out>, nparams=<optimized out>, cookiein=0x0, cookieinlen=0, flags=131329, cancelled=1)
// at ../src/qemu/qemu_driver.c:11449
//#4 0x00007fb944e7ea30 in virDomainMigrateConfirm3Params (domain=domain@entry=0x7fb92c0013b0, params=0x7fb92c001400, nparams=2, cookiein=0x0, cookieinlen=0, flags=131329, cancelled=1)
// at ../src/libvirt-domain.c:5537
//#5 0x00005590f61c0d3c in remoteDispatchDomainMigrateConfirm3Params (server=0x559111617bf0, msg=0x55911161d800, client=<optimized out>, rerr=0x7fb94282ea30, args=0x7fb92c000d80)
// at ../src/remote/remote_daemon_dispatch.c:5863
//#6 remoteDispatchDomainMigrateConfirm3ParamsHelper (server=0x559111617bf0, client=<optimized out>, msg=0x55911161d800, rerr=0x7fb94282ea30, args=0x7fb92c000d80, ret=0x0)
// at src/remote/remote_daemon_dispatch_stubs.h:8488
//#7 0x00007fb944dfb88e in virNetServerProgramDispatchCall (prog=0x559111618430, server=0x559111617bf0, client=0x55911160dfa0, msg=0x55911161d800) at ../src/rpc/virnetserverprogram.c:423
//#8 virNetServerProgramDispatch (prog=0x559111618430, server=server@entry=0x559111617bf0, client=client@entry=0x55911160dfa0, msg=msg@entry=0x55911161d800)
// at ../src/rpc/virnetserverprogram.c:299
//#9 0x00007fb944e00e22 in virNetServerProcessMsg (srv=srv@entry=0x559111617bf0, client=0x55911160dfa0, prog=<optimized out>, msg=0x55911161d800) at ../src/rpc/virnetserver.c:135
//#10 0x00007fb944e0115d in virNetServerHandleJob (jobOpaque=0x5591115f7f60, opaque=0x559111617bf0) at ../src/rpc/virnetserver.c:155
//#11 0x00007fb944d488bd in virThreadPoolWorker (opaque=<optimized out>) at ../src/util/virthreadpool.c:164
//#12 0x00007fb944d47f44 in virThreadHelper (data=<optimized out>) at ../src/util/virthread.c:256
//#13 0x00007fb9446a1b7b in ??? () at /lib/x86_64-linux-gnu/libc.so.6
//#14 0x00007fb94471f7f8 in ??? () at /lib/x86_64-linux-gnu/libc.so.6
static bool
qemuMigrationSrcRestoreDomainState(virQEMUDriver *driver, virDomainObj *vm)
{
...
/* we got here through some sort of failure; start the domain again */
if (qemuProcessStartCPUs(driver, vm,
VIR_DOMAIN_RUNNING_MIGRATION_CANCELED,
VIR_ASYNC_JOB_MIGRATION_OUT) < 0) {
/* Hm, we already know we are in error here. We don't want to
* overwrite the previous error, though, so we just throw something
* to the logs and hope for the best */
VIR_ERROR(_("Failed to resume guest %1$s after failure"), vm->def->name);
if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_PAUSED) {
virObjectEvent *event;

virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
VIR_DOMAIN_PAUSED_API_ERROR);
event = virDomainEventLifecycleNewFromObj(vm,
VIR_DOMAIN_EVENT_SUSPENDED,
VIR_DOMAIN_EVENT_SUSPENDED_API_ERROR);
virObjectEventStateQueue(driver->domainEventState, event);
}
goto cleanup;
}
...
}

可以看到,libvirt会调用qemuProcessStartCPUs()恢复源端子机的运行

参考

  1. Guest migration
  2. Migration
  3. QEMU-KVM 热迁移:Live Migration
  4. libvirt live migration 流程
  5. Libvirt migration internals
  6. Coroutines in QEMU: The basics
  7. Iterative device migration
  8. QEMU Multifd迁移原理