前言

这里简单介绍一些QEMU的内存模型,即QEMU是如何管理gpa到hva的映射关系。

其内存模型主要由RAMBlockMemoryRegionAddressSpaceFlatView等结构构成。

RAMBlock

无论如何,Qemu都需要申请一段内存空间用来存放虚拟机内存的真实数据,而这部分内存空间由struct RAMBlock来管理。

struct RAMBlock

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
struct RAMBlock {
struct rcu_head rcu;
struct MemoryRegion *mr;
uint8_t *host;
uint8_t *colo_cache; /* For colo, VM's ram cache */
ram_addr_t offset;
ram_addr_t used_length;
ram_addr_t max_length;
void (*resized)(const char*, uint64_t length, void *host);
uint32_t flags;
/* Protected by the BQL. */
char idstr[256];
/* RCU-enabled, writes protected by the ramlist lock */
QLIST_ENTRY(RAMBlock) next;
QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
int fd;
uint64_t fd_offset;
size_t page_size;
/* dirty bitmap used during migration */
unsigned long *bmap;

/*
* Below fields are only used by mapped-ram migration
*/
/* bitmap of pages present in the migration file */
unsigned long *file_bmap;
/*
* offset in the file pages belonging to this ramblock are saved,
* used only during migration to a file.
*/
off_t bitmap_offset;
uint64_t pages_offset;

/* bitmap of already received pages in postcopy */
unsigned long *receivedmap;

/*
* bitmap to track already cleared dirty bitmap. When the bit is
* set, it means the corresponding memory chunk needs a log-clear.
* Set this up to non-NULL to enable the capability to postpone
* and split clearing of dirty bitmap on the remote node (e.g.,
* KVM). The bitmap will be set only when doing global sync.
*
* It is only used during src side of ram migration, and it is
* protected by the global ram_state.bitmap_mutex.
*
* NOTE: this bitmap is different comparing to the other bitmaps
* in that one bit can represent multiple guest pages (which is
* decided by the `clear_bmap_shift' variable below). On
* destination side, this should always be NULL, and the variable
* `clear_bmap_shift' is meaningless.
*/
unsigned long *clear_bmap;
uint8_t clear_bmap_shift;

/*
* RAM block length that corresponds to the used_length on the migration
* source (after RAM block sizes were synchronized). Especially, after
* starting to run the guest, used_length and postcopy_length can differ.
* Used to register/unregister uffd handlers and as the size of the received
* bitmap. Receiving any page beyond this length will bail out, as it
* could not have been valid on the source.
*/
ram_addr_t postcopy_length;
};

其中,host指向Qemu申请的内存空间的虚拟地址,也就是hva

而所有的struct RAMBlocknext指针形成单链表存储在ram_list,如下图所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
         ram_list                 
┌───────┬──┐
│blocks │ ├────┐
└───────┴──┘ │


struct RAMBlock◄─┘
┌─────┬──────┐
│idstr│pc.ram│
├─────┼──────┤
│next │ ├─────┐
└─────┴──────┘ │

struct RAMBlock◄───┘
┌─────┬─────────────────────┐
│idstr│0000:00:02.0/vga.vram│
├─────┼─────────────────────┤
│next │ │
└─────┴─────────────────────┘

初始化

Qemu会通过qemu_ram_alloc_internal()来分配和初始化RAMBlock数据,关键逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
static
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
void (*resized)(const char*,
uint64_t length,
void *host),
void *host, uint32_t ram_flags,
MemoryRegion *mr, Error **errp)
{
RAMBlock *new_block;
...
new_block = g_malloc0(sizeof(*new_block));
new_block->host = host;
ram_block_add(new_block, &local_err);
...
return new_block;
}

static void ram_block_add(RAMBlock *new_block, Error **errp)
{
RAMBlock *block;
RAMBlock *last_block = NULL;

qemu_mutex_lock_ramlist();
new_block->host = qemu_anon_ram_alloc(new_block->max_length,
&new_block->mr->align,
shared, noreserve);

/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
* QLIST (which has an RCU-friendly variant) does not have insertion at
* tail, so save the last element in last_block.
*/
RAMBLOCK_FOREACH(block) {
last_block = block;
if (block->max_length < new_block->max_length) {
break;
}
}
if (block) {
QLIST_INSERT_BEFORE_RCU(block, new_block, next);
} else if (last_block) {
QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
} else { /* list is empty */
QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
}

/* Write list before version */
smp_wmb();
ram_list.version++;
qemu_mutex_unlock_ramlist();
}

其主要就是初始化RAMBlock,管理该RAMBlock对应的hva,并将其插入ram_list

MemoryRegion

实际上,不同区域的gpa有着不同的属性和功能,因此需要分开管理。

例如,对于如下e1000-mmio的gpa访问,实际上并不是内存的读写,而是对于设备的模拟操作,Qemu需要模拟设备处理guest的请求

1
00000000febc0000-00000000febdffff (prio 1, i/o): e1000-mmio

而对于如下的pc.ram的gpa访问,则只是单纯的内存访问,Qemu只需要简单的存取或读取数据即可

1
0000000000000000-00000000ffffffff (prio 0, ram): pc.ram

为此,Qemu使用struct MemoryRegion,以树状组织管理整个gpa。

struct MemoryRegion

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/** MemoryRegion:
*
* A struct representing a memory region.
*/
struct MemoryRegion {
Object parent_obj;

/* private: */

/* The following fields should fit in a cache line */
bool romd_mode;
bool ram;
bool subpage;
bool readonly; /* For RAM regions */
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
bool unmergeable;
uint8_t dirty_log_mask;
bool is_iommu;
RAMBlock *ram_block;
Object *owner;
/* owner as TYPE_DEVICE. Used for re-entrancy checks in MR access hotpath */
DeviceState *dev;

const MemoryRegionOps *ops;
void *opaque;
MemoryRegion *container;
int mapped_via_alias; /* Mapped via an alias, container might be NULL */
Int128 size;
hwaddr addr;
void (*destructor)(MemoryRegion *mr);
uint64_t align;
bool terminates;
bool ram_device;
bool enabled;
bool warning_printed; /* For reservations */
uint8_t vga_logging_count;
MemoryRegion *alias;
hwaddr alias_offset;
int32_t priority;
QTAILQ_HEAD(, MemoryRegion) subregions;
QTAILQ_ENTRY(MemoryRegion) subregions_link;
QTAILQ_HEAD(, CoalescedMemoryRange) coalesced;
const char *name;
unsigned ioeventfd_nb;
MemoryRegionIoeventfd *ioeventfds;
RamDiscardManager *rdm; /* Only for RAM */

/* For devices designed to perform re-entrant IO into their own IO MRs */
bool disable_reentrancy_guard;
};

addr字段表明MemoryRegion起始gpa相对于父MemoryRegion起始gpa的相对偏移,而size表明这段内存区间的大小。

实际上,根据Qemu官网MemoryRegion可以分为RAM MemoryRegionROM MemoryRegionMMIO MemoryRegionROM device MemoryRegionIOMMU MemoryRegioncontainer MemoryRegionalias MemoryRegionreservation MemoryRegion

MR间关系

树状结构

对于container MemoryRegion来说,其subregions字段包含了其他的子MemoryRegion,而这些子MemoryRegioncontainer字段则指向该container MemoryRegion。这些子MemoryRegion之间没有交集,通过memory_region_add_subregion()初始化对应的subregionscontainer字段,从而构建出如下的树状结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
                              struct MemoryRegion                                      
┌──────────┬────────┐
│name │io │
├──────────┼────────┤
│addr │0 │
├──────────┼────────┤
│size │65536 │
├──────────┼────────┤
│subregions│ │
└──────────┴───┬────┘

┌─────────────────┴─────────────────┬────────────────────┬───
│ │ │
▼ ▼ ▼
struct MemoryRegion struct MemoryRegion
┌──────────┬──────────┐ ┌──────────┬──────────┐
│name │piix4-pm │ │name │pm-smbus │
├──────────┼──────────┤ ├──────────┼──────────┤
│addr │1536 │ │addr │45312 │
├──────────┼──────────┤ ├──────────┼──────────┤
│size │64 │ │size │64 │
├──────────┼──────────┤ ├──────────┼──────────┤
│subregions│ │ │subregions│NULL │
└──────────┴─────┬────┘ └──────────┴──────────┘

┌──────────────────┴──────────┬──────────────────┬───
│ │ │
▼ ▼ ▼
struct MemoryRegion struct MemoryRegion
┌──────────┬──────────┐ ┌──────────┬──────────┐
│name │acpi-cnt │ │name │acpi-evt │
├──────────┼──────────┤ ├──────────┼──────────┤
│addr │4 │ │addr │0 │
├──────────┼──────────┤ ├──────────┼──────────┤
│size │2 │ │size │4 │
├──────────┼──────────┤ ├──────────┼──────────┤
│subregions│NULL │ │subregions│NULL │
└──────────┴──────────┘ └──────────┴──────────┘

交叠

通常情况下,MemoryRegion之间不会交叠:要么内含;要么不相交。

但是考虑到诸如pcie设备的地址空间是动态分配的,因此允许MemoryRegion交叠并通过优先级决定交叠部分的可见性会极大地简化这部分代码。可以通过memory_region_add_subregion_overlap()来向一个container MemoryRegion中插入和其他子MemoryRegion交叠的MemoryRegion并声明优先级,如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
           struct MemoryRegion                                                  
┌──────────┬───────────────────┐
│name │system │
├──────────┼───────────────────┤
│addr │0 │
├──────────┼───────────────────┤
│size │0x10000000000000000│
├──────────┼───────────────────┤
│priority │0 │
├──────────┼───────────────────┤
│subregions│ │
└──────────┴───┬───────────────┘

┌────────────┴────────────┬────────────────────────────►
│ │
▼ ▼
struct MemoryRegion struct MemoryRegion
┌──────────┬──────────┐ ┌──────────┬───────────────────┐
│name │kvm-ioapic│ │name │pci │
├──────────┼──────────┤ ├──────────┼───────────────────┤
│addr │0xfec00000│ │addr │0 │
├──────────┼──────────┤ ├──────────┼───────────────────┤
│size │0x10000 │ │size │0x10000000000000000│
├──────────┼──────────┤ ├──────────┼───────────────────┤
│priority │0 │ │priority │-1 │
├──────────┼──────────┤ ├──────────┼───────────────────┤
│subregions│NULL │ │subregions│ │
└──────────┴──────────┘ └──────────┴───┬───────────────┘

┌─────┴─────────────────────────────────────►

struct MemoryRegion
┌──────────┬──────────┐
│name │vga-lowmem│
├──────────┼──────────┤
│addr │0xa0000 │
├──────────┼──────────┤
│size │0x20000 │
├──────────┼──────────┤
│priority │1 │
├──────────┼──────────┤
│subregions│NULL │
└──────────┴──────────┘

如图所示,kvm-ioapicpci相交叠,但由于kvm-ioapic优先级更高,所以在system所代表的地址空间中[0xfec00000, 0xfec10000)是kvm-ioapicMemoryRegion而非pci下的MemoryRegion

AddressSpace

对于Guest来说,相同的地址可能有不同的意义。例如port IO中的地址和内存中的地址,即使值相同表示的也不是同一个东西。

为此,Qemu使用struct AddressSpace来管理不同类型的地址空间,主要包括address_space_memoryaddress_space_io

struct AddressSpace

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/**
* struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
*/
struct AddressSpace {
/* private: */
struct rcu_head rcu;
char *name;
MemoryRegion *root;

/* Accessed via RCU. */
struct FlatView *current_map;

int ioeventfd_nb;
int ioeventfd_notifiers;
struct MemoryRegionIoeventfd *ioeventfds;
QTAILQ_HEAD(, MemoryListener) listeners;
QTAILQ_ENTRY(AddressSpace) address_spaces_link;
};

其中,root字段指向该地址空间中的MemoryRegion资源,即树状MemoryRegion的根,从而可通过遍历树状MemoryRegion来访问地址空间中的所有地址。

FlatView

Qemu处理Guest的内存操作时,都是基于对应AddressSpace,找到地址对应的MemoryRegion,完成最终的内存操作模拟。但考虑到MemoryRegion的树状结构,需要进行大量的计算才能获取地址实际对应的MemoryRegion。为了提高效率,Qemu在AddressSpace中添加了FlatView来加快地址查找,其结构如下所示。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/* Flattened global view of current active memory hierarchy.  Kept in sorted
* order.
*/
struct FlatView {
struct rcu_head rcu;
unsigned ref;
FlatRange *ranges;
unsigned nr;
unsigned nr_allocated;
struct AddressSpaceDispatch *dispatch;
MemoryRegion *root;
};

/* Range of memory in the global map. Addresses are absolute. */
struct FlatRange {
MemoryRegion *mr;
hwaddr offset_in_region;
AddrRange addr;
uint8_t dirty_log_mask;
bool romd_mode;
bool readonly;
bool nonvolatile;
bool unmergeable;
};

具体来说,FlatView由数个互相不重合的struct FlatRange构成,每一个FlatRange包含地址空间和其实际对应的MemoryRegion,从而能表示AddressSpace中树状MemoryRegion经过平坦化后的最终线性地址空间,如下所示。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
                                                                                           ┌─────────────────────────────────────────┐                               
│ │
│ │
│ ┌──────────────▼─────────────┐
│ │ struct FlatRange │
│ │ ┌──────┬─────────────────┐ │
│ │ │start │0 │ │
│ │ ├──────┼─────────────────┤ │
struct AddressSpace │ │ │size │1536 │ │
┌───────────┬────┐ ┌────►struct FlatView │ │ ├──────┼─────────────────┤ │
│name │I/O │ │ ┌──────┬─────┐ │ ┌─────────────┼─┤mr │ │ │
├───────────┼────┤ │ │ranges│ ├───────┘ │ │ └──────┴─────────────────┘ │
│current_map│ ├────────┘ └──────┴─────┘ │ │ │
├───────────┼────┤ │ ├────────────────────────────┤
│root │ │ │ │ struct FlatRange │
└───────────┴─┬──┘ │ │ ┌──────┬─────────────────┐ │
│ │ │ │start │1536 │ │
│ │ │ ├──────┼─────────────────┤ │
▼ │ │ │size │4 │ │
struct MemoryRegion◄───────────────────────────────────────────────┤ │ ├──────┼─────────────────┤ │
┌──────────┬────────┐ │ │ │mr │ ├─┼──────────────┐
│name │io │ │ │ └──────┴─────────────────┘ │ │
├──────────┼────────┤ │ │ │ │
│addr │0 │ │ ├────────────────────────────┤ │
├──────────┼────────┤ │ │ struct FlatRange │ │
│size │65536 │ │ │ ┌──────┬─────────────────┐ │ │
├──────────┼────────┤ │ │ │start │1540 │ │ │
│subregions│ │ │ │ ├──────┼─────────────────┤ │ │
└──────────┴───┬────┘ │ │ │size │2 │ │ │
│ │ │ ├──────┼─────────────────┤ │ │
┌─────────────────┴─────────────────┬ │ │ │mr │ ├─┼──────────┐ │
│ │ │ │ └──────┴─────────────────┘ │ │ │
▼ ▼ │ │ │ │ │
┌─────────────────────►struct MemoryRegion struct MemoryRegion◄─────┐ │ ├────────────────────────────┤ │ │
│ ┌──────────┬──────────┐ ┌──────────┬──────────┐ │ │ │ struct FlatRange │ │ │
│ │name │piix4-pm │ │name │pm-smbus │ │ │ │ ┌──────┬─────────────────┐ │ │ │
│ ├──────────┼──────────┤ ├──────────┼──────────┤ │ │ │ │start │1542 │ │ │ │
│ │addr │1536 │ │addr │45312 │ │ │ │ ├──────┼─────────────────┤ │ │ │
│ ├──────────┼──────────┤ ├──────────┼──────────┤ │ │ │ │size │58 │ │ │ │
│ │size │64 │ │size │64 │ │ │ │ ├──────┼─────────────────┤ │ │ │
│ ├──────────┼──────────┤ ├──────────┼──────────┤ │ │ │ │mr │ ├─┼──────┐ │ │
│ │subregions│ │ │subregions│NULL │ │ │ │ └──────┴─────────────────┘ │ │ │ │
│ └──────────┴─────┬────┘ └──────────┴──────────┘ │ │ │ │ │ │ │
│ │ │ │ ├────────────────────────────┤ │ │ │
│ ┌──────────────────┴──────────┬ │ │ │ struct FlatRange │ │ │ │
│ │ │ │ │ │ ┌──────┬─────────────────┐ │ │ │ │
│ ▼ ▼ │ │ │ │start │1600 │ │ │ │ │
│ ┌──►struct MemoryRegion struct MemoryRegion◄──────┐ │ │ │ ├──────┼─────────────────┤ │ │ │ │
│ │ ┌──────────┬──────────┐ ┌──────────┬──────────┐ │ │ │ │ │size │43712 │ │ │ │ │
│ │ │name │acpi-cnt │ │name │acpi-evt │ │ │ │ │ ├──────┼─────────────────┤ │ │ │ │
│ │ ├──────────┼──────────┤ ├──────────┼──────────┤ │ │ ├─────────────┼─┤mr │ │ │ │ │ │
│ │ │addr │4 │ │addr │0 │ │ │ │ │ └──────┴─────────────────┘ │ │ │ │
│ │ ├──────────┼──────────┤ ├──────────┼──────────┤ │ │ │ │ │ │ │ │
│ │ │size │2 │ │size │4 │ │ │ │ ├────────────────────────────┤ │ │ │
│ │ ├──────────┼──────────┤ ├──────────┼──────────┤ │ │ │ │ struct FlatRange │ │ │ │
│ │ │subregions│NULL │ │subregions│NULL │ │ │ │ │ ┌──────┬─────────────────┐ │ │ │ │
│ │ └──────────┴──────────┘ └──────────┴──────────┘ │ │ │ │ │start │45312 │ │ │ │ │
│ │ │ │ │ │ ├──────┼─────────────────┤ │ │ │ │
│ │ │ │ │ │ │size │64 │ │ │ │ │
│ │ │ │ │ │ ├──────┼─────────────────┤ │ │ │ │
│ │ │ │ │ │ │mr │ ├─┼───┐ │ │ │
│ │ │ │ │ │ └──────┴─────────────────┘ │ │ │ │ │
│ │ │ │ │ │ │ │ │ │ │
│ │ │ │ │ ├────────────────────────────┤ │ │ │ │
│ │ │ │ │ │ struct FlatRange │ │ │ │ │
│ │ │ │ │ │ ┌──────┬─────────────────┐ │ │ │ │ │
│ │ │ │ │ │ │start │45376 │ │ │ │ │ │
│ │ │ │ │ │ ├──────┼─────────────────┤ │ │ │ │ │
│ │ │ │ │ │ │size │20160 │ │ │ │ │ │
│ │ │ │ │ │ ├──────┼─────────────────┤ │ │ │ │ │
│ │ │ │ └─────────────┼─┤mr │ │ │ │ │ │ │
│ │ │ │ │ └──────┴─────────────────┘ │ │ │ │ │
│ │ │ │ │ │ │ │ │ │
│ │ │ │ └────────────────────────────┘ │ │ │ │
│ │ │ │ │ │ │ │
│ │ │ └──────────────────────────────────────────────────────────────────┘ │ │ │
│ │ │ │ │ │
└───┼────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────┘ │ │
│ │ │ │
└────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────┘ │
│ │
└────────────────────────────────────────────────────────────────────────────────────────────────┘

而Qemu通过address_space_update_topology()生成AddressSpace对应的FlatView

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
static void address_space_update_topology(AddressSpace *as)
{
MemoryRegion *physmr = memory_region_get_flatview_root(as->root);

flatviews_init();
if (!g_hash_table_lookup(flat_views, physmr)) {
generate_memory_topology(physmr);
}
address_space_set_flatview(as);
}

/* Render a memory topology into a list of disjoint absolute ranges. */
static FlatView *generate_memory_topology(MemoryRegion *mr)
{
int i;
FlatView *view;

view = flatview_new(mr);

if (mr) {
render_memory_region(view, mr, int128_zero(),
addrrange_make(int128_zero(), int128_2_64()),
false, false, false);
}
flatview_simplify(view);
...
return view;
}

/* Render a memory region into the global view. Ranges in @view obscure
* ranges in @mr.
*/
static void render_memory_region(FlatView *view,
MemoryRegion *mr,
Int128 base,
AddrRange clip,
bool readonly,
bool nonvolatile,
bool unmergeable)
{
MemoryRegion *subregion;
unsigned i;
hwaddr offset_in_region;
Int128 remain;
Int128 now;
FlatRange fr;
AddrRange tmp;

if (!mr->enabled) {
return;
}

int128_addto(&base, int128_make64(mr->addr));
readonly |= mr->readonly;
nonvolatile |= mr->nonvolatile;
unmergeable |= mr->unmergeable;

tmp = addrrange_make(base, mr->size);

if (!addrrange_intersects(tmp, clip)) {
return;
}

clip = addrrange_intersection(tmp, clip);

if (mr->alias) {
int128_subfrom(&base, int128_make64(mr->alias->addr));
int128_subfrom(&base, int128_make64(mr->alias_offset));
render_memory_region(view, mr->alias, base, clip,
readonly, nonvolatile, unmergeable);
return;
}

/* Render subregions in priority order. */
QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
render_memory_region(view, subregion, base, clip,
readonly, nonvolatile, unmergeable);
}

if (!mr->terminates) {
return;
}

offset_in_region = int128_get64(int128_sub(clip.start, base));
base = clip.start;
remain = clip.size;

fr.mr = mr;
fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
fr.romd_mode = mr->romd_mode;
fr.readonly = readonly;
fr.nonvolatile = nonvolatile;
fr.unmergeable = unmergeable;

/* Render the region itself into any gaps left by the current view. */
for (i = 0; i < view->nr && int128_nz(remain); ++i) {
if (int128_ge(base, addrrange_end(view->ranges[i].addr))) {
continue;
}
if (int128_lt(base, view->ranges[i].addr.start)) {
now = int128_min(remain,
int128_sub(view->ranges[i].addr.start, base));
fr.offset_in_region = offset_in_region;
fr.addr = addrrange_make(base, now);
flatview_insert(view, i, &fr);
++i;
int128_addto(&base, now);
offset_in_region += int128_get64(now);
int128_subfrom(&remain, now);
}
now = int128_sub(int128_min(int128_add(base, remain),
addrrange_end(view->ranges[i].addr)),
base);
int128_addto(&base, now);
offset_in_region += int128_get64(now);
int128_subfrom(&remain, now);
}
if (int128_nz(remain)) {
fr.offset_in_region = offset_in_region;
fr.addr = addrrange_make(base, remain);
flatview_insert(view, i, &fr);
}
}

/* Attempt to simplify a view by merging adjacent ranges */
static void flatview_simplify(FlatView *view)
{
unsigned i, j, k;

i = 0;
while (i < view->nr) {
j = i + 1;
while (j < view->nr
&& can_merge(&view->ranges[j-1], &view->ranges[j])) {
int128_addto(&view->ranges[i].addr.size, view->ranges[j].addr.size);
++j;
}
++i;
for (k = i; k < j; k++) {
memory_region_unref(view->ranges[k].mr);
}
memmove(&view->ranges[i], &view->ranges[j],
(view->nr - j) * sizeof(view->ranges[j]));
view->nr -= j - i;
}
}

可以看到,生成FlatView整体可分为两步,首先通过memory_region_get_flatview_root()获取AddressSpace对应的树状MemoryRegion根,其次通过generate_memory_topology()平坦化地址空间。

其中generate_memory_topology的逻辑也相对比较清晰:通过DFS遍历整棵树即可平坦化。

内存分派

虽然Qemu已经通过FlatView加快了AddressSpace地址对应的MemoryRegion的查找,但还可以使用struct AddressSpaceDispatch以类似页表的形式进一步加快查找

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
struct AddressSpaceDispatch {
MemoryRegionSection *mru_section;
/* This is a multi-level map on the physical address space.
* The bottom level has pointers to MemoryRegionSections.
*/
PhysPageEntry phys_map;
PhysPageMap map;
};

/**
* struct MemoryRegionSection: describes a fragment of a #MemoryRegion
*
* @mr: the region, or %NULL if empty
* @fv: the flat view of the address space the region is mapped in
* @offset_within_region: the beginning of the section, relative to @mr's start
* @size: the size of the section; will not exceed @mr's boundaries
* @offset_within_address_space: the address of the first byte of the section
* relative to the region's address space
* @readonly: writes to this section are ignored
* @nonvolatile: this section is non-volatile
* @unmergeable: this section should not get merged with adjacent sections
*/
struct MemoryRegionSection {
Int128 size;
MemoryRegion *mr;
FlatView *fv;
hwaddr offset_within_region;
hwaddr offset_within_address_space;
bool readonly;
bool nonvolatile;
bool unmergeable;
};

struct PhysPageEntry {
/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
uint32_t skip : 6;
/* index into phys_sections (!skip) or phys_map_nodes (skip) */
uint32_t ptr : 26;
};

/* Size of the L2 (and L3, etc) page tables. */
#define ADDR_SPACE_BITS 64
#define P_L2_BITS 9
#define P_L2_SIZE (1 << P_L2_BITS)
#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)

typedef PhysPageEntry Node[P_L2_SIZE];


typedef struct PhysPageMap {
struct rcu_head rcu;

unsigned sections_nb;
unsigned sections_nb_alloc;
unsigned nodes_nb;
unsigned nodes_nb_alloc;
Node *nodes;
MemoryRegionSection *sections;
} PhysPageMap;

Qemu使用address_space_lookup_region()完成地址分派,逻辑如下所示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/* Called from RCU critical section */
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
hwaddr addr,
bool resolve_subpage)
{
MemoryRegionSection *section = qatomic_read(&d->mru_section);
subpage_t *subpage;

if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
!section_covers_addr(section, addr)) {
section = phys_page_find(d, addr);
qatomic_set(&d->mru_section, section);
}
...
return section;
}

static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
{
PhysPageEntry lp = d->phys_map, *p;
Node *nodes = d->map.nodes;
MemoryRegionSection *sections = d->map.sections;
hwaddr index = addr >> TARGET_PAGE_BITS;
int i;

for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
if (lp.ptr == PHYS_MAP_NODE_NIL) {
return &sections[PHYS_SECTION_UNASSIGNED];
}
p = nodes[lp.ptr];
lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
}

if (section_covers_addr(&sections[lp.ptr], addr)) {
return &sections[lp.ptr];
} else {
return &sections[PHYS_SECTION_UNASSIGNED];
}
}

类似于页表地址转换,内存分派使用了6级的map实现了地址到MemoryRegionSection的转换。具体来说,map中的Node类型类似于页表地址转换中的中间项map中的MemoryRegionSection类似于页表地址转换中最后的物理页,phys_map则类似于页表地址转换中的CR3寄存器,即第一级Map。具体来说,map中的nodes数组存放着该AddressSpace所有的Node,而sections数组则存放着所有的MemoryRegionSectionPhysPageEntryptr在作为这些数组的下标进行索引,如下所示。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
                                                               ┌─────────────────────┐                
┌───────┼───►struct Node │ gpa
│ │ ┌────────────┐ │ │
│ │ ┌─┼─── │◄───┼──────────────┘
│ │ │ ├────────────┤ │
│ │ │ │ ...... │ │
│ │ │ ├────────────┤ │
│ │ │ │ │ │
│ │ │ └────────────┘ │
│ │ │ │
│ ├─┼───────────────────┤
struct AddressSpaceDispatch │ │ │ struct Node │
┌────────┬─────┐ │ │ │ ┌────────────┐ │
│phys_map│ ├─────────┘ │ └─►│ ───┼─┐ │
├────────┼─────┤ │ ├────────────┤ │ │
│map │ ├──┐ │ │ ...... │ │ │
└────────┴─────┘ │ │ ├────────────┤ │ │
│ │ │ │ │ │
┌──────────┘ │ └────────────┘ │ │
│ │ │ │
▼ ├───────────────────┼─┤
struct PhysPageMap │ struct Node │ │
┌────────┬───┐ │ ┌────────────┐ │ │
│nodes │ ├───────────────────► │ │ │ │
┌─────────────────────┐ ├────────┼───┤ │ ├────────────┤ │ │
│ MemoryReginSection │◄──────┤sections│ │ │ │ ...... │ │ │
├─────────────────────┤ └────────┴───┘ │ ├────────────┤ │ │
│ MemoryReginSection │ │ ┌──┼─── │◄┘ │
├─────────────────────┤ │ │ └────────────┘ │
│ ...... │ │ │ │
├─────────────────────┤ ├─┼───────────────────┤
│ MemoryReginSection │◄───────────┐ │ │ struct Node │
└─────────────────────┘ │ │ │ ┌────────────┐ │
│ │ └─►│ ───┼─┐ │
│ │ ├────────────┤ │ │
│ │ │ ...... │ │ │
│ │ ├────────────┤ │ │
│ │ │ │ │ │
│ │ └────────────┘ │ │
│ │ │ │
│ ├───────────────────┼─┤
│ │ struct Node │ │
│ │ ┌────────────┐ │ │
│ │ ┌─┼─── │◄┘ │
│ │ │ ├────────────┤ │
│ │ │ │ ...... │ │
│ │ │ ├────────────┤ │
│ │ │ │ │ │
│ │ │ └────────────┘ │
│ │ │ │
│ ├──┼──────────────────┤
│ │ │ ........... │
│ ├──┼──────────────────┤
│ │ │ struct Node │
│ │ │ ┌────────────┐ │
│ │ └►│ ───┼─┐ │
│ │ ├────────────┤ │ │
│ │ │ ...... │ │ │
│ │ ├────────────┤ │ │
│ │ │ │ │ │
│ │ └────────────┘ │ │
│ │ │ │
│ └───────────────────┼─┘
│ │
└───────────────────────────────────────────────┘

参考

  1. qemu对虚拟机的内存管理(一)
  2. QEMU 的 memory model
  3. 地址空间
  4. QEMU的内存模拟
  5. MemoryRegion模型原理,以及同FlatView模型的关系(QEMU2.0.0)
  6. The memory API