From a7d4166c89fe695436870dea98b96bad45b0c5f7 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Wed, 8 Feb 2023 17:13:01 +0000 Subject: [PATCH] vm: implement a sparse memory model --- include/socks/vm.h | 35 +++++-- vm/bootstrap.c | 7 +- vm/flat.c | 19 +++- vm/sparse.c | 233 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 285 insertions(+), 9 deletions(-) create mode 100644 vm/sparse.c diff --git a/include/socks/vm.h b/include/socks/vm.h index 2671f41..254285e 100644 --- a/include/socks/vm.h +++ b/include/socks/vm.h @@ -14,6 +14,8 @@ #define VM_MAX_ZONES (VM_ZONE_MAX + 1) /* maximum number of supported page orders */ #define VM_MAX_PAGE_ORDERS (VM_PAGE_MAX_ORDER + 1) +/* maximum number of sparse memory sectors */ +#define VM_MAX_SECTORS 1024 #define VM_CHECK_ALIGN(p, mask) ((((p) & (mask)) == (p)) ? 1 : 0) @@ -189,15 +191,16 @@ typedef struct vm_slab { typedef struct vm_page { /* order of the page block that this page belongs too */ - uint16_t p_order : 4; + uint32_t p_order : 4; /* the id of the NUMA node that this page belongs to */ - uint16_t p_node : 6; + uint32_t p_node : 6; /* the id of the memory zone that this page belongs to */ - uint16_t p_zone : 3; - /* some unused bits */ - uint16_t p_reserved : 3; - + uint32_t p_zone : 3; /* vm_page_flags_t bitfields. */ + uint32_t p_sector : 11; + /* some unused bits */ + uint32_t p_reserved : 8; + uint32_t p_flags; /* multi-purpose list. @@ -216,6 +219,21 @@ typedef struct vm_page { } __attribute__((aligned(2 * sizeof(unsigned long)))) vm_page_t; +/* represents a sector of memory, containing its own array of vm_pages. + this struct is used under the sparse memory model, instead of the + global vm_page array */ +typedef struct vm_sector { + /* sector size. this must be a power of 2. + all sectors in the system have the same size. */ + vm_page_order_t s_size; + /* PFN of the first page contained in s_pages. + to find the PFN of any page contained within s_pages, + simply add its offset within the array to s_first_pfn */ + size_t s_first_pfn; + /* array of pages contained in this sector */ + vm_page_t *s_pages; +} vm_sector_t; + extern kern_status_t vm_bootstrap(const vm_zone_descriptor_t *zones, size_t nr_zones); extern vm_model_t vm_memory_model(void); extern void vm_set_memory_model(vm_model_t model); @@ -264,4 +282,9 @@ extern void vm_flat_init(void); extern vm_page_t *vm_page_get_flat(phys_addr_t addr); extern size_t vm_page_get_pfn_flat(vm_page_t *pg); +/* Sparse memory model functions */ +extern void vm_sparse_init(void); +extern vm_page_t *vm_page_get_sparse(phys_addr_t addr); +extern size_t vm_page_get_pfn_sparse(vm_page_t *pg); + #endif diff --git a/vm/bootstrap.c b/vm/bootstrap.c index 6549d02..b10c3c5 100644 --- a/vm/bootstrap.c +++ b/vm/bootstrap.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -18,8 +19,10 @@ kern_status_t vm_bootstrap(const vm_zone_descriptor_t *zones, size_t nr_zones) node_data = memblock_alloc(sizeof(vm_pg_data_t) * numa_count, 8); printk("vm: initialising %u node%s", numa_count, numa_count > 1 ? "s" : ""); - vm_set_memory_model(VM_MODEL_FLAT); - vm_flat_init(); + /* TODO select which memory model to use automatically, and add + a kernel boot parameter to override the choice */ + vm_set_memory_model(VM_MODEL_SPARSE); + vm_sparse_init(); for (size_t i = 0; i < nr_zones; i++) { vm_zone_init(&node_data->pg_zones[zones[i].zd_id], &zones[i]); diff --git a/vm/flat.c b/vm/flat.c index c67d32b..64be9ca 100644 --- a/vm/flat.c +++ b/vm/flat.c @@ -1,3 +1,20 @@ +/* ### The flat memory model ### + + under this memory model, the system memory is represented by + a single contiguous array of vm_pages. this array spans from + physical address up to the last available byte, as provided by + memblock. any extra reserved regions after the last available + byte will not be included to save memory. + + this memory model is good for systems with a smaller amount of + physical memory that is mostly contiguous with few holes or + reserved regions. it is simpler and has less overhead. + + for systems with a large amount of memory, or with large + amounts of reserved memory (especially those whose reserved + memory outstripts free memory), the sparse memory model may + be a better choice. + */ #include #include #include @@ -8,7 +25,7 @@ static vm_page_t *page_array = NULL; /* number of pages stored in page_array */ static size_t page_array_count = 0; -void vm_flat_init() +void vm_flat_init(void) { printk("vm: using flat memory model"); size_t pmem_size = 0; diff --git a/vm/sparse.c b/vm/sparse.c new file mode 100644 index 0000000..e04e940 --- /dev/null +++ b/vm/sparse.c @@ -0,0 +1,233 @@ +/* ### The sparse memory model ### + + under this memory model, the system memory is represented by + a set of sectors. each sector has the same, fixed, power-of-2 + size, and has its own array of vm_pages. unlike the flat memory + model, this is an array of vm_page POINTERS, allowing vm_pages + to be allocated on demand. + + under this memory model, only memory frames that are usable by + the kernel will have an associated vm_page. the array of pointers + adds some overhead, effectively adding an extra pointer's worth + of memory to the size of vm_page, but this is mitigated by + fewer vm_pages being allocated. + + on top of this, any sector that ONLY contains reserved memory + can forego allocating their vm_page pointer array entirely, + saving even more memory. + + this memory model is good for systems with large amounts of + memory, or those will less memory but a high percentage of + reserved memory. if this is not the case, the memory savings + of the sparse memory model may be outweighed by the extra + overhead, and the flat memory model may be a better choice. + */ +#include +#include +#include +#include +#include + +static vm_sector_t *sector_array = NULL; +static size_t sector_array_count = 0; + +static vm_sector_t *phys_addr_to_sector_and_index(phys_addr_t addr, size_t *sector_id, size_t *index) +{ + /* all sectors have the same size */ + size_t step = vm_page_order_to_bytes(sector_array[0].s_size); + addr &= ~VM_PAGE_MASK; + size_t sector = div64_pow2(addr, step); + + addr >>= VM_PAGE_SHIFT; + addr -= ((sector * step) >> VM_PAGE_SHIFT); + + if (sector_id) { + *sector_id = sector; + } + + if (index) { + *index = addr; + } + + return §or_array[sector]; +} + +static vm_page_t *get_or_create_page(phys_addr_t addr) +{ + size_t sector_number, page_number; + phys_addr_to_sector_and_index(addr, §or_number, &page_number); + + vm_sector_t *sector = §or_array[sector_number]; + + if (!sector->s_pages) { + printk("allocated page array for sector %u", sector_number); + sector->s_pages = kmalloc(vm_page_order_to_pages(sector->s_size) * sizeof(vm_page_t *), 0); + } + + sector->s_pages[page_number].p_sector = sector_number; + return §or->s_pages[page_number]; +} + +static vm_page_order_t find_minimum_sector_size(size_t pmem_size) +{ + for (vm_page_order_t i = VM_PAGE_4K; i < VM_PAGE_64G; i++) { + size_t order_bytes = vm_page_order_to_bytes(i); + if (order_bytes * VM_MAX_SECTORS >= pmem_size) { + return i; + } + } + + /* TODO panic here, once panic() is implemented. */ + return VM_PAGE_64G; +} + +/* this function is called to calculate the optimal sector size for the system, + taking in to account factors like the total system memory and how much memory + is reserved vs free. + + this function uses some heuristics and thresholds that are untested and + are in need of improvement to ensure that sparse works well on a wide + range of systems. */ +static void calculate_sector_size_and_count(size_t pmem_size, size_t reserved_size, unsigned int *out_sector_count, vm_page_order_t *out_sector_size) +{ + /* we can support up to VM_MAX_SECTORS memory sectors. + the minimum sector size is what ever is required + to cover all of physical memory in the maximum number of sectors */ + vm_page_order_t sector_size = find_minimum_sector_size(pmem_size); + + if (sector_size <= VM_PAGE_2M) { + /* override really small sector sizes with something + more reasonable, to avoid excessive overhead on + low-memory systems */ + sector_size = VM_PAGE_2M; + } + + size_t free_size = pmem_size - reserved_size; + /* the absolute difference between the amount of free memory and + the amount of reserved memory. */ + size_t memdiff = absdiff64(free_size, reserved_size); + + if (free_size > reserved_size && sector_size < VM_PAGE_256M) { + /* if there is more free memory than reserved, we can choose + a bigger sector size, as we don't have to worry as much + about wasting memory allocating vm_pages for reserved frames. + + we only do this bump if the sector size is below a certain + threshold. */ + sector_size++; + + /* if the difference is particularly big, increase the sector size + even further */ + if (memdiff >= 0x1000000) { + sector_size++; + } + } + + /* round pmem_size up to the next multiple of sector_bytes. + this works because sector_bytes is guaranteed to be a + power of 2. */ + size_t sector_bytes = vm_page_order_to_bytes(sector_size); + + if (pmem_size & (sector_bytes - 1)) { + pmem_size &= ~(sector_bytes - 1); + pmem_size += sector_bytes; + } + + size_t sector_count = div64_pow2(pmem_size, sector_bytes); + + *out_sector_count = sector_count; + *out_sector_size = sector_size; +} + +void vm_sparse_init(void) +{ + printk("vm: using sparse memory model"); + + size_t pmem_size = 0, reserved_size = 0; + + memblock_iter_t it; + for_each_mem_range (&it, 0x0, UINTPTR_MAX) { + if (pmem_size < it.it_limit + 1) { + pmem_size = it.it_limit + 1; + } + } + + for_each_reserved_mem_range (&it, 0x0, UINTPTR_MAX) { + reserved_size += it.it_limit - it.it_base + 1; + } + + vm_page_order_t sector_size; + size_t sector_bytes = 0; + unsigned int nr_sectors = 0; + calculate_sector_size_and_count(pmem_size, reserved_size, &nr_sectors, §or_size); + sector_bytes = vm_page_order_to_bytes(sector_size); + + char sector_size_str[64]; + data_size_to_string(sector_bytes, sector_size_str, sizeof sector_size_str); + + sector_array = memblock_alloc(sizeof(vm_sector_t) * nr_sectors, 8); + sector_array_count = nr_sectors; + + for (unsigned int i = 0; i < nr_sectors; i++) { + sector_array[i].s_size = sector_size; + sector_array[i].s_first_pfn = (i * sector_bytes) >> VM_PAGE_SHIFT; + } + + size_t s, i; + phys_addr_to_sector_and_index(0x3f00000, &s, &i); + + for_each_free_mem_range(&it, 0x0, UINTPTR_MAX) { + if (it.it_base & VM_PAGE_MASK) { + it.it_base &= ~VM_PAGE_MASK; + it.it_base += VM_PAGE_SIZE; + } + + for (uintptr_t i = it.it_base; i < it.it_limit; i += VM_PAGE_SIZE) { + vm_page_t *pg = get_or_create_page(i); + memset(pg, 0x0, sizeof *pg); + } + } + + for_each_reserved_mem_range(&it, 0x0, UINTPTR_MAX) { + if (it.it_base & VM_PAGE_MASK) { + it.it_base &= ~VM_PAGE_MASK; + it.it_base += VM_PAGE_SIZE; + } + + for (uintptr_t i = it.it_base; i < it.it_limit; i += VM_PAGE_SIZE) { + vm_page_t *pg = vm_page_get(i); + + if (!pg) { + /* if the page doesn't exist, it is part of a sector + that only contains reserved pages. a NULL page + is implicitly treated as reserved */ + continue; + } + + memset(pg, 0x0, sizeof *pg); + pg->p_flags = VM_PAGE_RESERVED; + } + } + + printk("vm: initialised %zu sectors of size %s", nr_sectors, sector_size_str); +} + +vm_page_t *vm_page_get_sparse(phys_addr_t addr) +{ + size_t sector_number, page_number; + phys_addr_to_sector_and_index(addr, §or_number, &page_number); + + vm_sector_t *sector = §or_array[sector_number]; + + if (!sector->s_pages) { + return NULL; + } + + return §or->s_pages[page_number]; +} + +size_t vm_page_get_pfn_sparse(vm_page_t *pg) +{ + vm_sector_t *sector = §or_array[pg->p_sector]; + return sector->s_first_pfn + (((uintptr_t)pg - (uintptr_t)sector->s_pages) / sizeof *pg); +}