#include #include #include #include #include #include #include #include #include /* NOTE Locking Rules * To avoid deadlocks and crashes, the following locking rules should be * followed: * 1. Do NOT lock more than one region at a time IF the regions are siblings. * 2. When locking a region and it's child(ren) or ancestors, always lock * the parent region BEFORE the child region. * 3. When locking a region and a vm-object mapped into that region, always * lock the region BEFORE the vm-object. * 3. An entry MUST be locked before any of its data can be read/written, * including its children (if it's a region) and its e_parent pointer. * 4. vm_region_mapping has no lock. Instead, its immediate parent region must * be locked before any child mappings can be accessed. */ /*** STATIC DATA + MACROS *****************************************************/ #undef ASLR #define INVALID_OFFSET ((off_t) - 1) #ifdef ASLR #define region_find_free_area(region, length) \ region_find_free_area_random(region, length) #else #define region_find_free_area(region, length) \ region_find_free_area_linear(region, length) #endif /* iterates over a range of mapped virtual memory in a region, and provides * a moving buffer through which the memory can be accessed */ struct vm_iterator { struct vm_region *it_region; struct vm_region_mapping *it_mapping; virt_addr_t it_base; vm_prot_t it_prot; void *it_buf; size_t it_max; }; /* iterates recursively over the entries in a region */ struct entry_iterator { struct vm_region *it_root; struct vm_region_entry *it_entry; /* depth of it_entry relative to it_root */ unsigned int it_depth; }; enum search_direction { SEARCH_LEFT, SEARCH_RIGHT, }; #define VM_REGION_CAST(p) \ OBJECT_C_CAST(struct vm_region, vr_base, &vm_region_type, p) static kern_status_t vm_region_object_destroy(struct object *obj); static kern_status_t region_object_destroy(struct object *obj, struct queue *q); static kern_status_t region_object_destroy_recurse( struct queue_entry *entry, struct object **out); static struct object_type vm_region_type = { .ob_name = "vm-region", .ob_size = sizeof(struct vm_region), .ob_header_offset = offsetof(struct vm_region, vr_base), .ob_ops = { .destroy = region_object_destroy, .destroy_recurse = region_object_destroy_recurse, }, }; static struct vm_cache mapping_cache = { .c_name = "vm-region-mapping", .c_obj_size = sizeof(struct vm_region_mapping), }; /*** INTERNAL UTILITY FUNCTION ************************************************/ static struct vm_region *region_from_entry(struct vm_region_entry *entry) { if (!entry || entry->e_type != VM_REGION_ENTRY_REGION) { return NULL; } return BTREE_CONTAINER(struct vm_region, vr_entry, entry); } static struct vm_region_mapping *mapping_from_entry( struct vm_region_entry *entry) { if (!entry || entry->e_type != VM_REGION_ENTRY_MAPPING) { return NULL; } return BTREE_CONTAINER(struct vm_region_mapping, m_entry, entry); } kern_status_t region_object_destroy(struct object *obj, struct queue *q) { struct vm_region *region = VM_REGION_CAST(obj); if (region->vr_status == VM_REGION_ONLINE) { panic("last reference closed on an online vm-region"); } struct btree_node *node = btree_first(®ion->vr_entries); while (node) { struct btree_node *next = btree_next(node); btree_delete(®ion->vr_entries, node); struct vm_region_entry *entry = BTREE_CONTAINER(struct vm_region_entry, e_node, node); if (entry->e_type != VM_REGION_ENTRY_REGION) { panic("offline vm-region still contains non-region " "children."); } queue_push_back(q, &entry->e_entry); node = next; } return KERN_OK; } kern_status_t region_object_destroy_recurse( struct queue_entry *entry, struct object **out) { struct vm_region_entry *region_entry = BTREE_CONTAINER(struct vm_region_entry, e_entry, entry); if (region_entry->e_type != VM_REGION_ENTRY_REGION) { panic("offline vm-region still contains non-region " "children."); } struct vm_region *region = region_from_entry(region_entry); *out = ®ion->vr_base; return KERN_OK; } static virt_addr_t entry_absolute_address(const struct vm_region_entry *entry) { return entry->e_address; } /* this function must be called with `parent` locked */ static void region_put_entry( struct vm_region *parent, struct vm_region_entry *child) { struct btree_node *cur = parent->vr_entries.b_root; if (!cur) { parent->vr_entries.b_root = &child->e_node; btree_insert_fixup(&parent->vr_entries, &child->e_node); return; } off_t child_base = child->e_offset; off_t child_limit = child_base + child->e_size - 1; while (cur) { struct vm_region_entry *cur_entry = BTREE_CONTAINER(struct vm_region_entry, e_node, cur); struct btree_node *next = NULL; off_t cur_base = cur_entry->e_offset; off_t cur_limit = cur_base + cur_entry->e_size - 1; if (child_limit < cur_base) { next = btree_left(cur); } else if (child_base > cur_limit) { next = btree_right(cur); } else { #ifdef TRACE vm_region_dump(parent); #endif panic("tried to add an overlapping entry [%zx-%zx] to " "vm-region (overlaps [%zx-%zx])", child_base, child_limit, cur_base, cur_limit); } if (next) { cur = next; continue; } if (child_limit < cur_base) { btree_put_left(cur, &child->e_node); } else { btree_put_right(cur, &child->e_node); } btree_insert_fixup(&parent->vr_entries, &child->e_node); break; } } /* find the child entry that covers the specified offset. * DOES NOT search recursively! */ static struct vm_region_entry *region_get_entry( struct vm_region *region, off_t offset, size_t len) { struct btree_node *cur = region->vr_entries.b_root; if (!cur) { return NULL; } struct vm_region_entry *result = NULL; off_t base = offset, limit = offset + len - 1; while (cur) { struct vm_region_entry *child = BTREE_CONTAINER(struct vm_region_entry, e_node, cur); struct btree_node *next = NULL; off_t child_base = child->e_offset; off_t child_limit = child->e_offset + child->e_size - 1; if (limit < child_base) { next = btree_left(cur); } else if (base > child_limit) { next = btree_right(cur); } else { result = child; break; } cur = next; } return result; } /* find the child region that covers the area [*offp,len]. searches recursively * the value in `offp` is updated to the offset of the returned entry relative * to its parent. * this function should be called with `region` locked. * the region returned by this function will also be locked. any intermediary * regions traversed by this function will be locked temporarily, but will * be unlocked by the time the function returns. */ static struct vm_region *region_get_child_region_recursive( struct vm_region *region, off_t *offp, size_t len) { struct vm_region *root = region; off_t offset = *offp; if (offset >= region->vr_entry.e_size) { return NULL; } while (1) { struct vm_region_entry *next = region_get_entry(region, offset, len); struct vm_region *next_region = region_from_entry(next); if (next_region) { offset -= next->e_offset; /* since `region` is locked, interrupts are already * disabled, so don't use lock_irq() here */ vm_region_lock(next_region); if (region != root) { vm_region_unlock(region); } region = next_region; } else { break; } } *offp = offset; return region; } /* find the vm_region_mapping that contains a given memory area. * `offp` should be a pointer to an off_t value that contains the offset * of the area relative to the start of `region`. this value will be updated * to the offset of the mapping relative to its immediate parent. * this function should be called with `region` locked. if a mapping is found, * it will be returned with its immediate parent locked. */ static struct vm_region_mapping *region_get_mapping_recursive( struct vm_region *region, off_t *offp, size_t len) { off_t offset = *offp; region = region_get_child_region_recursive(region, &offset, len); if (!region) { return NULL; } /* if `region` is a different region than what was originally passed to * us, it has now been locked, and its children can be accessed. */ struct vm_region_entry *entry = region_get_entry(region, offset, len); *offp = offset; /* return the mapping with the parent region still locked */ return mapping_from_entry(entry); } static off_t generate_random_address( off_t area_base, size_t area_length, size_t target_length) { size_t random_range = area_length - target_length; off_t offset = 0; fill_random(&offset, sizeof offset); offset %= random_range; return area_base + offset; } static struct vm_region_entry *region_get_random_entry(struct vm_region *region) { enum { STEP_LEFT = 0, STEP_RIGHT = 1, STEP_FINISH = 2, } step; struct btree_node *result = NULL; struct btree_node *cur = region->vr_entries.b_root; if (!cur) { return NULL; } while (1) { unsigned long r; fill_random(&r, sizeof r); struct btree_node *next = NULL; step = r % 3; switch (step) { case STEP_LEFT: next = btree_left(cur); break; case STEP_RIGHT: next = btree_right(cur); break; case STEP_FINISH: result = cur; break; default: return NULL; } if (!next) { result = cur; break; } cur = next; } if (!result) { return NULL; } return BTREE_CONTAINER(struct vm_region_entry, e_node, result); } static virt_addr_t region_find_free_area_ex( struct vm_region *region, size_t target_length, struct btree_node *start, enum search_direction direction, bool random) { if (region->vr_entry.e_size < target_length) { return 0; } struct btree_node *left_node = NULL, *right_node = NULL; switch (direction) { case SEARCH_LEFT: right_node = start; left_node = start ? btree_prev(start) : NULL; break; case SEARCH_RIGHT: left_node = start; right_node = start ? btree_next(start) : NULL; break; default: return 0; } if (!left_node && !right_node) { return 0; } while (1) { struct vm_region_entry *left = BTREE_CONTAINER( struct vm_region_entry, e_node, left_node); struct vm_region_entry *right = BTREE_CONTAINER( struct vm_region_entry, e_node, right_node); /* addresses of the first and last free bytes in the area * respectively. */ off_t area_base, area_limit; if (left && right) { area_base = left->e_offset + left->e_size; area_limit = right->e_offset - 1; } else if (right) { area_base = region->vr_entry.e_offset; area_limit = left->e_offset - 1; } else if (left) { area_base = left->e_offset + left->e_size; area_limit = region->vr_entry.e_offset + region->vr_entry.e_size - 1; } else { return 0; } area_base &= ~VM_PAGE_MASK; size_t area_size = 0; if (area_limit >= area_base) { area_size = area_limit - area_base + 1; } if (area_size >= target_length) { if (random) { area_base = generate_random_address( area_base, area_size, target_length); area_base &= ~VM_PAGE_MASK; } return area_base; } if (direction == SEARCH_RIGHT) { left_node = right_node; right_node = btree_next(right_node); } else { right_node = left_node; left_node = btree_prev(right_node); } } return 0; } static off_t region_find_free_area_linear( struct vm_region *region, size_t target_length) { if (!region->vr_entries.b_root) { return 0; } return region_find_free_area_ex( region, target_length, btree_first(®ion->vr_entries), SEARCH_RIGHT, false); } static off_t region_find_free_area_random( struct vm_region *region, size_t target_length) { if (!region->vr_entries.b_root) { off_t offset = generate_random_address( 0, region->vr_entry.e_size, target_length); return offset & ~VM_PAGE_MASK; } int tmp = 0; struct vm_region_entry *basis = region_get_random_entry(region); fill_random(&tmp, sizeof tmp); enum search_direction direction = tmp % 2; return region_find_free_area_ex( region, target_length, &basis->e_node, direction, true); } static bool region_is_area_free( const struct vm_region *region, off_t base, size_t len) { off_t limit = base + len - 1; if (base >= region->vr_entry.e_size) { return false; } if (limit >= region->vr_entry.e_size) { return false; } struct btree_node *cur = region->vr_entries.b_root; if (!cur) { return true; } while (cur) { struct vm_region_entry *entry = BTREE_CONTAINER(struct vm_region_entry, e_node, cur); struct btree_node *next = NULL; off_t entry_limit = entry->e_offset + entry->e_size - 1; if (base > entry_limit) { next = btree_right(cur); } else if (limit < entry->e_offset) { next = btree_left(cur); } else { return false; } cur = next; } return true; } static kern_status_t region_validate_allocation( struct vm_region *parent, vm_prot_t prot, off_t *offp, size_t len) { off_t offset = *offp; if ((prot & parent->vr_prot) != prot) { /* child region protection must match or be a * subset of parent region protection */ return KERN_INVALID_ARGUMENT; } if (offset == VM_REGION_ANY_OFFSET) { offset = region_find_free_area(parent, len); *offp = offset; return (offset == INVALID_OFFSET) ? KERN_NO_MEMORY : KERN_OK; } offset &= ~VM_PAGE_MASK; if (!region_is_area_free(parent, offset, len)) { return KERN_INVALID_ARGUMENT; } *offp = offset; return KERN_OK; } /* this function should be called with `region` locked */ static void vm_iterator_begin( struct vm_iterator *it, struct vm_region *region, virt_addr_t base, vm_prot_t prot) { memset(it, 0x0, sizeof *it); it->it_base = base; it->it_region = region; it->it_prot = prot; off_t offset = base - vm_region_get_base_address(region); it->it_mapping = region_get_mapping_recursive(region, &offset, 1); if (!it->it_mapping || (it->it_mapping->m_prot & prot) != prot) { return; } off_t object_offset = offset - it->it_mapping->m_entry.e_offset + it->it_mapping->m_object_offset; struct vm_page *pg = NULL; if (prot & VM_PROT_WRITE) { pg = vm_object_alloc_page( it->it_mapping->m_object, object_offset, VM_PAGE_4K); } else { pg = vm_object_get_page( it->it_mapping->m_object, object_offset); } if (!pg) { return; } void *buffer_base = vm_page_get_vaddr(pg); phys_addr_t pg_addr = vm_page_get_paddr(pg); size_t buffer_size = vm_page_get_size_bytes(pg); while (1) { struct btree_node *next_node = btree_next(&pg->p_bnode); struct vm_page *next = BTREE_CONTAINER(struct vm_page, p_bnode, next_node); if (!next) { break; } phys_addr_t next_addr = vm_page_get_paddr(next); if (pg_addr + vm_page_get_size_bytes(pg) != next_addr) { break; } pg = next; pg_addr = next_addr; buffer_size += vm_page_get_size_bytes(next); } it->it_buf = (char *)buffer_base + (object_offset & VM_PAGE_MASK); it->it_max = buffer_size - (object_offset & VM_PAGE_MASK); } static kern_status_t vm_iterator_seek(struct vm_iterator *it, size_t nr_bytes) { #define UNLOCK_MAPPING_PARENT(p) \ do { \ struct vm_region *parent \ = region_from_entry(p->m_entry.e_parent); \ if (parent != it->it_region) { \ vm_region_unlock(parent); \ } \ } while (0) if (nr_bytes < it->it_max) { it->it_base += nr_bytes; it->it_buf = (char *)it->it_buf + nr_bytes; it->it_max -= nr_bytes; return KERN_OK; } /* the parent region of it->it_mapping is locked here. if it is * different from it->it_region, it must be unlocked */ UNLOCK_MAPPING_PARENT(it->it_mapping); it->it_base += nr_bytes; off_t offset = it->it_base - vm_region_get_base_address(it->it_region); struct vm_region_mapping *next_mapping = region_get_mapping_recursive(it->it_region, &offset, 1); if (!next_mapping) { it->it_buf = NULL; it->it_max = 0; return KERN_MEMORY_FAULT; } /* past this point, if we encounter an error, must remember to unlock * the parent region of next_mapping */ if ((next_mapping->m_prot & it->it_prot) != it->it_prot) { it->it_buf = NULL; it->it_max = 0; UNLOCK_MAPPING_PARENT(next_mapping); return KERN_MEMORY_FAULT; } off_t object_offset = offset - it->it_mapping->m_entry.e_offset + it->it_mapping->m_object_offset; struct vm_page *pg = NULL; if (it->it_prot & VM_PROT_WRITE) { pg = vm_object_alloc_page( it->it_mapping->m_object, object_offset, VM_PAGE_4K); } else { pg = vm_object_get_page( it->it_mapping->m_object, object_offset); } if (!pg) { UNLOCK_MAPPING_PARENT(next_mapping); return KERN_NO_MEMORY; } void *buffer_base = vm_page_get_vaddr(pg); phys_addr_t pg_addr = vm_page_get_paddr(pg); size_t buffer_size = vm_page_get_size_bytes(pg); while (1) { struct btree_node *next_node = btree_next(&pg->p_bnode); struct vm_page *next = BTREE_CONTAINER(struct vm_page, p_bnode, next_node); if (!next) { break; } phys_addr_t next_addr = vm_page_get_paddr(next); if (pg_addr + vm_page_get_size_bytes(pg) != next_addr) { break; } pg = next; pg_addr = next_addr; buffer_size += vm_page_get_size_bytes(next); } it->it_buf = (char *)buffer_base + (object_offset & VM_PAGE_MASK); it->it_max = buffer_size; return KERN_OK; } /* this function must be called with `root` locked. `root` will be the first * entry visited by the iterator. from there, child entries are visited in * depth-first order. */ static void entry_iterator_begin( struct entry_iterator *it, struct vm_region *root) { memset(it, 0x0, sizeof *it); it->it_root = root; it->it_entry = &root->vr_entry; } /* this function must be called when you are finished with an entry_iterator, * to ensure that all held locks are released. */ static void entry_iterator_finish(struct entry_iterator *it) { struct vm_region_entry *cur = it->it_entry; if (!cur) { return; } struct vm_region *region = NULL; if (cur->e_type == VM_REGION_ENTRY_MAPPING) { region = region_from_entry(cur->e_parent); } else { region = region_from_entry(cur); } while (region && region != it->it_root) { struct vm_region *parent = region_from_entry(region->vr_entry.e_parent); vm_region_unlock(region); region = parent; } memset(it, 0x0, sizeof *it); } /* move to the next entry in the traversal order. * when this function returns: * 1. if the visited entry is a region, it will be locked. * 2. if the visited entry is a mapping, its parent region will be locked. * a region will remain locked until all of its children and n-grand-children * have been visited. once iteration is finished, only `it->it_root` will be * locked. */ static void entry_iterator_move_next(struct entry_iterator *it) { /* `region` is locked */ struct vm_region *region = region_from_entry(it->it_entry); bool has_children = (region && !btree_empty(®ion->vr_entries)); if (has_children) { /* visit the first child */ struct btree_node *node = btree_first(®ion->vr_entries); struct vm_region_entry *entry = BTREE_CONTAINER(struct vm_region_entry, e_node, node); if (entry->e_type == VM_REGION_ENTRY_REGION) { struct vm_region *child_region = region_from_entry(entry); /* since `region` is locked, interrupts are already * disabled, so don't use lock_irq() here */ vm_region_lock(child_region); } it->it_depth++; it->it_entry = entry; return; } /* go back up until we find a right sibling. */ struct vm_region_entry *cur = it->it_entry; while (1) { struct btree_node *sibling = btree_next(&cur->e_node); if (sibling) { it->it_entry = BTREE_CONTAINER( struct vm_region_entry, e_node, sibling); return; } if (cur == &it->it_root->vr_entry) { it->it_entry = NULL; return; } struct vm_region_entry *parent_entry = cur->e_parent; struct vm_region *parent = region_from_entry(parent_entry); if (!parent) { it->it_entry = NULL; return; } if (cur->e_type == VM_REGION_ENTRY_REGION) { struct vm_region *child_region = region_from_entry(cur); if (child_region != it->it_root) { vm_region_unlock(child_region); } } it->it_depth--; cur = parent_entry; } } /* erase the current entry and move to the next entry in the traversal order. * the current entry MUST be a mapping, otherwise nothing will happen. */ static void entry_iterator_erase(struct entry_iterator *it) { /* the parent region of `mapping` is locked */ struct vm_region_mapping *mapping = mapping_from_entry(it->it_entry); if (!mapping) { return; } struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent); /* go back up until we find a right sibling. */ struct vm_region_entry *cur = it->it_entry; while (1) { struct btree_node *sibling = btree_next(&cur->e_node); if (mapping) { btree_delete( &parent->vr_entries, &mapping->m_entry.e_node); vm_cache_free(&mapping_cache, mapping); mapping = NULL; } if (sibling) { it->it_entry = BTREE_CONTAINER( struct vm_region_entry, e_node, sibling); return; } if (cur == &it->it_root->vr_entry) { it->it_entry = NULL; return; } struct vm_region_entry *parent_entry = cur->e_parent; struct vm_region *parent = region_from_entry(parent_entry); if (!parent) { it->it_entry = NULL; return; } if (cur->e_type == VM_REGION_ENTRY_REGION) { struct vm_region *child_region = region_from_entry(cur); if (child_region != it->it_root) { vm_region_unlock(child_region); } } it->it_depth--; cur = parent_entry; } } static void mapping_iterator_begin( struct entry_iterator *it, struct vm_region *root, off_t offset, size_t length, off_t *offp) { entry_iterator_begin(it, root); while (it->it_entry) { off_t base = entry_absolute_address(it->it_entry) - root->vr_entry.e_offset; off_t limit = base + it->it_entry->e_size - 1; if (it->it_entry->e_type == VM_REGION_ENTRY_MAPPING) { if (offset >= base && offset <= limit) { *offp = base; return; } if (offset + length >= base && offset + length <= limit) { *offp = base; return; } } entry_iterator_move_next(it); } } static void mapping_iterator_finish(struct entry_iterator *it) { entry_iterator_finish(it); } static void mapping_iterator_move_next( struct entry_iterator *it, off_t offset, size_t length, off_t *offp) { do { entry_iterator_move_next(it); } while (it->it_entry && it->it_entry->e_type != VM_REGION_ENTRY_MAPPING); if (!it->it_entry) { return; } off_t base = entry_absolute_address(it->it_entry) - it->it_root->vr_entry.e_offset; if (base >= offset + length) { it->it_entry = NULL; } else { *offp = base; } } static void mapping_iterator_erase( struct entry_iterator *it, off_t offset, size_t length, off_t *offp) { entry_iterator_erase(it); while (it->it_entry && it->it_entry->e_type != VM_REGION_ENTRY_MAPPING) { entry_iterator_move_next(it); } if (!it->it_entry) { return; } off_t base = entry_absolute_address(it->it_entry) - it->it_root->vr_entry.e_offset; if (base >= offset + length) { it->it_entry = NULL; } else { *offp = base; } } /*** PUBLIC API ***************************************************************/ kern_status_t vm_region_type_init(void) { vm_cache_init(&mapping_cache); return object_type_register(&vm_region_type); } struct vm_region *vm_region_cast(struct object *obj) { return VM_REGION_CAST(obj); } /* this function should be called with `parent` locked (if parent is non-NULL) */ kern_status_t vm_region_create( struct vm_region *parent, const char *name, size_t name_len, off_t offset, size_t region_len, vm_prot_t prot, struct vm_region **out) { if (parent && parent->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } if (!offset || !region_len) { return KERN_INVALID_ARGUMENT; } if (region_len & VM_PAGE_MASK) { region_len &= ~VM_PAGE_MASK; region_len += VM_PAGE_SIZE; } kern_status_t status = KERN_OK; if (parent) { status = region_validate_allocation( parent, prot, &offset, region_len); } if (status != KERN_OK) { return status; } struct object *region_object = object_create(&vm_region_type); if (!region_object) { return KERN_NO_MEMORY; } struct vm_region *region = VM_REGION_CAST(region_object); region->vr_status = VM_REGION_ONLINE; region->vr_prot = prot; region->vr_entry.e_type = VM_REGION_ENTRY_REGION; region->vr_entry.e_address = offset; region->vr_entry.e_offset = offset; region->vr_entry.e_size = region_len; #ifdef TRACE tracek("creating sub-region at [%llx-%llx]", offset, offset + region_len); #endif if (parent) { region->vr_entry.e_parent = &parent->vr_entry; region->vr_entry.e_address += parent->vr_entry.e_address; region->vr_pmap = parent->vr_pmap; region_put_entry(parent, ®ion->vr_entry); /* `parent` holds a reference to child `region` */ object_ref(®ion->vr_base); } if (name && name_len) { name_len = MIN(sizeof region->vr_name - 1, name_len); memcpy(region->vr_name, name, name_len); region->vr_name[name_len] = '\0'; } *out = region; return KERN_OK; } kern_status_t vm_region_kill( struct vm_region *region, unsigned long *lock_flags) { if (region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } if (region->vr_entry.e_parent) { struct vm_region *parent = region_from_entry(region->vr_entry.e_parent); region->vr_entry.e_parent = NULL; /* locks must be acquired in parent->child order. since we're * going backwards here, unlock `region` before locking its * parent */ vm_region_unlock_irqrestore(region, *lock_flags); vm_region_lock_irqsave(parent, lock_flags); btree_delete(&parent->vr_entries, ®ion->vr_entry.e_node); vm_region_unlock_irqrestore(parent, *lock_flags); vm_region_lock_irqsave(region, lock_flags); /* `region` lock is held, and e_parent is NULL */ } struct entry_iterator it; entry_iterator_begin(&it, region); while (it.it_entry) { if (it.it_entry->e_type == VM_REGION_ENTRY_REGION) { struct vm_region *region = region_from_entry(it.it_entry); region->vr_status = VM_REGION_DEAD; entry_iterator_move_next(&it); continue; } struct vm_region_mapping *mapping = mapping_from_entry(it.it_entry); virt_addr_t base = entry_absolute_address(it.it_entry); for (size_t i = 0; i < mapping->m_entry.e_size; i += VM_PAGE_SIZE) { pmap_remove(region->vr_pmap, base + i); } unsigned long flags; vm_object_lock_irqsave(mapping->m_object, &flags); queue_delete( &mapping->m_object->vo_mappings, &mapping->m_object_entry); vm_object_unlock_irqrestore(mapping->m_object, flags); entry_iterator_erase(&it); } return KERN_OK; } kern_status_t vm_region_map_object( struct vm_region *root, off_t region_offset, struct vm_object *object, off_t object_offset, size_t length, vm_prot_t prot, virt_addr_t *out) { object_offset &= ~VM_PAGE_MASK; if (region_offset != VM_REGION_ANY_OFFSET) { off_t limit = region_offset + length; if (region_offset & VM_PAGE_MASK) { region_offset &= ~VM_PAGE_MASK; } if (limit & VM_PAGE_MASK) { limit &= ~VM_PAGE_MASK; limit += VM_PAGE_SIZE; } length = limit - region_offset; } if (length & VM_PAGE_MASK) { length &= ~VM_PAGE_MASK; length += VM_PAGE_SIZE; } if (!root || !object) { return KERN_INVALID_ARGUMENT; } struct vm_region *region = root; if (region_offset != VM_REGION_ANY_OFFSET) { region = region_get_child_region_recursive( root, ®ion_offset, length); /* if `region` != `root`, it will need to be unlocked at the end * of the function */ } if (region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } if ((prot & region->vr_prot) != prot) { return KERN_INVALID_ARGUMENT; } if ((prot & object->vo_prot) != prot) { return KERN_INVALID_ARGUMENT; } if (!length || object_offset + length > object->vo_size) { return KERN_INVALID_ARGUMENT; } if (!region) { return KERN_INVALID_ARGUMENT; } if (region_offset == VM_REGION_ANY_OFFSET) { region_offset = region_find_free_area(region, length); if (region_offset == INVALID_OFFSET) { return KERN_NO_MEMORY; } } else if (!region_is_area_free(region, region_offset, length)) { return KERN_INVALID_ARGUMENT; } tracek("vm_region_map_object(%s, %zx, %s, %zx, %zx, %x, %p)", region->vr_name, region_offset, object->vo_name, object_offset, length, prot, out); struct vm_region_mapping *mapping = vm_cache_alloc(&mapping_cache, VM_NORMAL); if (!mapping) { return KERN_NO_MEMORY; } mapping->m_object = object; mapping->m_prot = prot; mapping->m_object_offset = object_offset; mapping->m_entry.e_type = VM_REGION_ENTRY_MAPPING; mapping->m_entry.e_parent = ®ion->vr_entry; mapping->m_entry.e_address = region->vr_entry.e_address + region_offset; mapping->m_entry.e_offset = region_offset; mapping->m_entry.e_size = length; #ifdef TRACE virt_addr_t abs_base = entry_absolute_address(&mapping->m_entry); tracek("mapping %s at [%llx-%llx]", object->vo_name, abs_base, abs_base + length); #endif region_put_entry(region, &mapping->m_entry); if (region != root) { vm_region_unlock(region); } unsigned long lock_flags; vm_object_lock_irqsave(object, &lock_flags); queue_push_back(&object->vo_mappings, &mapping->m_object_entry); vm_object_unlock_irqrestore(object, lock_flags); if (out) { *out = entry_absolute_address(&mapping->m_entry); } return KERN_OK; } /* unmap some pages in the middle of a mapping, splitting it into two separate * mappings */ static kern_status_t split_mapping( struct vm_region_mapping *mapping, struct vm_region *root, off_t mapping_offset, off_t unmap_offset, off_t unmap_limit) { tracek("split mapping [%zx-%zx] subtract [%zx-%zx]", mapping_offset, mapping_offset + mapping->m_entry.e_size, unmap_offset, unmap_limit); off_t mapping_limit = mapping_offset + mapping->m_entry.e_size; struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent); struct vm_region_mapping *left = mapping; struct vm_region_mapping *right = vm_cache_alloc(&mapping_cache, VM_NORMAL); if (!right) { return KERN_NO_MEMORY; } off_t left_offset = mapping->m_entry.e_offset; off_t right_offset = unmap_limit - mapping_offset; off_t left_object_offset = mapping->m_object_offset; size_t left_length = unmap_offset - mapping_offset; size_t right_length = mapping_limit - unmap_limit; off_t right_object_offset = mapping->m_object_offset + mapping->m_entry.e_size - right_length; tracek("mapping=[%zx-%zx]->[%zx-%zx]", mapping_offset, mapping_limit, mapping->m_object_offset, mapping->m_object_offset + mapping->m_entry.e_size); tracek("left=[%zx-%zx]->[%zx-%zx], right=[%zx-%zx]->[%zx-%zx]", left_offset, left_offset + left_length, left_object_offset, left_object_offset + left_length, right_offset, right_offset + right_length, right_object_offset, right_object_offset + right_length); left->m_object_offset = left_object_offset; left->m_entry.e_offset = left_offset; left->m_entry.e_size = left_length; right->m_object = left->m_object; right->m_prot = left->m_prot; right->m_entry.e_type = VM_REGION_ENTRY_MAPPING; right->m_entry.e_parent = left->m_entry.e_parent; right->m_object_offset = right_object_offset; right->m_entry.e_offset = right_offset; right->m_entry.e_size = right_length; virt_addr_t unmap_base = root->vr_entry.e_offset + unmap_offset; size_t unmap_length = unmap_limit - unmap_offset; for (size_t i = 0; i < unmap_length; i += VM_PAGE_SIZE) { tracek("unmapping %zx", unmap_base + i); pmap_remove(root->vr_pmap, unmap_base + i); } region_put_entry(parent, &right->m_entry); return KERN_OK; } /* unmap some pages from the left-side of a mapping to somewhere in the * middle. */ static kern_status_t left_reduce_mapping( struct vm_region_mapping *mapping, struct vm_region *root, off_t mapping_offset, off_t unmap_offset, off_t unmap_limit) { /* unmap_limit falls somwwhere between mapping_offset and * mapping_offset+length */ tracek("left reduce mapping [%zx-%zx] subtract [%zx-%zx]", mapping_offset, mapping_offset + mapping->m_entry.e_size, unmap_offset, unmap_limit); virt_addr_t base = root->vr_entry.e_offset + mapping_offset; off_t limit = mapping_offset + mapping->m_entry.e_size; size_t length = mapping->m_entry.e_size - (limit - unmap_limit); tracek(" unmapping %zx-%zx (%zx bytes)", base, base + length, length); for (size_t i = 0; i < length; i += VM_PAGE_SIZE) { pmap_remove(root->vr_pmap, base + i); } mapping->m_entry.e_offset += length; mapping->m_object_offset += length; mapping->m_entry.e_size -= length; return KERN_OK; } /* unmap some pages from the middle of a mapping to the right-side. */ static kern_status_t right_reduce_mapping( struct vm_region_mapping *mapping, struct vm_region *root, off_t mapping_offset, off_t unmap_offset, off_t unmap_limit) { /* unmap_base falls somwwhere between mapping_offset and * mapping_offset+length */ tracek("right reduce mapping [%zx-%zx] subtract [%zx-%zx]", mapping_offset, mapping_offset + mapping->m_entry.e_size, unmap_offset, unmap_limit); virt_addr_t base = root->vr_entry.e_offset + unmap_offset; off_t limit = mapping_offset + mapping->m_entry.e_size; size_t length = limit - unmap_offset; tracek(" unmapping %zx-%zx (%zx bytes)", base, base + length, length); for (size_t i = 0; i < length; i += VM_PAGE_SIZE) { pmap_remove(root->vr_pmap, base + i); } mapping->m_entry.e_size -= length; return KERN_OK; } /* completely unmap and delete an entire mapping */ static kern_status_t delete_mapping( struct vm_region_mapping *mapping, struct vm_region *root, off_t mapping_offset) { virt_addr_t base = root->vr_entry.e_offset + mapping_offset; tracek("delete mapping [%zx-%zx]", base, base + mapping->m_entry.e_size); for (size_t i = 0; i < mapping->m_entry.e_size; i += VM_PAGE_SIZE) { pmap_remove(root->vr_pmap, base + i); } unsigned long flags; vm_object_lock_irqsave(mapping->m_object, &flags); queue_delete(&mapping->m_object->vo_mappings, &mapping->m_object_entry); vm_object_unlock_irqrestore(mapping->m_object, flags); /* don't actually delete the mapping yet. that will be done by * vm_region_unmap */ return KERN_OK; } kern_status_t vm_region_unmap( struct vm_region *region, off_t unmap_area_offset, size_t unmap_area_length) { if (region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } kern_status_t status = KERN_OK; struct entry_iterator it; off_t unmap_area_limit = unmap_area_offset + unmap_area_length; tracek("unmapping %zx-%zx", unmap_area_offset, unmap_area_limit); off_t tmp = 0; mapping_iterator_begin( &it, region, unmap_area_offset, unmap_area_length, &tmp); while (it.it_entry) { struct vm_region_mapping *mapping = mapping_from_entry(it.it_entry); off_t mapping_offset = tmp; off_t mapping_limit = mapping_offset + it.it_entry->e_size; bool split = (unmap_area_offset > mapping_offset && unmap_area_limit < mapping_limit); bool delete = (unmap_area_offset <= mapping_offset && unmap_area_limit >= mapping_limit); bool left_reduce = (unmap_area_offset <= mapping_offset && unmap_area_limit < mapping_limit); bool right_reduce = (unmap_area_offset > mapping_offset && unmap_area_limit >= mapping_limit); if (split) { status = split_mapping( mapping, region, mapping_offset, unmap_area_offset, unmap_area_limit); delete = true; } else if (delete) { status = delete_mapping( mapping, region, mapping_offset); } else if (left_reduce) { status = left_reduce_mapping( mapping, region, mapping_offset, unmap_area_offset, unmap_area_limit); } else if (right_reduce) { status = right_reduce_mapping( mapping, region, mapping_offset, unmap_area_offset, unmap_area_limit); } else { panic("don't know what to do with this mapping"); } if (delete) { mapping_iterator_erase( &it, unmap_area_offset, unmap_area_length, &tmp); } else { mapping_iterator_move_next( &it, unmap_area_offset, unmap_area_length, &tmp); } if (status != KERN_OK) { break; } } mapping_iterator_finish(&it); return status; } bool vm_region_validate_access( struct vm_region *region, virt_addr_t ptr, size_t len, vm_prot_t prot) { if (region->vr_status != VM_REGION_ONLINE) { return false; } if (len == 0) { return true; } if (ptr < region->vr_entry.e_offset) { return false; } off_t offset = ptr - region->vr_entry.e_offset; if (len >= region->vr_entry.e_size) { return false; } if (offset + len > region->vr_entry.e_size) { return false; } offset &= ~VM_PAGE_MASK; /* TODO improve this to not require a per-page loop */ for (off_t i = 0; i < len; i += VM_PAGE_SIZE) { off_t x = offset + i; struct vm_region_mapping *mapping = region_get_mapping_recursive( region, &x, VM_PAGE_SIZE); if (!mapping) { return false; } if ((mapping->m_prot & prot) != prot) { return false; } struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent); if (parent != region) { vm_region_unlock(parent); } } return true; } /* this function must be called with `region` locked */ kern_status_t vm_region_demand_map( struct vm_region *region, virt_addr_t addr, enum pmap_fault_flags flags) { if (region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } addr &= ~VM_PAGE_MASK; if (addr < region->vr_entry.e_offset || addr > region->vr_entry.e_offset + region->vr_entry.e_size) { return KERN_NO_ENTRY; } off_t region_offset = addr - region->vr_entry.e_offset; struct vm_region_mapping *mapping = region_get_mapping_recursive(region, ®ion_offset, 1); if (!mapping) { return KERN_NO_ENTRY; } off_t object_offset = region_offset - mapping->m_entry.e_offset + mapping->m_object_offset; tracek("vm: tried to access vm-object %s at offset=%05llx", mapping->m_object->vo_name, object_offset); unsigned long lock_flags; vm_object_lock_irqsave(mapping->m_object, &lock_flags); struct vm_page *pg = vm_object_alloc_page( mapping->m_object, object_offset, VM_PAGE_4K); vm_object_unlock_irqrestore(mapping->m_object, lock_flags); tracek("vm: mapping %07llx -> %10llx", vm_page_get_paddr(pg), addr); kern_status_t status = pmap_add( region->vr_pmap, addr, vm_page_get_pfn(pg), mapping->m_prot, PMAP_NORMAL); struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent); if (parent != region) { vm_region_unlock(parent); } return status; } virt_addr_t vm_region_get_base_address(const struct vm_region *region) { if (region->vr_status != VM_REGION_ONLINE) { return 0; } return entry_absolute_address(®ion->vr_entry); } kern_status_t vm_region_read_kernel( struct vm_region *src_region, virt_addr_t src_ptr, size_t count, void *destp, size_t *nr_read) { if (src_region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } struct vm_iterator src; char *dest = destp; vm_iterator_begin( &src, src_region, src_ptr, VM_PROT_READ | VM_PROT_USER); kern_status_t status = KERN_OK; size_t r = 0; while (r < count && src.it_max) { size_t remaining = count - r; size_t to_move = MIN(src.it_max, remaining); memmove(dest, src.it_buf, to_move); status = vm_iterator_seek(&src, to_move); if (status != KERN_OK) { break; } r += to_move; dest += to_move; } if (nr_read) { *nr_read = r; } return status; } kern_status_t vm_region_memmove( struct vm_region *dest_region, virt_addr_t dest_ptr, struct vm_region *src_region, virt_addr_t src_ptr, size_t count, size_t *nr_moved) { if (src_region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } if (dest_region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } struct vm_iterator src, dest; vm_iterator_begin( &src, src_region, src_ptr, VM_PROT_READ | VM_PROT_USER); vm_iterator_begin( &dest, dest_region, dest_ptr, VM_PROT_WRITE | VM_PROT_USER); kern_status_t status = KERN_OK; size_t r = 0; while (count && src.it_max && dest.it_max) { size_t to_move = MIN(MIN(src.it_max, dest.it_max), count); memmove(dest.it_buf, src.it_buf, to_move); status = vm_iterator_seek(&src, to_move); if (status != KERN_OK) { break; } status = vm_iterator_seek(&dest, to_move); if (status != KERN_OK) { break; } count -= to_move; r += to_move; } if (nr_moved) { *nr_moved = r; } return status; } extern kern_status_t vm_region_memmove_v( struct vm_region *dest_region, size_t dest_offset, struct iovec *dest_vecs, size_t nr_dest_vecs, struct vm_region *src_region, size_t src_offset, const struct iovec *src_vecs, size_t nr_src_vecs, size_t bytes_to_move) { if (src_region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } if (dest_region->vr_status != VM_REGION_ONLINE) { return KERN_BAD_STATE; } struct iovec_iterator src, dest; iovec_iterator_begin_user(&src, src_region, src_vecs, nr_src_vecs); iovec_iterator_begin_user(&dest, dest_region, dest_vecs, nr_dest_vecs); iovec_iterator_seek(&src, src_offset); iovec_iterator_seek(&dest, dest_offset); while (bytes_to_move && src.it_len && dest.it_len) { size_t to_move = MIN(MIN(src.it_len, dest.it_len), bytes_to_move); kern_status_t status = vm_region_memmove( dest_region, dest.it_base, src_region, src.it_base, to_move, NULL); if (status != KERN_OK) { return status; } iovec_iterator_seek(&src, to_move); iovec_iterator_seek(&dest, to_move); bytes_to_move -= to_move; } return KERN_OK; } #ifdef TRACE void vm_region_dump(struct vm_region *region) { char s[128]; size_t p = 0; struct entry_iterator it; entry_iterator_begin(&it, region); while (it.it_entry) { p = 0; for (unsigned int i = 0; i < it.it_depth; i++) { p += snprintf(s + p, sizeof s - p, " "); } switch (it.it_entry->e_type) { case VM_REGION_ENTRY_REGION: { struct vm_region *child = region_from_entry(it.it_entry); p += snprintf( s + p, sizeof s - p, "-region [%zx-%zx] %s", child->vr_entry.e_offset, child->vr_entry.e_offset + child->vr_entry.e_size, child->vr_name); break; } case VM_REGION_ENTRY_MAPPING: { struct vm_region_mapping *mapping = mapping_from_entry(it.it_entry); p += snprintf( s + p, sizeof s - p, "+mapping [%zx-%zx] %s", mapping->m_entry.e_offset, mapping->m_entry.e_offset + mapping->m_entry.e_size, mapping->m_object->vo_name); break; default: break; } } tracek("%s", s); entry_iterator_move_next(&it); } } #endif