syscall: add task_self, task_get_address_space, and vm_region_kill

This commit is contained in:
2026-02-23 18:43:49 +00:00
parent fd1bc0ad5f
commit 5f0654430d
8 changed files with 533 additions and 28 deletions

View File

@@ -8,6 +8,20 @@
#include <kernel/vm-region.h>
#include <mango/status.h>
/* NOTE Locking Rules
* To avoid deadlocks and crashes, the following locking rules should be
* followed:
* 1. Do NOT lock more than one region at a time IF the regions are siblings.
* 2. When locking a region and it's child(ren) or ancestors, always lock
* the parent region BEFORE the child region.
* 3. When locking a region and a vm-object mapped into that region, always
* lock the region BEFORE the vm-object.
* 3. An entry MUST be locked before any of its data can be read/written,
* including its children (if it's a region) and its e_parent pointer.
* 4. vm_region_mapping has no lock. Instead, its immediate parent region must
* be locked before any child mappings can be accessed.
*/
/*** STATIC DATA + MACROS *****************************************************/
#undef ASLR
@@ -49,10 +63,21 @@ enum search_direction {
#define VM_REGION_CAST(p) \
OBJECT_C_CAST(struct vm_region, vr_base, &vm_region_type, p)
static kern_status_t vm_region_object_destroy(struct object *obj);
static kern_status_t region_object_destroy(struct object *obj, struct queue *q);
static kern_status_t region_object_destroy_recurse(
struct queue_entry *entry,
struct object **out);
static struct object_type vm_region_type = {
.ob_name = "vm-region",
.ob_size = sizeof(struct vm_region),
.ob_header_offset = offsetof(struct vm_region, vr_base),
.ob_ops = {
.destroy = region_object_destroy,
.destroy_recurse = region_object_destroy_recurse,
},
};
static struct vm_cache mapping_cache = {
@@ -81,17 +106,53 @@ static struct vm_region_mapping *mapping_from_entry(
return BTREE_CONTAINER(struct vm_region_mapping, m_entry, entry);
}
static virt_addr_t entry_absolute_address(const struct vm_region_entry *entry)
kern_status_t region_object_destroy(struct object *obj, struct queue *q)
{
virt_addr_t result = 0;
while (entry) {
result += entry->e_offset;
entry = entry->e_parent;
struct vm_region *region = VM_REGION_CAST(obj);
if (region->vr_status == VM_REGION_ONLINE) {
panic("last reference closed on an online vm-region");
}
return result;
struct btree_node *node = btree_first(&region->vr_entries);
while (node) {
struct btree_node *next = btree_next(node);
btree_delete(&region->vr_entries, node);
struct vm_region_entry *entry
= BTREE_CONTAINER(struct vm_region_entry, e_node, node);
if (entry->e_type != VM_REGION_ENTRY_REGION) {
panic("offline vm-region still contains non-region "
"children.");
}
queue_push_back(q, &entry->e_entry);
node = next;
}
return KERN_OK;
}
kern_status_t region_object_destroy_recurse(
struct queue_entry *entry,
struct object **out)
{
struct vm_region_entry *region_entry
= BTREE_CONTAINER(struct vm_region_entry, e_entry, entry);
if (region_entry->e_type != VM_REGION_ENTRY_REGION) {
panic("offline vm-region still contains non-region "
"children.");
}
struct vm_region *region = region_from_entry(region_entry);
*out = &region->vr_base;
return KERN_OK;
}
static virt_addr_t entry_absolute_address(const struct vm_region_entry *entry)
{
return entry->e_address;
}
/* this function must be called with `parent` locked */
static void region_put_entry(
struct vm_region *parent,
struct vm_region_entry *child)
@@ -119,7 +180,15 @@ static void region_put_entry(
} else if (child_base > cur_limit) {
next = btree_right(cur);
} else {
panic("tried to add an overlapping entry to vm-region");
#ifdef TRACE
vm_region_dump(parent);
#endif
panic("tried to add an overlapping entry [%zx-%zx] to "
"vm-region (overlaps [%zx-%zx])",
child_base,
child_limit,
cur_base,
cur_limit);
}
if (next) {
@@ -179,12 +248,17 @@ static struct vm_region_entry *region_get_entry(
/* find the child region that covers the area [*offp,len]. searches recursively
* the value in `offp` is updated to the offset of the returned entry relative
* to its parent */
* to its parent.
* this function should be called with `region` locked.
* the region returned by this function will also be locked. any intermediary
* regions traversed by this function will be locked temporarily, but will
* be unlocked by the time the function returns. */
static struct vm_region *region_get_child_region_recursive(
struct vm_region *region,
off_t *offp,
size_t len)
{
struct vm_region *root = region;
off_t offset = *offp;
if (offset >= region->vr_entry.e_size) {
return NULL;
@@ -197,6 +271,14 @@ static struct vm_region *region_get_child_region_recursive(
struct vm_region *next_region = region_from_entry(next);
if (next_region) {
offset -= next->e_offset;
/* since `region` is locked, interrupts are already
* disabled, so don't use lock_irq() here */
vm_region_lock(next_region);
if (region != root) {
vm_region_unlock(region);
}
region = next_region;
} else {
break;
@@ -207,6 +289,12 @@ static struct vm_region *region_get_child_region_recursive(
return region;
}
/* find the vm_region_mapping that contains a given memory area.
* `offp` should be a pointer to an off_t value that contains the offset
* of the area relative to the start of `region`. this value will be updated
* to the offset of the mapping relative to its immediate parent.
* this function should be called with `region` locked. if a mapping is found,
* it will be returned with its immediate parent locked. */
static struct vm_region_mapping *region_get_mapping_recursive(
struct vm_region *region,
off_t *offp,
@@ -218,9 +306,12 @@ static struct vm_region_mapping *region_get_mapping_recursive(
return NULL;
}
/* if `region` is a different region than what was originally passed to
* us, it has now been locked, and its children can be accessed. */
struct vm_region_entry *entry = region_get_entry(region, offset, len);
*offp = offset;
/* return the mapping with the parent region still locked */
return mapping_from_entry(entry);
}
@@ -488,6 +579,7 @@ static kern_status_t region_validate_allocation(
return KERN_OK;
}
/* this function should be called with `region` locked */
static void vm_iterator_begin(
struct vm_iterator *it,
struct vm_region *region,
@@ -551,6 +643,15 @@ static void vm_iterator_begin(
static kern_status_t vm_iterator_seek(struct vm_iterator *it, size_t nr_bytes)
{
#define UNLOCK_MAPPING_PARENT(p) \
do { \
struct vm_region *parent \
= region_from_entry(p->m_entry.e_parent); \
if (parent != it->it_region) { \
vm_region_unlock(parent); \
} \
} while (0)
if (nr_bytes < it->it_max) {
it->it_base += nr_bytes;
it->it_buf = (char *)it->it_buf + nr_bytes;
@@ -558,6 +659,10 @@ static kern_status_t vm_iterator_seek(struct vm_iterator *it, size_t nr_bytes)
return KERN_OK;
}
/* the parent region of it->it_mapping is locked here. if it is
* different from it->it_region, it must be unlocked */
UNLOCK_MAPPING_PARENT(it->it_mapping);
it->it_base += nr_bytes;
off_t offset = it->it_base - vm_region_get_base_address(it->it_region);
@@ -569,9 +674,13 @@ static kern_status_t vm_iterator_seek(struct vm_iterator *it, size_t nr_bytes)
return KERN_MEMORY_FAULT;
}
/* past this point, if we encounter an error, must remember to unlock
* the parent region of next_mapping */
if ((next_mapping->m_prot & it->it_prot) != it->it_prot) {
it->it_buf = NULL;
it->it_max = 0;
UNLOCK_MAPPING_PARENT(next_mapping);
return KERN_MEMORY_FAULT;
}
@@ -590,6 +699,7 @@ static kern_status_t vm_iterator_seek(struct vm_iterator *it, size_t nr_bytes)
}
if (!pg) {
UNLOCK_MAPPING_PARENT(next_mapping);
return KERN_NO_MEMORY;
}
@@ -620,6 +730,9 @@ static kern_status_t vm_iterator_seek(struct vm_iterator *it, size_t nr_bytes)
return KERN_OK;
}
/* this function must be called with `root` locked. `root` will be the first
* entry visited by the iterator. from there, child entries are visited in
* depth-first order. */
static void entry_iterator_begin(
struct entry_iterator *it,
struct vm_region *root)
@@ -629,8 +742,43 @@ static void entry_iterator_begin(
it->it_entry = &root->vr_entry;
}
/* this function must be called when you are finished with an entry_iterator,
* to ensure that all held locks are released. */
static void entry_iterator_finish(struct entry_iterator *it)
{
struct vm_region_entry *cur = it->it_entry;
if (!cur) {
return;
}
struct vm_region *region = NULL;
if (cur->e_type == VM_REGION_ENTRY_MAPPING) {
region = region_from_entry(cur->e_parent);
} else {
region = region_from_entry(cur);
}
while (region && region != it->it_root) {
struct vm_region *parent
= region_from_entry(region->vr_entry.e_parent);
vm_region_unlock(region);
region = parent;
}
memset(it, 0x0, sizeof *it);
}
/* move to the next entry in the traversal order.
* when this function returns:
* 1. if the visited entry is a region, it will be locked.
* 2. if the visited entry is a mapping, its parent region will be locked.
* a region will remain locked until all of its children and n-grand-children
* have been visited. once iteration is finished, only `it->it_root` will be
* locked.
*/
static void entry_iterator_move_next(struct entry_iterator *it)
{
/* `region` is locked */
struct vm_region *region = region_from_entry(it->it_entry);
bool has_children = (region && !btree_empty(&region->vr_entries));
@@ -639,6 +787,15 @@ static void entry_iterator_move_next(struct entry_iterator *it)
struct btree_node *node = btree_first(&region->vr_entries);
struct vm_region_entry *entry
= BTREE_CONTAINER(struct vm_region_entry, e_node, node);
if (entry->e_type == VM_REGION_ENTRY_REGION) {
struct vm_region *child_region
= region_from_entry(entry);
/* since `region` is locked, interrupts are already
* disabled, so don't use lock_irq() here */
vm_region_lock(child_region);
}
it->it_depth++;
it->it_entry = entry;
return;
@@ -670,6 +827,72 @@ static void entry_iterator_move_next(struct entry_iterator *it)
return;
}
if (cur->e_type == VM_REGION_ENTRY_REGION) {
struct vm_region *child_region = region_from_entry(cur);
if (child_region != it->it_root) {
vm_region_unlock(child_region);
}
}
it->it_depth--;
cur = parent_entry;
}
}
/* erase the current entry and move to the next entry in the traversal order.
* the current entry MUST be a mapping, otherwise nothing will happen.
*/
static void entry_iterator_erase(struct entry_iterator *it)
{
/* the parent region of `mapping` is locked */
struct vm_region_mapping *mapping = mapping_from_entry(it->it_entry);
if (!mapping) {
return;
}
struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent);
/* go back up until we find a right sibling. */
struct vm_region_entry *cur = it->it_entry;
while (1) {
struct btree_node *sibling = btree_next(&cur->e_node);
if (mapping) {
btree_delete(
&parent->vr_entries,
&mapping->m_entry.e_node);
vm_cache_free(&mapping_cache, mapping);
mapping = NULL;
}
if (sibling) {
it->it_entry = BTREE_CONTAINER(
struct vm_region_entry,
e_node,
sibling);
return;
}
if (cur == &it->it_root->vr_entry) {
it->it_entry = NULL;
return;
}
struct vm_region_entry *parent_entry = cur->e_parent;
struct vm_region *parent = region_from_entry(parent_entry);
if (!parent) {
it->it_entry = NULL;
return;
}
if (cur->e_type == VM_REGION_ENTRY_REGION) {
struct vm_region *child_region = region_from_entry(cur);
if (child_region != it->it_root) {
vm_region_unlock(child_region);
}
}
it->it_depth--;
cur = parent_entry;
}
@@ -705,6 +928,11 @@ static void mapping_iterator_begin(
}
}
static void mapping_iterator_finish(struct entry_iterator *it)
{
entry_iterator_finish(it);
}
static void mapping_iterator_move_next(
struct entry_iterator *it,
off_t offset,
@@ -730,6 +958,34 @@ static void mapping_iterator_move_next(
}
}
static void mapping_iterator_erase(
struct entry_iterator *it,
off_t offset,
size_t length,
off_t *offp)
{
entry_iterator_erase(it);
while (it->it_entry
&& it->it_entry->e_type != VM_REGION_ENTRY_MAPPING) {
entry_iterator_move_next(it);
}
if (!it->it_entry) {
return;
}
off_t base = entry_absolute_address(it->it_entry)
- it->it_root->vr_entry.e_offset;
if (base >= offset + length) {
it->it_entry = NULL;
} else {
*offp = base;
}
}
/*** PUBLIC API ***************************************************************/
kern_status_t vm_region_type_init(void)
@@ -743,6 +999,8 @@ struct vm_region *vm_region_cast(struct object *obj)
return VM_REGION_CAST(obj);
}
/* this function should be called with `parent` locked (if parent is non-NULL)
*/
kern_status_t vm_region_create(
struct vm_region *parent,
const char *name,
@@ -752,6 +1010,10 @@ kern_status_t vm_region_create(
vm_prot_t prot,
struct vm_region **out)
{
if (parent && parent->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
if (!offset || !region_len) {
return KERN_INVALID_ARGUMENT;
}
@@ -781,8 +1043,10 @@ kern_status_t vm_region_create(
struct vm_region *region = VM_REGION_CAST(region_object);
region->vr_status = VM_REGION_ONLINE;
region->vr_prot = prot;
region->vr_entry.e_type = VM_REGION_ENTRY_REGION;
region->vr_entry.e_address = offset;
region->vr_entry.e_offset = offset;
region->vr_entry.e_size = region_len;
@@ -794,8 +1058,11 @@ kern_status_t vm_region_create(
if (parent) {
region->vr_entry.e_parent = &parent->vr_entry;
region->vr_entry.e_address += parent->vr_entry.e_address;
region->vr_pmap = parent->vr_pmap;
region_put_entry(parent, &region->vr_entry);
/* `parent` holds a reference to child `region` */
object_ref(&region->vr_base);
}
if (name && name_len) {
@@ -808,8 +1075,67 @@ kern_status_t vm_region_create(
return KERN_OK;
}
kern_status_t vm_region_map_object(
kern_status_t vm_region_kill(
struct vm_region *region,
unsigned long *lock_flags)
{
if (region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
if (region->vr_entry.e_parent) {
struct vm_region *parent
= region_from_entry(region->vr_entry.e_parent);
region->vr_entry.e_parent = NULL;
/* locks must be acquired in parent->child order. since we're
* going backwards here, unlock `region` before locking its
* parent */
vm_region_unlock_irqrestore(region, *lock_flags);
vm_region_lock_irqsave(parent, lock_flags);
btree_delete(&parent->vr_entries, &region->vr_entry.e_node);
vm_region_unlock_irqrestore(parent, *lock_flags);
vm_region_lock_irqsave(region, lock_flags);
/* `region` lock is held, and e_parent is NULL */
}
struct entry_iterator it;
entry_iterator_begin(&it, region);
while (it.it_entry) {
if (it.it_entry->e_type == VM_REGION_ENTRY_REGION) {
struct vm_region *region
= region_from_entry(it.it_entry);
region->vr_status = VM_REGION_DEAD;
entry_iterator_move_next(&it);
continue;
}
struct vm_region_mapping *mapping
= mapping_from_entry(it.it_entry);
virt_addr_t base = entry_absolute_address(it.it_entry);
for (size_t i = 0; i < mapping->m_entry.e_size;
i += VM_PAGE_SIZE) {
pmap_remove(region->vr_pmap, base + i);
}
unsigned long flags;
vm_object_lock_irqsave(mapping->m_object, &flags);
queue_delete(
&mapping->m_object->vo_mappings,
&mapping->m_object_entry);
vm_object_unlock_irqrestore(mapping->m_object, flags);
entry_iterator_erase(&it);
}
return KERN_OK;
}
kern_status_t vm_region_map_object(
struct vm_region *root,
off_t region_offset,
struct vm_object *object,
off_t object_offset,
@@ -839,10 +1165,24 @@ kern_status_t vm_region_map_object(
length += VM_PAGE_SIZE;
}
if (!region || !object) {
if (!root || !object) {
return KERN_INVALID_ARGUMENT;
}
struct vm_region *region = root;
if (region_offset != VM_REGION_ANY_OFFSET) {
region = region_get_child_region_recursive(
root,
&region_offset,
length);
/* if `region` != `root`, it will need to be unlocked at the end
* of the function */
}
if (region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
if ((prot & region->vr_prot) != prot) {
return KERN_INVALID_ARGUMENT;
}
@@ -855,13 +1195,6 @@ kern_status_t vm_region_map_object(
return KERN_INVALID_ARGUMENT;
}
if (region_offset != VM_REGION_ANY_OFFSET) {
region = region_get_child_region_recursive(
region,
&region_offset,
length);
}
if (!region) {
return KERN_INVALID_ARGUMENT;
}
@@ -896,6 +1229,7 @@ kern_status_t vm_region_map_object(
mapping->m_object_offset = object_offset;
mapping->m_entry.e_type = VM_REGION_ENTRY_MAPPING;
mapping->m_entry.e_parent = &region->vr_entry;
mapping->m_entry.e_address = region->vr_entry.e_address + region_offset;
mapping->m_entry.e_offset = region_offset;
mapping->m_entry.e_size = length;
@@ -907,7 +1241,14 @@ kern_status_t vm_region_map_object(
abs_base + length);
#endif
region_put_entry(region, &mapping->m_entry);
if (region != root) {
vm_region_unlock(region);
}
unsigned long lock_flags;
vm_object_lock_irqsave(object, &lock_flags);
queue_push_back(&object->vo_mappings, &mapping->m_object_entry);
vm_object_unlock_irqrestore(object, lock_flags);
if (out) {
*out = entry_absolute_address(&mapping->m_entry);
@@ -1068,12 +1409,13 @@ static kern_status_t delete_mapping(
pmap_remove(root->vr_pmap, base + i);
}
struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent);
unsigned long flags;
vm_object_lock_irqsave(mapping->m_object, &flags);
queue_delete(&mapping->m_object->vo_mappings, &mapping->m_object_entry);
btree_delete(&parent->vr_entries, &mapping->m_entry.e_node);
vm_object_unlock_irqrestore(mapping->m_object, flags);
vm_cache_free(&mapping_cache, mapping);
/* don't actually delete the mapping yet. that will be done by
* vm_region_unmap */
return KERN_OK;
}
@@ -1083,6 +1425,10 @@ kern_status_t vm_region_unmap(
off_t unmap_area_offset,
size_t unmap_area_length)
{
if (region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
kern_status_t status = KERN_OK;
struct entry_iterator it;
off_t unmap_area_limit = unmap_area_offset + unmap_area_length;
@@ -1101,12 +1447,6 @@ kern_status_t vm_region_unmap(
off_t mapping_offset = tmp;
off_t mapping_limit = mapping_offset + it.it_entry->e_size;
mapping_iterator_move_next(
&it,
unmap_area_offset,
unmap_area_length,
&tmp);
bool split
= (unmap_area_offset > mapping_offset
&& unmap_area_limit < mapping_limit);
@@ -1127,6 +1467,7 @@ kern_status_t vm_region_unmap(
mapping_offset,
unmap_area_offset,
unmap_area_limit);
delete = true;
} else if (delete) {
status = delete_mapping(
mapping,
@@ -1150,11 +1491,27 @@ kern_status_t vm_region_unmap(
panic("don't know what to do with this mapping");
}
if (delete) {
mapping_iterator_erase(
&it,
unmap_area_offset,
unmap_area_length,
&tmp);
} else {
mapping_iterator_move_next(
&it,
unmap_area_offset,
unmap_area_length,
&tmp);
}
if (status != KERN_OK) {
break;
}
}
mapping_iterator_finish(&it);
return status;
}
@@ -1164,6 +1521,10 @@ bool vm_region_validate_access(
size_t len,
vm_prot_t prot)
{
if (region->vr_status != VM_REGION_ONLINE) {
return false;
}
if (len == 0) {
return true;
}
@@ -1199,16 +1560,27 @@ bool vm_region_validate_access(
if ((mapping->m_prot & prot) != prot) {
return false;
}
struct vm_region *parent
= region_from_entry(mapping->m_entry.e_parent);
if (parent != region) {
vm_region_unlock(parent);
}
}
return true;
}
/* this function must be called with `region` locked */
kern_status_t vm_region_demand_map(
struct vm_region *region,
virt_addr_t addr,
enum pmap_fault_flags flags)
{
if (region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
addr &= ~VM_PAGE_MASK;
if (addr < region->vr_entry.e_offset
|| addr > region->vr_entry.e_offset + region->vr_entry.e_size) {
@@ -1230,21 +1602,35 @@ kern_status_t vm_region_demand_map(
mapping->m_object->vo_name,
object_offset);
unsigned long lock_flags;
vm_object_lock_irqsave(mapping->m_object, &lock_flags);
struct vm_page *pg = vm_object_alloc_page(
mapping->m_object,
object_offset,
VM_PAGE_4K);
vm_object_unlock_irqrestore(mapping->m_object, lock_flags);
tracek("vm: mapping %07llx -> %10llx", vm_page_get_paddr(pg), addr);
return pmap_add(
kern_status_t status = pmap_add(
region->vr_pmap,
addr,
vm_page_get_pfn(pg),
mapping->m_prot,
PMAP_NORMAL);
struct vm_region *parent = region_from_entry(mapping->m_entry.e_parent);
if (parent != region) {
vm_region_unlock(parent);
}
return status;
}
virt_addr_t vm_region_get_base_address(const struct vm_region *region)
{
if (region->vr_status != VM_REGION_ONLINE) {
return 0;
}
return entry_absolute_address(&region->vr_entry);
}
@@ -1255,6 +1641,10 @@ kern_status_t vm_region_read_kernel(
void *destp,
size_t *nr_read)
{
if (src_region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
struct vm_iterator src;
char *dest = destp;
@@ -1296,6 +1686,14 @@ kern_status_t vm_region_memmove(
size_t count,
size_t *nr_moved)
{
if (src_region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
if (dest_region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
struct vm_iterator src, dest;
vm_iterator_begin(
&src,
@@ -1347,6 +1745,14 @@ extern kern_status_t vm_region_memmove_v(
size_t nr_src_vecs,
size_t bytes_to_move)
{
if (src_region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
if (dest_region->vr_status != VM_REGION_ONLINE) {
return KERN_BAD_STATE;
}
struct iovec_iterator src, dest;
iovec_iterator_begin_user(&src, src_region, src_vecs, nr_src_vecs);
iovec_iterator_begin_user(&dest, dest_region, dest_vecs, nr_dest_vecs);