From 59c37052bedfb83b159335006dda59536ec2de3c Mon Sep 17 00:00:00 2001 From: "Li, Xun" Date: Mon, 18 Mar 2024 07:03:44 +0000 Subject: [PATCH] [LibOS] VMA bookkeep calls Pal `mprotect` per VMA VMA bookkeeping records prot of each VMA. However `mprotect` syscall always calls Pal `mprotect` no matter prot changed or not. This commit updates VMA bookkeep `mprotect` to walk through VMAs and only call Pal `mprotect` when prot changed, and also fixes prot mismatch exceptions. Signed-off-by: Li, Xun Signed-off-by: xiaonan-INTC --- libos/src/bookkeep/libos_thread.c | 2 +- libos/src/bookkeep/libos_vma.c | 267 ++++++++++++++---------------- libos/src/libos_init.c | 4 - libos/src/sys/libos_mmap.c | 24 --- 4 files changed, 123 insertions(+), 174 deletions(-) diff --git a/libos/src/bookkeep/libos_thread.c b/libos/src/bookkeep/libos_thread.c index f2db3831a9..1638abdbec 100644 --- a/libos/src/bookkeep/libos_thread.c +++ b/libos/src/bookkeep/libos_thread.c @@ -97,7 +97,7 @@ int alloc_thread_libos_stack(struct libos_thread* thread) { need_mem_free = true; /* Create a stack guard page. */ - ret = PalVirtualMemoryProtect(addr, PAGE_SIZE, /*prot=*/0); + ret = bkeep_mprotect(addr, PAGE_SIZE, /*prot=*/0, /*is_internal=*/true); if (ret < 0) { ret = pal_to_unix_errno(ret); goto unmap; diff --git a/libos/src/bookkeep/libos_vma.c b/libos/src/bookkeep/libos_vma.c index c07a627c5f..2a7d212ecd 100644 --- a/libos/src/bookkeep/libos_vma.c +++ b/libos/src/bookkeep/libos_vma.c @@ -851,151 +851,6 @@ static void vma_update_prot(struct libos_vma* vma, int prot) { } } -static int _vma_bkeep_change(uintptr_t begin, uintptr_t end, int prot, bool is_internal, - struct libos_vma** new_vma_ptr1, struct libos_vma** new_vma_ptr2) { - assert(spinlock_is_locked(&vma_tree_lock)); - assert(IS_ALLOC_ALIGNED_PTR(begin) && IS_ALLOC_ALIGNED_PTR(end)); - assert(begin < end); - - struct libos_vma* vma = _lookup_vma(begin); - if (!vma) { - return -ENOMEM; - } - - struct libos_vma* prev = NULL; - struct libos_vma* first_vma = vma; - - if (begin < vma->begin) { - return -ENOMEM; - } - - bool is_continuous = true; - - while (1) { - if (!!(vma->flags & VMA_INTERNAL) != is_internal) { - return -EACCES; - } - if (prot & PROT_GROWSDOWN) { - if (!(vma->flags & MAP_GROWSDOWN)) { - return -EINVAL; - } - } - if (vma->file && (vma->flags & MAP_SHARED)) { - if (!is_file_prot_matching(vma->file, prot)) { - return -EACCES; - } - } - - if (end <= vma->end) { - break; - } - - prev = vma; - - vma = _get_next_vma(vma); - if (!vma) { - is_continuous = false; - break; - } - - is_continuous &= prev->end == vma->begin; - } - - if (!is_continuous) { - /* When Linux fails with such an error, it still changes permissions of the first - * continuous fragment, but we just return an error. */ - return -ENOMEM; - } - - vma = first_vma; - - /* For PROT_GROWSDOWN we just pretend that `vma->begin == begin`. */ - if (vma->begin < begin && !(prot & PROT_GROWSDOWN)) { - struct libos_vma* new_vma1 = *new_vma_ptr1; - *new_vma_ptr1 = NULL; - - split_vma(vma, new_vma1, begin); - vma_update_prot(new_vma1, prot); - - struct libos_vma* next = _get_next_vma(vma); - - avl_tree_insert(&vma_tree, &new_vma1->tree_node); - - if (end < new_vma1->end) { - struct libos_vma* new_vma2 = *new_vma_ptr2; - *new_vma_ptr2 = NULL; - - split_vma(new_vma1, new_vma2, end); - vma_update_prot(new_vma2, vma->prot); - - avl_tree_insert(&vma_tree, &new_vma2->tree_node); - - return 0; - } - - /* Error checking at the begining ensures we always have the next node. */ - assert(next); - vma = next; - } - - while (vma->end <= end) { - vma_update_prot(vma, prot); - -#ifdef DEBUG - struct libos_vma* prev = vma; -#endif - vma = _get_next_vma(vma); - if (!vma) { - /* We've reached the very last vma. */ - assert(prev->end == end); - return 0; - } - } - - if (end <= vma->begin) { - return 0; - } - - struct libos_vma* new_vma2 = *new_vma_ptr2; - *new_vma_ptr2 = NULL; - - split_vma(vma, new_vma2, end); - vma_update_prot(vma, prot); - - avl_tree_insert(&vma_tree, &new_vma2->tree_node); - - return 0; -} - -int bkeep_mprotect(void* addr, size_t length, int prot, bool is_internal) { - if (!length || !IS_ALLOC_ALIGNED(length) || !IS_ALLOC_ALIGNED_PTR(addr)) { - return -EINVAL; - } - - struct libos_vma* vma1 = alloc_vma(); - if (!vma1) { - return -ENOMEM; - } - struct libos_vma* vma2 = alloc_vma(); - if (!vma2) { - free_vma(vma1); - return -ENOMEM; - } - - spinlock_lock(&vma_tree_lock); - int ret = _vma_bkeep_change((uintptr_t)addr, (uintptr_t)addr + length, prot, is_internal, &vma1, - &vma2); - spinlock_unlock(&vma_tree_lock); - - if (vma1) { - free_vma(vma1); - } - if (vma2) { - free_vma(vma2); - } - - return ret; -} /* TODO consider: * maybe it's worth to keep another tree, complementary to `vma_tree`, that would hold free areas. @@ -1371,6 +1226,128 @@ int madvise_dontneed_range(uintptr_t begin, uintptr_t end) { return ctx.error; } +struct mprotect_ctx { + uintptr_t begin; + uintptr_t end; + int prot; + bool is_internal; + struct libos_vma* new_vma1; + struct libos_vma* new_vma2; + int error; +}; + +static bool mprotect_check_visitor(struct libos_vma* vma, void* visitor_arg) { + struct mprotect_ctx* ctx = visitor_arg; + if (vma->flags & VMA_UNMAPPED) { + ctx->error = -EINVAL; + return false; + } + if ((ctx->is_internal && !(vma->flags & VMA_INTERNAL)) || + (!ctx->is_internal && (vma->flags & VMA_INTERNAL))) { + ctx->error = -EINVAL; + return false; + } + if (ctx->prot & PROT_GROWSDOWN) { + if (!(vma->flags & MAP_GROWSDOWN)) { + ctx->error = -EINVAL; + return false; + } + } + if (vma->file && (vma->flags & MAP_SHARED)) { + if (!is_file_prot_matching(vma->file, ctx->prot)) { + ctx->error = -EACCES; + return false; + } + } + return true; +} + +static bool mprotect_visitor(struct libos_vma* vma, void* visitor_arg) { + struct mprotect_ctx* ctx = (struct mprotect_ctx*)visitor_arg; + if (vma->prot == (ctx->prot & (PROT_READ | PROT_WRITE | PROT_EXEC))) { + return true; + } + + if (vma->begin < ctx->begin && !(ctx->prot & PROT_GROWSDOWN)) { + struct libos_vma* new_vma1 = ctx->new_vma1; + ctx->new_vma1 = NULL; + + split_vma(vma, new_vma1, ctx->begin); + avl_tree_insert(&vma_tree, &new_vma1->tree_node); + vma = new_vma1; + } + + if (ctx->end < vma->end) { + struct libos_vma* new_vma2 = ctx->new_vma2; + ctx->new_vma2 = NULL; + + split_vma(vma, new_vma2, ctx->end); + avl_tree_insert(&vma_tree, &new_vma2->tree_node); + } + + vma_update_prot(vma, ctx->prot); + + int ret = PalVirtualMemoryProtect((void*)vma->begin, vma->end - vma->begin, + LINUX_PROT_TO_PAL(ctx->prot, /*map_flags=*/0)); + if (ret < 0) { + ctx->error = pal_to_unix_errno(ret); + return false; + } + return true; +} + +int bkeep_mprotect(void* addr, size_t length, int prot, bool is_internal) { + struct libos_vma* vma1 = alloc_vma(); + if (!vma1) { + return -ENOMEM; + } + struct libos_vma* vma2 = alloc_vma(); + if (!vma2) { + free_vma(vma1); + return -ENOMEM; + } + + struct mprotect_ctx ctx = { + .begin = (uintptr_t)addr, + .end = (uintptr_t)addr + length, + .prot = prot, + .is_internal = is_internal, + .new_vma1 = vma1, + .new_vma2 = vma2, + .error = 0, + }; + + spinlock_lock(&vma_tree_lock); + + bool is_continuous = _traverse_vmas_in_range(ctx.begin, ctx.end, mprotect_check_visitor, &ctx); + if (!is_continuous) { + spinlock_unlock(&vma_tree_lock); + ctx.error = -ENOMEM; + goto out; + } + if (ctx.error) { + spinlock_unlock(&vma_tree_lock); + goto out; + } + + is_continuous = _traverse_vmas_in_range(ctx.begin, ctx.end, mprotect_visitor, &ctx); + spinlock_unlock(&vma_tree_lock); + if (!is_continuous) { + log_error("Walks through all VMAs failed"); + BUG(); + } + +out: + if (ctx.new_vma1) { + free_vma(ctx.new_vma1); + } + if (ctx.new_vma2) { + free_vma(ctx.new_vma2); + } + + return ctx.error; +} + static bool vma_filter_needs_msync(struct libos_vma* vma, void* arg) { struct libos_handle* hdl = arg; diff --git a/libos/src/libos_init.c b/libos/src/libos_init.c index 332c9f4694..2ca71844de 100644 --- a/libos/src/libos_init.c +++ b/libos/src/libos_init.c @@ -132,10 +132,6 @@ static void* allocate_stack(size_t size, size_t protect_size, bool user) { goto out_fail; } - if (PalVirtualMemoryProtect(stack + protect_size, size, PAL_PROT_READ | PAL_PROT_WRITE) < 0) { - goto out_fail; - } - return stack + protect_size; out_fail:; diff --git a/libos/src/sys/libos_mmap.c b/libos/src/sys/libos_mmap.c index fa04766a63..5c9be6d013 100644 --- a/libos/src/sys/libos_mmap.c +++ b/libos/src/sys/libos_mmap.c @@ -302,35 +302,11 @@ long libos_syscall_mprotect(void* addr, size_t length, int prot) { return -EINVAL; } - /* `bkeep_mprotect` and then `PalVirtualMemoryProtect` is racy, but it's hard to do it properly. - * On the other hand if this race happens, it means user app is buggy, so not a huge problem. */ - ret = bkeep_mprotect(addr, length, prot, /*is_internal=*/false); if (ret < 0) { return ret; } - if (prot & PROT_GROWSDOWN) { - struct libos_vma_info vma_info = {0}; - if (lookup_vma(addr, &vma_info) >= 0) { - assert(vma_info.addr <= addr); - length += addr - vma_info.addr; - addr = vma_info.addr; - if (vma_info.file) { - put_handle(vma_info.file); - } - } else { - log_warning("Memory that was about to be mprotected was unmapped, your program is " - "buggy!"); - return -ENOTRECOVERABLE; - } - } - - ret = PalVirtualMemoryProtect(addr, length, LINUX_PROT_TO_PAL(prot, /*map_flags=*/0)); - if (ret < 0) { - return pal_to_unix_errno(ret); - } - return 0; }