1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00
Jiayuan Chen 5fcd551307 arm64: mm: Fix incomplete tag reset in change_memory_common()
Running KASAN KUnit tests with {HW,SW}_TAGS mode triggers a fault in
change_memory_common():

  Call trace:
   change_memory_common+0x168/0x210 (P)
   set_memory_ro+0x20/0x48
   vmalloc_helpers_tags+0xe8/0x338
   kunit_try_run_case+0x74/0x188
   kunit_generic_run_threadfn_adapter+0x30/0x70
   kthread+0x11c/0x200
   ret_from_fork+0x10/0x20
  ---[ end trace 0000000000000000 ]---
      # vmalloc_helpers_tags: try faulted
      not ok 67 vmalloc_helpers_tags

Commit a06494adb7ef ("arm64: mm: use untagged address to calculate page index")
fixed a KASAN warning in the BPF subsystem by adding kasan_reset_tag() to
the index calculation. In the execmem flow:

    bpf_prog_pack_alloc()
      -> bpf_jit_alloc_exec()
        -> execmem_alloc()

The returned address from execmem_vmalloc/execmem_cache_alloc is passed
through kasan_reset_tag(), so start has no tag while area->addr still
retains the original tag. The fix correctly handled this case by resetting
the tag on area->addr:

    (start - (unsigned long)kasan_reset_tag(area->addr)) >> PAGE_SHIFT

However, in normal vmalloc paths, both start and area->addr have matching
tags(or no tags). Resetting only area->addr causes a mismatch when
subtracting a tagged address from an untagged one, resulting in an
incorrect index.

Fix this by resetting tags on both addresses in the index calculation.
This ensures correct results regardless of the tag state of either address.

Tested with KASAN KUnit tests under CONFIG_KASAN_GENERIC,
CONFIG_KASAN_SW_TAGS, and CONFIG_KASAN_HW_TAGS - all pass. Also verified
the original BPF KASAN warning from [1] is still fixed.

[1] https://lore.kernel.org/all/20251118164115.GA3977565@ax162/

Fixes: a06494adb7ef ("arm64: mm: use untagged address to calculate page index")
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2026-01-08 19:47:59 +00:00

420 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2014, The Linux Foundation. All rights reserved.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mem_encrypt.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
#include <linux/pagewalk.h>
#include <asm/cacheflush.h>
#include <asm/pgtable-prot.h>
#include <asm/set_memory.h>
#include <asm/tlbflush.h>
#include <asm/kfence.h>
struct page_change_data {
pgprot_t set_mask;
pgprot_t clear_mask;
};
static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
{
struct page_change_data *masks = walk->private;
val &= ~(pgprot_val(masks->clear_mask));
val |= (pgprot_val(masks->set_mask));
return val;
}
static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pud_t val = pudp_get(pud);
if (pud_sect(val)) {
if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
return -EINVAL;
val = __pud(set_pageattr_masks(pud_val(val), walk));
set_pud(pud, val);
walk->action = ACTION_CONTINUE;
}
return 0;
}
static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pmd_t val = pmdp_get(pmd);
if (pmd_sect(val)) {
if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
return -EINVAL;
val = __pmd(set_pageattr_masks(pmd_val(val), walk));
set_pmd(pmd, val);
walk->action = ACTION_CONTINUE;
}
return 0;
}
static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t val = __ptep_get(pte);
val = __pte(set_pageattr_masks(pte_val(val), walk));
__set_pte(pte, val);
return 0;
}
static const struct mm_walk_ops pageattr_ops = {
.pud_entry = pageattr_pud_entry,
.pmd_entry = pageattr_pmd_entry,
.pte_entry = pageattr_pte_entry,
};
bool rodata_full __ro_after_init = true;
bool can_set_direct_map(void)
{
/*
* rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear
* map to be mapped at page granularity, so that it is possible to
* protect/unprotect single pages.
*
* KFENCE pool requires page-granular mapping if initialized late.
*
* Realms need to make pages shared/protected at page granularity.
*/
return rodata_full || debug_pagealloc_enabled() ||
arm64_kfence_can_set_direct_map() || is_realm_world();
}
static int update_range_prot(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
struct page_change_data data;
int ret;
data.set_mask = set_mask;
data.clear_mask = clear_mask;
ret = split_kernel_leaf_mapping(start, start + size);
if (WARN_ON_ONCE(ret))
return ret;
arch_enter_lazy_mmu_mode();
/*
* The caller must ensure that the range we are operating on does not
* partially overlap a block mapping, or a cont mapping. Any such case
* must be eliminated by splitting the mapping.
*/
ret = walk_kernel_page_table_range_lockless(start, start + size,
&pageattr_ops, NULL, &data);
arch_leave_lazy_mmu_mode();
return ret;
}
static int __change_memory_common(unsigned long start, unsigned long size,
pgprot_t set_mask, pgprot_t clear_mask)
{
int ret;
ret = update_range_prot(start, size, set_mask, clear_mask);
/*
* If the memory is being made valid without changing any other bits
* then a TLBI isn't required as a non-valid entry cannot be cached in
* the TLB.
*/
if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
flush_tlb_kernel_range(start, start + size);
return ret;
}
static int change_memory_common(unsigned long addr, int numpages,
pgprot_t set_mask, pgprot_t clear_mask)
{
unsigned long start = addr;
unsigned long size = PAGE_SIZE * numpages;
unsigned long end = start + size;
struct vm_struct *area;
int ret;
if (!PAGE_ALIGNED(addr)) {
start &= PAGE_MASK;
end = start + size;
WARN_ON_ONCE(1);
}
/*
* Kernel VA mappings are always live, and splitting live section
* mappings into page mappings may cause TLB conflicts. This means
* we have to ensure that changing the permission bits of the range
* we are operating on does not result in such splitting.
*
* Let's restrict ourselves to mappings created by vmalloc (or vmap).
* Disallow VM_ALLOW_HUGE_VMAP mappings to guarantee that only page
* mappings are updated and splitting is never needed.
*
* So check whether the [addr, addr + size) interval is entirely
* covered by precisely one VM area that has the VM_ALLOC flag set.
*/
area = find_vm_area((void *)addr);
if (!area ||
((unsigned long)kasan_reset_tag((void *)end) >
(unsigned long)kasan_reset_tag(area->addr) + area->size) ||
((area->flags & (VM_ALLOC | VM_ALLOW_HUGE_VMAP)) != VM_ALLOC))
return -EINVAL;
if (!numpages)
return 0;
/*
* If we are manipulating read-only permissions, apply the same
* change to the linear mapping of the pages that back this VM area.
*/
if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
pgprot_val(clear_mask) == PTE_RDONLY)) {
unsigned long idx = ((unsigned long)kasan_reset_tag((void *)start) -
(unsigned long)kasan_reset_tag(area->addr))
>> PAGE_SHIFT;
for (; numpages; idx++, numpages--) {
ret = __change_memory_common((u64)page_address(area->pages[idx]),
PAGE_SIZE, set_mask, clear_mask);
if (ret)
return ret;
}
}
/*
* Get rid of potentially aliasing lazily unmapped vm areas that may
* have permissions set that deviate from the ones we are setting here.
*/
vm_unmap_aliases();
return __change_memory_common(start, size, set_mask, clear_mask);
}
int set_memory_ro(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
__pgprot(PTE_RDONLY),
__pgprot(PTE_WRITE));
}
int set_memory_rw(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
__pgprot(PTE_WRITE),
__pgprot(PTE_RDONLY));
}
int set_memory_nx(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
__pgprot(PTE_PXN),
__pgprot(PTE_MAYBE_GP));
}
int set_memory_x(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
__pgprot(PTE_MAYBE_GP),
__pgprot(PTE_PXN));
}
int set_memory_valid(unsigned long addr, int numpages, int enable)
{
if (enable)
return __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(PTE_VALID),
__pgprot(0));
else
return __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(0),
__pgprot(PTE_VALID));
}
int set_direct_map_invalid_noflush(struct page *page)
{
pgprot_t clear_mask = __pgprot(PTE_VALID);
pgprot_t set_mask = __pgprot(0);
if (!can_set_direct_map())
return 0;
return update_range_prot((unsigned long)page_address(page),
PAGE_SIZE, set_mask, clear_mask);
}
int set_direct_map_default_noflush(struct page *page)
{
pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
pgprot_t clear_mask = __pgprot(PTE_RDONLY);
if (!can_set_direct_map())
return 0;
return update_range_prot((unsigned long)page_address(page),
PAGE_SIZE, set_mask, clear_mask);
}
static int __set_memory_enc_dec(unsigned long addr,
int numpages,
bool encrypt)
{
unsigned long set_prot = 0, clear_prot = 0;
phys_addr_t start, end;
int ret;
if (!is_realm_world())
return 0;
if (!__is_lm_address(addr))
return -EINVAL;
start = __virt_to_phys(addr);
end = start + numpages * PAGE_SIZE;
if (encrypt)
clear_prot = PROT_NS_SHARED;
else
set_prot = PROT_NS_SHARED;
/*
* Break the mapping before we make any changes to avoid stale TLB
* entries or Synchronous External Aborts caused by RIPAS_EMPTY
*/
ret = __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(set_prot),
__pgprot(clear_prot | PTE_VALID));
if (ret)
return ret;
if (encrypt)
ret = rsi_set_memory_range_protected(start, end);
else
ret = rsi_set_memory_range_shared(start, end);
if (ret)
return ret;
return __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(PTE_VALID),
__pgprot(0));
}
static int realm_set_memory_encrypted(unsigned long addr, int numpages)
{
int ret = __set_memory_enc_dec(addr, numpages, true);
/*
* If the request to change state fails, then the only sensible cause
* of action for the caller is to leak the memory
*/
WARN(ret, "Failed to encrypt memory, %d pages will be leaked",
numpages);
return ret;
}
static int realm_set_memory_decrypted(unsigned long addr, int numpages)
{
int ret = __set_memory_enc_dec(addr, numpages, false);
WARN(ret, "Failed to decrypt memory, %d pages will be leaked",
numpages);
return ret;
}
static const struct arm64_mem_crypt_ops realm_crypt_ops = {
.encrypt = realm_set_memory_encrypted,
.decrypt = realm_set_memory_decrypted,
};
int realm_register_memory_enc_ops(void)
{
return arm64_mem_crypt_ops_register(&realm_crypt_ops);
}
int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
{
unsigned long addr = (unsigned long)page_address(page);
if (!can_set_direct_map())
return 0;
return set_memory_valid(addr, nr, valid);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
/*
* This is - apart from the return value - doing the same
* thing as the new set_direct_map_valid_noflush() function.
*
* Unify? Explain the conceptual differences?
*/
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (!can_set_direct_map())
return;
set_memory_valid((unsigned long)page_address(page), numpages, enable);
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
/*
* This function is used to determine if a linear map page has been marked as
* not-valid. Walk the page table and check the PTE_VALID bit.
*
* Because this is only called on the kernel linear map, p?d_sect() implies
* p?d_present(). When debug_pagealloc is enabled, sections mappings are
* disabled.
*/
bool kernel_page_present(struct page *page)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep;
unsigned long addr = (unsigned long)page_address(page);
pgdp = pgd_offset_k(addr);
if (pgd_none(READ_ONCE(*pgdp)))
return false;
p4dp = p4d_offset(pgdp, addr);
if (p4d_none(READ_ONCE(*p4dp)))
return false;
pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return false;
if (pud_sect(pud))
return true;
pmdp = pmd_offset(pudp, addr);
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
return false;
if (pmd_sect(pmd))
return true;
ptep = pte_offset_kernel(pmdp, addr);
return pte_valid(__ptep_get(ptep));
}