From 7563d021e28ec71bc6266e1848b22205ed7863d6 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Tue, 14 Oct 2025 11:20:17 -0700 Subject: [PATCH 01/58] mshv: Fix VpRootDispatchThreadBlocked value This value in the VP stats page is used to track if the VP can be dispatched for execution when there are no fast interrupts injected. The original value of 201 was used in a version of the hypervisor which did not ship. It was subsequently changed to 202 so that is the correct value. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Nuno Das Neves Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e3b2bd417c46..8a42d9961466 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); /* TODO move this to another file when debugfs code is added */ enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ #if defined(CONFIG_X86) - VpRootDispatchThreadBlocked = 201, + VpRootDispatchThreadBlocked = 202, #elif defined(CONFIG_ARM64) VpRootDispatchThreadBlocked = 94, #endif From 4cc1aa469cd6b714adc958547a4866247bfd60a9 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 17 Oct 2025 11:58:17 -0700 Subject: [PATCH 02/58] mshv: Fix deposit memory in MSHV_ROOT_HVCALL When the MSHV_ROOT_HVCALL ioctl is executing a hypercall, and gets HV_STATUS_INSUFFICIENT_MEMORY, it deposits memory and then returns -EAGAIN to userspace. The expectation is that the VMM will retry. However, some VMM code in the wild doesn't do this and simply fails. Rather than force the VMM to retry, change the ioctl to deposit memory on demand and immediately retry the hypercall as is done with all the other hypercall helper functions. In addition to making the ioctl easier to use, removing the need for multiple syscalls improves performance. There is a complication: unlike the other hypercall helper functions, in MSHV_ROOT_HVCALL the input is opaque to the kernel. This is problematic for rep hypercalls, because the next part of the input list can't be copied on each loop after depositing pages (this was the original reason for returning -EAGAIN in this case). Introduce hv_do_rep_hypercall_ex(), which adds a 'rep_start' parameter. This solves the issue, allowing the deposit loop in MSHV_ROOT_HVCALL to restart a rep hypercall after depositing pages partway through. Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs") Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 58 ++++++++++++++++++---------------- include/asm-generic/mshyperv.h | 17 ++++++++-- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 8a42d9961466..a599bbf1f9b8 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -159,6 +159,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, unsigned int pages_order; void *input_pg = NULL; void *output_pg = NULL; + u16 reps_completed; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; @@ -210,41 +211,42 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, */ *(u64 *)input_pg = partition->pt_id; - if (args.reps) - status = hv_do_rep_hypercall(args.code, args.reps, 0, - input_pg, output_pg); - else - status = hv_do_hypercall(args.code, input_pg, output_pg); - - if (hv_result(status) == HV_STATUS_CALL_PENDING) { - if (is_async) { - mshv_async_hvcall_handler(partition, &status); - } else { /* Paranoia check. This shouldn't happen! */ - ret = -EBADFD; - goto free_pages_out; + reps_completed = 0; + do { + if (args.reps) { + status = hv_do_rep_hypercall_ex(args.code, args.reps, + 0, reps_completed, + input_pg, output_pg); + reps_completed = hv_repcomp(status); + } else { + status = hv_do_hypercall(args.code, input_pg, output_pg); } - } - if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); - if (!ret) - ret = -EAGAIN; - } else if (!hv_result_success(status)) { - ret = hv_result_to_errno(status); - } + if (hv_result(status) == HV_STATUS_CALL_PENDING) { + if (is_async) { + mshv_async_hvcall_handler(partition, &status); + } else { /* Paranoia check. This shouldn't happen! */ + ret = -EBADFD; + goto free_pages_out; + } + } + + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) + ret = hv_result_to_errno(status); + else + ret = hv_call_deposit_pages(NUMA_NO_NODE, + partition->pt_id, 1); + } while (!ret); - /* - * Always return the status and output data regardless of result. - * The VMM may need it to determine how to proceed. E.g. the status may - * contain the number of reps completed if a rep hypercall partially - * succeeded. - */ args.status = hv_result(status); - args.reps = args.reps ? hv_repcomp(status) : 0; + args.reps = reps_completed; if (copy_to_user(user_args, &args, sizeof(args))) ret = -EFAULT; - if (output_pg && + if (!ret && output_pg && copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) ret = -EFAULT; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 64ba6bc807d9..b89c7e3a2047 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -124,10 +124,12 @@ static inline unsigned int hv_repcomp(u64 status) /* * Rep hypercalls. Callers of this functions are supposed to ensure that - * rep_count and varhead_size comply with Hyper-V hypercall definition. + * rep_count, varhead_size, and rep_start comply with Hyper-V hypercall + * definition. */ -static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, - void *input, void *output) +static inline u64 hv_do_rep_hypercall_ex(u16 code, u16 rep_count, + u16 varhead_size, u16 rep_start, + void *input, void *output) { u64 control = code; u64 status; @@ -135,6 +137,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + control |= (u64)rep_start << HV_HYPERCALL_REP_START_OFFSET; do { status = hv_do_hypercall(control, input, output); @@ -152,6 +155,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, return status; } +/* For the typical case where rep_start is 0 */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + return hv_do_rep_hypercall_ex(code, rep_count, varhead_size, 0, + input, output); +} + /* Generate the guest OS identifier as described in the Hyper-V TLFS */ static inline u64 hv_generate_guest_id(u64 kernel_version) { From f34f5e576f5e17de5591c9ae306aeab2911d8533 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 18 Sep 2025 11:00:19 -0400 Subject: [PATCH 03/58] x86/hyperv: Don't use hv apic driver when Secure AVIC is available When Secure AVIC is available, the AMD x2apic Secure AVIC driver will be selected. In that case, have hv_apic_init() return immediately without doing anything. Reviewed-by: Michael Kelley Reviewed-by: Neeraj Upadhyay Signed-off-by: Tianyu Lan Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_apic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index bfde0a3498b9..e669053b637d 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -293,6 +293,9 @@ static void hv_send_ipi_self(int vector) void __init hv_apic_init(void) { + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return; + if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) { pr_info("Hyper-V: Using IPI hypercalls\n"); /* From 3e1b611515d286c6725028e17170f7143e5e51fc Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 18 Sep 2025 11:00:20 -0400 Subject: [PATCH 04/58] drivers: hv: Allow vmbus message synic interrupt injected from Hyper-V When Secure AVIC is enabled, VMBus driver should call x2apic Secure AVIC interface to allow Hyper-V to inject VMBus message interrupt. Reviewed-by: Michael Kelley Reviewed-by: Neeraj Upadhyay Signed-off-by: Tianyu Lan Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_apic.c | 5 +++++ drivers/hv/hv.c | 2 ++ drivers/hv/hv_common.c | 5 +++++ include/asm-generic/mshyperv.h | 1 + 4 files changed, 13 insertions(+) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index e669053b637d..a8de503def37 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id) wrmsrq(HV_X64_MSR_ICR, reg_val); } +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ + apic_update_vector(cpu, vector, set); +} + static u32 hv_apic_read(u32 reg) { u32 reg_val, hi; diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index b14c5f9e0ef2..ec5d10839e0f 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -307,6 +307,7 @@ void hv_synic_enable_regs(unsigned int cpu) } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, true); /* Setup the shared SINT. */ if (vmbus_irq != -1) @@ -350,6 +351,7 @@ void hv_synic_disable_regs(unsigned int cpu) /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, false); simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index e109a620c83f..3e41f686daa5 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -716,6 +716,11 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) } EXPORT_SYMBOL_GPL(hv_tdx_hypercall); +void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ +} +EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); + void hv_identify_partition_type(void) { /* Assume guest role */ diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index b89c7e3a2047..db84aced1658 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -347,6 +347,7 @@ bool hv_is_isolation_supported(void); bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); From c52c957e41e75d3f74157acffc53cefa3089ffa7 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 18 Sep 2025 11:00:21 -0400 Subject: [PATCH 05/58] x86/hyperv: Don't use auto-eoi when Secure AVIC is available Hyper-V doesn't support auto-eoi with Secure AVIC. So set the HV_DEPRECATING_AEOI_RECOMMENDED flag to force writing the EOI register after handling an interrupt. Reviewed-by: Michael Kelley Reviewed-by: Neeraj Upadhyay Signed-off-by: Tianyu Lan Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c4febdbcfe4d..217a41b63df3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -470,6 +470,9 @@ static void __init ms_hyperv_init_platform(void) hv_identify_partition_type(); + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED; + if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { hv_nested = true; pr_info("Hyper-V: running on a nested hypervisor\n"); From 5e52db91d112bd1da00b520e21f570db14105bde Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 18 Sep 2025 11:00:22 -0400 Subject: [PATCH 06/58] x86/hyperv: Allow Hyper-V to inject STIMER0 interrupts When Secure AVIC is enabled, call Secure AVIC function to allow Hyper-V to inject STIMER0 interrupt. Reviewed-by: Neeraj Upadhyay Reviewed-by: Michael Kelley Signed-off-by: Tianyu Lan Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e890fd37e9c2..21ed8963fafd 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -170,6 +170,10 @@ static int hv_cpu_init(unsigned int cpu) wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } + /* Allow Hyper-V stimer vector to be injected from Hypervisor. */ + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true); + return hyperv_init_ghcb(); } @@ -277,6 +281,9 @@ static int hv_cpu_die(unsigned int cpu) *ghcb_va = NULL; } + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false); + hv_common_cpu_die(cpu); if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { From 92c7053b44b312e1cda765507f45fc170dee1b41 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:03 -0700 Subject: [PATCH 07/58] Documentation: hyperv: Confidential VMBus Define what the confidential VMBus is and describe what advantages it offers on the capable hardware. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- Documentation/virt/hyperv/coco.rst | 139 ++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/hyperv/coco.rst b/Documentation/virt/hyperv/coco.rst index c15d6fe34b4e..3231e51444da 100644 --- a/Documentation/virt/hyperv/coco.rst +++ b/Documentation/virt/hyperv/coco.rst @@ -178,7 +178,7 @@ These Hyper-V and VMBus memory pages are marked as decrypted: * VMBus monitor pages -* Synthetic interrupt controller (synic) related pages (unless supplied by +* Synthetic interrupt controller (SynIC) related pages (unless supplied by the paravisor) * Per-cpu hypercall input and output pages (unless running with a paravisor) @@ -232,6 +232,143 @@ with arguments explicitly describing the access. See _hv_pcifront_read_config() and _hv_pcifront_write_config() and the "use_calls" flag indicating to use hypercalls. +Confidential VMBus +------------------ +The confidential VMBus enables the confidential guest not to interact with +the untrusted host partition and the untrusted hypervisor. Instead, the guest +relies on the trusted paravisor to communicate with the devices processing +sensitive data. The hardware (SNP or TDX) encrypts the guest memory and the +register state while measuring the paravisor image using the platform security +processor to ensure trusted and confidential computing. + +Confidential VMBus provides a secure communication channel between the guest +and the paravisor, ensuring that sensitive data is protected from hypervisor- +level access through memory encryption and register state isolation. + +Confidential VMBus is an extension of Confidential Computing (CoCo) VMs +(a.k.a. "Isolated" VMs in Hyper-V terminology). Without Confidential VMBus, +guest VMBus device drivers (the "VSC"s in VMBus terminology) communicate +with VMBus servers (the VSPs) running on the Hyper-V host. The +communication must be through memory that has been decrypted so the +host can access it. With Confidential VMBus, one or more of the VSPs reside +in the trusted paravisor layer in the guest VM. Since the paravisor layer also +operates in encrypted memory, the memory used for communication with +such VSPs does not need to be decrypted and thereby exposed to the +Hyper-V host. The paravisor is responsible for communicating securely +with the Hyper-V host as necessary. + +The data is transferred directly between the VM and a vPCI device (a.k.a. +a PCI pass-thru device, see :doc:`vpci`) that is directly assigned to VTL2 +and that supports encrypted memory. In such a case, neither the host partition +nor the hypervisor has any access to the data. The guest needs to establish +a VMBus connection only with the paravisor for the channels that process +sensitive data, and the paravisor abstracts the details of communicating +with the specific devices away providing the guest with the well-established +VSP (Virtual Service Provider) interface that has had support in the Hyper-V +drivers for a decade. + +In the case the device does not support encrypted memory, the paravisor +provides bounce-buffering, and although the data is not encrypted, the backing +pages aren't mapped into the host partition through SLAT. While not impossible, +it becomes much more difficult for the host partition to exfiltrate the data +than it would be with a conventional VMBus connection where the host partition +has direct access to the memory used for communication. + +Here is the data flow for a conventional VMBus connection (`C` stands for the +client or VSC, `S` for the server or VSP, the `DEVICE` is a physical one, might +be with multiple virtual functions):: + + +---- GUEST ----+ +----- DEVICE ----+ +----- HOST -----+ + | | | | | | + | | | | | | + | | | ========== | + | | | | | | + | | | | | | + | | | | | | + +----- C -------+ +-----------------+ +------- S ------+ + || || + || || + +------||------------------ VMBus --------------------------||------+ + | Interrupts, MMIO | + +-------------------------------------------------------------------+ + +and the Confidential VMBus connection:: + + +---- GUEST --------------- VTL0 ------+ +-- DEVICE --+ + | | | | + | +- PARAVISOR --------- VTL2 -----+ | | | + | | +-- VMBus Relay ------+ ====+================ | + | | | Interrupts, MMIO | | | | | + | | +-------- S ----------+ | | +------------+ + | | || | | + | +---------+ || | | + | | Linux | || OpenHCL | | + | | kernel | || | | + | +---- C --+-----||---------------+ | + | || || | + +-------++------- C -------------------+ +------------+ + || | HOST | + || +---- S -----+ + +-------||----------------- VMBus ---------------------------||-----+ + | Interrupts, MMIO | + +-------------------------------------------------------------------+ + +An implementation of the VMBus relay that offers the Confidential VMBus +channels is available in the OpenVMM project as a part of the OpenHCL +paravisor. Please refer to + + * https://openvmm.dev/, and + * https://github.com/microsoft/openvmm + +for more information about the OpenHCL paravisor. + +A guest that is running with a paravisor must determine at runtime if +Confidential VMBus is supported by the current paravisor. The x86_64-specific +approach relies on the CPUID Virtualization Stack leaf; the ARM64 implementation +is expected to support the Confidential VMBus unconditionally when running +ARM CCA guests. + +Confidential VMBus is a characteristic of the VMBus connection as a whole, +and of each VMBus channel that is created. When a Confidential VMBus +connection is established, the paravisor provides the guest the message-passing +path that is used for VMBus device creation and deletion, and it provides a +per-CPU synthetic interrupt controller (SynIC) just like the SynIC that is +offered by the Hyper-V host. Each VMBus device that is offered to the guest +indicates the degree to which it participates in Confidential VMBus. The offer +indicates if the device uses encrypted ring buffers, and if the device uses +encrypted memory for DMA that is done outside the ring buffer. These settings +may be different for different devices using the same Confidential VMBus +connection. + +Although these settings are separate, in practice it'll always be encrypted +ring buffer only, or both encrypted ring buffer and external data. If a channel +is offered by the paravisor with confidential VMBus, the ring buffer can always +be encrypted since it's strictly for communication between the VTL2 paravisor +and the VTL0 guest. However, other memory regions are often used for e.g. DMA, +so they need to be accessible by the underlying hardware, and must be +unencrypted (unless the device supports encrypted memory). Currently, there are +not any VSPs in OpenHCL that support encrypted external memory, but future +versions are expected to enable this capability. + +Because some devices on a Confidential VMBus may require decrypted ring buffers +and DMA transfers, the guest must interact with two SynICs -- the one provided +by the paravisor and the one provided by the Hyper-V host when Confidential +VMBus is not offered. Interrupts are always signaled by the paravisor SynIC, +but the guest must check for messages and for channel interrupts on both SynICs. + +In the case of a confidential VMBus, regular SynIC access by the guest is +intercepted by the paravisor (this includes various MSRs such as the SIMP and +SIEFP, as well as hypercalls like HvPostMessage and HvSignalEvent). If the +guest actually wants to communicate with the hypervisor, it has to use special +mechanisms (GHCB page on SNP, or tdcall on TDX). Messages can be of either +kind: with confidential VMBus, messages use the paravisor SynIC, and if the +guest chose to communicate directly to the hypervisor, they use the hypervisor +SynIC. For interrupt signaling, some channels may be running on the host +(non-confidential, using the VMBus relay) and use the hypervisor SynIC, and +some on the paravisor and use its SynIC. The RelIDs are coordinated by the +OpenHCL VMBus server and are guaranteed to be unique regardless of whether +the channel originated on the host or the paravisor. + load_unaligned_zeropad() ------------------------ When transitioning memory between encrypted and decrypted, the caller of From 6802d8af47d1dccd9a74a1f708fb9129244ef843 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:04 -0700 Subject: [PATCH 08/58] Drivers: hv: VMBus protocol version 6.0 The confidential VMBus is supported starting from the protocol version 6.0 onwards. Provide the required definitions. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hyperv_vmbus.h | 2 ++ drivers/hv/vmbus_drv.c | 12 +++++++ include/hyperv/hvgdk_mini.h | 1 + include/linux/hyperv.h | 69 +++++++++++++++++++++++++++---------- 4 files changed, 65 insertions(+), 19 deletions(-) diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 0b450e53161e..4a01797d4851 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -333,6 +333,8 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..3c414560fa5f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -56,6 +56,18 @@ static long __percpu *vmbus_evt; int vmbus_irq; int vmbus_interrupt; +/* + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. + */ +static bool is_confidential; + +bool vmbus_is_confidential(void) +{ + return is_confidential; +} +EXPORT_SYMBOL_GPL(vmbus_is_confidential); + /* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 77abddfc750e..7f730a0e54e6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -260,6 +260,7 @@ union hv_hypervisor_version_info { #define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 /* Support for the extended IOAPIC RTE format */ #define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) +#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3) #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 59826c89171c..dfc516c1c719 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent( * Linux kernel. */ -#define VERSION_WS2008 ((0 << 16) | (13)) -#define VERSION_WIN7 ((1 << 16) | (1)) -#define VERSION_WIN8 ((2 << 16) | (4)) -#define VERSION_WIN8_1 ((3 << 16) | (0)) -#define VERSION_WIN10 ((4 << 16) | (0)) -#define VERSION_WIN10_V4_1 ((4 << 16) | (1)) -#define VERSION_WIN10_V5 ((5 << 16) | (0)) -#define VERSION_WIN10_V5_1 ((5 << 16) | (1)) -#define VERSION_WIN10_V5_2 ((5 << 16) | (2)) -#define VERSION_WIN10_V5_3 ((5 << 16) | (3)) +#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN)) +#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13) +#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1) +#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4) +#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0) +#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0) +#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1) +#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0) +#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1) +#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2) +#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3) +#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0) /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) @@ -335,14 +337,22 @@ struct vmbus_channel_offer { } __packed; /* Server Flags */ -#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4 -#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10 -#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100 -#define VMBUS_CHANNEL_PARENT_OFFER 0x200 -#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400 -#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 +#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for the channel ring buffer. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for GPA direct packets and additional GPADLs. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004 +#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010 +#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100 +#define VMBUS_CHANNEL_PARENT_OFFER 0x0200 +#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400 +#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 struct vmpacket_descriptor { u16 type; @@ -621,6 +631,12 @@ struct vmbus_channel_relid_released { u32 child_relid; } __packed; +/* + * Used by the paravisor only, means that the encrypted ring buffers and + * the encrypted external memory are supported + */ +#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10 + struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; @@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact { struct { u8 msg_sint; u8 msg_vtl; - u8 reserved[6]; + u8 reserved[2]; + u32 feature_flags; /* VMBus version 6.0 */ }; }; u64 monitor_page1; @@ -1003,6 +1020,10 @@ struct vmbus_channel { /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; + /* The ring buffer is encrypted */ + bool co_ring_buffer; + /* The external memory is encrypted */ + bool co_external_memory; }; #define lock_requestor(channel, flags) \ @@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER); +} + +static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY); +} + static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) { return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); From 7c8b6c326d830ca5c6b95f390c703966e14167e6 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:05 -0700 Subject: [PATCH 09/58] arch/x86: mshyperv: Discover Confidential VMBus availability Confidential VMBus requires enabling paravisor SynIC, and the x86_64 guest has to inspect the Virtualization Stack (VS) CPUID leaf to see if Confidential VMBus is available. If it is, the guest shall enable the paravisor SynIC. Read the relevant data from the VS CPUID leaf. Refactor the code to avoid repeating CPUID and add flags to the struct ms_hyperv_info. For ARM64, the flag for Confidential VMBus is not set which provides the desired behaviour for now as it is not available on ARM64 just yet. Once ARM64 CCA guests are supported, this flag will be set unconditionally when running such a guest. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 28 +++++++++++++++------------- include/asm-generic/mshyperv.h | 2 ++ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 217a41b63df3..1369fae7d8a9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -440,7 +440,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); static void __init ms_hyperv_init_platform(void) { - int hv_max_functions_eax; + int hv_max_functions_eax, eax; #ifdef CONFIG_PARAVIRT pv_info.name = "Hyper-V"; @@ -478,6 +478,19 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running on a nested hypervisor\n"); } + /* + * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID + * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2 + * root) will not get them because the nested (L1) hypervisor filters them out. + * These are handled through intercept processing by the Windows Hyper-V stack + * or the paravisor. + */ + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + ms_hyperv.confidential_vmbus_available = + eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE; + ms_hyperv.msi_ext_dest_id = + eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; @@ -678,21 +691,10 @@ static bool __init ms_hyperv_x2apic_available(void) * pci-hyperv host bridge. * * Note: for a Hyper-V root partition, this will always return false. - * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by - * default, they are implemented as intercepts by the Windows Hyper-V stack. - * Even a nested root partition (L2 root) will not get them because the - * nested (L1) hypervisor filters them out. */ static bool __init ms_hyperv_msi_ext_dest_id(void) { - u32 eax; - - eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); - if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) - return false; - - eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); - return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; + return ms_hyperv.msi_ext_dest_id; } #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index db84aced1658..8da1893365f0 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -62,6 +62,8 @@ struct ms_hyperv_info { }; }; u64 shared_gpa_boundary; + bool msi_ext_dest_id; + bool confidential_vmbus_available; }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; From e6eeb3c782739cd1613a8da856b878b99f741943 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:06 -0700 Subject: [PATCH 10/58] arch: hyperv: Get/set SynIC synth.registers via paravisor The existing Hyper-V wrappers for getting and setting MSRs are hv_get/set_msr(). Via hv_get/set_non_nested_msr(), they detect when running in a CoCo VM with a paravisor, and use the TDX or SNP guest-host communication protocol to bypass the paravisor and go directly to the host hypervisor for SynIC MSRs. The "set" function also implements the required special handling for the SINT MSRs. Provide functions that allow manipulating the SynIC registers through the paravisor. Move vmbus_signal_eom() to a more appropriate location (which also avoids breaking KVM). Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 20 ++++++++++++++++ drivers/hv/hv_common.c | 11 +++++++++ drivers/hv/hyperv_vmbus.h | 44 ++++++++++++++++++++++++++++++++++ include/asm-generic/mshyperv.h | 42 ++------------------------------ 4 files changed, 77 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 1369fae7d8a9..eb9f1d7eec61 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -86,6 +86,26 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value) } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); +/* + * Get the SynIC register value from the paravisor. + */ +u64 hv_para_get_synic_register(unsigned int reg) +{ + if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) + return ~0ULL; + return native_read_msr(reg); +} + +/* + * Set the SynIC register value with the paravisor. + */ +void hv_para_set_synic_register(unsigned int reg, u64 val) +{ + if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) + return; + native_write_msr(reg, val); +} + u64 hv_get_msr(unsigned int reg) { if (hv_nested) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 3e41f686daa5..c39472ae8345 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -721,6 +721,17 @@ void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool } EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); +u64 __weak hv_para_get_synic_register(unsigned int reg) +{ + return ~0ULL; +} +EXPORT_SYMBOL_GPL(hv_para_get_synic_register); + +void __weak hv_para_set_synic_register(unsigned int reg, u64 val) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_synic_register); + void hv_identify_partition_type(void) { /* Assume guest role */ diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 4a01797d4851..9ac6f5520287 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -335,6 +336,49 @@ extern const struct vmbus_channel_message_table_entry bool vmbus_is_confidential(void); +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) +/* Free the message slot and signal end-of-message if required */ +static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) +{ + /* + * On crash we're reading some other CPU's message page and we need + * to be careful: this other CPU may already had cleared the header + * and the host may already had delivered some other message there. + * In case we blindly write msg->header.message_type we're going + * to lose it. We can still lose a message of the same type but + * we count on the fact that there can only be one + * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages + * on crash. + */ + if (cmpxchg(&msg->header.message_type, old_msg_type, + HVMSG_NONE) != old_msg_type) + return; + + /* + * The cmxchg() above does an implicit memory barrier to + * ensure the write to MessageType (ie set to + * HVMSG_NONE) happens before we read the + * MessagePending and EOMing. Otherwise, the EOMing + * will not deliver any more messages since there is + * no empty slot + */ + if (msg->header.message_flags.msg_pending) { + /* + * This will cause message queue rescan to + * possibly deliver another msg from the + * hypervisor + */ + if (vmbus_is_confidential()) + hv_para_set_synic_register(HV_MSR_EOM, 0); + else + hv_set_msr(HV_MSR_EOM, 0); + } +} + +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8da1893365f0..c328265de624 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -176,46 +176,6 @@ static inline u64 hv_generate_guest_id(u64 kernel_version) return guest_id; } -#if IS_ENABLED(CONFIG_HYPERV_VMBUS) -/* Free the message slot and signal end-of-message if required */ -static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) -{ - /* - * On crash we're reading some other CPU's message page and we need - * to be careful: this other CPU may already had cleared the header - * and the host may already had delivered some other message there. - * In case we blindly write msg->header.message_type we're going - * to lose it. We can still lose a message of the same type but - * we count on the fact that there can only be one - * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages - * on crash. - */ - if (cmpxchg(&msg->header.message_type, old_msg_type, - HVMSG_NONE) != old_msg_type) - return; - - /* - * The cmxchg() above does an implicit memory barrier to - * ensure the write to MessageType (ie set to - * HVMSG_NONE) happens before we read the - * MessagePending and EOMing. Otherwise, the EOMing - * will not deliver any more messages since there is - * no empty slot - */ - if (msg->header.message_flags.msg_pending) { - /* - * This will cause message queue rescan to - * possibly deliver another msg from the - * hypervisor - */ - hv_set_msr(HV_MSR_EOM, 0); - } -} - -extern int vmbus_interrupt; -extern int vmbus_irq; -#endif /* CONFIG_HYPERV_VMBUS */ - int hv_get_hypervisor_version(union hv_hypervisor_version_info *info); void hv_setup_vmbus_handler(void (*handler)(void)); @@ -350,6 +310,8 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); +u64 hv_para_get_synic_register(unsigned int reg); +void hv_para_set_synic_register(unsigned int reg, u64 val); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); From a156ad8c508209ce22f3213d25c3c2ae1774a57d Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:07 -0700 Subject: [PATCH 11/58] arch/x86: mshyperv: Trap on access for some synthetic MSRs hv_set_non_nested_msr() has special handling for SINT MSRs when a paravisor is present. In addition to updating the MSR on the host, the mirror MSR in the paravisor is updated, including with the proxy bit. But with Confidential VMBus, the proxy bit must not be used, so add a special case to skip it. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 29 +++++++++++++++++++++++++---- drivers/hv/hv_common.c | 5 +++++ include/asm-generic/mshyperv.h | 1 + 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index eb9f1d7eec61..80a641a6ac48 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,12 @@ bool hv_nested; struct ms_hyperv_info ms_hyperv; #if IS_ENABLED(CONFIG_HYPERV) +/* + * When running with the paravisor, controls proxying the synthetic interrupts + * from the host + */ +static bool hv_para_sint_proxy; + static inline unsigned int hv_get_nested_msr(unsigned int reg) { if (hv_is_sint_msr(reg)) @@ -75,17 +82,31 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); void hv_set_non_nested_msr(unsigned int reg, u64 value) { if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { + /* The hypervisor will get the intercept. */ hv_ivm_msr_write(reg, value); - /* Write proxy bit via wrmsl instruction */ - if (hv_is_sint_msr(reg)) - wrmsrq(reg, value | 1 << 20); + /* Using wrmsrq so the following goes to the paravisor. */ + if (hv_is_sint_msr(reg)) { + union hv_synic_sint sint = { .as_uint64 = value }; + + sint.proxy = hv_para_sint_proxy; + native_wrmsrq(reg, sint.as_uint64); + } } else { - wrmsrq(reg, value); + native_wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); +/* + * Enable or disable proxying synthetic interrupts + * to the paravisor. + */ +void hv_para_set_sint_proxy(bool enable) +{ + hv_para_sint_proxy = enable; +} + /* * Get the SynIC register value from the paravisor. */ diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index c39472ae8345..4b44da7f23b2 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -721,6 +721,11 @@ void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool } EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); +void __weak hv_para_set_sint_proxy(bool enable) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy); + u64 __weak hv_para_get_synic_register(unsigned int reg) { return ~0ULL; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c328265de624..ecedab554c80 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -310,6 +310,7 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); +void hv_para_set_sint_proxy(bool enable); u64 hv_para_get_synic_register(unsigned int reg); void hv_para_set_synic_register(unsigned int reg, u64 val); void hyperv_cleanup(void); From 163224c189e8b679ce919aa64ccabb7a992ca2d1 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:08 -0700 Subject: [PATCH 12/58] Drivers: hv: Rename fields for SynIC message and event pages Confidential VMBus requires interacting with two SynICs -- one provided by the host hypervisor, and one provided by the paravisor. Each SynIC requires its own message and event pages. Rename the existing host-accessible SynIC message and event pages with the "hyp_" prefix to clearly distinguish them from the paravisor ones. The field name is also changed in mshv_root.* for consistency. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 6 ++-- drivers/hv/hv.c | 66 +++++++++++++++++++-------------------- drivers/hv/hyperv_vmbus.h | 4 +-- drivers/hv/mshv_root.h | 2 +- drivers/hv/mshv_synic.c | 6 ++-- drivers/hv/vmbus_drv.c | 6 ++-- 6 files changed, 45 insertions(+), 45 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 65dd299e2944..1a33c6944b3c 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -844,14 +844,14 @@ static void vmbus_wait_for_unload(void) = per_cpu_ptr(hv_context.cpu_context, cpu); /* - * In a CoCo VM the synic_message_page is not allocated + * In a CoCo VM the hyp_synic_message_page is not allocated * in hv_synic_alloc(). Instead it is set/cleared in * hv_synic_enable_regs() and hv_synic_disable_regs() * such that it is set only when the CPU is online. If * not all present CPUs are online, the message page * might be NULL, so skip such CPUs. */ - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; if (!page_addr) continue; @@ -892,7 +892,7 @@ completed: struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; if (!page_addr) continue; diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index ec5d10839e0f..6fbb51e50673 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -147,20 +147,20 @@ int hv_synic_alloc(void) * Skip these pages allocation here. */ if (!ms_hyperv.paravisor_present && !hv_root_partition()) { - hv_cpu->synic_message_page = + hv_cpu->hyp_synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->synic_message_page) { + if (!hv_cpu->hyp_synic_message_page) { pr_err("Unable to allocate SYNIC message page\n"); goto err; } - hv_cpu->synic_event_page = + hv_cpu->hyp_synic_event_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->synic_event_page) { + if (!hv_cpu->hyp_synic_event_page) { pr_err("Unable to allocate SYNIC event page\n"); - free_page((unsigned long)hv_cpu->synic_message_page); - hv_cpu->synic_message_page = NULL; + free_page((unsigned long)hv_cpu->hyp_synic_message_page); + hv_cpu->hyp_synic_message_page = NULL; goto err; } } @@ -168,30 +168,30 @@ int hv_synic_alloc(void) if (!ms_hyperv.paravisor_present && (hv_isolation_type_snp() || hv_isolation_type_tdx())) { ret = set_memory_decrypted((unsigned long) - hv_cpu->synic_message_page, 1); + hv_cpu->hyp_synic_message_page, 1); if (ret) { pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); - hv_cpu->synic_message_page = NULL; + hv_cpu->hyp_synic_message_page = NULL; /* * Free the event page here so that hv_synic_free() * won't later try to re-encrypt it. */ - free_page((unsigned long)hv_cpu->synic_event_page); - hv_cpu->synic_event_page = NULL; + free_page((unsigned long)hv_cpu->hyp_synic_event_page); + hv_cpu->hyp_synic_event_page = NULL; goto err; } ret = set_memory_decrypted((unsigned long) - hv_cpu->synic_event_page, 1); + hv_cpu->hyp_synic_event_page, 1); if (ret) { pr_err("Failed to decrypt SYNIC event page: %d\n", ret); - hv_cpu->synic_event_page = NULL; + hv_cpu->hyp_synic_event_page = NULL; goto err; } - memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); - memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); + memset(hv_cpu->hyp_synic_message_page, 0, PAGE_SIZE); + memset(hv_cpu->hyp_synic_event_page, 0, PAGE_SIZE); } } @@ -227,28 +227,28 @@ void hv_synic_free(void) if (!ms_hyperv.paravisor_present && (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - if (hv_cpu->synic_message_page) { + if (hv_cpu->hyp_synic_message_page) { ret = set_memory_encrypted((unsigned long) - hv_cpu->synic_message_page, 1); + hv_cpu->hyp_synic_message_page, 1); if (ret) { pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); - hv_cpu->synic_message_page = NULL; + hv_cpu->hyp_synic_message_page = NULL; } } - if (hv_cpu->synic_event_page) { + if (hv_cpu->hyp_synic_event_page) { ret = set_memory_encrypted((unsigned long) - hv_cpu->synic_event_page, 1); + hv_cpu->hyp_synic_event_page, 1); if (ret) { pr_err("Failed to encrypt SYNIC event page: %d\n", ret); - hv_cpu->synic_event_page = NULL; + hv_cpu->hyp_synic_event_page = NULL; } } } free_page((unsigned long)hv_cpu->post_msg_page); - free_page((unsigned long)hv_cpu->synic_event_page); - free_page((unsigned long)hv_cpu->synic_message_page); + free_page((unsigned long)hv_cpu->hyp_synic_event_page); + free_page((unsigned long)hv_cpu->hyp_synic_message_page); } kfree(hv_context.hv_numa_map); @@ -278,12 +278,12 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_message_page = + hv_cpu->hyp_synic_message_page = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); - if (!hv_cpu->synic_message_page) + if (!hv_cpu->hyp_synic_message_page) pr_err("Fail to map synic message page.\n"); } else { - simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) + simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page) >> HV_HYP_PAGE_SHIFT; } @@ -297,12 +297,12 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_event_page = + hv_cpu->hyp_synic_event_page = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); - if (!hv_cpu->synic_event_page) + if (!hv_cpu->hyp_synic_event_page) pr_err("Fail to map synic event page.\n"); } else { - siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page) >> HV_HYP_PAGE_SHIFT; } @@ -362,8 +362,8 @@ void hv_synic_disable_regs(unsigned int cpu) */ simp.simp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { - iounmap(hv_cpu->synic_message_page); - hv_cpu->synic_message_page = NULL; + iounmap(hv_cpu->hyp_synic_message_page); + hv_cpu->hyp_synic_message_page = NULL; } else { simp.base_simp_gpa = 0; } @@ -374,8 +374,8 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.siefp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { - iounmap(hv_cpu->synic_event_page); - hv_cpu->synic_event_page = NULL; + iounmap(hv_cpu->hyp_synic_event_page); + hv_cpu->hyp_synic_event_page = NULL; } else { siefp.base_siefp_gpa = 0; } @@ -405,7 +405,7 @@ static bool hv_synic_event_pending(void) { struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); union hv_synic_event_flags *event = - (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; + (union hv_synic_event_flags *)hv_cpu->hyp_synic_event_page + VMBUS_MESSAGE_SINT; unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ bool pending; u32 relid; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 9ac6f5520287..d593af45a5b2 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -121,8 +121,8 @@ enum { * Per cpu state for channel handling */ struct hv_per_cpu_context { - void *synic_message_page; - void *synic_event_page; + void *hyp_synic_message_page; + void *hyp_synic_event_page; /* * The page is only used in hv_post_message() for a TDX VM (with the diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index e3931b0f1269..db6b42db2fdc 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -169,7 +169,7 @@ struct mshv_girq_routing_table { }; struct hv_synic_pages { - struct hv_message_page *synic_message_page; + struct hv_message_page *hyp_synic_message_page; struct hv_synic_event_flags_page *synic_event_flags_page; struct hv_synic_event_ring_page *synic_event_ring_page; }; diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c index e6b6381b7c36..f8b0337cdc82 100644 --- a/drivers/hv/mshv_synic.c +++ b/drivers/hv/mshv_synic.c @@ -394,7 +394,7 @@ unlock_out: void mshv_isr(void) { struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); - struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; struct hv_message *msg; bool handled; @@ -456,7 +456,7 @@ int mshv_synic_init(unsigned int cpu) #endif union hv_synic_scontrol sctrl; struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); - struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; struct hv_synic_event_flags_page **event_flags_page = &spages->synic_event_flags_page; struct hv_synic_event_ring_page **event_ring_page = @@ -550,7 +550,7 @@ int mshv_synic_cleanup(unsigned int cpu) union hv_synic_sirbp sirbp; union hv_synic_scontrol sctrl; struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); - struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; struct hv_synic_event_flags_page **event_flags_page = &spages->synic_event_flags_page; struct hv_synic_event_ring_page **event_ring_page = diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 3c414560fa5f..e12f0ba0701f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1060,7 +1060,7 @@ static void vmbus_onmessage_work(struct work_struct *work) void vmbus_on_msg_dpc(unsigned long data) { struct hv_per_cpu_context *hv_cpu = (void *)data; - void *page_addr = hv_cpu->synic_message_page; + void *page_addr = hv_cpu->hyp_synic_message_page; struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; struct vmbus_channel_message_header *hdr; @@ -1244,7 +1244,7 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) * The event page can be directly checked to get the id of * the channel that has the interrupt pending. */ - void *page_addr = hv_cpu->synic_event_page; + void *page_addr = hv_cpu->hyp_synic_event_page; union hv_synic_event_flags *event = (union hv_synic_event_flags *)page_addr + VMBUS_MESSAGE_SINT; @@ -1327,7 +1327,7 @@ static void vmbus_isr(void) vmbus_chan_sched(hv_cpu); - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; /* Check if there are actual msgs to be processed */ From 226494e5ee4eb0bae4fc7b525505828271d5047e Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:09 -0700 Subject: [PATCH 13/58] Drivers: hv: Allocate the paravisor SynIC pages when required Confidential VMBus requires interacting with two SynICs -- one provided by the host hypervisor, and one provided by the paravisor. Each SynIC requires its own message and event pages. Refactor and extend the existing code to add allocating and freeing the message and event pages for the paravisor SynIC when it is present. Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv.c | 184 +++++++++++++++++++------------------- drivers/hv/hyperv_vmbus.h | 18 ++++ 2 files changed, 112 insertions(+), 90 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6fbb51e50673..d380e0df63f3 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -96,10 +96,70 @@ int hv_post_message(union hv_connection_id connection_id, return hv_result(status); } +static int hv_alloc_page(void **page, bool decrypt, const char *note) +{ + int ret = 0; + + /* + * After the page changes its encryption status, its contents might + * appear scrambled on some hardware. Thus `get_zeroed_page` would + * zero the page out in vain, so do that explicitly exactly once. + * + * By default, the page is allocated encrypted in a CoCo VM. + */ + *page = (void *)__get_free_page(GFP_KERNEL); + if (!*page) + return -ENOMEM; + + if (decrypt) + ret = set_memory_decrypted((unsigned long)*page, 1); + if (ret) + goto failed; + + memset(*page, 0, PAGE_SIZE); + return 0; + +failed: + /* + * Report the failure but don't put the page back on the free list as + * its encryption status is unknown. + */ + pr_err("allocation failed for %s page, error %d, decrypted %d\n", + note, ret, decrypt); + *page = NULL; + return ret; +} + +static int hv_free_page(void **page, bool encrypt, const char *note) +{ + int ret = 0; + + if (!*page) + return 0; + + if (encrypt) + ret = set_memory_encrypted((unsigned long)*page, 1); + + /* + * In the case of the failure, the page is leaked. Something is wrong, + * prefer to lose the page with the unknown encryption status and stay afloat. + */ + if (ret) + pr_err("deallocation failed for %s page, error %d, encrypt %d\n", + note, ret, encrypt); + else + free_page((unsigned long)*page); + + *page = NULL; + + return ret; +} + int hv_synic_alloc(void) { int cpu, ret = -ENOMEM; struct hv_per_cpu_context *hv_cpu; + const bool decrypt = !vmbus_is_confidential(); /* * First, zero all per-cpu memory areas so hv_synic_free() can @@ -125,73 +185,37 @@ int hv_synic_alloc(void) vmbus_on_msg_dpc, (unsigned long)hv_cpu); if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { - hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->post_msg_page) { - pr_err("Unable to allocate post msg page\n"); + ret = hv_alloc_page(&hv_cpu->post_msg_page, + decrypt, "post msg"); + if (ret) goto err; - } - - ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1); - if (ret) { - pr_err("Failed to decrypt post msg page: %d\n", ret); - /* Just leak the page, as it's unsafe to free the page. */ - hv_cpu->post_msg_page = NULL; - goto err; - } - - memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); } /* - * Synic message and event pages are allocated by paravisor. - * Skip these pages allocation here. + * If these SynIC pages are not allocated, SIEF and SIM pages + * are configured using what the root partition or the paravisor + * provides upon reading the SIEFP and SIMP registers. */ if (!ms_hyperv.paravisor_present && !hv_root_partition()) { - hv_cpu->hyp_synic_message_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->hyp_synic_message_page) { - pr_err("Unable to allocate SYNIC message page\n"); + ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page, + decrypt, "hypervisor SynIC msg"); + if (ret) goto err; - } - - hv_cpu->hyp_synic_event_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->hyp_synic_event_page) { - pr_err("Unable to allocate SYNIC event page\n"); - - free_page((unsigned long)hv_cpu->hyp_synic_message_page); - hv_cpu->hyp_synic_message_page = NULL; + ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page, + decrypt, "hypervisor SynIC event"); + if (ret) goto err; - } } - if (!ms_hyperv.paravisor_present && - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - ret = set_memory_decrypted((unsigned long) - hv_cpu->hyp_synic_message_page, 1); - if (ret) { - pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); - hv_cpu->hyp_synic_message_page = NULL; - - /* - * Free the event page here so that hv_synic_free() - * won't later try to re-encrypt it. - */ - free_page((unsigned long)hv_cpu->hyp_synic_event_page); - hv_cpu->hyp_synic_event_page = NULL; + if (vmbus_is_confidential()) { + ret = hv_alloc_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); + if (ret) goto err; - } - - ret = set_memory_decrypted((unsigned long) - hv_cpu->hyp_synic_event_page, 1); - if (ret) { - pr_err("Failed to decrypt SYNIC event page: %d\n", ret); - hv_cpu->hyp_synic_event_page = NULL; + ret = hv_alloc_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + if (ret) goto err; - } - - memset(hv_cpu->hyp_synic_message_page, 0, PAGE_SIZE); - memset(hv_cpu->hyp_synic_event_page, 0, PAGE_SIZE); } } @@ -207,48 +231,28 @@ err: void hv_synic_free(void) { - int cpu, ret; + int cpu; + const bool encrypt = !vmbus_is_confidential(); for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - /* It's better to leak the page if the encryption fails. */ - if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { - if (hv_cpu->post_msg_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->post_msg_page, 1); - if (ret) { - pr_err("Failed to encrypt post msg page: %d\n", ret); - hv_cpu->post_msg_page = NULL; - } - } + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) + hv_free_page(&hv_cpu->post_msg_page, + encrypt, "post msg"); + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { + hv_free_page(&hv_cpu->hyp_synic_event_page, + encrypt, "hypervisor SynIC event"); + hv_free_page(&hv_cpu->hyp_synic_message_page, + encrypt, "hypervisor SynIC msg"); } - - if (!ms_hyperv.paravisor_present && - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - if (hv_cpu->hyp_synic_message_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->hyp_synic_message_page, 1); - if (ret) { - pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); - hv_cpu->hyp_synic_message_page = NULL; - } - } - - if (hv_cpu->hyp_synic_event_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->hyp_synic_event_page, 1); - if (ret) { - pr_err("Failed to encrypt SYNIC event page: %d\n", ret); - hv_cpu->hyp_synic_event_page = NULL; - } - } + if (vmbus_is_confidential()) { + hv_free_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + hv_free_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); } - - free_page((unsigned long)hv_cpu->post_msg_page); - free_page((unsigned long)hv_cpu->hyp_synic_event_page); - free_page((unsigned long)hv_cpu->hyp_synic_message_page); } kfree(hv_context.hv_numa_map); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index d593af45a5b2..3c70051c0431 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -121,8 +121,26 @@ enum { * Per cpu state for channel handling */ struct hv_per_cpu_context { + /* + * SynIC pages for communicating with the host. + * + * These pages are accessible to the host partition and the hypervisor. + * They may be used for exchanging data with the host partition and the + * hypervisor even when they aren't trusted yet the guest partition + * must be prepared to handle the malicious behavior. + */ void *hyp_synic_message_page; void *hyp_synic_event_page; + /* + * SynIC pages for communicating with the paravisor. + * + * These pages may be accessed from within the guest partition only in + * CoCo VMs. Neither the host partition nor the hypervisor can access + * these pages in that case; they are used for exchanging data with the + * paravisor. + */ + void *para_synic_message_page; + void *para_synic_event_page; /* * The page is only used in hv_post_message() for a TDX VM (with the From 25059d5e4c5af37688797d87b2d2004ac1cddff7 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:10 -0700 Subject: [PATCH 14/58] Drivers: hv: Post messages through the confidential VMBus if available When the confidential VMBus is available, the guest should post messages to the paravisor. Update hv_post_message() to post messages to the paravisor rather than through GHCB or TD calls. Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index d380e0df63f3..aca3a275bb1f 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -74,7 +74,11 @@ int hv_post_message(union hv_connection_id connection_id, aligned_msg->payload_size = payload_size; memcpy((void *)aligned_msg->payload, payload, payload_size); - if (ms_hyperv.paravisor_present) { + if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) { + /* + * If the VMBus isn't confidential, use the CoCo-specific + * mechanism to communicate with the hypervisor. + */ if (hv_isolation_type_tdx()) status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, virt_to_phys(aligned_msg), 0); @@ -88,6 +92,11 @@ int hv_post_message(union hv_connection_id connection_id, u64 control = HVCALL_POST_MESSAGE; control |= hv_nested ? HV_HYPERCALL_NESTED : 0; + /* + * If there is no paravisor, this will go to the hypervisor. + * In the Confidential VMBus case, there is the paravisor + * to which this will trap. + */ status = hv_do_hypercall(control, aligned_msg, NULL); } From 1bb15327d529a4e30f69b6c3486dfe8d7a753ff5 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:11 -0700 Subject: [PATCH 15/58] Drivers: hv: remove stale comment The comment about the x2v shim is ancient and long since incorrect. Remove the incorrect comment. Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index aca3a275bb1f..e9e1edd75107 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -268,11 +268,7 @@ void hv_synic_free(void) } /* - * hv_synic_init - Initialize the Synthetic Interrupt Controller. - * - * If it is already initialized by another entity (ie x2v shim), we need to - * retrieve the initialized message and event pages. Otherwise, we create and - * initialize the message and event pages. + * hv_synic_enable_regs - Initialize the Synthetic Interrupt Controller. */ void hv_synic_enable_regs(unsigned int cpu) { From 09406f2f8466fe44894f444943134a09c8519e4f Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:12 -0700 Subject: [PATCH 16/58] Drivers: hv: Check message and event pages for non-NULL before iounmap() It might happen that some hyp SynIC pages aren't allocated. Check for that and only then call iounmap(). Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index e9e1edd75107..77da2d211378 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -371,8 +371,10 @@ void hv_synic_disable_regs(unsigned int cpu) */ simp.simp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { - iounmap(hv_cpu->hyp_synic_message_page); - hv_cpu->hyp_synic_message_page = NULL; + if (hv_cpu->hyp_synic_message_page) { + iounmap(hv_cpu->hyp_synic_message_page); + hv_cpu->hyp_synic_message_page = NULL; + } } else { simp.base_simp_gpa = 0; } @@ -383,8 +385,10 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.siefp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { - iounmap(hv_cpu->hyp_synic_event_page); - hv_cpu->hyp_synic_event_page = NULL; + if (hv_cpu->hyp_synic_event_page) { + iounmap(hv_cpu->hyp_synic_event_page); + hv_cpu->hyp_synic_event_page = NULL; + } } else { siefp.base_siefp_gpa = 0; } From 74fa5d7e5fbdecdff64f149d9c14c74baa5cb27b Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:13 -0700 Subject: [PATCH 17/58] Drivers: hv: Rename the SynIC enable and disable routines The confidential VMBus requires support for the both hypervisor facing SynIC and the paravisor one. Rename the functions that enable and disable SynIC with the hypervisor. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 2 +- drivers/hv/hv.c | 11 ++++++----- drivers/hv/hyperv_vmbus.h | 4 ++-- drivers/hv/vmbus_drv.c | 6 +++--- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 1a33c6944b3c..6d66cbc9030b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -846,7 +846,7 @@ static void vmbus_wait_for_unload(void) /* * In a CoCo VM the hyp_synic_message_page is not allocated * in hv_synic_alloc(). Instead it is set/cleared in - * hv_synic_enable_regs() and hv_synic_disable_regs() + * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs() * such that it is set only when the CPU is online. If * not all present CPUs are online, the message page * might be NULL, so skip such CPUs. diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 77da2d211378..81b6c3b0f21a 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -268,9 +268,10 @@ void hv_synic_free(void) } /* - * hv_synic_enable_regs - Initialize the Synthetic Interrupt Controller. + * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller + * with the hypervisor. */ -void hv_synic_enable_regs(unsigned int cpu) +void hv_hyp_synic_enable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); @@ -337,14 +338,14 @@ void hv_synic_enable_regs(unsigned int cpu) int hv_synic_init(unsigned int cpu) { - hv_synic_enable_regs(cpu); + hv_hyp_synic_enable_regs(cpu); hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); return 0; } -void hv_synic_disable_regs(unsigned int cpu) +void hv_hyp_synic_disable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); @@ -532,7 +533,7 @@ int hv_synic_cleanup(unsigned int cpu) always_cleanup: hv_stimer_legacy_cleanup(cpu); - hv_synic_disable_regs(cpu); + hv_hyp_synic_disable_regs(cpu); return ret; } diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 3c70051c0431..552ed782bcfc 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -190,10 +190,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); -extern void hv_synic_enable_regs(unsigned int cpu); +extern void hv_hyp_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); -extern void hv_synic_disable_regs(unsigned int cpu); +extern void hv_hyp_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index e12f0ba0701f..2b5bf672c467 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2810,7 +2810,7 @@ static void hv_crash_handler(struct pt_regs *regs) */ cpu = smp_processor_id(); hv_stimer_cleanup(cpu); - hv_synic_disable_regs(cpu); + hv_hyp_synic_disable_regs(cpu); }; static int hv_synic_suspend(void) @@ -2835,14 +2835,14 @@ static int hv_synic_suspend(void) * interrupts-disabled context. */ - hv_synic_disable_regs(0); + hv_hyp_synic_disable_regs(0); return 0; } static void hv_synic_resume(void) { - hv_synic_enable_regs(0); + hv_hyp_synic_enable_regs(0); /* * Note: we don't need to call hv_stimer_init(0), because the timer From e096fe2bd623f77d5407f1d9a40f5d8d2a596680 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:14 -0700 Subject: [PATCH 18/58] Drivers: hv: Functions for setting up and tearing down the paravisor SynIC The confidential VMBus runs with the paravisor SynIC and requires configuring it with the paravisor. Add the functions for configuring the paravisor SynIC. Update overall SynIC initialization logic to initialize the SynIC if it is present. Finally, break out SynIC interrupt enable/disable code into separate functions so that SynIC interrupts can be enabled or disabled via the paravisor instead of the hypervisor if the paravisor SynIC is present. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv.c | 138 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 12 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 81b6c3b0f21a..936c5f310df6 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -278,9 +278,8 @@ void hv_hyp_synic_enable_regs(unsigned int cpu) union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; - union hv_synic_scontrol sctrl; - /* Setup the Synic's message page */ + /* Setup the Synic's message page with the hypervisor. */ simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); simp.simp_enabled = 1; @@ -299,7 +298,7 @@ void hv_hyp_synic_enable_regs(unsigned int cpu) hv_set_msr(HV_MSR_SIMP, simp.as_uint64); - /* Setup the Synic's event page */ + /* Setup the Synic's event page with the hypervisor. */ siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 1; @@ -328,6 +327,11 @@ void hv_hyp_synic_enable_regs(unsigned int cpu) shared_sint.masked = false; shared_sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); +} + +static void hv_hyp_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Enable the global synic bit */ sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); @@ -336,9 +340,59 @@ void hv_hyp_synic_enable_regs(unsigned int cpu) hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); } +static void hv_para_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + struct hv_per_cpu_context *hv_cpu + = per_cpu_ptr(hv_context.cpu_context, cpu); + + /* Setup the Synic's message page with the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 1; + simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event page with the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 1; + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Enable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); +} + int hv_synic_init(unsigned int cpu) { + if (vmbus_is_confidential()) + hv_para_synic_enable_regs(cpu); + + /* + * The SINT is set in hv_hyp_synic_enable_regs() by calling + * hv_set_msr(). hv_set_msr() in turn has special case code for the + * SINT MSRs that write to the hypervisor version of the MSR *and* + * the paravisor version of the MSR (but *without* the proxy bit when + * VMBus is confidential). + * + * Then enable interrupts via the paravisor if VMBus is confidential, + * and otherwise via the hypervisor. + */ + hv_hyp_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_interrupts(); + else + hv_hyp_synic_enable_interrupts(); hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); @@ -352,7 +406,6 @@ void hv_hyp_synic_disable_regs(unsigned int cpu) union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; - union hv_synic_scontrol sctrl; shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); @@ -365,7 +418,7 @@ void hv_hyp_synic_disable_regs(unsigned int cpu) simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* - * In Isolation VM, sim and sief pages are allocated by + * In Isolation VM, simp and sief pages are allocated by * paravisor. These pages also will be used by kdump * kernel. So just reset enable bit here and keep page * addresses. @@ -395,14 +448,42 @@ void hv_hyp_synic_disable_regs(unsigned int cpu) } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_hyp_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Disable the global synic bit */ sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 0; hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); +} - if (vmbus_irq != -1) - disable_percpu_irq(vmbus_irq); +static void hv_para_synic_disable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + + /* Disable SynIC's message page in the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Disable SynIC's event page in the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Disable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); } #define HV_MAX_TRIES 3 @@ -415,16 +496,18 @@ void hv_hyp_synic_disable_regs(unsigned int cpu) * that the normal interrupt handling mechanism will find and process the channel interrupt * "very soon", and in the process clear the bit. */ -static bool hv_synic_event_pending(void) +static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint) { - struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); - union hv_synic_event_flags *event = - (union hv_synic_event_flags *)hv_cpu->hyp_synic_event_page + VMBUS_MESSAGE_SINT; - unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ + unsigned long *recv_int_page; bool pending; u32 relid; int tries = 0; + if (!event) + return false; + + event += sint; + recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ retry: pending = false; for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { @@ -441,6 +524,17 @@ retry: return pending; } +static bool hv_synic_event_pending(void) +{ + struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); + union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page; + union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page; + + return + __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) || + __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT); +} + static int hv_pick_new_cpu(struct vmbus_channel *channel) { int ret = -EBUSY; @@ -533,7 +627,27 @@ int hv_synic_cleanup(unsigned int cpu) always_cleanup: hv_stimer_legacy_cleanup(cpu); + /* + * First, disable the event and message pages + * used for communicating with the host, and then + * disable the host interrupts if VMBus is not + * confidential. + */ hv_hyp_synic_disable_regs(cpu); + if (!vmbus_is_confidential()) + hv_hyp_synic_disable_interrupts(); + + /* + * Perform the same steps for the Confidential VMBus. + * The sequencing provides the guarantee that no data + * may be posted for processing before disabling interrupts. + */ + if (vmbus_is_confidential()) { + hv_para_synic_disable_regs(cpu); + hv_para_synic_disable_interrupts(); + } + if (vmbus_irq != -1) + disable_percpu_irq(vmbus_irq); return ret; } From 0a4534bdf29a5b7f5a355c267d28dad9c40ba252 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:15 -0700 Subject: [PATCH 19/58] Drivers: hv: Allocate encrypted buffers when requested Confidential VMBus is built around using buffers not shared with the host. Support allocating encrypted buffers when requested. Signed-off-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel.c | 49 +++++++++++++++++++++++---------------- drivers/hv/hyperv_vmbus.h | 3 ++- drivers/hv/ring_buffer.c | 5 ++-- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 162d6aeece7b..d69713201bef 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -444,20 +444,23 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, return ret; } - /* - * Set the "decrypted" flag to true for the set_memory_decrypted() - * success case. In the failure case, the encryption state of the - * memory is unknown. Leave "decrypted" as true to ensure the - * memory will be leaked instead of going back on the free list. - */ - gpadl->decrypted = true; - ret = set_memory_decrypted((unsigned long)kbuffer, - PFN_UP(size)); - if (ret) { - dev_warn(&channel->device_obj->device, - "Failed to set host visibility for new GPADL %d.\n", - ret); - return ret; + gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) || + (channel->co_ring_buffer && type == HV_GPADL_RING)); + if (gpadl->decrypted) { + /* + * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds. + * But if it fails, the encryption state of the memory is unknown. In that case, + * leave "decrypted" as true to ensure the memory is leaked instead of going back + * on the free list. + */ + ret = set_memory_decrypted((unsigned long)kbuffer, + PFN_UP(size)); + if (ret) { + dev_warn(&channel->device_obj->device, + "Failed to set host visibility for new GPADL %d.\n", + ret); + return ret; + } } init_completion(&msginfo->waitevent); @@ -545,8 +548,10 @@ cleanup: * left as true so the memory is leaked instead of being * put back on the free list. */ - if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) - gpadl->decrypted = false; + if (gpadl->decrypted) { + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } } return ret; @@ -677,12 +682,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel, goto error_clean_ring; err = hv_ringbuffer_init(&newchannel->outbound, - page, send_pages, 0); + page, send_pages, 0, newchannel->co_ring_buffer); if (err) goto error_free_gpadl; err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], - recv_pages, newchannel->max_pkt_size); + recv_pages, newchannel->max_pkt_size, + newchannel->co_ring_buffer); if (err) goto error_free_gpadl; @@ -863,8 +869,11 @@ post_msg_err: kfree(info); - ret = set_memory_encrypted((unsigned long)gpadl->buffer, - PFN_UP(gpadl->size)); + if (gpadl->decrypted) + ret = set_memory_encrypted((unsigned long)gpadl->buffer, + PFN_UP(gpadl->size)); + else + ret = 0; if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 552ed782bcfc..f7fc2630c054 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -201,7 +201,8 @@ extern int hv_synic_cleanup(unsigned int cpu); void hv_ringbuffer_pre_init(struct vmbus_channel *channel); int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 pagecnt, u32 max_pkt_size); + struct page *pages, u32 pagecnt, u32 max_pkt_size, + bool confidential); void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 23ce1fb70de1..3c421a7f78c0 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -184,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) /* Initialize the ring buffer. */ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 page_cnt, u32 max_pkt_size) + struct page *pages, u32 page_cnt, u32 max_pkt_size, + bool confidential) { struct page **pages_wraparound; int i; @@ -208,7 +209,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer = (struct hv_ring_buffer *) vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, - pgprot_decrypted(PAGE_KERNEL)); + confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL)); kfree(pages_wraparound); if (!ring_info->ring_buffer) From 510164539f16062e842a9de762616b5008616fa1 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:16 -0700 Subject: [PATCH 20/58] Drivers: hv: Free msginfo when the buffer fails to decrypt The early failure path in __vmbus_establish_gpadl() doesn't deallocate msginfo if the buffer fails to decrypt. Fix the leak by breaking out the cleanup code into a separate function and calling it where required. Fixes: d4dccf353db80 ("Drivers: hv: vmbus: Mark vmbus ring buffer visible to host in Isolation VM") Reported-by: Michael Kelley Closes: https://lore.kernel.org/linux-hyperv/SN6PR02MB41573796F9787F67E0E97049D472A@SN6PR02MB4157.namprd02.prod.outlook.com Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index d69713201bef..88485d255a42 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -410,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer, return 0; } +static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo) +{ + struct vmbus_channel_msginfo *submsginfo, *tmp; + + if (!msginfo) + return; + + list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, + msglistentry) { + kfree(submsginfo); + } + + kfree(msginfo); +} + /* * __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer * @@ -429,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, struct vmbus_channel_gpadl_header *gpadlmsg; struct vmbus_channel_gpadl_body *gpadl_body; struct vmbus_channel_msginfo *msginfo = NULL; - struct vmbus_channel_msginfo *submsginfo, *tmp; + struct vmbus_channel_msginfo *submsginfo; struct list_head *curr; u32 next_gpadl_handle; unsigned long flags; @@ -459,6 +474,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, dev_warn(&channel->device_obj->device, "Failed to set host visibility for new GPADL %d.\n", ret); + vmbus_free_channel_msginfo(msginfo); return ret; } } @@ -535,12 +551,8 @@ cleanup: spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); list_del(&msginfo->msglistentry); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, - msglistentry) { - kfree(submsginfo); - } - kfree(msginfo); + vmbus_free_channel_msginfo(msginfo); if (ret) { /* From bf35d298bb9ab2ccbcf654c1acb65ddc758ad690 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:17 -0700 Subject: [PATCH 21/58] Drivers: hv: Support confidential VMBus channels To make use of Confidential VMBus channels, initialize the co_ring_buffers and co_external_memory fields of the channel structure. Advertise support upon negotiating the version and compute values for those fields and initialize them. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 19 +++++++++++++++++++ drivers/hv/connection.c | 3 +++ 2 files changed, 22 insertions(+) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 6d66cbc9030b..74fed2c073d4 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1022,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) struct vmbus_channel_offer_channel *offer; struct vmbus_channel *oldchannel, *newchannel; size_t offer_sz; + bool co_ring_buffer, co_external_memory; offer = (struct vmbus_channel_offer_channel *)hdr; @@ -1034,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } + co_ring_buffer = is_co_ring_buffer(offer); + co_external_memory = is_co_external_memory(offer); + if (!co_ring_buffer && co_external_memory) { + pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n", + offer->child_relid); + return; + } + if (co_ring_buffer || co_external_memory) { + if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) { + pr_err("Invalid offer relid=%d: no support for confidential VMBus\n", + offer->child_relid); + atomic_dec(&vmbus_connection.offer_in_progress); + return; + } + } + oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { @@ -1112,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) pr_err("Unable to allocate channel object\n"); return; } + newchannel->co_ring_buffer = co_ring_buffer; + newchannel->co_external_memory = co_external_memory; vmbus_setup_channel_state(newchannel, offer); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 1fe3573ae52a..5ac9232396f7 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -105,6 +105,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID; } + if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0) + msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS; + /* * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always * bitwise OR it From b537794bc28aafd05c317bda9d7a4d8f7d6a528d Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:18 -0700 Subject: [PATCH 22/58] Drivers: hv: Set the default VMBus version to 6.0 The confidential VMBus is supported by the protocol version 6.0 onwards. Attempt to establish the VMBus 6.0 connection thus enabling the confidential VMBus features when available. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/connection.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 5ac9232396f7..5d9cb5bf2d62 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); * Linux guests and are not listed. */ static __u32 vmbus_versions[] = { + VERSION_WIN10_V6_0, VERSION_WIN10_V5_3, VERSION_WIN10_V5_2, VERSION_WIN10_V5_1, @@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = { * Maximal VMBus protocol version guests can negotiate. Useful to cap the * VMBus version for testing and debugging purpose. */ -static uint max_version = VERSION_WIN10_V5_3; +static uint max_version = VERSION_WIN10_V6_0; module_param(max_version, uint, S_IRUGO); MODULE_PARM_DESC(max_version, From 2647c96649ba4ba9fb0fa5dfd101a15257145a3d Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:19 -0700 Subject: [PATCH 23/58] Drivers: hv: Support establishing the confidential VMBus connection To establish the confidential VMBus connection the CoCo VM, the guest first checks on the confidential VMBus availability, and then proceeds to initializing the communication stack. Implement that in the VMBus driver initialization. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 184 +++++++++++++++++++++++++---------------- 1 file changed, 114 insertions(+), 70 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 2b5bf672c467..0dc4692b411a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1057,12 +1057,9 @@ static void vmbus_onmessage_work(struct work_struct *work) kfree(ctx); } -void vmbus_on_msg_dpc(unsigned long data) +static void __vmbus_on_msg_dpc(void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu = (void *)data; - void *page_addr = hv_cpu->hyp_synic_message_page; - struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + - VMBUS_MESSAGE_SINT; + struct hv_message msg_copy, *msg; struct vmbus_channel_message_header *hdr; enum vmbus_channel_message_type msgtype; const struct vmbus_channel_message_table_entry *entry; @@ -1070,6 +1067,10 @@ void vmbus_on_msg_dpc(unsigned long data) __u8 payload_size; u32 message_type; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; + /* * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as * it is being used in 'struct vmbus_channel_message_header' definition @@ -1195,6 +1196,14 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +void vmbus_on_msg_dpc(unsigned long data) +{ + struct hv_per_cpu_context *hv_cpu = (void *)data; + + __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page); + __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page); +} + #ifdef CONFIG_PM_SLEEP /* * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for @@ -1233,21 +1242,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) #endif /* CONFIG_PM_SLEEP */ /* - * Schedule all channels with events pending + * Schedule all channels with events pending. + * The event page can be directly checked to get the id of + * the channel that has the interrupt pending. */ -static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) +static void vmbus_chan_sched(void *event_page_addr) { unsigned long *recv_int_page; u32 maxbits, relid; + union hv_synic_event_flags *event; - /* - * The event page can be directly checked to get the id of - * the channel that has the interrupt pending. - */ - void *page_addr = hv_cpu->hyp_synic_event_page; - union hv_synic_event_flags *event - = (union hv_synic_event_flags *)page_addr + - VMBUS_MESSAGE_SINT; + if (!event_page_addr) + return; + event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT; maxbits = HV_EVENT_FLAGS_COUNT; recv_int_page = event->flags; @@ -1255,6 +1262,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (unlikely(!recv_int_page)) return; + /* + * Suggested-by: Michael Kelley + * One possible optimization would be to keep track of the largest relID that's in use, + * and only scan up to that relID. + */ for_each_set_bit(relid, recv_int_page, maxbits) { void (*callback_fn)(void *context); struct vmbus_channel *channel; @@ -1318,26 +1330,35 @@ sched_unlock_rcu: } } -static void vmbus_isr(void) +static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - void *page_addr; struct hv_message *msg; - vmbus_chan_sched(hv_cpu); - - page_addr = hv_cpu->hyp_synic_message_page; - msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; /* Check if there are actual msgs to be processed */ if (msg->header.message_type != HVMSG_NONE) { if (msg->header.message_type == HVMSG_TIMER_EXPIRED) { hv_stimer0_isr(); vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED); - } else + } else { tasklet_schedule(&hv_cpu->msg_dpc); + } } +} + +static void vmbus_isr(void) +{ + struct hv_per_cpu_context *hv_cpu + = this_cpu_ptr(hv_context.cpu_context); + + vmbus_chan_sched(hv_cpu->hyp_synic_event_page); + vmbus_chan_sched(hv_cpu->para_synic_event_page); + + vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page); + vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page); add_interrupt_randomness(vmbus_interrupt); } @@ -1355,54 +1376,14 @@ static void vmbus_percpu_work(struct work_struct *work) hv_synic_init(cpu); } -/* - * vmbus_bus_init -Main vmbus driver initialization routine. - * - * Here, we - * - initialize the vmbus driver context - * - invoke the vmbus hv main init routine - * - retrieve the channel offers - */ -static int vmbus_bus_init(void) +static int vmbus_alloc_synic_and_connect(void) { int ret, cpu; struct work_struct __percpu *works; - - ret = hv_init(); - if (ret != 0) { - pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); - return ret; - } - - ret = bus_register(&hv_bus); - if (ret) - return ret; - - /* - * VMbus interrupts are best modeled as per-cpu interrupts. If - * on an architecture with support for per-cpu IRQs (e.g. ARM64), - * allocate a per-cpu IRQ using standard Linux kernel functionality. - * If not on such an architecture (e.g., x86/x64), then rely on - * code in the arch-specific portion of the code tree to connect - * the VMbus interrupt handler. - */ - - if (vmbus_irq == -1) { - hv_setup_vmbus_handler(vmbus_isr); - } else { - vmbus_evt = alloc_percpu(long); - ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr, - "Hyper-V VMbus", vmbus_evt); - if (ret) { - pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d", - vmbus_irq, ret); - free_percpu(vmbus_evt); - goto err_setup; - } - } + int hyperv_cpuhp_online; ret = hv_synic_alloc(); - if (ret) + if (ret < 0) goto err_alloc; works = alloc_percpu(struct work_struct); @@ -1436,6 +1417,72 @@ static int vmbus_bus_init(void) hyperv_cpuhp_online = ret; ret = vmbus_connect(); + if (ret) + goto err_connect; + return 0; + +err_connect: + cpuhp_remove_state(hyperv_cpuhp_online); + return -ENODEV; +err_alloc: + hv_synic_free(); + return -ENOMEM; +} + +/* + * vmbus_bus_init -Main vmbus driver initialization routine. + * + * Here, we + * - initialize the vmbus driver context + * - invoke the vmbus hv main init routine + * - retrieve the channel offers + */ +static int vmbus_bus_init(void) +{ + int ret; + + ret = hv_init(); + if (ret != 0) { + pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); + return ret; + } + + ret = bus_register(&hv_bus); + if (ret) + return ret; + + /* + * VMbus interrupts are best modeled as per-cpu interrupts. If + * on an architecture with support for per-cpu IRQs (e.g. ARM64), + * allocate a per-cpu IRQ using standard Linux kernel functionality. + * If not on such an architecture (e.g., x86/x64), then rely on + * code in the arch-specific portion of the code tree to connect + * the VMbus interrupt handler. + */ + + if (vmbus_irq == -1) { + hv_setup_vmbus_handler(vmbus_isr); + } else { + vmbus_evt = alloc_percpu(long); + ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr, + "Hyper-V VMbus", vmbus_evt); + if (ret) { + pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d", + vmbus_irq, ret); + free_percpu(vmbus_evt); + goto err_setup; + } + } + + /* + * Cache the value as getting it involves a VM exit on x86(_64), and + * doing that on each VP while initializing SynIC's wastes time. + */ + is_confidential = ms_hyperv.confidential_vmbus_available; + if (is_confidential) + pr_info("Establishing connection to the confidential VMBus\n"); + hv_para_set_sint_proxy(!is_confidential); + ret = vmbus_alloc_synic_and_connect(); if (ret) goto err_connect; @@ -1451,9 +1498,6 @@ static int vmbus_bus_init(void) return 0; err_connect: - cpuhp_remove_state(hyperv_cpuhp_online); -err_alloc: - hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { From 9ebc528cfdada055984f919710c31be281cb717c Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 10 Oct 2025 14:55:47 -0700 Subject: [PATCH 24/58] mshv: Only map vp->vp_stats_pages if on root scheduler This mapping is only used for checking if the dispatch thread is blocked. This is only relevant for the root scheduler, so check the scheduler type to determine whether to map/unmap these pages, instead of the current check, which is incorrect. Signed-off-by: Nuno Das Neves Reviewed-by: Anirudh Rayabharam Reviewed-by: Praveen K Paladugu Reviewed-by: Easwar Hariharan Reviewed-by: Tianyu Lan Acked-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index a599bbf1f9b8..f1fabf52c8f5 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -936,7 +936,11 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, goto unmap_register_page; } - if (hv_parent_partition()) { + /* + * This mapping of the stats page is for detecting if dispatch thread + * is blocked - only relevant for root scheduler + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) { ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, stats_pages); if (ret) @@ -965,7 +969,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) vp->vp_ghcb_page = page_to_virt(ghcb_page); - if (hv_parent_partition()) + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); /* @@ -988,7 +992,7 @@ put_partition: free_vp: kfree(vp); unmap_stats_pages: - if (hv_parent_partition()) + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) mshv_vp_stats_unmap(partition->pt_id, args.vp_index); unmap_ghcb_page: if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { @@ -1742,7 +1746,7 @@ static void destroy_partition(struct mshv_partition *partition) if (!vp) continue; - if (hv_parent_partition()) + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); if (vp->vp_register_page) { From 59aeea195948fd507cef2e439a5a964b8432750e Mon Sep 17 00:00:00 2001 From: Purna Pavan Chandra Aekkaladevi Date: Fri, 10 Oct 2025 14:55:48 -0700 Subject: [PATCH 25/58] mshv: Add the HVCALL_GET_PARTITION_PROPERTY_EX hypercall This hypercall can be used to fetch extended properties of a partition. Extended properties are properties with values larger than a u64. Some of these also need additional input arguments. Add helper function for using the hypercall in the mshv_root driver. Signed-off-by: Purna Pavan Chandra Aekkaladevi Signed-off-by: Nuno Das Neves Reviewed-by: Anirudh Rayabharam Reviewed-by: Praveen K Paladugu Reviewed-by: Easwar Hariharan Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 2 ++ drivers/hv/mshv_root_hv_call.c | 31 ++++++++++++++++++++++++++ include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk.h | 40 ++++++++++++++++++++++++++++++++++ include/hyperv/hvhdk_mini.h | 26 ++++++++++++++++++++++ 5 files changed, 100 insertions(+) diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index db6b42db2fdc..0e62badfc9f1 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -303,6 +303,8 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type, int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire); +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, + void *property_value, size_t property_value_sz); extern struct mshv_root mshv_root; extern enum hv_scheduler_type hv_scheduler_type; diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index c9c274f29c3c..8049e51c45dc 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -590,6 +590,37 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, return hv_result_to_errno(status); } +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, + u64 arg, void *property_value, + size_t property_value_sz) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property_ex *input; + struct hv_output_get_partition_property_ex *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + input->arg = arg; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); + } + memcpy(property_value, &output->property_value, property_value_sz); + + local_irq_restore(flags); + + return 0; +} + int hv_call_clear_virtual_interrupt(u64 partition_id) { diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 7f730a0e54e6..af85b1c36b6e 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -491,6 +491,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_VP_STATE 0x00e3 #define HVCALL_SET_VP_STATE 0x00e4 #define HVCALL_GET_VP_CPUID_VALUES 0x00f4 +#define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index b4067ada02cf..416c0d45b793 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -376,6 +376,46 @@ struct hv_input_set_partition_property { u64 property_value; } __packed; +union hv_partition_property_arg { + u64 as_uint64; + struct { + union { + u32 arg; + u32 vp_index; + }; + u16 reserved0; + u8 reserved1; + u8 object_type; + } __packed; +}; + +struct hv_input_get_partition_property_ex { + u64 partition_id; + u32 property_code; /* enum hv_partition_property_code */ + u32 padding; + union { + union hv_partition_property_arg arg_data; + u64 arg; + }; +} __packed; + +/* + * NOTE: Should use hv_input_set_partition_property_ex_header to compute this + * size, but hv_input_get_partition_property_ex is identical so it suffices + */ +#define HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_partition_property_ex)) + +union hv_partition_property_ex { + u8 buffer[HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE]; + struct hv_partition_property_vmm_capabilities vmm_capabilities; + /* More fields to be filled in when needed */ +}; + +struct hv_output_get_partition_property_ex { + union hv_partition_property_ex property_value; +} __packed; + enum hv_vp_state_page_type { HV_VP_STATE_PAGE_REGISTERS = 0, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 858f6a3925b3..bf2ce27dfcc5 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -96,8 +96,34 @@ enum hv_partition_property_code { HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, + + /* Extended properties with larger property values */ + HV_PARTITION_PROPERTY_VMM_CAPABILITIES = 0x00090007, }; +#define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT 1 +#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 59 + +struct hv_partition_property_vmm_capabilities { + u16 bank_count; + u16 reserved[3]; + union { + u64 as_uint64[HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT]; + struct { + u64 map_gpa_preserve_adjustable: 1; + u64 vmm_can_provide_overlay_gpfn: 1; + u64 vp_affinity_property: 1; +#if IS_ENABLED(CONFIG_ARM64) + u64 vmm_can_provide_gic_overlay_locations: 1; +#else + u64 reservedbit3: 1; +#endif + u64 assignable_synthetic_proc_features: 1; + u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT; + } __packed; + }; +} __packed; + enum hv_snp_status { HV_SNP_STATUS_NONE = 0, HV_SNP_STATUS_AVAILABLE = 1, From fd612d97a458f0c44d1395e2def4fa719ec2ea48 Mon Sep 17 00:00:00 2001 From: Purna Pavan Chandra Aekkaladevi Date: Fri, 10 Oct 2025 14:55:49 -0700 Subject: [PATCH 26/58] mshv: Get the vmm capabilities offered by the hypervisor Some hypervisor APIs are gated by feature bits in the "vmm capabilities" partition property. Store the capabilities on mshv_root module init, using HVCALL_GET_PARTITION_PROPERTY_EX. This is not supported on all hypervisors. In that case, just set the capabilities to 0 and proceed as normal. Signed-off-by: Purna Pavan Chandra Aekkaladevi Signed-off-by: Nuno Das Neves Reviewed-by: Praveen K Paladugu Reviewed-by: Easwar Hariharan Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 1 + drivers/hv/mshv_root_main.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 0e62badfc9f1..f47d36d0c942 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -178,6 +178,7 @@ struct mshv_root { struct hv_synic_pages __percpu *synic_pages; spinlock_t pt_ht_lock; DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); + struct hv_partition_property_vmm_capabilities vmm_caps; }; /* diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index f1fabf52c8f5..33160f6ac028 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -2203,6 +2203,22 @@ root_sched_deinit: return err; } +static void mshv_init_vmm_caps(struct device *dev) +{ + /* + * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or + * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that + * case it's valid to proceed as if all vmm_caps are disabled (zero). + */ + if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_VMM_CAPABILITIES, + 0, &mshv_root.vmm_caps, + sizeof(mshv_root.vmm_caps))) + dev_warn(dev, "Unable to get VMM capabilities\n"); + + dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); +} + static int __init mshv_parent_partition_init(void) { int ret; @@ -2255,6 +2271,8 @@ static int __init mshv_parent_partition_init(void) if (ret) goto remove_cpu_state; + mshv_init_vmm_caps(dev); + ret = mshv_irqfd_wq_init(); if (ret) goto exit_partition; From 19c515c27cee3bbba7e70a4d84f4b7e4d8d1cd7e Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Fri, 10 Oct 2025 14:55:50 -0700 Subject: [PATCH 27/58] mshv: Allocate vp state page for HVCALL_MAP_VP_STATE_PAGE on L1VH Introduce mshv_use_overlay_gpfn() to check if a page needs to be allocated and passed to the hypervisor to map VP state pages. This is only needed on L1VH, and only on some (newer) versions of the hypervisor, hence the need to check vmm_capabilities. Introduce functions hv_map/unmap_vp_state_page() to handle the allocation and freeing. Signed-off-by: Jinank Jain Signed-off-by: Nuno Das Neves Reviewed-by: Praveen K Paladugu Reviewed-by: Easwar Hariharan Reviewed-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 11 ++--- drivers/hv/mshv_root_hv_call.c | 64 +++++++++++++++++++++++++--- drivers/hv/mshv_root_main.c | 76 +++++++++++++++++----------------- 3 files changed, 101 insertions(+), 50 deletions(-) diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index f47d36d0c942..5b57d894358a 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -279,11 +279,12 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id, /* Choose between pages and bytes */ struct hv_vp_state_data state_data, u64 page_count, struct page **pages, u32 num_bytes, u8 *bytes); -int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl, - struct page **state_page); -int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl); +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page); +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, + union hv_input_vtl input_vtl); int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, u64 connection_partition_id, struct hv_port_info *port_info, u8 port_vtl, u8 min_connection_vtl, int node); diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index 8049e51c45dc..6dac9fcc092c 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -526,9 +526,9 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id, return ret; } -int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl, - struct page **state_page) +static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) { struct hv_input_map_vp_state_page *input; struct hv_output_map_vp_state_page *output; @@ -542,12 +542,20 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, input = *this_cpu_ptr(hyperv_pcpu_input_arg); output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); input->partition_id = partition_id; input->vp_index = vp_index; input->type = type; input->input_vtl = input_vtl; - status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output); + if (*state_page) { + input->flags.map_location_provided = 1; + input->requested_map_location = + page_to_pfn(*state_page); + } + + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, + output); if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { if (hv_result_success(status)) @@ -565,8 +573,41 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, return ret; } -int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl) +static bool mshv_use_overlay_gpfn(void) +{ + return hv_l1vh_partition() && + mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn; +} + +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + int ret = 0; + struct page *allocated_page = NULL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + *state_page = allocated_page; + } else { + *state_page = NULL; + } + + ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl, + state_page); + + if (ret && allocated_page) { + __free_page(allocated_page); + *state_page = NULL; + } + + return ret; +} + +static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl) { unsigned long flags; u64 status; @@ -590,6 +631,17 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, return hv_result_to_errno(status); } +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, union hv_input_vtl input_vtl) +{ + int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl); + + if (mshv_use_overlay_gpfn() && state_page) + __free_page(state_page); + + return ret; +} + int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, void *property_value, size_t property_value_sz) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 33160f6ac028..288695a859f7 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -892,7 +892,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, { struct mshv_create_vp args; struct mshv_vp *vp; - struct page *intercept_message_page, *register_page, *ghcb_page; + struct page *intercept_msg_page, *register_page, *ghcb_page; void *stats_pages[2]; long ret; @@ -910,28 +910,25 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, if (ret) return ret; - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero, - &intercept_message_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero, &intercept_msg_page); if (ret) goto destroy_vp; if (!mshv_partition_encrypted(partition)) { - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero, - ®ister_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero, ®ister_page); if (ret) goto unmap_intercept_message_page; } if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal, - &ghcb_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal, &ghcb_page); if (ret) goto unmap_register_page; } @@ -962,7 +959,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, atomic64_set(&vp->run.vp_signaled_count, 0); vp->vp_index = args.vp_index; - vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); + vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); if (!mshv_partition_encrypted(partition)) vp->vp_register_page = page_to_virt(register_page); @@ -995,21 +992,19 @@ unmap_stats_pages: if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) mshv_vp_stats_unmap(partition->pt_id, args.vp_index); unmap_ghcb_page: - if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal); - } + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, ghcb_page, + input_vtl_normal); unmap_register_page: - if (!mshv_partition_encrypted(partition)) { - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero); - } + if (!mshv_partition_encrypted(partition)) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + register_page, input_vtl_zero); unmap_intercept_message_page: - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero); + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + intercept_msg_page, input_vtl_zero); destroy_vp: hv_call_delete_vp(partition->pt_id, args.vp_index); return ret; @@ -1750,24 +1745,27 @@ static void destroy_partition(struct mshv_partition *partition) mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); if (vp->vp_register_page) { - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_REGISTERS, + virt_to_page(vp->vp_register_page), + input_vtl_zero); vp->vp_register_page = NULL; } - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + virt_to_page(vp->vp_intercept_msg_page), + input_vtl_zero); vp->vp_intercept_msg_page = NULL; if (vp->vp_ghcb_page) { - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_GHCB, + virt_to_page(vp->vp_ghcb_page), + input_vtl_normal); vp->vp_ghcb_page = NULL; } From d62313bdf5961b5f815f0b212f029cf146a8a804 Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Fri, 10 Oct 2025 14:55:51 -0700 Subject: [PATCH 28/58] mshv: Introduce new hypercall to map stats page for L1VH partitions Introduce HVCALL_MAP_STATS_PAGE2 which provides a map location (GPFN) to map the stats to. This hypercall is required for L1VH partitions, depending on the hypervisor version. This uses the same check as the state page map location; mshv_use_overlay_gpfn(). Add mshv_map_vp_state_page() helpers to use this new hypercall or the old one depending on availability. For unmapping, the original HVCALL_UNMAP_STATS_PAGE works for both cases. Signed-off-by: Jinank Jain Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 10 ++-- drivers/hv/mshv_root_hv_call.c | 95 ++++++++++++++++++++++++++++++++-- drivers/hv/mshv_root_main.c | 22 ++++---- include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk_mini.h | 7 +++ 5 files changed, 115 insertions(+), 20 deletions(-) diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 5b57d894358a..3eb815011b46 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -297,11 +297,11 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, int hv_call_disconnect_port(u64 connection_partition_id, union hv_connection_id connection_id); int hv_call_notify_port_ring_empty(u32 sint_index); -int hv_call_map_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr); -int hv_call_unmap_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity); +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr); +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity); int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire); diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index 6dac9fcc092c..caf02cfa49c9 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -807,9 +807,51 @@ hv_call_notify_port_ring_empty(u32 sint_index) return hv_result_to_errno(status); } -int hv_call_map_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr) +static int hv_call_map_stats_page2(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + u64 map_location) +{ + unsigned long flags; + struct hv_input_map_stats_page2 *input; + u64 status; + int ret; + + if (!map_location || !mshv_use_overlay_gpfn()) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + input->map_location = map_location; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL); + + local_irq_restore(flags); + + ret = hv_result_to_errno(status); + + if (!ret) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + hv_status_debug(status, "\n"); + break; + } + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +static int hv_call_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) { unsigned long flags; struct hv_input_map_stats_page *input; @@ -848,8 +890,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type, return ret; } -int hv_call_unmap_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity) +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) +{ + int ret; + struct page *allocated_page = NULL; + + if (!addr) + return -EINVAL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + + ret = hv_call_map_stats_page2(type, identity, + page_to_pfn(allocated_page)); + *addr = page_address(allocated_page); + } else { + ret = hv_call_map_stats_page(type, identity, addr); + } + + if (ret && allocated_page) { + __free_page(allocated_page); + *addr = NULL; + } + + return ret; +} + +static int hv_call_unmap_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) { unsigned long flags; struct hv_input_unmap_stats_page *input; @@ -868,6 +940,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type, return hv_result_to_errno(status); } +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity) +{ + int ret; + + ret = hv_call_unmap_stats_page(type, identity); + + if (mshv_use_overlay_gpfn() && page_addr) + __free_page(virt_to_page(page_addr)); + + return ret; +} + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 288695a859f7..7684645ef00d 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -843,7 +843,8 @@ mshv_vp_release(struct inode *inode, struct file *filp) return 0; } -static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) +static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + void *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -851,10 +852,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) }; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); } static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, @@ -867,14 +868,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, int err; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_SELF]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); if (err) return err; identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_PARENT]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); if (err) goto unmap_self; @@ -882,7 +883,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, unmap_self: identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); return err; } @@ -990,7 +991,7 @@ free_vp: kfree(vp); unmap_stats_pages: if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - mshv_vp_stats_unmap(partition->pt_id, args.vp_index); + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); unmap_ghcb_page: if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) hv_unmap_vp_state_page(partition->pt_id, args.vp_index, @@ -1742,7 +1743,8 @@ static void destroy_partition(struct mshv_partition *partition) continue; if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + (void **)vp->vp_stats_pages); if (vp->vp_register_page) { (void)hv_unmap_vp_state_page(partition->pt_id, diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index af85b1c36b6e..f6e31d1c3267 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -494,6 +494,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 +#define HVCALL_MAP_STATS_PAGE2 0x0131 /* HV_HYPERCALL_INPUT */ #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index bf2ce27dfcc5..064bf735cab6 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -177,6 +177,13 @@ struct hv_input_map_stats_page { union hv_stats_object_identity identity; } __packed; +struct hv_input_map_stats_page2 { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; + u64 map_location; +} __packed; + struct hv_output_map_stats_page { u64 map_location; } __packed; From 428ca2d4c6aa4124fb91580e5918f012d32ad4c4 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 17 Oct 2025 14:58:14 -0700 Subject: [PATCH 29/58] MAINTAINERS: Add Long Li as a Hyper-V maintainer Also include MANA RDMA driver in the Hyper-V maintained list. Signed-off-by: Long Li Signed-off-by: Wei Liu --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968..da844816d2a0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11590,6 +11590,7 @@ M: "K. Y. Srinivasan" M: Haiyang Zhang M: Wei Liu M: Dexuan Cui +M: Long Li L: linux-hyperv@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git @@ -11607,6 +11608,7 @@ F: arch/x86/kernel/cpu/mshyperv.c F: drivers/clocksource/hyperv_timer.c F: drivers/hid/hid-hyperv.c F: drivers/hv/ +F: drivers/infiniband/hw/mana/ F: drivers/input/serio/hyperv-keyboard.c F: drivers/iommu/hyperv-iommu.c F: drivers/net/ethernet/microsoft/ @@ -11625,6 +11627,7 @@ F: include/hyperv/hvhdk_mini.h F: include/linux/hyperv.h F: include/net/mana F: include/uapi/linux/hyperv.h +F: include/uapi/rdma/mana-abi.h F: net/vmw_vsock/hyperv_transport.c F: tools/hv/ From 77c3a45a0f4866d4c36246abdf9633d3bde13f9e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 15 Oct 2025 09:50:14 +0800 Subject: [PATCH 30/58] x86: mshyperv: Remove duplicate asm/msr.h header ./arch/x86/kernel/cpu/mshyperv.c: asm/msr.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=26164 Signed-off-by: Jiapeng Chong Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 80a641a6ac48..6802d89ca790 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include From 91a076d30450237181e474396e3128082d0ffe6e Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:03 -0700 Subject: [PATCH 31/58] x86/hyperv: Rename guest crash shutdown function Rename hv_machine_crash_shutdown to more appropriate hv_guest_crash_shutdown and make it applicable to guests only. This in preparation for the subsequent hypervisor root crash support patches. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 6802d89ca790..fac9953a72ef 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -255,7 +255,7 @@ static void hv_machine_shutdown(void) #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_CRASH_DUMP -static void hv_machine_crash_shutdown(struct pt_regs *regs) +static void hv_guest_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) hv_crash_handler(regs); @@ -625,7 +625,8 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; #endif #if defined(CONFIG_CRASH_DUMP) - machine_ops.crash_shutdown = hv_machine_crash_shutdown; + if (!hv_root_partition()) + machine_ops.crash_shutdown = hv_guest_crash_shutdown; #endif #endif /* From 56c3feb3cc17b764f51191fd3dc461ab55a7b803 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:04 -0700 Subject: [PATCH 32/58] hyperv: Add two new hypercall numbers to guest ABI public header In preparation for the subsequent crashdump patches, copy two hypercall numbers to the guest ABI header published by Hyper-V. One to notify hypervisor of an event that occurs in the root partition, other to ask hypervisor to disable the hypervisor. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index f6e31d1c3267..7499a679e60a 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -470,6 +470,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 @@ -494,6 +495,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 +#define HVCALL_DISABLE_HYP_EX 0x010f #define HVCALL_MAP_STATS_PAGE2 0x0131 /* HV_HYPERCALL_INPUT */ From e0a975ecd2e671664d208723476eeabb3baf08be Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:05 -0700 Subject: [PATCH 33/58] hyperv: Add definitions for hypervisor crash dump support Add data structures for hypervisor crash dump support to the hypervisor host ABI header file. Details of their usages are in subsequent commits. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- include/hyperv/hvhdk_mini.h | 55 +++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 064bf735cab6..f2d7b50de7a4 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -142,6 +142,17 @@ enum hv_system_property { /* Add more values when needed */ HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, + HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47, +}; + +#define HV_PFN_RANGE_PGBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */ +union hv_pfn_range { /* HV_SPA_PAGE_RANGE */ + u64 as_uint64; + struct { + /* 39:0: base pfn. 63:40: additional pages */ + u64 base_pfn : 64 - HV_PFN_RANGE_PGBITS; + u64 add_pfns : HV_PFN_RANGE_PGBITS; + } __packed; }; enum hv_dynamic_processor_feature_property { @@ -168,6 +179,8 @@ struct hv_output_get_system_property { #if IS_ENABLED(CONFIG_X86) u64 hv_processor_feature_value; #endif + union hv_pfn_range hv_cda_info; /* CrashdumpAreaAddress */ + u64 hv_tramp_pa; /* CrashdumpTrampolineAddress */ }; } __packed; @@ -267,6 +280,48 @@ union hv_gpa_page_access_state { u8 as_uint8; } __packed; +enum hv_crashdump_action { + HV_CRASHDUMP_NONE = 0, + HV_CRASHDUMP_SUSPEND_ALL_VPS, + HV_CRASHDUMP_PREPARE_FOR_STATE_SAVE, + HV_CRASHDUMP_STATE_SAVED, + HV_CRASHDUMP_ENTRY, +}; + +struct hv_partition_event_root_crashdump_input { + u32 crashdump_action; /* enum hv_crashdump_action */ +} __packed; + +struct hv_input_disable_hyp_ex { /* HV_X64_INPUT_DISABLE_HYPERVISOR_EX */ + u64 rip; + u64 arg; +} __packed; + +struct hv_crashdump_area { /* HV_CRASHDUMP_AREA */ + u32 version; + union { + u32 flags_as_uint32; + struct { + u32 cda_valid : 1; + u32 cda_unused : 31; + } __packed; + }; + /* more unused fields */ +} __packed; + +union hv_partition_event_input { + struct hv_partition_event_root_crashdump_input crashdump_input; +}; + +enum hv_partition_event { + HV_PARTITION_EVENT_ROOT_CRASHDUMP = 2, +}; + +struct hv_input_notify_partition_event { + u32 event; /* enum hv_partition_event */ + union hv_partition_event_input input; +} __packed; + struct hv_lp_startup_status { u64 hv_status; u64 substatus1; From b0574ba75525c6b7e7448dd755d5cc09391478b1 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:06 -0700 Subject: [PATCH 34/58] x86/hyperv: Add trampoline asm code to transition from hypervisor Introduce a small asm stub to transition from the hypervisor to Linux after devirtualization. Devirtualization means disabling hypervisor on the fly, so after it is done, the code is running on physical processor instead of virtual, and hypervisor is gone. This can be done by a root vm only. At a high level, during panic of either the hypervisor or the root, the NMI handler asks hypervisor to devirtualize. As part of that, the arguments include an entry point to return back to Linux. This asm stub implements that entry point. The stub is entered in protected mode, uses temporary gdt and page table to enable long mode and get to kernel entry point which then restores full kernel context to resume execution to kexec. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_trampoline.S | 101 ++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 arch/x86/hyperv/hv_trampoline.S diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S new file mode 100644 index 000000000000..25f02ff12286 --- /dev/null +++ b/arch/x86/hyperv/hv_trampoline.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * X86 specific Hyper-V kdump/crash related code. + * + * Copyright (C) 2025, Microsoft, Inc. + * + */ +#include +#include +#include +#include +#include + +/* + * void noreturn hv_crash_asm32(arg1) + * arg1 == edi == 32bit PA of struct hv_crash_tramp_data + * + * The hypervisor jumps here upon devirtualization in protected mode. This + * code gets copied to a page in the low 4G ie, 32bit space so it can run + * in the protected mode. Hence we cannot use any compile/link time offsets or + * addresses. It restores long mode via temporary gdt and page tables and + * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry. + * + * PreCondition (ie, Hypervisor call back ABI): + * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled + * o CR4 is set to 0x0 + * o IA32_EFER is set to 0x901 (SCE and NXE are set) + * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX. + * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF + * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF + * o LDTR is initialized as invalid (limit of 0) + * o MSR PAT is power on default. + * o Other state/registers are cleared. All TLBs flushed. + */ + +#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */ +#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */ +#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */ +#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */ +#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */ + + .text + .code32 + +SYM_CODE_START(hv_crash_asm32) + UNWIND_HINT_UNDEFINED + ENDBR + movl $X86_CR4_PAE, %ecx + movl %ecx, %cr4 + + movl %edi, %ebx + add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx + movl %cs:(%ebx), %eax + movl %eax, %cr3 + + /* Setup EFER for long mode now */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Turn paging on using the temp 32bit trampoline page table */ + movl %cr0, %eax + orl $(X86_CR0_PG), %eax + movl %eax, %cr0 + + /* since kernel cr3 could be above 4G, we need to be in the long mode + * before we can load 64bits of the kernel cr3. We use a temp gdt for + * that with CS.L=1 and CS.D=0 */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax + lgdtl %cs:(%eax) + + /* not done yet, restore CS now to switch to CS.L=1 */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax + ljmp %cs:*(%eax) +SYM_CODE_END(hv_crash_asm32) + + /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */ + .code64 + .balign 8 +SYM_CODE_START(hv_crash_asm64) + UNWIND_HINT_UNDEFINED + ENDBR + /* restore kernel page tables so we can jump to kernel code */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_KERNCR3, %eax + movq %cs:(%eax), %rbx + movq %rbx, %cr3 + + mov %edi, %eax + add $HV_CRASHDATA_OFFS_C_entry, %eax + movq %cs:(%eax), %rbx + ANNOTATE_RETPOLINE_SAFE + jmp *%rbx + + int $3 + +SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL) +SYM_CODE_END(hv_crash_asm64) From 94212d34618c2608758128da32ddac2ad834cb9f Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:07 -0700 Subject: [PATCH 35/58] x86/hyperv: Implement hypervisor RAM collection into vmcore Introduce a new file to implement collection of hypervisor RAM into the vmcore collected by linux. By default, the hypervisor RAM is locked, ie, protected via hw page table. Hyper-V implements a disable hypercall which essentially devirtualizes the system on the fly. This mechanism makes the hypervisor RAM accessible to linux. Because the hypervisor RAM is already mapped into linux address space (as reserved RAM), it is automatically collected into the vmcore without extra work. More details of the implementation are available in the file prologue. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_crash.c | 642 +++++++++++++++++++++++++++++++++++++ 1 file changed, 642 insertions(+) create mode 100644 arch/x86/hyperv/hv_crash.c diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c new file mode 100644 index 000000000000..c0e22921ace1 --- /dev/null +++ b/arch/x86/hyperv/hv_crash.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * X86 specific Hyper-V root partition kdump/crash support module + * + * Copyright (C) 2025, Microsoft, Inc. + * + * This module implements hypervisor RAM collection into vmcore for both + * cases of the hypervisor crash and Linux root crash. Hyper-V implements + * a disable hypercall with a 32bit protected mode ABI callback. This + * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM + * is already mapped in Linux, it is automatically collected into Linux vmcore, + * and can be examined by the crash command (raw RAM dump) or windbg. + * + * At a high level: + * + * Hypervisor Crash: + * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a + * restrictive mode with very limited hypercall and MSR support. Each cpu + * then injects NMIs into root vcpus. A shared page is used to check + * by Linux in the NMI handler if the hypervisor has crashed. This shared + * page is setup in hv_root_crash_init during boot. + * + * Linux Crash: + * In case of Linux crash, the callback hv_crash_stop_other_cpus will send + * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits + * for all cpus to be in NMI. + * + * NMI Handler (upon quorum): + * Eventually, in both cases, all cpus will end up in the NMI handler. + * Hyper-V requires the disable hypervisor must be done from the BSP. So + * the BSP NMI handler saves current context, does some fixups and makes + * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor + * at that point will suspend all vcpus (except the BSP), unlock all its + * RAM, and return to Linux at the 32bit mode entry RIP. + * + * Linux 32bit entry trampoline will then restore long mode and call C + * function here to restore context and continue execution to crash kexec. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +bool hv_crash_enabled; +EXPORT_SYMBOL_GPL(hv_crash_enabled); + +struct hv_crash_ctxt { + ulong rsp; + ulong cr0; + ulong cr2; + ulong cr4; + ulong cr8; + + u16 cs; + u16 ss; + u16 ds; + u16 es; + u16 fs; + u16 gs; + + u16 gdt_fill; + struct desc_ptr gdtr; + char idt_fill[6]; + struct desc_ptr idtr; + + u64 gsbase; + u64 efer; + u64 pat; +}; +static struct hv_crash_ctxt hv_crash_ctxt; + +/* Shared hypervisor page that contains crash dump area we peek into. + * NB: windbg looks for "hv_cda" symbol so don't change it. + */ +static struct hv_crashdump_area *hv_cda; + +static u32 trampoline_pa, devirt_arg; +static atomic_t crash_cpus_wait; +static void *hv_crash_ptpgs[4]; +static bool hv_has_crashed, lx_has_crashed; + +static void __noreturn hv_panic_timeout_reboot(void) +{ + #define PANIC_TIMER_STEP 100 + + if (panic_timeout > 0) { + int i; + + for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) + mdelay(PANIC_TIMER_STEP); + } + + if (panic_timeout) + native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */ + + for (;;) + cpu_relax(); +} + +/* This cannot be inlined as it needs stack */ +static noinline __noclone void hv_crash_restore_tss(void) +{ + load_TR_desc(); +} + +/* This cannot be inlined as it needs stack */ +static noinline void hv_crash_clear_kernpt(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + /* Clear entry so it's not confusing to someone looking at the core */ + pgd = pgd_offset_k(trampoline_pa); + p4d = p4d_offset(pgd, trampoline_pa); + native_p4d_clear(p4d); +} + +/* + * This is the C entry point from the asm glue code after the disable hypercall. + * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel + * page tables with our below 4G page identity mapped, but using a temporary + * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not + * available. We restore kernel GDT, and rest of the context, and continue + * to kexec. + */ +static asmlinkage void __noreturn hv_crash_c_entry(void) +{ + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt; + + /* first thing, restore kernel gdt */ + native_load_gdt(&ctxt->gdtr); + + asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss)); + asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp)); + + asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds)); + asm volatile("movw %%ax, %%es" : : "a"(ctxt->es)); + asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs)); + asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs)); + + native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat); + asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0)); + + asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8)); + asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4)); + asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4)); + + native_load_idt(&ctxt->idtr); + native_wrmsrq(MSR_GS_BASE, ctxt->gsbase); + native_wrmsrq(MSR_EFER, ctxt->efer); + + /* restore the original kernel CS now via far return */ + asm volatile("movzwq %0, %%rax\n\t" + "pushq %%rax\n\t" + "pushq $1f\n\t" + "lretq\n\t" + "1:nop\n\t" : : "m"(ctxt->cs) : "rax"); + + /* We are in asmlinkage without stack frame, hence make C function + * calls which will buy stack frames. + */ + hv_crash_restore_tss(); + hv_crash_clear_kernpt(); + + /* we are now fully in devirtualized normal kernel mode */ + __crash_kexec(NULL); + + hv_panic_timeout_reboot(); +} +/* Tell gcc we are using lretq long jump in the above function intentionally */ +STACK_FRAME_NON_STANDARD(hv_crash_c_entry); + +static void hv_mark_tss_not_busy(void) +{ + struct desc_struct *desc = get_current_gdt_rw(); + tss_desc tss; + + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); + tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */ + write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); +} + +/* Save essential context */ +static void hv_hvcrash_ctxt_save(void) +{ + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt; + + asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp)); + + ctxt->cr0 = native_read_cr0(); + ctxt->cr4 = native_read_cr4(); + + asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2)); + asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8)); + + asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs)); + asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss)); + asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds)); + asm volatile("movl %%es, %%eax" : "=a"(ctxt->es)); + asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs)); + asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs)); + + native_store_gdt(&ctxt->gdtr); + store_idt(&ctxt->idtr); + + ctxt->gsbase = __rdmsr(MSR_GS_BASE); + ctxt->efer = __rdmsr(MSR_EFER); + ctxt->pat = __rdmsr(MSR_IA32_CR_PAT); +} + +/* Add trampoline page to the kernel pagetable for transition to kernel PT */ +static void hv_crash_fixup_kernpt(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset_k(trampoline_pa); + p4d = p4d_offset(pgd, trampoline_pa); + + /* trampoline_pa is below 4G, so no pre-existing entry to clobber */ + p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]); + p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */ +} + +/* + * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce + * and suspend all guest VPs. + */ +static void hv_notify_prepare_hyp(void) +{ + u64 status; + struct hv_input_notify_partition_event *input; + struct hv_partition_event_root_crashdump_input *cda; + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + cda = &input->input.crashdump_input; + memset(input, 0, sizeof(*input)); + input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP; + + cda->crashdump_action = HV_CRASHDUMP_ENTRY; + status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL); + if (!hv_result_success(status)) + return; + + cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS; + hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL); +} + +/* + * Common function for all cpus before devirtualization. + * + * Hypervisor crash: all cpus get here in NMI context. + * Linux crash: the panicing cpu gets here at base level, all others in NMI + * context. Note, panicing cpu may not be the BSP. + * + * The function is not inlined so it will show on the stack. It is named so + * because the crash cmd looks for certain well known function names on the + * stack before looking into the cpu saved note in the elf section, and + * that work is currently incomplete. + * + * Notes: + * Hypervisor crash: + * - the hypervisor is in a very restrictive mode at this point and any + * vmexit it cannot handle would result in reboot. So, no mumbo jumbo, + * just get to kexec as quickly as possible. + * + * Devirtualization is supported from the BSP only at present. + */ +static noinline __noclone void crash_nmi_callback(struct pt_regs *regs) +{ + struct hv_input_disable_hyp_ex *input; + u64 status; + int msecs = 1000, ccpu = smp_processor_id(); + + if (ccpu == 0) { + /* crash_save_cpu() will be done in the kexec path */ + cpu_emergency_stop_pt(); /* disable performance trace */ + atomic_inc(&crash_cpus_wait); + } else { + crash_save_cpu(regs, ccpu); + cpu_emergency_stop_pt(); /* disable performance trace */ + atomic_inc(&crash_cpus_wait); + for (;;) + cpu_relax(); + } + + while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--) + mdelay(1); + + stop_nmi(); + if (!hv_has_crashed) + hv_notify_prepare_hyp(); + + if (crashing_cpu == -1) + crashing_cpu = ccpu; /* crash cmd uses this */ + + hv_hvcrash_ctxt_save(); + hv_mark_tss_not_busy(); + hv_crash_fixup_kernpt(); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->rip = trampoline_pa; + input->arg = devirt_arg; + + status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL); + + hv_panic_timeout_reboot(); +} + + +static DEFINE_SPINLOCK(hv_crash_reboot_lk); + +/* + * Generic NMI callback handler: could be called without any crash also. + * hv crash: hypervisor injects NMI's into all cpus + * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus + */ +static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs) +{ + if (!hv_has_crashed && hv_cda && hv_cda->cda_valid) + hv_has_crashed = true; + + if (!hv_has_crashed && !lx_has_crashed) + return NMI_DONE; /* ignore the NMI */ + + if (hv_has_crashed && !kexec_crash_loaded()) { + if (spin_trylock(&hv_crash_reboot_lk)) + hv_panic_timeout_reboot(); + else + for (;;) + cpu_relax(); + } + + crash_nmi_callback(regs); + + return NMI_DONE; +} + +/* + * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus + * + * On normal Linux panic, this is called twice: first from panic and then again + * from native_machine_crash_shutdown. + * + * In case of hyperv, 3 ways to get here: + * 1. hv crash (only BSP will get here): + * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry + * -> __crash_kexec -> native_machine_crash_shutdown + * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus + * Linux panic: + * 2. panic cpu x: panic() -> crash_smp_send_stop + * -> smp_ops.crash_stop_other_cpus + * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop + * + * NB: noclone and non standard stack because of call to crash_setup_regs(). + */ +static void __noclone hv_crash_stop_other_cpus(void) +{ + static bool crash_stop_done; + struct pt_regs lregs; + int ccpu = smp_processor_id(); + + if (hv_has_crashed) + return; /* all cpus already in NMI handler path */ + + if (!kexec_crash_loaded()) { + hv_notify_prepare_hyp(); + hv_panic_timeout_reboot(); /* no return */ + } + + /* If the hv crashes also, we could come here again before cpus_stopped + * is set in crash_smp_send_stop(). So use our own check. + */ + if (crash_stop_done) + return; + crash_stop_done = true; + + /* Linux has crashed: hv is healthy, we can IPI safely */ + lx_has_crashed = true; + wmb(); /* NMI handlers look at lx_has_crashed */ + + apic->send_IPI_allbutself(NMI_VECTOR); + + if (crashing_cpu == -1) + crashing_cpu = ccpu; /* crash cmd uses this */ + + /* crash_setup_regs() happens in kexec also, but for the kexec cpu which + * is the BSP. We could be here on non-BSP cpu, collect regs if so. + */ + if (ccpu) + crash_setup_regs(&lregs, NULL); + + crash_nmi_callback(&lregs); +} +STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus); + +/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */ +struct hv_gdtreg_32 { + u16 fill; + u16 limit; + u32 address; +} __packed; + +/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */ +struct hv_crash_tramp_gdt { + u64 null; /* index 0, selector 0, null selector */ + u64 cs64; /* index 1, selector 8, cs64 selector */ +} __packed; + +/* No stack, so jump via far ptr in memory to load the 64bit CS */ +struct hv_cs_jmptgt { + u32 address; + u16 csval; + u16 fill; +} __packed; + +/* Linux use only, hypervisor doesn't look at this struct */ +struct hv_crash_tramp_data { + u64 tramp32_cr3; + u64 kernel_cr3; + struct hv_gdtreg_32 gdtr32; + struct hv_crash_tramp_gdt tramp_gdt; + struct hv_cs_jmptgt cs_jmptgt; + u64 c_entry_addr; +} __packed; + +/* + * Setup a temporary gdt to allow the asm code to switch to the long mode. + * Since the asm code is relocated/copied to a below 4G page, it cannot use rip + * relative addressing, hence we must use trampoline_pa here. Also, save other + * info like jmp and C entry targets for same reasons. + * + * Returns: 0 on success, -1 on error + */ +static int hv_crash_setup_trampdata(u64 trampoline_va) +{ + int size, offs; + void *dest; + struct hv_crash_tramp_data *tramp; + + /* These must match exactly the ones in the corresponding asm file */ + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, + cs_jmptgt.address) != 40); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48); + + /* hv_crash_asm_end is beyond last byte by 1 */ + size = &hv_crash_asm_end - &hv_crash_asm32; + if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) { + pr_err("%s: trampoline page overflow\n", __func__); + return -1; + } + + dest = (void *)trampoline_va; + memcpy(dest, &hv_crash_asm32, size); + + dest += size; + dest = (void *)round_up((ulong)dest, 16); + tramp = (struct hv_crash_tramp_data *)dest; + + /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by + * non-PCID-aware users". Build cr3 with pcid 0 + */ + tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]); + + /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */ + tramp->kernel_cr3 = __sme_pa(init_mm.pgd); + + tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt); + tramp->gdtr32.address = trampoline_pa + + (ulong)&tramp->tramp_gdt - trampoline_va; + + /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */ + tramp->tramp_gdt.cs64 = 0x00af9a000000ffff; + + tramp->cs_jmptgt.csval = 0x8; + offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32; + tramp->cs_jmptgt.address = trampoline_pa + offs; + + tramp->c_entry_addr = (u64)&hv_crash_c_entry; + + devirt_arg = trampoline_pa + (ulong)dest - trampoline_va; + + return 0; +} + +/* + * Build 32bit trampoline page table for transition from protected mode + * non-paging to long-mode paging. This transition needs pagetables below 4G. + */ +static void hv_crash_build_tramp_pt(void) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + u64 pa, addr = trampoline_pa; + + p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d); + pa = virt_to_phys(hv_crash_ptpgs[1]); + set_p4d(p4d, __p4d(_PAGE_TABLE | pa)); + p4d->p4d &= ~(_PAGE_NX); /* enable execute */ + + pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud); + pa = virt_to_phys(hv_crash_ptpgs[2]); + set_pud(pud, __pud(_PAGE_TABLE | pa)); + + pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd); + pa = virt_to_phys(hv_crash_ptpgs[3]); + set_pmd(pmd, __pmd(_PAGE_TABLE | pa)); + + pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +/* + * Setup trampoline for devirtualization: + * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to + * in protected mode. + * - 4 pages for a temporary page table that asm code uses to turn paging on + * - a temporary gdt to use in the compat mode. + * + * Returns: 0 on success + */ +static int hv_crash_trampoline_setup(void) +{ + int i, rc, order; + struct page *page; + u64 trampoline_va; + gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO; + + /* page for 32bit trampoline assembly code + hv_crash_tramp_data */ + page = alloc_page(flags32); + if (page == NULL) { + pr_err("%s: failed to alloc asm stub page\n", __func__); + return -1; + } + + trampoline_va = (u64)page_to_virt(page); + trampoline_pa = (u32)page_to_phys(page); + + order = 2; /* alloc 2^2 pages */ + page = alloc_pages(flags32, order); + if (page == NULL) { + pr_err("%s: failed to alloc pt pages\n", __func__); + free_page(trampoline_va); + return -1; + } + + for (i = 0; i < 4; i++, page++) + hv_crash_ptpgs[i] = page_to_virt(page); + + hv_crash_build_tramp_pt(); + + rc = hv_crash_setup_trampdata(trampoline_va); + if (rc) + goto errout; + + return 0; + +errout: + free_page(trampoline_va); + free_pages((ulong)hv_crash_ptpgs[0], order); + + return rc; +} + +/* Setup for kdump kexec to collect hypervisor RAM when running as root */ +void hv_root_crash_init(void) +{ + int rc; + struct hv_input_get_system_property *input; + struct hv_output_get_system_property *output; + unsigned long flags; + u64 status; + union hv_pfn_range cda_info; + + if (pgtable_l5_enabled()) { + pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n"); + return; + } + + rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST, + "hv_crash_nmi"); + if (rc) { + pr_err("Hyper-V: failed to register crash nmi handler\n"); + return; + } + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA; + + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); + cda_info.as_uint64 = output->hv_cda_info.as_uint64; + local_irq_restore(flags); + + if (!hv_result_success(status)) { + pr_err("Hyper-V: %s: property:%d %s\n", __func__, + input->property_id, hv_result_to_string(status)); + goto err_out; + } + + if (cda_info.base_pfn == 0) { + pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n"); + goto err_out; + } + + hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT); + + rc = hv_crash_trampoline_setup(); + if (rc) + goto err_out; + + smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus; + + crash_kexec_post_notifiers = true; + hv_crash_enabled = true; + pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n"); + + return; + +err_out: + unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi"); + pr_err("Hyper-V: only linux root kdump support enabled\n"); +} From 77c860d2dbb72d1f3c6a2e882a07d19eca399db5 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:08 -0700 Subject: [PATCH 36/58] x86/hyperv: Enable build of hypervisor crashdump collection files Enable build of the new files introduced in the earlier commits and add call to do the setup during boot. Signed-off-by: Mukesh Rathor [ wei: fix build ] Signed-off-by: Wei Liu --- arch/x86/hyperv/Makefile | 6 ++++++ arch/x86/hyperv/hv_init.c | 1 + arch/x86/include/asm/mshyperv.h | 9 +++++++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index d55f494f471d..6f5d97cddd80 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -5,4 +5,10 @@ obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o + + ifdef CONFIG_MSHV_ROOT + CFLAGS_REMOVE_hv_trampoline.o += -pg + CFLAGS_hv_trampoline.o += -fno-stack-protector + obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o + endif endif diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21ed8963fafd..e28737ec7054 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -554,6 +554,7 @@ void __init hyperv_init(void) memunmap(src); hv_remap_tsc_clocksource(); + hv_root_crash_init(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 605abd02158d..1342d55c2545 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -237,6 +237,15 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg) } int hv_apicid_to_vp_index(u32 apic_id); +#if IS_ENABLED(CONFIG_MSHV_ROOT) && IS_ENABLED(CONFIG_CRASH_DUMP) +void hv_root_crash_init(void); +void hv_crash_asm32(void); +void hv_crash_asm64(void); +void hv_crash_asm_end(void); +#else /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */ +static inline void hv_root_crash_init(void) {} +#endif /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */ + #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} From 6626f815a1716259f161a4df8d105c9f0c8cbae9 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Sat, 25 Oct 2025 12:07:07 +0000 Subject: [PATCH 37/58] Drivers: hv: fix missing kernel-doc description for 'size' in request_arr_init() Add missing kernel-doc entry for the @size parameter in request_arr_init(), fixing the following documentation warning reported by the kernel test robot and detected via kernel-doc: Warning: drivers/hv/channel.c:595 function parameter 'size' not described in 'request_arr_init' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503021934.wH1BERla-lkp@intel.com Signed-off-by: Kriish Sharma Reviewed-by: Easwar Hariharan Signed-off-by: Wei Liu --- drivers/hv/channel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 88485d255a42..6821f225248b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -590,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); * keeps track of the next available slot in the array. Initially, each * slot points to the next one (as in a Linked List). The last slot * does not point to anything, so its value is U64_MAX by default. - * @size The size of the array + * @size: The size of the array */ static u64 *request_arr_init(u32 size) { From 5f4b5edcb1a389b16b852fb30dcd435bfb1f8d23 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 8 Oct 2025 22:43:29 +0000 Subject: [PATCH 38/58] Drivers: hv: Resolve ambiguity in hypervisor version log Update the log message in hv_common_init to explicitly state that the reported version is for the Microsoft Hypervisor, not the host OS. Previously, this message was accurate for guests running on Windows hosts, where the host and hypervisor versions matched. With support for Linux hosts running the Hyper-V hypervisor, the host OS and hypervisor versions may differ. This change avoids confusion by making it clear that the version refers to the Microsoft Hypervisor regardless of the host operating system. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Nuno Das Neves Signed-off-by: Wei Liu --- drivers/hv/hv_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 4b44da7f23b2..5f5047f122a9 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -315,9 +315,9 @@ int __init hv_common_init(void) int i; union hv_hypervisor_version_info version; - /* Get information about the Hyper-V host version */ + /* Get information about the Microsoft Hypervisor version */ if (!hv_get_hypervisor_version(&version)) - pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n", version.major_version, version.minor_version, version.build_number, version.service_number, version.service_pack, version.service_branch); From 22cb2f06fac9a05455eafa3ef3dc49ed7dedb240 Mon Sep 17 00:00:00 2001 From: Rahul Kumar Date: Tue, 4 Nov 2025 17:46:18 +0530 Subject: [PATCH 39/58] Drivers: hv: Use kmalloc_array() instead of kmalloc() Documentation/process/deprecated.rst recommends against the use of kmalloc with dynamic size calculations due to the risk of overflow and smaller allocation being made than the caller was expecting. Replace kmalloc() with kmalloc_array() in hv_common.c to make the intended allocation size clearer and avoid potential overflow issues. The number of pages (pgcount) is bounded, so overflow is not a practical concern here. However, using kmalloc_array() better reflects the intent to allocate an array and improves consistency with other allocations. No functional change intended. Signed-off-by: Rahul Kumar Signed-off-by: Wei Liu --- drivers/hv/hv_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 5f5047f122a9..0a3ab7efed46 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -487,7 +487,7 @@ int hv_common_cpu_init(unsigned int cpu) * online and then taken offline */ if (!*inputarg) { - mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags); if (!mem) return -ENOMEM; From 8ec6070fc8aca0fce6327d8b2cb8bd77adb155c9 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 7 Nov 2025 14:27:12 +0100 Subject: [PATCH 40/58] mshv: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Wei Liu --- drivers/hv/mshv_eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 806674722868..2a80af1d610a 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -592,7 +592,7 @@ static void mshv_irqfd_release(struct mshv_partition *pt) int mshv_irqfd_wq_init(void) { - irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0); + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0); if (!irqfd_cleanup_wq) return -ENOMEM; From ba9eb9b86d232854e983203dc2fb1ba18e316681 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Thu, 6 Nov 2025 14:13:30 -0800 Subject: [PATCH 41/58] mshv: Fix create memory region overlap check The current check is incorrect; it only checks if the beginning or end of a region is within an existing region. This doesn't account for userspace specifying a region that begins before and ends after an existing region. Change the logic to a range intersection check against gfns and uaddrs for each region. Remove mshv_partition_region_by_uaddr() as it is no longer used. Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs") Reported-by: Michael Kelley Closes: https://lore.kernel.org/linux-hyperv/SN6PR02MB41575BE0406D3AB22E1D7DB5D4C2A@SN6PR02MB4157.namprd02.prod.outlook.com/ Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 7684645ef00d..a8f3c5f3ce34 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1174,21 +1174,6 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) return NULL; } -static struct mshv_mem_region * -mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (uaddr >= region->start_uaddr && - uaddr < region->start_uaddr + - (region->nr_pages << HV_HYP_PAGE_SHIFT)) - return region; - } - - return NULL; -} - /* * NB: caller checks and makes sure mem->size is page aligned * Returns: 0 with regionpp updated on success, or -errno @@ -1198,15 +1183,21 @@ static int mshv_partition_create_region(struct mshv_partition *partition, struct mshv_mem_region **regionpp, bool is_mmio) { - struct mshv_mem_region *region; + struct mshv_mem_region *region, *rg; u64 nr_pages = HVPFN_DOWN(mem->size); /* Reject overlapping regions */ - if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || - mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || - mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || - mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) + hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { + u64 rg_size = rg->nr_pages << HV_HYP_PAGE_SHIFT; + + if ((mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) && + (mem->userspace_addr + mem->size <= rg->start_uaddr || + rg->start_uaddr + rg_size <= mem->userspace_addr)) + continue; + return -EEXIST; + } region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); if (!region) From f91bc8f61abf0e1d23108ae9871c60d7612a09b2 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 6 Nov 2025 14:13:31 -0800 Subject: [PATCH 42/58] mshv: Allow mappings that overlap in uaddr Currently the MSHV driver rejects mappings that would overlap in userspace. Some VMMs require the same memory to be mapped to different parts of the guest's address space, and so working around this restriction is difficult. The hypervisor itself doesn't prohibit mappings that overlap in uaddr, (really in SPA; system physical addresses), so supporting this in the driver doesn't require any extra work: only the checks need to be removed. Since no userspace code until now has been able to overlap regions in userspace, relaxing this constraint can't break any existing code. Signed-off-by: Magnus Kulke Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 8 ++------ include/uapi/linux/mshv.h | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index a8f3c5f3ce34..3f73c468e975 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1188,12 +1188,8 @@ static int mshv_partition_create_region(struct mshv_partition *partition, /* Reject overlapping regions */ hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { - u64 rg_size = rg->nr_pages << HV_HYP_PAGE_SHIFT; - - if ((mem->guest_pfn + nr_pages <= rg->start_gfn || - rg->start_gfn + rg->nr_pages <= mem->guest_pfn) && - (mem->userspace_addr + mem->size <= rg->start_uaddr || - rg->start_uaddr + rg_size <= mem->userspace_addr)) + if (mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) continue; return -EEXIST; diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 876bfe4e4227..374f75e198bc 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -89,7 +89,7 @@ enum { * @rsvd: MBZ * * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). - * Mappings can't overlap in GPA space or userspace. + * Mappings can't overlap in GPA space. * To unmap, these fields must match an existing mapping. */ struct mshv_user_mem_region { From c91fe5f162f278d4aa960d06d2dbc42f9857593a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 13 Nov 2025 11:45:33 -0800 Subject: [PATCH 43/58] mshv: Extend create partition ioctl to support cpu features The existing mshv create partition ioctl does not provide a way to specify which cpu features are enabled in the guest. Instead, it attempts to enable all features and those that are not supported are silently disabled by the hypervisor. This was done to reduce unnecessary complexity and is sufficient for many cases. However, new scenarios require fine-grained control over these features. Define a new mshv_create_partition_v2 structure which supports passing the disabled processor and xsave feature bits through to the create partition hypercall directly. Introduce a new flag MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES which enables the new structure. If unset, the original mshv_create_partition struct is used, with the old behavior of enabling all features. Co-developed-by: Jinank Jain Signed-off-by: Jinank Jain Signed-off-by: Muminul Islam Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 122 +++++++++++++++++++++++++++++------- include/uapi/linux/mshv.h | 34 ++++++++++ 2 files changed, 133 insertions(+), 23 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 3f73c468e975..45c7a5fea1cf 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1855,43 +1855,119 @@ add_partition(struct mshv_partition *partition) return 0; } -static long -mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) -{ - struct mshv_create_partition args; - u64 creation_flags; - struct hv_partition_creation_properties creation_properties = {}; - union hv_partition_isolation_properties isolation_properties = {}; - struct mshv_partition *partition; - struct file *file; - int fd; - long ret; +static_assert(MSHV_NUM_CPU_FEATURES_BANKS == + HV_PARTITION_PROCESSOR_FEATURES_BANKS); - if (copy_from_user(&args, user_arg, sizeof(args))) +static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, + struct hv_partition_creation_properties *cr_props, + union hv_partition_isolation_properties *isol_props) +{ + int i; + struct mshv_create_partition_v2 args; + union hv_partition_processor_features *disabled_procs; + union hv_partition_processor_xsave_features *disabled_xsave; + + /* First, copy v1 struct in case user is on previous versions */ + if (copy_from_user(&args, user_arg, + sizeof(struct mshv_create_partition))) return -EFAULT; if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) return -EINVAL; - /* Only support EXO partitions */ - creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | - HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + disabled_procs = &cr_props->disabled_processor_features; + disabled_xsave = &cr_props->disabled_processor_xsave_features; - if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; - if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) - creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + /* Check if user provided newer struct with feature fields */ + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + /* Re-validate v1 fields after second copy_from_user() */ + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || + mshv_field_nonzero(args, pt_rsvd) || + mshv_field_nonzero(args, pt_rsvd1)) + return -EINVAL; + + /* + * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never + * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS + * (i.e. 2). + * + * Further banks (index >= 2) will be modifiable as 'early' + * properties via the set partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; + +#if IS_ENABLED(CONFIG_X86_64) + disabled_xsave->as_uint64 = args.pt_disabled_xsave; +#else + /* + * In practice this field is ignored on arm64, but safer to + * zero it in case it is ever used. + */ + disabled_xsave->as_uint64 = 0; + + if (mshv_field_nonzero(args, pt_rsvd2)) + return -EINVAL; +#endif + } else { + /* + * v1 behavior: try to enable everything. The hypervisor will + * disable features that are not supported. The banks can be + * queried via the get partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = 0; + + disabled_xsave->as_uint64 = 0; + } + + /* Only support EXO partitions */ + *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + + isol_props->as_uint64 = 0; switch (args.pt_isolation) { case MSHV_PT_ISOLATION_NONE: - isolation_properties.isolation_type = - HV_PARTITION_ISOLATION_TYPE_NONE; + isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; break; } + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + u64 creation_flags; + struct hv_partition_creation_properties creation_properties; + union hv_partition_isolation_properties isolation_properties; + struct mshv_partition *partition; + struct file *file; + int fd; + long ret; + + ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, + &creation_properties, + &isolation_properties); + if (ret) + return ret; + partition = kzalloc(sizeof(*partition), GFP_KERNEL); if (!partition) return -ENOMEM; diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 374f75e198bc..b645d17cc531 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -26,6 +26,7 @@ enum { MSHV_PT_BIT_LAPIC, MSHV_PT_BIT_X2APIC, MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, MSHV_PT_BIT_COUNT, }; @@ -41,6 +42,8 @@ enum { * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* * @pt_isolation: MSHV_PT_ISOLATION_* * + * This is the initial/v1 version for backward compatibility. + * * Returns a file descriptor to act as a handle to a guest partition. * At this point the partition is not yet initialized in the hypervisor. * Some operations must be done with the partition in this state, e.g. setting @@ -52,6 +55,37 @@ struct mshv_create_partition { __u64 pt_isolation; }; +#define MSHV_NUM_CPU_FEATURES_BANKS 2 + +/** + * struct mshv_create_partition_v2 + * + * This is extended version of the above initial MSHV_CREATE_PARTITION + * ioctl and allows for following additional parameters: + * + * @pt_num_cpu_fbanks: Must be set to MSHV_NUM_CPU_FEATURES_BANKS. + * @pt_cpu_fbanks: Disabled processor feature banks array. + * @pt_disabled_xsave: Disabled xsave feature bits. + * + * pt_cpu_fbanks and pt_disabled_xsave are passed through as-is to the create + * partition hypercall. + * + * Returns : same as above original mshv_create_partition + */ +struct mshv_create_partition_v2 { + __u64 pt_flags; + __u64 pt_isolation; + __u16 pt_num_cpu_fbanks; + __u8 pt_rsvd[6]; /* MBZ */ + __u64 pt_cpu_fbanks[MSHV_NUM_CPU_FEATURES_BANKS]; + __u64 pt_rsvd1[2]; /* MBZ */ +#if defined(__x86_64__) + __u64 pt_disabled_xsave; +#else + __u64 pt_rsvd2; /* MBZ */ +#endif +} __packed; + /* /dev/mshv */ #define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) From 796ef5a7fe86a8605f2844471ed7baa8e80bace8 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:47 +0000 Subject: [PATCH 44/58] static_call: allow using STATIC_CALL_TRAMP_STR() from assembly STATIC_CALL_TRAMP_STR() could not be used from .S files because static_call_types.h was not safe to include in assembly as it pulled in C types/constructs that are unavailable under __ASSEMBLY__. Make the header assembly-friendly by adding __ASSEMBLY__ checks and providing only the minimal definitions needed for assembly, so that it can be safely included by .S code. This enables emitting the static call trampoline symbol name via STATIC_CALL_TRAMP_STR() directly in assembly sources, to be used with 'call' instruction. Also, move a certain definitions out of __ASSEMBLY__ checks in compiler_types.h to meet the dependencies. No functional change for C compilation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- include/linux/compiler_types.h | 8 ++++---- include/linux/static_call_types.h | 4 ++++ tools/include/linux/static_call_types.h | 4 ++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..6897d4d5cb28 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -11,6 +11,10 @@ #define __has_builtin(x) (0) #endif +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + #ifndef __ASSEMBLY__ /* @@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __builtin_warning(x, y...) (1) #endif /* __CHECKER__ */ -/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) - #ifdef __KERNEL__ /* Attributes */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ From cffe9f58de1eb1a2cefce10c9512162fca3f7c89 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:48 +0000 Subject: [PATCH 45/58] Drivers: hv: Export some symbols for mshv_vtl MSHV_VTL driver is going to be introduced, which is supposed to provide interface for Virtual Machine Monitors (VMMs) to control Virtual Trust Level (VTL). Export the symbols needed to make it work (vmbus_isr, hv_context and hv_post_message). Co-developed-by: Roman Kisel Signed-off-by: Roman Kisel Co-developed-by: Saurabh Sengar Signed-off-by: Saurabh Sengar Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506110544.q0NDMQVc-lkp@intel.com/ Reviewed-by: Michael Kelley Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- drivers/hv/hv.c | 3 +++ drivers/hv/hyperv_vmbus.h | 1 + drivers/hv/vmbus_drv.c | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 936c5f310df6..c100f04b3581 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ /* The one and only */ struct hv_context hv_context; +EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl"); /* * hv_init - Main initialization routine. @@ -104,6 +106,7 @@ int hv_post_message(union hv_connection_id connection_id, return hv_result(status); } +EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl"); static int hv_alloc_page(void **page, bool decrypt, const char *note) { diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index f7fc2630c054..b2862e0a317a 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -33,6 +33,7 @@ */ #define HV_UTIL_NEGO_TIMEOUT 55 +void vmbus_isr(void); /* Definitions for the monitored notification facility */ union hv_monitor_trigger_group { diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0dc4692b411a..47fcab38398a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include "hyperv_vmbus.h" @@ -1349,7 +1350,7 @@ static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message } } -static void vmbus_isr(void) +void vmbus_isr(void) { struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); @@ -1362,6 +1363,7 @@ static void vmbus_isr(void) add_interrupt_randomness(vmbus_interrupt); } +EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl"); static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) { From 7bfe3b8ea6e30437e01fcb8e4f56ef6e4d986d0f Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:49 +0000 Subject: [PATCH 46/58] Drivers: hv: Introduce mshv_vtl driver Provide an interface for Virtual Machine Monitor like OpenVMM and its use as OpenHCL paravisor to control VTL0 (Virtual trust Level). Expose devices and support IOCTLs for features like VTL creation, VTL0 memory management, context switch, making hypercalls, mapping VTL0 address space to VTL2 userspace, getting new VMBus messages and channel events in VTL2 etc. Co-developed-by: Roman Kisel Signed-off-by: Roman Kisel Co-developed-by: Saurabh Sengar Signed-off-by: Saurabh Sengar Reviewed-by: Michael Kelley Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- arch/x86/hyperv/Makefile | 10 +- arch/x86/hyperv/hv_vtl.c | 30 + arch/x86/hyperv/mshv-asm-offsets.c | 37 + arch/x86/hyperv/mshv_vtl_asm.S | 116 +++ arch/x86/include/asm/mshyperv.h | 34 + drivers/hv/Kconfig | 27 +- drivers/hv/Makefile | 7 +- drivers/hv/mshv_vtl.h | 25 + drivers/hv/mshv_vtl_main.c | 1392 ++++++++++++++++++++++++++++ include/hyperv/hvgdk_mini.h | 106 +++ include/uapi/linux/mshv.h | 80 ++ 11 files changed, 1861 insertions(+), 3 deletions(-) create mode 100644 arch/x86/hyperv/mshv-asm-offsets.c create mode 100644 arch/x86/hyperv/mshv_vtl_asm.S create mode 100644 drivers/hv/mshv_vtl.h create mode 100644 drivers/hv/mshv_vtl_main.c diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 6f5d97cddd80..56292102af62 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,7 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o obj-$(CONFIG_X86_64) += hv_apic.o -obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o +obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o + +$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h + +$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE + $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__) ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o @@ -12,3 +17,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o endif endif + +targets += mshv-asm-offsets.s +clean-files += mshv-asm-offsets.h diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 042e8712d8de..c0edaed0efb3 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -9,12 +9,17 @@ #include #include #include +#include +#include #include #include #include #include #include +#include +#include #include <../kernel/smpboot.h> +#include "../../kernel/fpu/legacy.h" extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; @@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void) return 0; } + +DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void)); + +void mshv_vtl_return_call_init(u64 vtl_return_offset) +{ + static_call_update(__mshv_vtl_return_hypercall, + (void *)((u8 *)hv_hypercall_pg + vtl_return_offset)); +} +EXPORT_SYMBOL(mshv_vtl_return_call_init); + +void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + hvp->vtl_ret_x64rax = vtl0->rax; + hvp->vtl_ret_x64rcx = vtl0->rcx; + + kernel_fpu_begin_mask(0); + fxrstor(&vtl0->fx_state); + __mshv_vtl_return_call(vtl0); + fxsave(&vtl0->fx_state); + kernel_fpu_end(); +} +EXPORT_SYMBOL(mshv_vtl_return_call); diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c new file mode 100644 index 000000000000..882c1db6df16 --- /dev/null +++ b/arch/x86/hyperv/mshv-asm-offsets.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain + */ +#define COMPILE_OFFSETS + +#include +#include + +static void __used common(void) +{ + if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) { + OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax); + OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp); + OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi); + OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8); + OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9); + OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10); + OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11); + OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12); + OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13); + OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14); + OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15); + OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2); + } +} diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S new file mode 100644 index 000000000000..f595eefad9ab --- /dev/null +++ b/arch/x86/hyperv/mshv_vtl_asm.S @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Assembly level code for mshv_vtl VTL transition + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain + */ + +#include +#include +#include +#include +#include +#include "mshv-asm-offsets.h" + + .text + .section .noinstr.text, "ax" +/* + * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) + * + * This function is used to context switch between different Virtual Trust Levels. + * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities. + * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard + * against #PFs in NMI context clobbering the guest state. + */ +SYM_FUNC_START(__mshv_vtl_return_call) + /* Push callee save registers */ + pushq %rbp + mov %rsp, %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + + /* register switch to VTL0 clobbers all registers except rax/rcx */ + mov %_ASM_ARG1, %rax + + /* grab rbx/rbp/rsi/rdi/r8-r15 */ + mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx + mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp + mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi + mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi + mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8 + mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9 + mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10 + mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11 + mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12 + mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13 + mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14 + mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15 + + mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx + mov %rdx, %cr2 + mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx + + /* stash host registers on stack */ + pushq %rax + pushq %rcx + + xor %ecx, %ecx + + /* make a hypercall to switch VTL */ + call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall) + + /* stash guest registers on stack, restore saved host copies */ + pushq %rax + pushq %rcx + mov 16(%rsp), %rcx + mov 24(%rsp), %rax + + mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax) + mov %cr2, %rdx + mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax) + pop MSHV_VTL_CPU_CONTEXT_rcx(%rax) + pop MSHV_VTL_CPU_CONTEXT_rax(%rax) + add $16, %rsp + + /* save rbx/rbp/rsi/rdi/r8-r15 */ + mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax) + mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax) + mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax) + mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax) + mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax) + mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax) + mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax) + mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax) + mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax) + mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax) + mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax) + mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax) + + /* pop callee-save registers r12-r15, rbx */ + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + pop %rbp + RET +SYM_FUNC_END(__mshv_vtl_return_call) +/* + * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here. + * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid + * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0" + * which would otherwise have been generated by the macro. + */ + .section .discard.addressable,"aw" + .align 8 + .type mshv_vtl_return_sym, @object + .size mshv_vtl_return_sym, 8 +mshv_vtl_return_sym: + .quad __SCK____mshv_vtl_return_hypercall diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 1342d55c2545..10037125099a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * Hyper-V always provides a single IO-APIC at this MMIO address. @@ -269,13 +270,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; } static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; } #endif /* CONFIG_HYPERV */ +struct mshv_vtl_cpu_context { + union { + struct { + u64 rax; + u64 rcx; + u64 rdx; + u64 rbx; + u64 cr2; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + }; + u64 gp_regs[16]; + }; + + struct fxregs_state fx_state; +}; #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); int __init hv_vtl_early_init(void); +void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0); +void mshv_vtl_return_call_init(u64 vtl_return_offset); +void mshv_vtl_return_hypercall(void); +void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0); #else static inline void __init hv_vtl_init_platform(void) {} static inline int __init hv_vtl_early_init(void) { return 0; } +static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {} +static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {} +static inline void mshv_vtl_return_hypercall(void) {} +static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {} #endif #include diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 0b8c391a0342..d4a8d349200c 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -17,7 +17,8 @@ config HYPERV config HYPERV_VTL_MODE bool "Enable Linux to boot in VTL context" - depends on (X86_64 || ARM64) && HYPERV + depends on (X86_64 && HAVE_STATIC_CALL) || ARM64 + depends on HYPERV depends on SMP default n help @@ -82,4 +83,28 @@ config MSHV_ROOT If unsure, say N. +config MSHV_VTL + tristate "Microsoft Hyper-V VTL driver" + depends on X86_64 && HYPERV_VTL_MODE + depends on HYPERV_VMBUS + # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL. + # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs, + # specially with large memory requirements. + depends on TRANSPARENT_HUGEPAGE + # MTRRs are controlled by VTL0, and are not specific to individual VTLs. + # Therefore, do not attempt to access or modify MTRRs here. + depends on !MTRR + select CPUMASK_OFFSTACK + select VIRT_XFER_TO_GUEST_WORK + default n + help + Select this option to enable Hyper-V VTL driver support. + This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2 + userspace to create VTLs and partitions, setup and manage VTL0 memory and + allow userspace to make direct hypercalls. This also allows to map VTL0's address + space to a usermode process in VTL2 and supports getting new VMBus messages and channel + events in VTL2. + + If unsure, say N. + endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 1a1677bf4dac..6d929fb0e13d 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o obj-$(CONFIG_MSHV_ROOT) += mshv_root.o +obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o CFLAGS_hv_trace.o = -I$(src) CFLAGS_hv_balloon.o = -I$(src) @@ -14,7 +15,11 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ mshv_root_hv_call.o mshv_portid_table.o +mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in obj-$(CONFIG_HYPERV) += hv_common.o -obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o +ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),) + obj-y += mshv_common.o +endif diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h new file mode 100644 index 000000000000..a6eea52f7aa2 --- /dev/null +++ b/drivers/hv/mshv_vtl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _MSHV_VTL_H +#define _MSHV_VTL_H + +#include +#include + +struct mshv_vtl_run { + u32 cancel; + u32 vtl_ret_action_size; + u32 pad[2]; + char exit_message[MSHV_MAX_RUN_MSG_SIZE]; + union { + struct mshv_vtl_cpu_context cpu_context; + + /* + * Reserving room for the cpu context to grow and to maintain compatibility + * with user mode. + */ + char reserved[1024]; + }; + char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE]; +}; + +#endif /* _MSHV_VTL_H */ diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c new file mode 100644 index 000000000000..2cebe9de5a5a --- /dev/null +++ b/drivers/hv/mshv_vtl_main.c @@ -0,0 +1,1392 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Roman Kisel + * Saurabh Sengar + * Naman Jain + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kernel/fpu/legacy.h" +#include "mshv.h" +#include "mshv_vtl.h" +#include "hyperv_vmbus.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver"); + +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1 +#define MSHV_ENTRY_REASON_INTERRUPT 0x2 +#define MSHV_ENTRY_REASON_INTERCEPT 0x3 + +#define MSHV_REAL_OFF_SHIFT 16 +#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1) +#define MSHV_RUN_PAGE_OFFSET 0 +#define MSHV_REG_PAGE_OFFSET 1 +#define VTL2_VMBUS_SINT_INDEX 7 + +static struct device *mem_dev; + +static struct tasklet_struct msg_dpc; +static wait_queue_head_t fd_wait_queue; +static bool has_message; +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT]; +static DEFINE_MUTEX(flag_lock); +static bool __read_mostly mshv_has_reg_page; + +/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */ +#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8) + +struct mshv_vtl_hvcall_fd { + u8 allow_bitmap[MAX_BITMAP_SIZE]; + bool allow_map_initialized; + /* + * Used to protect hvcall setup in IOCTLs + */ + struct mutex init_mutex; + struct miscdevice *dev; +}; + +struct mshv_vtl_poll_file { + struct file *file; + wait_queue_entry_t wait; + wait_queue_head_t *wqh; + poll_table pt; + int cpu; +}; + +struct mshv_vtl { + struct device *module_dev; + u64 id; +}; + +struct mshv_vtl_per_cpu { + struct mshv_vtl_run *run; + struct page *reg_page; +}; + +/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */ +union hv_synic_overlay_page_msr { + u64 as_uint64; + struct { + u64 enabled: 1; + u64 reserved: 11; + u64 pfn: 52; + } __packed; +}; + +static struct mutex mshv_vtl_poll_file_lock; +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets; +static union hv_register_vsm_capabilities mshv_vsm_capabilities; + +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .use_target_vtl = 1, +}; + +static const struct file_operations mshv_vtl_fops; + +static long +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev) +{ + struct mshv_vtl *vtl; + struct file *file; + int fd; + + vtl = kzalloc(sizeof(*vtl), GFP_KERNEL); + if (!vtl) + return -ENOMEM; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + kfree(vtl); + return fd; + } + file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops, + vtl, O_RDWR); + if (IS_ERR(file)) { + kfree(vtl); + return PTR_ERR(file); + } + vtl->module_dev = module_dev; + fd_install(fd, file); + + return fd; +} + +static long +mshv_ioctl_check_extension(void __user *user_arg) +{ + u32 arg; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + switch (arg) { + case MSHV_CAP_CORE_API_STABLE: + return 0; + case MSHV_CAP_REGISTER_PAGE: + return mshv_has_reg_page; + case MSHV_CAP_VTL_RETURN_ACTION: + return mshv_vsm_capabilities.return_action_available; + case MSHV_CAP_DR6_SHARED: + return mshv_vsm_capabilities.dr6_shared; + } + + return -EOPNOTSUPP; +} + +static long +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CHECK_EXTENSION: + return mshv_ioctl_check_extension((void __user *)arg); + case MSHV_CREATE_VTL: + return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device); + } + + return -ENOTTY; +} + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +static struct mshv_vtl_run *mshv_vtl_this_run(void) +{ + return *this_cpu_ptr(&mshv_vtl_per_cpu.run); +} + +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu); +} + +static struct page *mshv_vtl_cpu_reg_page(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu); +} + +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu) +{ + struct hv_register_assoc reg_assoc = {}; + union hv_synic_overlay_page_msr overlay = {}; + struct page *reg_page; + + reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL); + if (!reg_page) { + WARN(1, "failed to allocate register page\n"); + return; + } + + overlay.enabled = 1; + overlay.pfn = page_to_hvpfn(reg_page); + reg_assoc.name = HV_X64_REGISTER_REG_PAGE; + reg_assoc.value.reg64 = overlay.as_uint64; + + if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc)) { + WARN(1, "failed to setup register page\n"); + __free_page(reg_page); + return; + } + + per_cpu->reg_page = reg_page; + mshv_has_reg_page = true; +} + +static void mshv_vtl_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + + /* Enable intercepts */ + if (!mshv_vsm_capabilities.intercept_page_available) + hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */ +} + +static int mshv_vtl_get_vsm_regs(void) +{ + struct hv_register_assoc registers[2]; + int ret, count = 2; + + registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS; + registers[1].name = HV_REGISTER_VSM_CAPABILITIES; + + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + count, input_vtl_zero, registers); + if (ret) + return ret; + + mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64; + mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64; + + return ret; +} + +static int mshv_vtl_configure_vsm_partition(struct device *dev) +{ + union hv_register_vsm_partition_config config; + struct hv_register_assoc reg_assoc; + + config.as_uint64 = 0; + config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK; + config.enable_vtl_protection = 1; + config.zero_memory_on_reset = 1; + config.intercept_vp_startup = 1; + config.intercept_cpuid_unimplemented = 1; + + if (mshv_vsm_capabilities.intercept_page_available) { + dev_dbg(dev, "using intercept page\n"); + config.intercept_page = 1; + } + + reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG; + reg_assoc.value.reg64 = config.as_uint64; + + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc); +} + +static void mshv_vtl_vmbus_isr(void) +{ + struct hv_per_cpu_context *per_cpu; + struct hv_message *msg; + u32 message_type; + union hv_synic_event_flags *event_flags; + struct eventfd_ctx *eventfd; + u16 i; + + per_cpu = this_cpu_ptr(hv_context.cpu_context); + if (smp_processor_id() == 0) { + msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type != HVMSG_NONE) + tasklet_schedule(&msg_dpc); + } + + event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page + + VTL2_VMBUS_SINT_INDEX; + for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) { + if (!sync_test_and_clear_bit(i, event_flags->flags)) + continue; + rcu_read_lock(); + eventfd = READ_ONCE(flag_eventfds[i]); + if (eventfd) + eventfd_signal(eventfd); + rcu_read_unlock(); + } + + vmbus_isr(); +} + +static int mshv_vtl_alloc_context(unsigned int cpu) +{ + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + + per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!per_cpu->run) + return -ENOMEM; + + if (mshv_vsm_capabilities.intercept_page_available) + mshv_vtl_configure_reg_page(per_cpu); + + mshv_vtl_synic_enable_regs(cpu); + + return 0; +} + +static int mshv_vtl_cpuhp_online; + +static int hv_vtl_setup_synic(void) +{ + int ret; + + /* Use our isr to first filter out packets destined for userspace */ + hv_setup_vmbus_handler(mshv_vtl_vmbus_isr); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online", + mshv_vtl_alloc_context, NULL); + if (ret < 0) { + hv_setup_vmbus_handler(vmbus_isr); + return ret; + } + + mshv_vtl_cpuhp_online = ret; + + return 0; +} + +static void hv_vtl_remove_synic(void) +{ + cpuhp_remove_state(mshv_vtl_cpuhp_online); + hv_setup_vmbus_handler(vmbus_isr); +} + +static int vtl_get_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int vtl_set_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) +{ + struct mshv_vtl_ram_disposition vtl0_mem; + struct dev_pagemap *pgmap; + void *addr; + + if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem))) + return -EFAULT; + /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */ + if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) { + dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn); + return -EFAULT; + } + + pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn); + pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; + pgmap->nr_range = 1; + pgmap->type = MEMORY_DEVICE_GENERIC; + + /* + * Determine the highest page order that can be used for the given memory range. + * This works best when the range is aligned; i.e. both the start and the length. + */ + pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn); + dev_dbg(vtl->module_dev, + "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift); + + addr = devm_memremap_pages(mem_dev, pgmap); + if (IS_ERR(addr)) { + dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr)); + kfree(pgmap); + return -EFAULT; + } + + /* Don't free pgmap, since it has to stick around until the memory + * is unmapped, which will never happen as there is no scenario + * where VTL0 can be released/shutdown without bringing down VTL2. + */ + return 0; +} + +static void mshv_vtl_cancel(int cpu) +{ + int here = get_cpu(); + + if (here != cpu) { + if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1)) + smp_send_reschedule(cpu); + } else { + WRITE_ONCE(mshv_vtl_this_run()->cancel, 1); + } + put_cpu(); +} + +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait); + + mshv_vtl_cancel(poll_file->cpu); + + return 0; +} + +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt); + + WARN_ON(poll_file->wqh); + poll_file->wqh = wqh; + add_wait_queue(wqh, &poll_file->wait); +} + +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input) +{ + struct file *file, *old_file; + struct mshv_vtl_poll_file *poll_file; + struct mshv_vtl_set_poll_file input; + + if (copy_from_user(&input, user_input, sizeof(input))) + return -EFAULT; + + if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu)) + return -EINVAL; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + file = NULL; + file = fget(input.fd); + if (!file) + return -EBADFD; + + poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu)); + if (!poll_file) + return -EINVAL; + + mutex_lock(&mshv_vtl_poll_file_lock); + + if (poll_file->wqh) + remove_wait_queue(poll_file->wqh, &poll_file->wait); + poll_file->wqh = NULL; + + old_file = poll_file->file; + poll_file->file = file; + poll_file->cpu = input.cpu; + + if (file) { + init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake); + init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc); + vfs_poll(file, &poll_file->pt); + } + + mutex_unlock(&mshv_vtl_poll_file_lock); + + if (old_file) + fput(old_file); + + return 0; +} + +/* Static table mapping register names to their corresponding actions */ +static const struct { + enum hv_register_name reg_name; + int debug_reg_num; /* -1 if not a debug register */ + u32 msr_addr; /* 0 if not an MSR */ +} reg_table[] = { + /* Debug registers */ + {HV_X64_REGISTER_DR0, 0, 0}, + {HV_X64_REGISTER_DR1, 1, 0}, + {HV_X64_REGISTER_DR2, 2, 0}, + {HV_X64_REGISTER_DR3, 3, 0}, + {HV_X64_REGISTER_DR6, 6, 0}, + /* MTRR MSRs */ + {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap}, + {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000}, +}; + +static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set) +{ + u64 *reg64; + enum hv_register_name gpr_name; + int i; + + gpr_name = regs->name; + reg64 = ®s->value.reg64; + + /* Search for the register in the table */ + for (i = 0; i < ARRAY_SIZE(reg_table); i++) { + if (reg_table[i].reg_name != gpr_name) + continue; + if (reg_table[i].debug_reg_num != -1) { + /* Handle debug registers */ + if (gpr_name == HV_X64_REGISTER_DR6 && + !mshv_vsm_capabilities.dr6_shared) + goto hypercall; + if (set) + native_set_debugreg(reg_table[i].debug_reg_num, *reg64); + else + *reg64 = native_get_debugreg(reg_table[i].debug_reg_num); + } else { + /* Handle MSRs */ + if (set) + wrmsrl(reg_table[i].msr_addr, *reg64); + else + rdmsrl(reg_table[i].msr_addr, *reg64); + } + return 0; + } + +hypercall: + return 1; +} + +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + + /* + * Process signal event direct set in the run page, if any. + */ + if (mshv_vsm_capabilities.return_action_available) { + u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size); + + WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0); + + /* + * Hypervisor will take care of clearing out the actions + * set in the assist page. + */ + memcpy(hvp->vtl_ret_actions, + mshv_vtl_this_run()->vtl_ret_actions, + min_t(u32, offset, sizeof(hvp->vtl_ret_actions))); + } + + mshv_vtl_return_call(vtl0); +} + +static bool mshv_vtl_process_intercept(void) +{ + struct hv_per_cpu_context *mshv_cpu; + void *synic_message_page; + struct hv_message *msg; + u32 message_type; + + mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + synic_message_page = mshv_cpu->hyp_synic_message_page; + if (unlikely(!synic_message_page)) + return true; + + msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type == HVMSG_NONE) + return true; + + memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); + + return false; +} + +static int mshv_vtl_ioctl_return_to_lower_vtl(void) +{ + preempt_disable(); + for (;;) { + unsigned long irq_flags; + struct hv_vp_assist_page *hvp; + int ret; + + if (__xfer_to_guest_mode_work_pending()) { + preempt_enable(); + ret = xfer_to_guest_mode_handle_work(); + if (ret) + return ret; + preempt_disable(); + } + + local_irq_save(irq_flags); + if (READ_ONCE(mshv_vtl_this_run()->cancel)) { + local_irq_restore(irq_flags); + preempt_enable(); + return -EINTR; + } + + mshv_vtl_return(&mshv_vtl_this_run()->cpu_context); + local_irq_restore(irq_flags); + + hvp = hv_vp_assist_page[smp_processor_id()]; + this_cpu_inc(num_vtl0_transitions); + switch (hvp->vtl_entry_reason) { + case MSHV_ENTRY_REASON_INTERRUPT: + if (!mshv_vsm_capabilities.intercept_page_available && + likely(!mshv_vtl_process_intercept())) + goto done; + break; + + case MSHV_ENTRY_REASON_INTERCEPT: + WARN_ON(!mshv_vsm_capabilities.intercept_page_available); + memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message, + sizeof(hvp->intercept_message)); + goto done; + + default: + panic("unknown entry reason: %d", hvp->vtl_entry_reason); + } + } + +done: + preempt_enable(); + + return 0; +} + +static long +mshv_vtl_ioctl_get_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, + sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, false); + if (!ret) + goto copy_args; /* No need of hypercall */ + ret = vtl_get_vp_register(®); + if (ret) + return ret; + +copy_args: + if (copy_to_user((void __user *)args.regs_ptr, ®, sizeof(reg))) + ret = -EFAULT; + + return ret; +} + +static long +mshv_vtl_ioctl_set_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, true); + if (!ret) + return ret; /* No need of hypercall */ + ret = vtl_set_vp_register(®); + + return ret; +} + +static long +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + long ret; + struct mshv_vtl *vtl = filp->private_data; + + switch (ioctl) { + case MSHV_SET_POLL_FILE: + ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg); + break; + case MSHV_GET_VP_REGISTERS: + ret = mshv_vtl_ioctl_get_regs((void __user *)arg); + break; + case MSHV_SET_VP_REGISTERS: + ret = mshv_vtl_ioctl_set_regs((void __user *)arg); + break; + case MSHV_RETURN_TO_LOWER_VTL: + ret = mshv_vtl_ioctl_return_to_lower_vtl(); + break; + case MSHV_ADD_VTL0_MEMORY: + ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg); + break; + default: + dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl); + ret = -ENOTTY; + } + + return ret; +} + +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf) +{ + struct page *page; + int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK; + int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT; + + if (!cpu_online(cpu)) + return VM_FAULT_SIGBUS; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + if (real_off == MSHV_RUN_PAGE_OFFSET) { + page = virt_to_page(mshv_vtl_cpu_run(cpu)); + } else if (real_off == MSHV_REG_PAGE_OFFSET) { + if (!mshv_has_reg_page) + return VM_FAULT_SIGBUS; + page = mshv_vtl_cpu_reg_page(cpu); + } else { + return VM_FAULT_NOPAGE; + } + + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct mshv_vtl_vm_ops = { + .fault = mshv_vtl_fault, +}; + +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_vm_ops; + + return 0; +} + +static int mshv_vtl_release(struct inode *inode, struct file *filp) +{ + struct mshv_vtl *vtl = filp->private_data; + + kfree(vtl); + + return 0; +} + +static const struct file_operations mshv_vtl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_vtl_ioctl, + .release = mshv_vtl_release, + .mmap = mshv_vtl_mmap, +}; + +static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = (*mask != 0); + sint.auto_eoi = hv_recommend_using_aeoi(); + + hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX, + sint.as_uint64); + + if (!sint.masked) + pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); + else + pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); +} + +static void mshv_vtl_read_remote(void *buffer) +{ + struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page + + VTL2_VMBUS_SINT_INDEX; + u32 message_type = READ_ONCE(msg->header.message_type); + + WRITE_ONCE(has_message, false); + if (message_type == HVMSG_NONE) + return; + + memcpy(buffer, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); +} + +static bool vtl_synic_mask_vmbus_sint_masked = true; + +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset) +{ + struct hv_message msg = {}; + int ret; + + if (size < sizeof(msg)) + return -EINVAL; + + for (;;) { + smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true); + if (msg.header.message_type != HVMSG_NONE) + break; + + if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + return 0; /* EOF */ + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(fd_wait_queue, + READ_ONCE(has_message) || + READ_ONCE(vtl_synic_mask_vmbus_sint_masked)); + if (ret) + return ret; + } + + if (copy_to_user(arg, &msg, sizeof(msg))) + return -EFAULT; + + return sizeof(msg); +} + +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(filp, &fd_wait_queue, wait); + if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static void mshv_vtl_sint_on_msg_dpc(unsigned long data) +{ + WRITE_ONCE(has_message, true); + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); +} + +static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg) +{ + struct mshv_vtl_sint_post_msg message; + u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT]; + + if (copy_from_user(&message, arg, sizeof(message))) + return -EFAULT; + if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) + return -EINVAL; + if (copy_from_user(payload, (void __user *)message.payload_ptr, + message.payload_size)) + return -EFAULT; + + return hv_post_message((union hv_connection_id)message.connection_id, + message.message_type, (void *)payload, + message.payload_size); +} + +static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg) +{ + u64 input, status; + struct mshv_vtl_signal_event signal_event; + + if (copy_from_user(&signal_event, arg, sizeof(signal_event))) + return -EFAULT; + + input = signal_event.connection_id | ((u64)signal_event.flag << 32); + + status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input); + + return hv_result_to_errno(status); +} + +static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg) +{ + struct mshv_vtl_set_eventfd set_eventfd; + struct eventfd_ctx *eventfd, *old_eventfd; + + if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd))) + return -EFAULT; + if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT) + return -EINVAL; + + eventfd = NULL; + if (set_eventfd.fd >= 0) { + eventfd = eventfd_ctx_fdget(set_eventfd.fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + } + + guard(mutex)(&flag_lock); + old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]); + WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd); + + if (old_eventfd) { + synchronize_rcu(); + eventfd_ctx_put(old_eventfd); + } + + return 0; +} + +static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg) +{ + static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex); + struct mshv_sint_mask mask; + + if (copy_from_user(&mask, arg, sizeof(mask))) + return -EFAULT; + guard(mutex)(&vtl2_vmbus_sint_mask_mutex); + on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); + WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); + if (mask.mask) + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); + + return 0; +} + +static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case MSHV_SINT_POST_MESSAGE: + return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg); + case MSHV_SINT_SIGNAL_EVENT: + return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg); + case MSHV_SINT_SET_EVENTFD: + return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg); + case MSHV_SINT_PAUSE_MESSAGE_STREAM: + return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + +static const struct file_operations mshv_vtl_sint_ops = { + .owner = THIS_MODULE, + .read = mshv_vtl_sint_read, + .poll = mshv_vtl_sint_poll, + .unlocked_ioctl = mshv_vtl_sint_ioctl, +}; + +static struct miscdevice mshv_vtl_sint_dev = { + .name = "mshv_sint", + .fops = &mshv_vtl_sint_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f) +{ + struct miscdevice *dev = f->private_data; + struct mshv_vtl_hvcall_fd *fd; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fd = vzalloc(sizeof(*fd)); + if (!fd) + return -ENOMEM; + fd->dev = dev; + f->private_data = fd; + mutex_init(&fd->init_mutex); + + return 0; +} + +static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f) +{ + struct mshv_vtl_hvcall_fd *fd; + + fd = f->private_data; + if (fd) { + vfree(fd); + f->private_data = NULL; + } + + return 0; +} + +static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall_setup __user *hvcall_setup_user) +{ + struct mshv_vtl_hvcall_setup hvcall_setup; + + guard(mutex)(&fd->init_mutex); + + if (fd->allow_map_initialized) { + dev_err(fd->dev->this_device, + "Hypercall allow map has already been set, pid %d\n", + current->pid); + return -EINVAL; + } + + if (copy_from_user(&hvcall_setup, hvcall_setup_user, + sizeof(struct mshv_vtl_hvcall_setup))) { + return -EFAULT; + } + if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap)) + return -EINVAL; + + if (copy_from_user(&fd->allow_bitmap, + (void __user *)hvcall_setup.allow_bitmap_ptr, + hvcall_setup.bitmap_array_size)) { + return -EFAULT; + } + + dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n", + current->pid); + fd->allow_map_initialized = true; + return 0; +} + +static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code) +{ + return test_bit(call_code, (unsigned long *)fd->allow_bitmap); +} + +static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall __user *hvcall_user) +{ + struct mshv_vtl_hvcall hvcall; + void *in, *out; + int ret; + + if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall))) + return -EFAULT; + if (hvcall.input_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + if (hvcall.output_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + + /* + * By default, all hypercalls are not allowed. + * The user mode code has to set up the allow bitmap once. + */ + + if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) { + dev_err(fd->dev->this_device, + "Hypercall with control data %#llx isn't allowed\n", + hvcall.control); + return -EPERM; + } + + /* + * This may create a problem for Confidential VM (CVM) usecase where we need to use + * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and + * hyperv_pcpu_output_arg) for making a hypervisor call. + * + * TODO: Take care of this when CVM support is added. + */ + in = (void *)__get_free_page(GFP_KERNEL); + out = (void *)__get_free_page(GFP_KERNEL); + + if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) { + ret = -EFAULT; + goto free_pages; + } + + hvcall.status = hv_do_hypercall(hvcall.control, in, out); + + if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) { + ret = -EFAULT; + goto free_pages; + } + ret = put_user(hvcall.status, &hvcall_user->status); +free_pages: + free_page((unsigned long)in); + free_page((unsigned long)out); + + return ret; +} + +static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct mshv_vtl_hvcall_fd *fd = f->private_data; + + switch (cmd) { + case MSHV_HVCALL_SETUP: + return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg); + case MSHV_HVCALL: + return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg); + default: + break; + } + + return -ENOIOCTLCMD; +} + +static const struct file_operations mshv_vtl_hvcall_dev_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_hvcall_dev_open, + .release = mshv_vtl_hvcall_dev_release, + .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl, +}; + +static struct miscdevice mshv_vtl_hvcall_dev = { + .name = "mshv_hvcall", + .nodename = "mshv_hvcall", + .fops = &mshv_vtl_hvcall_dev_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) +{ + pid_t pid = task_pid_vnr(current); + uid_t uid = current_uid().val; + int ret = 0; + + pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid); + + if (capable(CAP_SYS_ADMIN)) { + filp->private_data = inodep; + } else { + pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", + __func__, pid, uid); + ret = -EPERM; + } + + return ret; +} + +static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn) +{ + unsigned long mask = size - 1; + unsigned long start = vmf->address & ~mask; + unsigned long end = start + size; + bool is_valid; + + is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) && + start >= vmf->vma->vm_start && + end <= vmf->vma->vm_end; + + if (is_valid) + *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT); + + return is_valid; +} + +static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + unsigned long pfn = vmf->pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + switch (order) { + case 0: + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + + case PMD_ORDER: + if (can_fault(vmf, PMD_SIZE, &pfn)) + ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + case PUD_ORDER: + if (can_fault(vmf, PUD_SIZE, &pfn)) + ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + default: + return VM_FAULT_SIGBUS; + } +} + +static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf) +{ + return mshv_vtl_low_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct mshv_vtl_low_vm_ops = { + .fault = mshv_vtl_low_fault, + .huge_fault = mshv_vtl_low_huge_fault, +}; + +static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_low_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); + + return 0; +} + +static const struct file_operations mshv_vtl_low_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_low_open, + .mmap = mshv_vtl_low_mmap, +}; + +static struct miscdevice mshv_vtl_low = { + .name = "mshv_vtl_low", + .nodename = "mshv_vtl_low", + .fops = &mshv_vtl_low_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int __init mshv_vtl_init(void) +{ + int ret; + struct device *dev = mshv_dev.this_device; + + /* + * This creates /dev/mshv which provides functionality to create VTLs and partitions. + */ + ret = misc_register(&mshv_dev); + if (ret) { + dev_err(dev, "mshv device register failed: %d\n", ret); + goto free_dev; + } + + tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0); + init_waitqueue_head(&fd_wait_queue); + + if (mshv_vtl_get_vsm_regs()) { + dev_emerg(dev, "Unable to get VSM capabilities !!\n"); + ret = -ENODEV; + goto free_dev; + } + if (mshv_vtl_configure_vsm_partition(dev)) { + dev_emerg(dev, "VSM configuration failed !!\n"); + ret = -ENODEV; + goto free_dev; + } + + mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset); + ret = hv_vtl_setup_synic(); + if (ret) + goto free_dev; + + /* + * mshv_sint device adds VMBus relay ioctl support. + * This provides a channel for VTL0 to communicate with VTL2. + */ + ret = misc_register(&mshv_vtl_sint_dev); + if (ret) + goto free_synic; + + /* + * mshv_hvcall device adds interface to enable userspace for direct hypercalls support. + */ + ret = misc_register(&mshv_vtl_hvcall_dev); + if (ret) + goto free_sint; + + /* + * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2. + * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0. + */ + ret = misc_register(&mshv_vtl_low); + if (ret) + goto free_hvcall; + + /* + * "mshv vtl mem dev" device is later used to setup VTL0 memory. + */ + mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL); + if (!mem_dev) { + ret = -ENOMEM; + goto free_low; + } + + mutex_init(&mshv_vtl_poll_file_lock); + + device_initialize(mem_dev); + dev_set_name(mem_dev, "mshv vtl mem dev"); + ret = device_add(mem_dev); + if (ret) { + dev_err(dev, "mshv vtl mem dev add: %d\n", ret); + goto free_mem; + } + + return 0; + +free_mem: + kfree(mem_dev); +free_low: + misc_deregister(&mshv_vtl_low); +free_hvcall: + misc_deregister(&mshv_vtl_hvcall_dev); +free_sint: + misc_deregister(&mshv_vtl_sint_dev); +free_synic: + hv_vtl_remove_synic(); +free_dev: + misc_deregister(&mshv_dev); + + return ret; +} + +static void __exit mshv_vtl_exit(void) +{ + device_del(mem_dev); + kfree(mem_dev); + misc_deregister(&mshv_vtl_low); + misc_deregister(&mshv_vtl_hvcall_dev); + misc_deregister(&mshv_vtl_sint_dev); + hv_vtl_remove_synic(); + misc_deregister(&mshv_dev); +} + +module_init(mshv_vtl_init); +module_exit(mshv_vtl_exit); diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 7499a679e60a..1d5ce11be8b6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -885,6 +885,48 @@ struct hv_get_vp_from_apic_id_in { u32 apic_ids[]; } __packed; +union hv_register_vsm_partition_config { + u64 as_uint64; + struct { + u64 enable_vtl_protection : 1; + u64 default_vtl_protection_mask : 4; + u64 zero_memory_on_reset : 1; + u64 deny_lower_vtl_startup : 1; + u64 intercept_acceptance : 1; + u64 intercept_enable_vtl_protection : 1; + u64 intercept_vp_startup : 1; + u64 intercept_cpuid_unimplemented : 1; + u64 intercept_unrecoverable_exception : 1; + u64 intercept_page : 1; + u64 mbz : 51; + } __packed; +}; + +union hv_register_vsm_capabilities { + u64 as_uint64; + struct { + u64 dr6_shared: 1; + u64 mbec_vtl_mask: 16; + u64 deny_lower_vtl_startup: 1; + u64 supervisor_shadow_stack: 1; + u64 hardware_hvpt_available: 1; + u64 software_hvpt_available: 1; + u64 hardware_hvpt_range_bits: 6; + u64 intercept_page_available: 1; + u64 return_action_available: 1; + u64 reserved: 35; + } __packed; +}; + +union hv_register_vsm_page_offsets { + struct { + u64 vtl_call_offset : 12; + u64 vtl_return_offset : 12; + u64 reserved_mbz : 40; + } __packed; + u64 as_uint64; +}; + struct hv_nested_enlightenments_control { struct { u32 directhypercall : 1; @@ -1007,6 +1049,70 @@ enum hv_register_name { /* VSM */ HV_REGISTER_VSM_VP_STATUS = 0x000D0003, + + /* Synthetic VSM registers */ + HV_REGISTER_VSM_CODE_PAGE_OFFSETS = 0x000D0002, + HV_REGISTER_VSM_CAPABILITIES = 0x000D0006, + HV_REGISTER_VSM_PARTITION_CONFIG = 0x000D0007, + +#if defined(CONFIG_X86) + /* X64 Debug Registers */ + HV_X64_REGISTER_DR0 = 0x00050000, + HV_X64_REGISTER_DR1 = 0x00050001, + HV_X64_REGISTER_DR2 = 0x00050002, + HV_X64_REGISTER_DR3 = 0x00050003, + HV_X64_REGISTER_DR6 = 0x00050004, + HV_X64_REGISTER_DR7 = 0x00050005, + + /* X64 Cache control MSRs */ + HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D, + HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F, + HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070, + HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071, + HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072, + HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073, + HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074, + HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075, + HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076, + HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077, + HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078, + HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079, + HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A, + + HV_X64_REGISTER_REG_PAGE = 0x0009001C, +#endif }; /* diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index b645d17cc531..dee3ece28ce5 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -322,4 +322,84 @@ struct mshv_get_set_vp_state { * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) */ +/* Structure definitions, macros and IOCTLs for mshv_vtl */ + +#define MSHV_CAP_CORE_API_STABLE 0x0 +#define MSHV_CAP_REGISTER_PAGE 0x1 +#define MSHV_CAP_VTL_RETURN_ACTION 0x2 +#define MSHV_CAP_DR6_SHARED 0x3 +#define MSHV_MAX_RUN_MSG_SIZE 256 + +struct mshv_vp_registers { + __u32 count; /* supports only 1 register at a time */ + __u32 reserved; /* Reserved for alignment or future use */ + __u64 regs_ptr; /* pointer to struct hv_register_assoc */ +}; + +struct mshv_vtl_set_eventfd { + __s32 fd; + __u32 flag; +}; + +struct mshv_vtl_signal_event { + __u32 connection_id; + __u32 flag; +}; + +struct mshv_vtl_sint_post_msg { + __u64 message_type; + __u32 connection_id; + __u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */ + __u64 payload_ptr; /* pointer to message payload (bytes) */ +}; + +struct mshv_vtl_ram_disposition { + __u64 start_pfn; + __u64 last_pfn; +}; + +struct mshv_vtl_set_poll_file { + __u32 cpu; + __u32 fd; +}; + +struct mshv_vtl_hvcall_setup { + __u64 bitmap_array_size; /* stores number of bytes */ + __u64 allow_bitmap_ptr; +}; + +struct mshv_vtl_hvcall { + __u64 control; /* Hypercall control code */ + __u64 input_size; /* Size of the input data */ + __u64 input_ptr; /* Pointer to the input struct */ + __u64 status; /* Status of the hypercall (output) */ + __u64 output_size; /* Size of the output data */ + __u64 output_ptr; /* Pointer to the output struct */ +}; + +struct mshv_sint_mask { + __u8 mask; + __u8 reserved[7]; +}; + +/* /dev/mshv device IOCTL */ +#define MSHV_CHECK_EXTENSION _IOW(MSHV_IOCTL, 0x00, __u32) + +/* vtl device */ +#define MSHV_CREATE_VTL _IOR(MSHV_IOCTL, 0x1D, char) +#define MSHV_ADD_VTL0_MEMORY _IOW(MSHV_IOCTL, 0x21, struct mshv_vtl_ram_disposition) +#define MSHV_SET_POLL_FILE _IOW(MSHV_IOCTL, 0x25, struct mshv_vtl_set_poll_file) +#define MSHV_RETURN_TO_LOWER_VTL _IO(MSHV_IOCTL, 0x27) +#define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct mshv_vp_registers) +#define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers) + +/* VMBus device IOCTLs */ +#define MSHV_SINT_SIGNAL_EVENT _IOW(MSHV_IOCTL, 0x22, struct mshv_vtl_signal_event) +#define MSHV_SINT_POST_MESSAGE _IOW(MSHV_IOCTL, 0x23, struct mshv_vtl_sint_post_msg) +#define MSHV_SINT_SET_EVENTFD _IOW(MSHV_IOCTL, 0x24, struct mshv_vtl_set_eventfd) +#define MSHV_SINT_PAUSE_MESSAGE_STREAM _IOW(MSHV_IOCTL, 0x25, struct mshv_sint_mask) + +/* hv_hvcall device */ +#define MSHV_HVCALL_SETUP _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup) +#define MSHV_HVCALL _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall) #endif From c720e6a873cc97bdfe8912986e26ceaeeaa6b240 Mon Sep 17 00:00:00 2001 From: "Anirudh Rayabharam (Microsoft)" Date: Wed, 19 Nov 2025 17:17:08 +0000 Subject: [PATCH 47/58] mshv: Add ioctl for self targeted passthrough hvcalls Allow MSHV_ROOT_HVCALL IOCTL on the /dev/mshv fd. This IOCTL would execute a passthrough hypercall targeting the root/parent partition i.e. HV_PARTITION_ID_SELF. This will be useful for the VMM to query things like supported synthetic processor features, supported VMM capabiliites etc. Since hypercalls targeting the host partition could potentially perform privileged operations, allow only a limited set of hypercalls. To begin with, allow only: HVCALL_GET_PARTITION_PROPERTY HVCALL_GET_PARTITION_PROPERTY_EX Signed-off-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Nuno Das Neves Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 47 ++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 45c7a5fea1cf..bc15d6f6922f 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -122,6 +122,7 @@ static struct miscdevice mshv_dev = { */ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, HVCALL_SET_PARTITION_PROPERTY, HVCALL_INSTALL_INTERCEPT, HVCALL_GET_VP_REGISTERS, @@ -136,6 +137,16 @@ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_VP_CPUID_VALUES, }; +/* + * Only allow hypercalls that are safe to be called by the VMM with the host + * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a + * hypercall cannot be misused by the VMM before adding it to this list. + */ +static u16 mshv_self_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, +}; + static bool mshv_hvcall_is_async(u16 code) { switch (code) { @@ -147,12 +158,30 @@ static bool mshv_hvcall_is_async(u16 code) return false; } +static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) +{ + int i; + int n = ARRAY_SIZE(mshv_passthru_hvcalls); + u16 *allowed_hvcalls = mshv_passthru_hvcalls; + + if (pt_id == HV_PARTITION_ID_SELF) { + n = ARRAY_SIZE(mshv_self_passthru_hvcalls); + allowed_hvcalls = mshv_self_passthru_hvcalls; + } + + for (i = 0; i < n; ++i) + if (allowed_hvcalls[i] == code) + return true; + + return false; +} + static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, bool partition_locked, void __user *user_args) { u64 status; - int ret = 0, i; + int ret = 0; bool is_async; struct mshv_root_hvcall args; struct page *page; @@ -160,6 +189,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, void *input_pg = NULL; void *output_pg = NULL; u16 reps_completed; + u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; @@ -171,17 +201,13 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) - if (args.code == mshv_passthru_hvcalls[i]) - break; - - if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) + if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) return -EINVAL; is_async = mshv_hvcall_is_async(args.code); if (is_async) { /* async hypercalls can only be called from partition fd */ - if (!partition_locked) + if (!partition || !partition_locked) return -EINVAL; ret = mshv_init_async_handler(partition); if (ret) @@ -209,7 +235,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, * NOTE: This only works because all the allowed hypercalls' input * structs begin with a u64 partition_id field. */ - *(u64 *)input_pg = partition->pt_id; + *(u64 *)input_pg = pt_id; reps_completed = 0; do { @@ -238,7 +264,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, ret = hv_result_to_errno(status); else ret = hv_call_deposit_pages(NUMA_NO_NODE, - partition->pt_id, 1); + pt_id, 1); } while (!ret); args.status = hv_result(status); @@ -2050,6 +2076,9 @@ static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, case MSHV_CREATE_PARTITION: return mshv_ioctl_create_partition((void __user *)arg, misc->this_device); + case MSHV_ROOT_HVCALL: + return mshv_ioctl_passthru_hvcall(NULL, false, + (void __user *)arg); } return -ENOTTY; From b5110eaf67530091343b519d8abd0cddd14660f2 Mon Sep 17 00:00:00 2001 From: Gongwei Li Date: Fri, 21 Nov 2025 11:10:41 +0800 Subject: [PATCH 48/58] Drivers: hv: use kmalloc_array() instead of kmalloc() Replace kmalloc() with kmalloc_array() to prevent potential overflow, as recommended in Documentation/process/deprecated.rst. Signed-off-by: Gongwei Li Signed-off-by: Wei Liu --- drivers/hv/hv_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 36ee89c0358b..7e9c8e169c66 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev, (struct hv_util_service *)dev_id->driver_data; int ret; - srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL); + srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!srv->recv_buffer) return -ENOMEM; srv->channel = dev->channel; From 9d70ef7a18e0ec1653ac63020a13a5d4dda7cc0d Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Mon, 24 Nov 2025 14:25:59 +0000 Subject: [PATCH 49/58] mshv: adjust interrupt control structure for ARM64 Interrupt control structure (union hv_interupt_control) has different fields when it comes to x86 vs ARM64. Bring in the correct structure from HyperV header files and adjust the existing interrupt routing code accordingly. Signed-off-by: Jinank Jain Signed-off-by: Anirudh Rayabharam (Microsoft) Signed-off-by: Wei Liu --- drivers/hv/mshv_eventfd.c | 6 ++++++ drivers/hv/mshv_irq.c | 4 ++++ drivers/hv/mshv_root_hv_call.c | 6 ++++++ include/hyperv/hvhdk.h | 6 ++++++ 4 files changed, 22 insertions(+) diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 2a80af1d610a..d93a18f09c76 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) return -EOPNOTSUPP; +#if IS_ENABLED(CONFIG_X86) if (irq->lapic_control.logical_dest_mode) return -EOPNOTSUPP; +#endif vp = partition->pt_vp_array[irq->lapic_apic_id]; @@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) unsigned int seq; int idx; +#if IS_ENABLED(CONFIG_X86) WARN_ON(irqfd->irqfd_resampler && !irq->lapic_control.level_triggered); +#endif idx = srcu_read_lock(&partition->pt_irq_srcu); if (irqfd->irqfd_girq_ent.guest_irq_num) { @@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); spin_lock_irq(&pt->pt_irqfds_lock); +#if IS_ENABLED(CONFIG_X86) if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { /* @@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, ret = -EINVAL; goto fail; } +#endif ret = 0; hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c index d0fb9ef734f4..798e7e1ab06e 100644 --- a/drivers/hv/mshv_irq.c +++ b/drivers/hv/mshv_irq.c @@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, lirq->lapic_vector = ent->girq_irq_data & 0xFF; lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; +#if IS_ENABLED(CONFIG_X86) lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; +#elif IS_ENABLED(CONFIG_ARM64) + lirq->lapic_control.asserted = 1; +#endif } diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index caf02cfa49c9..598eaff4ff29 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, memset(input, 0, sizeof(*input)); input->partition_id = partition_id; input->vector = vector; + /* + * NOTE: dest_addr only needs to be provided while asserting an + * interrupt on x86 platform + */ +#if IS_ENABLED(CONFIG_X86) input->dest_addr = dest_addr; +#endif input->control = control; status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); local_irq_restore(flags); diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 416c0d45b793..469186df7826 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -579,9 +579,15 @@ union hv_interrupt_control { u64 as_uint64; struct { u32 interrupt_type; /* enum hv_interrupt_type */ +#if IS_ENABLED(CONFIG_X86) u32 level_triggered : 1; u32 logical_dest_mode : 1; u32 rsvd : 30; +#elif IS_ENABLED(CONFIG_ARM64) + u32 rsvd1 : 2; + u32 asserted : 1; + u32 rsvd2 : 29; +#endif } __packed; }; From df4ff5f6cf7864714d66c65ec7df582240a596a4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 3 Dec 2025 21:40:39 +0000 Subject: [PATCH 50/58] mshv: Refactor and rename memory region handling functions Simplify and unify memory region management to improve code clarity and reliability. Consolidate pinning and invalidation logic, adopt consistent naming, and remove redundant checks to reduce complexity. Enhance documentation and update call sites for maintainability. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Nuno Das Neves Reviewed-by: Anirudh Rayabharam (Microsoft) Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 80 +++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index bc15d6f6922f..fec82619684a 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1114,8 +1114,8 @@ mshv_region_map(struct mshv_mem_region *region) } static void -mshv_region_evict_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) +mshv_region_invalidate_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) { if (region->flags.range_pinned) unpin_user_pages(region->pages + page_offset, page_count); @@ -1125,29 +1125,24 @@ mshv_region_evict_pages(struct mshv_mem_region *region, } static void -mshv_region_evict(struct mshv_mem_region *region) +mshv_region_invalidate(struct mshv_mem_region *region) { - mshv_region_evict_pages(region, 0, region->nr_pages); + mshv_region_invalidate_pages(region, 0, region->nr_pages); } static int -mshv_region_populate_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) +mshv_region_pin(struct mshv_mem_region *region) { u64 done_count, nr_pages; struct page **pages; __u64 userspace_addr; int ret; - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - for (done_count = 0; done_count < page_count; done_count += ret) { - pages = region->pages + page_offset + done_count; + for (done_count = 0; done_count < region->nr_pages; done_count += ret) { + pages = region->pages + done_count; userspace_addr = region->start_uaddr + - (page_offset + done_count) * - HV_HYP_PAGE_SIZE; - nr_pages = min(page_count - done_count, + done_count * HV_HYP_PAGE_SIZE; + nr_pages = min(region->nr_pages - done_count, MSHV_PIN_PAGES_BATCH_SIZE); /* @@ -1158,34 +1153,23 @@ mshv_region_populate_pages(struct mshv_mem_region *region, * with the FOLL_LONGTERM flag does a large temporary * allocation of contiguous memory. */ - if (region->flags.range_pinned) - ret = pin_user_pages_fast(userspace_addr, - nr_pages, - FOLL_WRITE | FOLL_LONGTERM, - pages); - else - ret = -EOPNOTSUPP; - + ret = pin_user_pages_fast(userspace_addr, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages); if (ret < 0) goto release_pages; } - if (PageHuge(region->pages[page_offset])) + if (PageHuge(region->pages[0])) region->flags.large_pages = true; return 0; release_pages: - mshv_region_evict_pages(region, page_offset, done_count); + mshv_region_invalidate_pages(region, 0, done_count); return ret; } -static int -mshv_region_populate(struct mshv_mem_region *region) -{ - return mshv_region_populate_pages(region, 0, region->nr_pages); -} - static struct mshv_mem_region * mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) { @@ -1245,19 +1229,27 @@ static int mshv_partition_create_region(struct mshv_partition *partition, return 0; } -/* - * Map guest ram. if snp, make sure to release that from the host first - * Side Effects: In case of failure, pages are unpinned when feasible. +/** + * mshv_prepare_pinned_region - Pin and map memory regions + * @region: Pointer to the memory region structure + * + * This function processes memory regions that are explicitly marked as pinned. + * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based + * population. The function ensures the region is properly populated, handles + * encryption requirements for SNP partitions if applicable, maps the region, + * and performs necessary sharing or eviction operations based on the mapping + * result. + * + * Return: 0 on success, negative error code on failure. */ -static int -mshv_partition_mem_region_map(struct mshv_mem_region *region) +static int mshv_prepare_pinned_region(struct mshv_mem_region *region) { struct mshv_partition *partition = region->partition; int ret; - ret = mshv_region_populate(region); + ret = mshv_region_pin(region); if (ret) { - pt_err(partition, "Failed to populate memory region: %d\n", + pt_err(partition, "Failed to pin memory region: %d\n", ret); goto err_out; } @@ -1275,7 +1267,7 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) pt_err(partition, "Failed to unshare memory region (guest_pfn: %llu): %d\n", region->start_gfn, ret); - goto evict_region; + goto invalidate_region; } } @@ -1285,7 +1277,7 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) shrc = mshv_partition_region_share(region); if (!shrc) - goto evict_region; + goto invalidate_region; pt_err(partition, "Failed to share memory region (guest_pfn: %llu): %d\n", @@ -1299,8 +1291,8 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) return 0; -evict_region: - mshv_region_evict(region); +invalidate_region: + mshv_region_invalidate(region); err_out: return ret; } @@ -1349,7 +1341,7 @@ mshv_map_user_memory(struct mshv_partition *partition, ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, mmio_pfn, HVPFN_DOWN(mem.size)); else - ret = mshv_partition_mem_region_map(region); + ret = mshv_prepare_pinned_region(region); if (ret) goto errout; @@ -1394,7 +1386,7 @@ mshv_unmap_user_memory(struct mshv_partition *partition, hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, region->nr_pages, unmap_flags); - mshv_region_evict(region); + mshv_region_invalidate(region); vfree(region); return 0; @@ -1812,7 +1804,7 @@ static void destroy_partition(struct mshv_partition *partition) } } - mshv_region_evict(region); + mshv_region_invalidate(region); vfree(region); } From 6f6aed2c497e8d80d8ed6b5a87c6f65dc7548b8f Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 3 Dec 2025 21:40:45 +0000 Subject: [PATCH 51/58] mshv: Centralize guest memory region destruction Centralize guest memory region destruction to prevent resource leaks and inconsistent cleanup across unmap and partition destruction paths. Unify region removal, encrypted partition access recovery, and region invalidation to improve maintainability and reliability. Reduce code duplication and make future updates less error-prone by encapsulating cleanup logic in a single helper. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Nuno Das Neves Reviewed-by: Anirudh Rayabharam (Microsoft) Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 65 +++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index fec82619684a..ec18984c3f2d 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1356,13 +1356,42 @@ errout: return ret; } +static void mshv_partition_destroy_region(struct mshv_mem_region *region) +{ + struct mshv_partition *partition = region->partition; + u32 unmap_flags = 0; + int ret; + + hlist_del(®ion->hnode); + + if (mshv_partition_encrypted(partition)) { + ret = mshv_partition_region_share(region); + if (ret) { + pt_err(partition, + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", + ret); + return; + } + } + + if (region->flags.large_pages) + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + + /* ignore unmap failures and continue as process may be exiting */ + hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, + region->nr_pages, unmap_flags); + + mshv_region_invalidate(region); + + vfree(region); +} + /* Called for unmapping both the guest ram and the mmio space */ static long mshv_unmap_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem) { struct mshv_mem_region *region; - u32 unmap_flags = 0; if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) return -EINVAL; @@ -1377,18 +1406,8 @@ mshv_unmap_user_memory(struct mshv_partition *partition, region->nr_pages != HVPFN_DOWN(mem.size)) return -EINVAL; - hlist_del(®ion->hnode); + mshv_partition_destroy_region(region); - if (region->flags.large_pages) - unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; - - /* ignore unmap failures and continue as process may be exiting */ - hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, - region->nr_pages, unmap_flags); - - mshv_region_invalidate(region); - - vfree(region); return 0; } @@ -1724,8 +1743,8 @@ static void destroy_partition(struct mshv_partition *partition) { struct mshv_vp *vp; struct mshv_mem_region *region; - int i, ret; struct hlist_node *n; + int i; if (refcount_read(&partition->pt_ref_count)) { pt_err(partition, @@ -1789,25 +1808,9 @@ static void destroy_partition(struct mshv_partition *partition) remove_partition(partition); - /* Remove regions, regain access to the memory and unpin the pages */ hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, - hnode) { - hlist_del(®ion->hnode); - - if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_share(region); - if (ret) { - pt_err(partition, - "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", - ret); - return; - } - } - - mshv_region_invalidate(region); - - vfree(region); - } + hnode) + mshv_partition_destroy_region(region); /* Withdraw and free all pages we deposited */ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); From e950c30a1051d27fd6dd9a48c53ffbc41ee773f2 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 3 Dec 2025 21:40:50 +0000 Subject: [PATCH 52/58] mshv: Move region management to mshv_regions.c Refactor memory region management functions from mshv_root_main.c into mshv_regions.c for better modularity and code organization. Adjust function calls and headers to use the new implementation. Improve maintainability and separation of concerns in the mshv_root module. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Nuno Das Neves Signed-off-by: Wei Liu --- drivers/hv/Makefile | 2 +- drivers/hv/mshv_regions.c | 175 +++++++++++++++++++++++++++++++++++ drivers/hv/mshv_root.h | 10 ++ drivers/hv/mshv_root_main.c | 176 +++--------------------------------- 4 files changed, 198 insertions(+), 165 deletions(-) create mode 100644 drivers/hv/mshv_regions.c diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 6d929fb0e13d..a49f93c2d245 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -14,7 +14,7 @@ hv_vmbus-y := vmbus_drv.o \ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ - mshv_root_hv_call.o mshv_portid_table.o + mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c new file mode 100644 index 000000000000..35b866670840 --- /dev/null +++ b/drivers/hv/mshv_regions.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Microsoft Corporation. + * + * Memory region management for mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include + +#include + +#include "mshv_root.h" + +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags, + bool is_mmio) +{ + struct mshv_mem_region *region; + + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); + if (!region) + return ERR_PTR(-ENOMEM); + + region->nr_pages = nr_pages; + region->start_gfn = guest_pfn; + region->start_uaddr = uaddr; + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + + /* Note: large_pages flag populated when we pin the pages */ + if (!is_mmio) + region->flags.range_pinned = true; + + return region; +} + +int mshv_region_share(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; + + if (region->flags.large_pages) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages, region->nr_pages, + HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, + flags, true); +} + +int mshv_region_unshare(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; + + if (region->flags.large_pages) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages, region->nr_pages, + 0, + flags, false); +} + +static int mshv_region_remap_pages(struct mshv_mem_region *region, + u32 map_flags, + u64 page_offset, u64 page_count) +{ + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + if (region->flags.large_pages) + map_flags |= HV_MAP_GPA_LARGE_PAGE; + + return hv_call_map_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, map_flags, + region->pages + page_offset); +} + +int mshv_region_map(struct mshv_mem_region *region) +{ + u32 map_flags = region->hv_map_flags; + + return mshv_region_remap_pages(region, map_flags, + 0, region->nr_pages); +} + +static void mshv_region_invalidate_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + if (region->flags.range_pinned) + unpin_user_pages(region->pages + page_offset, page_count); + + memset(region->pages + page_offset, 0, + page_count * sizeof(struct page *)); +} + +void mshv_region_invalidate(struct mshv_mem_region *region) +{ + mshv_region_invalidate_pages(region, 0, region->nr_pages); +} + +int mshv_region_pin(struct mshv_mem_region *region) +{ + u64 done_count, nr_pages; + struct page **pages; + __u64 userspace_addr; + int ret; + + for (done_count = 0; done_count < region->nr_pages; done_count += ret) { + pages = region->pages + done_count; + userspace_addr = region->start_uaddr + + done_count * HV_HYP_PAGE_SIZE; + nr_pages = min(region->nr_pages - done_count, + MSHV_PIN_PAGES_BATCH_SIZE); + + /* + * Pinning assuming 4k pages works for large pages too. + * All page structs within the large page are returned. + * + * Pin requests are batched because pin_user_pages_fast + * with the FOLL_LONGTERM flag does a large temporary + * allocation of contiguous memory. + */ + ret = pin_user_pages_fast(userspace_addr, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages); + if (ret < 0) + goto release_pages; + } + + if (PageHuge(region->pages[0])) + region->flags.large_pages = true; + + return 0; + +release_pages: + mshv_region_invalidate_pages(region, 0, done_count); + return ret; +} + +void mshv_region_destroy(struct mshv_mem_region *region) +{ + struct mshv_partition *partition = region->partition; + u32 unmap_flags = 0; + int ret; + + hlist_del(®ion->hnode); + + if (mshv_partition_encrypted(partition)) { + ret = mshv_region_share(region); + if (ret) { + pt_err(partition, + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", + ret); + return; + } + } + + if (region->flags.large_pages) + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + + /* ignore unmap failures and continue as process may be exiting */ + hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, + region->nr_pages, unmap_flags); + + mshv_region_invalidate(region); + + vfree(region); +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 3eb815011b46..0366f416c2f0 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -312,4 +312,14 @@ extern struct mshv_root mshv_root; extern enum hv_scheduler_type hv_scheduler_type; extern u8 * __percpu *hv_synic_eventring_tail; +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags, + bool is_mmio); +int mshv_region_share(struct mshv_mem_region *region); +int mshv_region_unshare(struct mshv_mem_region *region); +int mshv_region_map(struct mshv_mem_region *region); +void mshv_region_invalidate(struct mshv_mem_region *region); +int mshv_region_pin(struct mshv_mem_region *region); +void mshv_region_destroy(struct mshv_mem_region *region); + #endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index ec18984c3f2d..5dfb933da981 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1059,117 +1059,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status) *status = partition->async_hypercall_status; } -static int -mshv_partition_region_share(struct mshv_mem_region *region) -{ - u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; - - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; - - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, - flags, true); -} - -static int -mshv_partition_region_unshare(struct mshv_mem_region *region) -{ - u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; - - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; - - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - 0, - flags, false); -} - -static int -mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, - u64 page_offset, u64 page_count) -{ - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - if (region->flags.large_pages) - map_flags |= HV_MAP_GPA_LARGE_PAGE; - - /* ask the hypervisor to map guest ram */ - return hv_call_map_gpa_pages(region->partition->pt_id, - region->start_gfn + page_offset, - page_count, map_flags, - region->pages + page_offset); -} - -static int -mshv_region_map(struct mshv_mem_region *region) -{ - u32 map_flags = region->hv_map_flags; - - return mshv_region_remap_pages(region, map_flags, - 0, region->nr_pages); -} - -static void -mshv_region_invalidate_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) -{ - if (region->flags.range_pinned) - unpin_user_pages(region->pages + page_offset, page_count); - - memset(region->pages + page_offset, 0, - page_count * sizeof(struct page *)); -} - -static void -mshv_region_invalidate(struct mshv_mem_region *region) -{ - mshv_region_invalidate_pages(region, 0, region->nr_pages); -} - -static int -mshv_region_pin(struct mshv_mem_region *region) -{ - u64 done_count, nr_pages; - struct page **pages; - __u64 userspace_addr; - int ret; - - for (done_count = 0; done_count < region->nr_pages; done_count += ret) { - pages = region->pages + done_count; - userspace_addr = region->start_uaddr + - done_count * HV_HYP_PAGE_SIZE; - nr_pages = min(region->nr_pages - done_count, - MSHV_PIN_PAGES_BATCH_SIZE); - - /* - * Pinning assuming 4k pages works for large pages too. - * All page structs within the large page are returned. - * - * Pin requests are batched because pin_user_pages_fast - * with the FOLL_LONGTERM flag does a large temporary - * allocation of contiguous memory. - */ - ret = pin_user_pages_fast(userspace_addr, nr_pages, - FOLL_WRITE | FOLL_LONGTERM, - pages); - if (ret < 0) - goto release_pages; - } - - if (PageHuge(region->pages[0])) - region->flags.large_pages = true; - - return 0; - -release_pages: - mshv_region_invalidate_pages(region, 0, done_count); - return ret; -} - static struct mshv_mem_region * mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) { @@ -1193,7 +1082,7 @@ static int mshv_partition_create_region(struct mshv_partition *partition, struct mshv_mem_region **regionpp, bool is_mmio) { - struct mshv_mem_region *region, *rg; + struct mshv_mem_region *rg; u64 nr_pages = HVPFN_DOWN(mem->size); /* Reject overlapping regions */ @@ -1205,26 +1094,15 @@ static int mshv_partition_create_region(struct mshv_partition *partition, return -EEXIST; } - region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); - if (!region) - return -ENOMEM; + rg = mshv_region_create(mem->guest_pfn, nr_pages, + mem->userspace_addr, mem->flags, + is_mmio); + if (IS_ERR(rg)) + return PTR_ERR(rg); - region->nr_pages = nr_pages; - region->start_gfn = mem->guest_pfn; - region->start_uaddr = mem->userspace_addr; - region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; - if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) - region->hv_map_flags |= HV_MAP_GPA_WRITABLE; - if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) - region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + rg->partition = partition; - /* Note: large_pages flag populated when we pin the pages */ - if (!is_mmio) - region->flags.range_pinned = true; - - region->partition = partition; - - *regionpp = region; + *regionpp = rg; return 0; } @@ -1262,7 +1140,7 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region) * access to guest memory regions. */ if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_unshare(region); + ret = mshv_region_unshare(region); if (ret) { pt_err(partition, "Failed to unshare memory region (guest_pfn: %llu): %d\n", @@ -1275,7 +1153,7 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region) if (ret && mshv_partition_encrypted(partition)) { int shrc; - shrc = mshv_partition_region_share(region); + shrc = mshv_region_share(region); if (!shrc) goto invalidate_region; @@ -1356,36 +1234,6 @@ errout: return ret; } -static void mshv_partition_destroy_region(struct mshv_mem_region *region) -{ - struct mshv_partition *partition = region->partition; - u32 unmap_flags = 0; - int ret; - - hlist_del(®ion->hnode); - - if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_share(region); - if (ret) { - pt_err(partition, - "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", - ret); - return; - } - } - - if (region->flags.large_pages) - unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; - - /* ignore unmap failures and continue as process may be exiting */ - hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, - region->nr_pages, unmap_flags); - - mshv_region_invalidate(region); - - vfree(region); -} - /* Called for unmapping both the guest ram and the mmio space */ static long mshv_unmap_user_memory(struct mshv_partition *partition, @@ -1406,7 +1254,7 @@ mshv_unmap_user_memory(struct mshv_partition *partition, region->nr_pages != HVPFN_DOWN(mem.size)) return -EINVAL; - mshv_partition_destroy_region(region); + mshv_region_destroy(region); return 0; } @@ -1810,7 +1658,7 @@ static void destroy_partition(struct mshv_partition *partition) hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, hnode) - mshv_partition_destroy_region(region); + mshv_region_destroy(region); /* Withdraw and free all pages we deposited */ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); From abceb4297bf88340ce06016895babe292510a262 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 3 Dec 2025 21:40:56 +0000 Subject: [PATCH 53/58] mshv: Fix huge page handling in memory region traversal The previous code assumed that if a region's first page was huge, the entire region consisted of huge pages and stored this in a large_pages flag. This premise is incorrect not only for movable regions (where pages can be split and merged on invalidate callbacks or page faults), but even for pinned regions: THPs can be split and merged during allocation, so a large, pinned region may contain a mix of huge and regular pages. This change removes the large_pages flag and replaces region-wide assumptions with per-chunk inspection of the actual page size when mapping, unmapping, sharing, and unsharing. This makes huge page handling correct for mixed-page regions and avoids relying on stale metadata that can easily become invalid as memory is remapped. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Nuno Das Neves Signed-off-by: Wei Liu --- drivers/hv/mshv_regions.c | 223 ++++++++++++++++++++++++++++++++------ drivers/hv/mshv_root.h | 3 +- 2 files changed, 193 insertions(+), 33 deletions(-) diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c index 35b866670840..40126c12ab33 100644 --- a/drivers/hv/mshv_regions.c +++ b/drivers/hv/mshv_regions.c @@ -14,6 +14,124 @@ #include "mshv_root.h" +/** + * mshv_region_process_chunk - Processes a contiguous chunk of memory pages + * in a region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle the chunk. + * + * This function scans the region's pages starting from @page_offset, + * checking for contiguous present pages of the same size (normal or huge). + * It invokes @handler for the chunk of contiguous pages found. Returns the + * number of pages handled, or a negative error code if the first page is + * not present or the handler fails. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Return: Number of pages handled, or negative error code. + */ +static long mshv_region_process_chunk(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count)) +{ + u64 count, stride; + unsigned int page_order; + struct page *page; + int ret; + + page = region->pages[page_offset]; + if (!page) + return -EINVAL; + + page_order = folio_order(page_folio(page)); + /* The hypervisor only supports 4K and 2M page sizes */ + if (page_order && page_order != HPAGE_PMD_ORDER) + return -EINVAL; + + stride = 1 << page_order; + + /* Start at stride since the first page is validated */ + for (count = stride; count < page_count; count += stride) { + page = region->pages[page_offset + count]; + + /* Break if current page is not present */ + if (!page) + break; + + /* Break if page size changes */ + if (page_order != folio_order(page_folio(page))) + break; + } + + ret = handler(region, flags, page_offset, count); + if (ret) + return ret; + + return count; +} + +/** + * mshv_region_process_range - Processes a range of memory pages in a + * region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle each chunk of contiguous + * pages. + * + * Iterates over the specified range of pages in @region, skipping + * non-present pages. For each contiguous chunk of present pages, invokes + * @handler via mshv_region_process_chunk. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Returns 0 on success, or a negative error code on failure. + */ +static int mshv_region_process_range(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count)) +{ + long ret; + + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + while (page_count) { + /* Skip non-present pages */ + if (!region->pages[page_offset]) { + page_offset++; + page_count--; + continue; + } + + ret = mshv_region_process_chunk(region, flags, + page_offset, + page_count, + handler); + if (ret < 0) + return ret; + + page_offset += ret; + page_count -= ret; + } + + return 0; +} + struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, u64 uaddr, u32 flags, bool is_mmio) @@ -33,53 +151,84 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; - /* Note: large_pages flag populated when we pin the pages */ if (!is_mmio) region->flags.range_pinned = true; return region; } +static int mshv_region_chunk_share(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages + page_offset, + page_count, + HV_MAP_GPA_READABLE | + HV_MAP_GPA_WRITABLE, + flags, true); +} + int mshv_region_share(struct mshv_mem_region *region) { u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; - if (region->flags.large_pages) + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_share); +} + +static int mshv_region_chunk_unshare(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, - flags, true); + region->pages + page_offset, + page_count, 0, + flags, false); } int mshv_region_unshare(struct mshv_mem_region *region) { u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_unshare); +} - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - 0, - flags, false); +static int mshv_region_chunk_remap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MAP_GPA_LARGE_PAGE; + + return hv_call_map_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags, + region->pages + page_offset); } static int mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, u64 page_offset, u64 page_count) { - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - if (region->flags.large_pages) - map_flags |= HV_MAP_GPA_LARGE_PAGE; - - return hv_call_map_gpa_pages(region->partition->pt_id, - region->start_gfn + page_offset, - page_count, map_flags, - region->pages + page_offset); + return mshv_region_process_range(region, map_flags, + page_offset, page_count, + mshv_region_chunk_remap); } int mshv_region_map(struct mshv_mem_region *region) @@ -134,9 +283,6 @@ int mshv_region_pin(struct mshv_mem_region *region) goto release_pages; } - if (PageHuge(region->pages[0])) - region->flags.large_pages = true; - return 0; release_pages: @@ -144,10 +290,30 @@ release_pages: return ret; } +static int mshv_region_chunk_unmap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_UNMAP_GPA_LARGE_PAGE; + + return hv_call_unmap_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags); +} + +static int mshv_region_unmap(struct mshv_mem_region *region) +{ + return mshv_region_process_range(region, 0, + 0, region->nr_pages, + mshv_region_chunk_unmap); +} + void mshv_region_destroy(struct mshv_mem_region *region) { struct mshv_partition *partition = region->partition; - u32 unmap_flags = 0; int ret; hlist_del(®ion->hnode); @@ -162,12 +328,7 @@ void mshv_region_destroy(struct mshv_mem_region *region) } } - if (region->flags.large_pages) - unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; - - /* ignore unmap failures and continue as process may be exiting */ - hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, - region->nr_pages, unmap_flags); + mshv_region_unmap(region); mshv_region_invalidate(region); diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 0366f416c2f0..ff3374f13691 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -77,9 +77,8 @@ struct mshv_mem_region { u64 start_uaddr; u32 hv_map_flags; struct { - u64 large_pages: 1; /* 2MiB */ u64 range_pinned: 1; - u64 reserved: 62; + u64 reserved: 63; } flags; struct mshv_partition *partition; struct page *pages[]; From c39dda08286f4d5ce4d114f8d5dbfdb85effbd6a Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 3 Dec 2025 21:41:02 +0000 Subject: [PATCH 54/58] mshv: Add refcount and locking to mem regions Introduce kref-based reference counting and spinlock protection for memory regions in Hyper-V partition management. This change improves memory region lifecycle management and ensures thread-safe access to the region list. Previously, the regions list was protected by the partition mutex. However, this approach is too heavy for frequent fault and invalidation operations. Finer grained locking is now used to improve efficiency and concurrency. This is a precursor to supporting movable memory regions. Fault and invalidation handling for movable regions will require safe traversal of the region list and holding a region reference while performing invalidation or fault operations. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- drivers/hv/mshv_regions.c | 19 ++++++++++++++++--- drivers/hv/mshv_root.h | 6 +++++- drivers/hv/mshv_root_main.c | 32 ++++++++++++++++++++++++-------- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c index 40126c12ab33..4ec78fdaf56d 100644 --- a/drivers/hv/mshv_regions.c +++ b/drivers/hv/mshv_regions.c @@ -7,6 +7,7 @@ * Authors: Microsoft Linux virtualization team */ +#include #include #include @@ -154,6 +155,8 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, if (!is_mmio) region->flags.range_pinned = true; + kref_init(®ion->refcount); + return region; } @@ -311,13 +314,13 @@ static int mshv_region_unmap(struct mshv_mem_region *region) mshv_region_chunk_unmap); } -void mshv_region_destroy(struct mshv_mem_region *region) +static void mshv_region_destroy(struct kref *ref) { + struct mshv_mem_region *region = + container_of(ref, struct mshv_mem_region, refcount); struct mshv_partition *partition = region->partition; int ret; - hlist_del(®ion->hnode); - if (mshv_partition_encrypted(partition)) { ret = mshv_region_share(region); if (ret) { @@ -334,3 +337,13 @@ void mshv_region_destroy(struct mshv_mem_region *region) vfree(region); } + +void mshv_region_put(struct mshv_mem_region *region) +{ + kref_put(®ion->refcount, mshv_region_destroy); +} + +int mshv_region_get(struct mshv_mem_region *region) +{ + return kref_get_unless_zero(®ion->refcount); +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index ff3374f13691..4249534ba900 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -72,6 +72,7 @@ do { \ struct mshv_mem_region { struct hlist_node hnode; + struct kref refcount; u64 nr_pages; u64 start_gfn; u64 start_uaddr; @@ -97,6 +98,8 @@ struct mshv_partition { u64 pt_id; refcount_t pt_ref_count; struct mutex pt_mutex; + + spinlock_t pt_mem_regions_lock; struct hlist_head pt_mem_regions; // not ordered u32 pt_vp_count; @@ -319,6 +322,7 @@ int mshv_region_unshare(struct mshv_mem_region *region); int mshv_region_map(struct mshv_mem_region *region); void mshv_region_invalidate(struct mshv_mem_region *region); int mshv_region_pin(struct mshv_mem_region *region); -void mshv_region_destroy(struct mshv_mem_region *region); +void mshv_region_put(struct mshv_mem_region *region); +int mshv_region_get(struct mshv_mem_region *region); #endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 5dfb933da981..aa1a11f4dc3e 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1086,13 +1086,15 @@ static int mshv_partition_create_region(struct mshv_partition *partition, u64 nr_pages = HVPFN_DOWN(mem->size); /* Reject overlapping regions */ + spin_lock(&partition->pt_mem_regions_lock); hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { if (mem->guest_pfn + nr_pages <= rg->start_gfn || rg->start_gfn + rg->nr_pages <= mem->guest_pfn) continue; - + spin_unlock(&partition->pt_mem_regions_lock); return -EEXIST; } + spin_unlock(&partition->pt_mem_regions_lock); rg = mshv_region_create(mem->guest_pfn, nr_pages, mem->userspace_addr, mem->flags, @@ -1224,8 +1226,9 @@ mshv_map_user_memory(struct mshv_partition *partition, if (ret) goto errout; - /* Install the new region */ + spin_lock(&partition->pt_mem_regions_lock); hlist_add_head(®ion->hnode, &partition->pt_mem_regions); + spin_unlock(&partition->pt_mem_regions_lock); return 0; @@ -1244,17 +1247,27 @@ mshv_unmap_user_memory(struct mshv_partition *partition, if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) return -EINVAL; + spin_lock(&partition->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); - if (!region) - return -EINVAL; + if (!region) { + spin_unlock(&partition->pt_mem_regions_lock); + return -ENOENT; + } /* Paranoia check */ if (region->start_uaddr != mem.userspace_addr || region->start_gfn != mem.guest_pfn || - region->nr_pages != HVPFN_DOWN(mem.size)) + region->nr_pages != HVPFN_DOWN(mem.size)) { + spin_unlock(&partition->pt_mem_regions_lock); return -EINVAL; + } - mshv_region_destroy(region); + hlist_del(®ion->hnode); + + spin_unlock(&partition->pt_mem_regions_lock); + + mshv_region_put(region); return 0; } @@ -1657,8 +1670,10 @@ static void destroy_partition(struct mshv_partition *partition) remove_partition(partition); hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, - hnode) - mshv_region_destroy(region); + hnode) { + hlist_del(®ion->hnode); + mshv_region_put(region); + } /* Withdraw and free all pages we deposited */ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); @@ -1856,6 +1871,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) INIT_HLIST_HEAD(&partition->pt_devices); + spin_lock_init(&partition->pt_mem_regions_lock); INIT_HLIST_HEAD(&partition->pt_mem_regions); mshv_eventfd_init(partition); From b9a66cd5ccbb9fade15d0e427e19470d8ad35b75 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Wed, 3 Dec 2025 21:41:09 +0000 Subject: [PATCH 55/58] mshv: Add support for movable memory regions Introduce support for movable memory regions in the Hyper-V root partition driver to improve memory management flexibility and enable advanced use cases such as dynamic memory remapping. Mirror the address space between the Linux root partition and guest VMs using HMM. The root partition owns the memory, while guest VMs act as devices with page tables managed via hypercalls. MSHV handles VP intercepts by invoking hmm_range_fault() and updating SLAT entries. When memory is reclaimed, HMM invalidates the relevant regions, prompting MSHV to clear SLAT entries; guest VMs will fault again on access. Integrate mmu_interval_notifier for movable regions, implement handlers for HMM faults and memory invalidation, and update memory region mapping logic to support movable regions. While MMU notifiers are commonly used in virtualization drivers, this implementation leverages HMM (Heterogeneous Memory Management) for its specialized functionality. HMM provides a framework for mirroring, invalidation, and fault handling, reducing boilerplate and improving maintainability compared to generic MMU notifiers. Signed-off-by: Stanislav Kinsburskii Reviewed-by: Nuno Das Neves Signed-off-by: Wei Liu --- drivers/hv/Kconfig | 2 + drivers/hv/mshv_regions.c | 218 +++++++++++++++++++++++++++++++++++- drivers/hv/mshv_root.h | 20 +++- drivers/hv/mshv_root_main.c | 142 +++++++++++++++++++---- 4 files changed, 346 insertions(+), 36 deletions(-) diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index d4a8d349200c..7937ac0cbd0f 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -76,6 +76,8 @@ config MSHV_ROOT depends on PAGE_SIZE_4KB select EVENTFD select VIRT_XFER_TO_GUEST_WORK + select HMM_MIRROR + select MMU_NOTIFIER default n help Select this option to enable support for booting and running as root diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c index 4ec78fdaf56d..202b9d551e39 100644 --- a/drivers/hv/mshv_regions.c +++ b/drivers/hv/mshv_regions.c @@ -7,6 +7,8 @@ * Authors: Microsoft Linux virtualization team */ +#include +#include #include #include #include @@ -15,6 +17,8 @@ #include "mshv_root.h" +#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD + /** * mshv_region_process_chunk - Processes a contiguous chunk of memory pages * in a region. @@ -134,8 +138,7 @@ static int mshv_region_process_range(struct mshv_mem_region *region, } struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, - u64 uaddr, u32 flags, - bool is_mmio) + u64 uaddr, u32 flags) { struct mshv_mem_region *region; @@ -152,9 +155,6 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; - if (!is_mmio) - region->flags.range_pinned = true; - kref_init(®ion->refcount); return region; @@ -245,7 +245,7 @@ int mshv_region_map(struct mshv_mem_region *region) static void mshv_region_invalidate_pages(struct mshv_mem_region *region, u64 page_offset, u64 page_count) { - if (region->flags.range_pinned) + if (region->type == MSHV_REGION_TYPE_MEM_PINNED) unpin_user_pages(region->pages + page_offset, page_count); memset(region->pages + page_offset, 0, @@ -321,6 +321,9 @@ static void mshv_region_destroy(struct kref *ref) struct mshv_partition *partition = region->partition; int ret; + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + mshv_region_movable_fini(region); + if (mshv_partition_encrypted(partition)) { ret = mshv_region_share(region); if (ret) { @@ -347,3 +350,206 @@ int mshv_region_get(struct mshv_mem_region *region) { return kref_get_unless_zero(®ion->refcount); } + +/** + * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region + * @region: Pointer to the memory region structure + * @range: Pointer to the HMM range structure + * + * This function performs the following steps: + * 1. Reads the notifier sequence for the HMM range. + * 2. Acquires a read lock on the memory map. + * 3. Handles HMM faults for the specified range. + * 4. Releases the read lock on the memory map. + * 5. If successful, locks the memory region mutex. + * 6. Verifies if the notifier sequence has changed during the operation. + * If it has, releases the mutex and returns -EBUSY to match with + * hmm_range_fault() return code for repeating. + * + * Return: 0 on success, a negative error code otherwise. + */ +static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, + struct hmm_range *range) +{ + int ret; + + range->notifier_seq = mmu_interval_read_begin(range->notifier); + mmap_read_lock(region->mni.mm); + ret = hmm_range_fault(range); + mmap_read_unlock(region->mni.mm); + if (ret) + return ret; + + mutex_lock(®ion->mutex); + + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { + mutex_unlock(®ion->mutex); + cond_resched(); + return -EBUSY; + } + + return 0; +} + +/** + * mshv_region_range_fault - Handle memory range faults for a given region. + * @region: Pointer to the memory region structure. + * @page_offset: Offset of the page within the region. + * @page_count: Number of pages to handle. + * + * This function resolves memory faults for a specified range of pages + * within a memory region. It uses HMM (Heterogeneous Memory Management) + * to fault in the required pages and updates the region's page array. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_region_range_fault(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + struct hmm_range range = { + .notifier = ®ion->mni, + .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, + }; + unsigned long *pfns; + int ret; + u64 i; + + pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return -ENOMEM; + + range.hmm_pfns = pfns; + range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE; + range.end = range.start + page_count * HV_HYP_PAGE_SIZE; + + do { + ret = mshv_region_hmm_fault_and_lock(region, &range); + } while (ret == -EBUSY); + + if (ret) + goto out; + + for (i = 0; i < page_count; i++) + region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); + + ret = mshv_region_remap_pages(region, region->hv_map_flags, + page_offset, page_count); + + mutex_unlock(®ion->mutex); +out: + kfree(pfns); + return ret; +} + +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) +{ + u64 page_offset, page_count; + int ret; + + /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */ + page_offset = ALIGN_DOWN(gfn - region->start_gfn, + MSHV_MAP_FAULT_IN_PAGES); + + /* Map more pages than requested to reduce the number of faults. */ + page_count = min(region->nr_pages - page_offset, + MSHV_MAP_FAULT_IN_PAGES); + + ret = mshv_region_range_fault(region, page_offset, page_count); + + WARN_ONCE(ret, + "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n", + region->partition->pt_id, region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + gfn, page_offset, page_count); + + return !ret; +} + +/** + * mshv_region_interval_invalidate - Invalidate a range of memory region + * @mni: Pointer to the mmu_interval_notifier structure + * @range: Pointer to the mmu_notifier_range structure + * @cur_seq: Current sequence number for the interval notifier + * + * This function invalidates a memory region by remapping its pages with + * no access permissions. It locks the region's mutex to ensure thread safety + * and updates the sequence number for the interval notifier. If the range + * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking + * lock and returns false if unsuccessful. + * + * NOTE: Failure to invalidate a region is a serious error, as the pages will + * be considered freed while they are still mapped by the hypervisor. + * Any attempt to access such pages will likely crash the system. + * + * Return: true if the region was successfully invalidated, false otherwise. + */ +static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct mshv_mem_region *region = container_of(mni, + struct mshv_mem_region, + mni); + u64 page_offset, page_count; + unsigned long mstart, mend; + int ret = -EPERM; + + if (mmu_notifier_range_blockable(range)) + mutex_lock(®ion->mutex); + else if (!mutex_trylock(®ion->mutex)) + goto out_fail; + + mmu_interval_set_seq(mni, cur_seq); + + mstart = max(range->start, region->start_uaddr); + mend = min(range->end, region->start_uaddr + + (region->nr_pages << HV_HYP_PAGE_SHIFT)); + + page_offset = HVPFN_DOWN(mstart - region->start_uaddr); + page_count = HVPFN_DOWN(mend - mstart); + + ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS, + page_offset, page_count); + if (ret) + goto out_fail; + + mshv_region_invalidate_pages(region, page_offset, page_count); + + mutex_unlock(®ion->mutex); + + return true; + +out_fail: + WARN_ONCE(ret, + "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", + region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + range->start, range->end, range->event, + page_offset, page_offset + page_count - 1, (u64)range->mm, ret); + return false; +} + +static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { + .invalidate = mshv_region_interval_invalidate, +}; + +void mshv_region_movable_fini(struct mshv_mem_region *region) +{ + mmu_interval_notifier_remove(®ion->mni); +} + +bool mshv_region_movable_init(struct mshv_mem_region *region) +{ + int ret; + + ret = mmu_interval_notifier_insert(®ion->mni, current->mm, + region->start_uaddr, + region->nr_pages << HV_HYP_PAGE_SHIFT, + &mshv_region_mni_ops); + if (ret) + return false; + + mutex_init(®ion->mutex); + + return true; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 4249534ba900..3c1d88b36741 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -70,6 +71,12 @@ do { \ #define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) #define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) +enum mshv_region_type { + MSHV_REGION_TYPE_MEM_PINNED, + MSHV_REGION_TYPE_MEM_MOVABLE, + MSHV_REGION_TYPE_MMIO +}; + struct mshv_mem_region { struct hlist_node hnode; struct kref refcount; @@ -77,11 +84,10 @@ struct mshv_mem_region { u64 start_gfn; u64 start_uaddr; u32 hv_map_flags; - struct { - u64 range_pinned: 1; - u64 reserved: 63; - } flags; struct mshv_partition *partition; + enum mshv_region_type type; + struct mmu_interval_notifier mni; + struct mutex mutex; /* protects region pages remapping */ struct page *pages[]; }; @@ -315,8 +321,7 @@ extern enum hv_scheduler_type hv_scheduler_type; extern u8 * __percpu *hv_synic_eventring_tail; struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, - u64 uaddr, u32 flags, - bool is_mmio); + u64 uaddr, u32 flags); int mshv_region_share(struct mshv_mem_region *region); int mshv_region_unshare(struct mshv_mem_region *region); int mshv_region_map(struct mshv_mem_region *region); @@ -324,5 +329,8 @@ void mshv_region_invalidate(struct mshv_mem_region *region); int mshv_region_pin(struct mshv_mem_region *region); void mshv_region_put(struct mshv_mem_region *region); int mshv_region_get(struct mshv_mem_region *region); +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); +void mshv_region_movable_fini(struct mshv_mem_region *region); +bool mshv_region_movable_init(struct mshv_mem_region *region); #endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index aa1a11f4dc3e..9cf28a3f12fe 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -594,14 +594,98 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +#ifdef CONFIG_X86_64 +static struct mshv_mem_region * +mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) +{ + struct mshv_mem_region *region; + + spin_lock(&p->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(p, gfn); + if (!region || !mshv_region_get(region)) { + spin_unlock(&p->pt_mem_regions_lock); + return NULL; + } + spin_unlock(&p->pt_mem_regions_lock); + + return region; +} + +/** + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. + * @vp: Pointer to the virtual processor structure. + * + * This function processes GPA intercepts by identifying the memory region + * corresponding to the intercepted GPA, aligning the page offset, and + * mapping the required pages. It ensures that the region is valid and + * handles faults efficiently by mapping multiple pages at once. + * + * Return: true if the intercept was handled successfully, false otherwise. + */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + struct mshv_mem_region *region; + struct hv_x64_memory_intercept_message *msg; + bool ret; + u64 gfn; + + msg = (struct hv_x64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; + + gfn = HVPFN_DOWN(msg->guest_physical_address); + + region = mshv_partition_region_by_gfn_get(p, gfn); + if (!region) + return false; + + /* Only movable memory ranges are supported for GPA intercepts */ + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + ret = mshv_region_handle_gfn_fault(region, gfn); + else + ret = false; + + mshv_region_put(region); + + return ret; +} +#else /* CONFIG_X86_64 */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; } +#endif /* CONFIG_X86_64 */ + +static bool mshv_vp_handle_intercept(struct mshv_vp *vp) +{ + switch (vp->vp_intercept_msg_page->header.message_type) { + case HVMSG_GPA_INTERCEPT: + return mshv_handle_gpa_intercept(vp); + } + return false; +} + static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) { long rc; - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - rc = mshv_run_vp_with_root_scheduler(vp); - else - rc = mshv_run_vp_with_hyp_scheduler(vp); + do { + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + } while (rc == 0 && mshv_vp_handle_intercept(vp)); if (rc) return rc; @@ -1059,20 +1143,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status) *status = partition->async_hypercall_status; } -static struct mshv_mem_region * -mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (gfn >= region->start_gfn && - gfn < region->start_gfn + region->nr_pages) - return region; - } - - return NULL; -} - /* * NB: caller checks and makes sure mem->size is page aligned * Returns: 0 with regionpp updated on success, or -errno @@ -1097,11 +1167,18 @@ static int mshv_partition_create_region(struct mshv_partition *partition, spin_unlock(&partition->pt_mem_regions_lock); rg = mshv_region_create(mem->guest_pfn, nr_pages, - mem->userspace_addr, mem->flags, - is_mmio); + mem->userspace_addr, mem->flags); if (IS_ERR(rg)) return PTR_ERR(rg); + if (is_mmio) + rg->type = MSHV_REGION_TYPE_MMIO; + else if (mshv_partition_encrypted(partition) || + !mshv_region_movable_init(rg)) + rg->type = MSHV_REGION_TYPE_MEM_PINNED; + else + rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; + rg->partition = partition; *regionpp = rg; @@ -1217,11 +1294,28 @@ mshv_map_user_memory(struct mshv_partition *partition, if (ret) return ret; - if (is_mmio) - ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, - mmio_pfn, HVPFN_DOWN(mem.size)); - else + switch (region->type) { + case MSHV_REGION_TYPE_MEM_PINNED: ret = mshv_prepare_pinned_region(region); + break; + case MSHV_REGION_TYPE_MEM_MOVABLE: + /* + * For movable memory regions, remap with no access to let + * the hypervisor track dirty pages, enabling pre-copy live + * migration. + */ + ret = hv_call_map_gpa_pages(partition->pt_id, + region->start_gfn, + region->nr_pages, + HV_MAP_GPA_NO_ACCESS, NULL); + break; + case MSHV_REGION_TYPE_MMIO: + ret = hv_call_map_mmio_pages(partition->pt_id, + region->start_gfn, + mmio_pfn, + region->nr_pages); + break; + } if (ret) goto errout; From 723c47a221ee407901055c9d9b4434e68c5d650e Mon Sep 17 00:00:00 2001 From: Praveen K Paladugu Date: Fri, 5 Dec 2025 14:17:06 -0600 Subject: [PATCH 56/58] mshv: Add definitions for MSHV sleep state configuration Add the definitions required to configure sleep states in mshv hypervsior. Signed-off-by: Praveen K Paladugu Co-developed-by: Anatol Belski Signed-off-by: Anatol Belski Reviewed-by: Easwar Hariharan Reviewed-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 4 +++- include/hyperv/hvhdk_mini.h | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 1d5ce11be8b6..04b18d0e37af 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -465,19 +465,21 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_RESET_DEBUG_SESSION 0x006b #define HVCALL_MAP_STATS_PAGE 0x006c #define HVCALL_UNMAP_STATS_PAGE 0x006d +#define HVCALL_SET_SYSTEM_PROPERTY 0x006f #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 #define HVCALL_GET_SYSTEM_PROPERTY 0x007b #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 +#define HVCALL_ENTER_SLEEP_STATE 0x0084 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 #define HVCALL_CREATE_PORT 0x0095 #define HVCALL_CONNECT_PORT 0x0096 #define HVCALL_START_VP 0x0099 -#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a +#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 #define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0 diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index f2d7b50de7a4..41a29bf8ec14 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -140,6 +140,7 @@ enum hv_snp_status { enum hv_system_property { /* Add more values when needed */ + HV_SYSTEM_PROPERTY_SLEEP_STATE = 3, HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47, @@ -155,6 +156,19 @@ union hv_pfn_range { /* HV_SPA_PAGE_RANGE */ } __packed; }; +enum hv_sleep_state { + HV_SLEEP_STATE_S1 = 1, + HV_SLEEP_STATE_S2 = 2, + HV_SLEEP_STATE_S3 = 3, + HV_SLEEP_STATE_S4 = 4, + HV_SLEEP_STATE_S5 = 5, + /* + * After hypervisor has received this, any follow up sleep + * state registration requests will be rejected. + */ + HV_SLEEP_STATE_LOCK = 6 +}; + enum hv_dynamic_processor_feature_property { /* Add more values when needed */ HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13, @@ -184,6 +198,32 @@ struct hv_output_get_system_property { }; } __packed; +struct hv_sleep_state_info { + u32 sleep_state; /* enum hv_sleep_state */ + u8 pm1a_slp_typ; + u8 pm1b_slp_typ; +} __packed; + +struct hv_input_set_system_property { + u32 property_id; /* enum hv_system_property */ + u32 reserved; + union { + /* More fields to be filled in when needed */ + struct hv_sleep_state_info set_sleep_state_info; + + /* + * Add a reserved field to ensure the union is 8-byte aligned as + * existing members may not be. This is a temporary measure + * until all remaining members are added. + */ + u64 reserved0[8]; + }; +} __packed; + +struct hv_input_enter_sleep_state { /* HV_INPUT_ENTER_SLEEP_STATE */ + u32 sleep_state; /* enum hv_sleep_state */ +} __packed; + struct hv_input_map_stats_page { u32 type; /* enum hv_stats_object_type */ u32 padding; From f0be2600ac55a5845d536c56787daca50dbcb2a1 Mon Sep 17 00:00:00 2001 From: Praveen K Paladugu Date: Fri, 5 Dec 2025 14:17:07 -0600 Subject: [PATCH 57/58] mshv: Use reboot notifier to configure sleep state Configure sleep state information from ACPI in MSHV hypervisor using a reboot notifier. This data allows the hypervisor to correctly power off the host during shutdown. Signed-off-by: Praveen K Paladugu Co-developed-by: Anatol Belski Signed-off-by: Anatol Belski Reviewed-by: Easwar Hariharan Reviewed-by: Stansialv Kinsburskii Acked-by: Ingo Molnar Reviewed-by: Nuno Das Neves Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 1 + arch/x86/include/asm/mshyperv.h | 1 + drivers/hv/mshv_common.c | 78 +++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e28737ec7054..daf97a984b78 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -555,6 +555,7 @@ void __init hyperv_init(void) hv_remap_tsc_clocksource(); hv_root_crash_init(); + hv_sleep_notifiers_register(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 10037125099a..62a89debfbce 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -177,6 +177,7 @@ int hyperv_flush_guest_mapping_range(u64 as, int hyperv_fill_flush_guest_mapping_list( struct hv_guest_mapping_flush_list *flush, u64 start_gfn, u64 end_gfn); +void hv_sleep_notifiers_register(void); #ifdef CONFIG_X86_64 void hv_apic_init(void); diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c index aa2be51979fd..ee733ba1575e 100644 --- a/drivers/hv/mshv_common.c +++ b/drivers/hv/mshv_common.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include "mshv.h" @@ -138,3 +141,78 @@ int hv_call_get_partition_property(u64 partition_id, return 0; } EXPORT_SYMBOL_GPL(hv_call_get_partition_property); + +/* + * Corresponding sleep states have to be initialized in order for a subsequent + * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per + * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported. + * + * In order to pass proper PM values to mshv, ACPI should be initialized and + * should support S5 sleep state when this method is invoked. + */ +static int hv_initialize_sleep_states(void) +{ + u64 status; + unsigned long flags; + struct hv_input_set_system_property *in; + acpi_status acpi_status; + u8 sleep_type_a, sleep_type_b; + + if (!acpi_sleep_state_supported(ACPI_STATE_S5)) { + pr_err("%s: S5 sleep state not supported.\n", __func__); + return -ENODEV; + } + + acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a, + &sleep_type_b); + if (ACPI_FAILURE(acpi_status)) + return -ENODEV; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(in, 0, sizeof(*in)); + + in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE; + in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5; + in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a; + in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b; + + status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL); + local_irq_restore(flags); + + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + return hv_result_to_errno(status); + } + + return 0; +} + +/* + * This notifier initializes sleep states in mshv hypervisor which will be + * used during power off. + */ +static int hv_reboot_notifier_handler(struct notifier_block *this, + unsigned long code, void *another) +{ + int ret = 0; + + if (code == SYS_HALT || code == SYS_POWER_OFF) + ret = hv_initialize_sleep_states(); + + return ret ? NOTIFY_DONE : NOTIFY_OK; +} + +static struct notifier_block hv_reboot_notifier = { + .notifier_call = hv_reboot_notifier_handler, +}; + +void hv_sleep_notifiers_register(void) +{ + int ret; + + ret = register_reboot_notifier(&hv_reboot_notifier); + if (ret) + pr_err("%s: cannot register reboot notifier %d\n", __func__, + ret); +} From 615a6e7d83f958e7ef3bc818e818f7c6433b4c2a Mon Sep 17 00:00:00 2001 From: Praveen K Paladugu Date: Fri, 5 Dec 2025 14:17:08 -0600 Subject: [PATCH 58/58] mshv: Cleanly shutdown root partition with MSHV Root partitions running on MSHV currently attempt ACPI power-off, which MSHV intercepts and triggers a Machine Check Exception (MCE), leading to a kernel panic. Root partitions panic with a trace similar to: [ 81.306348] reboot: Power down [ 81.314709] mce: [Hardware Error]: CPU 0: Machine Check Exception: 4 Bank 0: b2000000c0060001 [ 81.314711] mce: [Hardware Error]: TSC 3b8cb60a66 PPIN 11d98332458e4ea9 [ 81.314713] mce: [Hardware Error]: PROCESSOR 0:606a6 TIME 1759339405 SOCKET 0 APIC 0 microcode ffffffff [ 81.314715] mce: [Hardware Error]: Run the above through 'mcelog --ascii' [ 81.314716] mce: [Hardware Error]: Machine check: Processor context corrupt [ 81.314717] Kernel panic - not syncing: Fatal machine check To avoid this, configure the sleep state in the hypervisor and invoke the HVCALL_ENTER_SLEEP_STATE hypercall as the final step in the shutdown sequence. This ensures a clean and safe shutdown of the root partition. Signed-off-by: Praveen K Paladugu Co-developed-by: Anatol Belski Signed-off-by: Anatol Belski Reviewed-by: Easwar Hariharan Acked-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- arch/x86/include/asm/mshyperv.h | 1 + arch/x86/kernel/cpu/mshyperv.c | 2 ++ drivers/hv/mshv_common.c | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 62a89debfbce..eef4c3a5ba28 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -178,6 +178,7 @@ int hyperv_fill_flush_guest_mapping_list( struct hv_guest_mapping_flush_list *flush, u64 start_gfn, u64 end_gfn); void hv_sleep_notifiers_register(void); +void hv_machine_power_off(void); #ifdef CONFIG_X86_64 void hv_apic_init(void); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index fac9953a72ef..579fb2c64cfd 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -621,6 +621,8 @@ static void __init ms_hyperv_init_platform(void) #endif #if IS_ENABLED(CONFIG_HYPERV) + if (hv_root_partition()) + machine_ops.power_off = hv_machine_power_off; #if defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; #endif diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c index ee733ba1575e..58027b23c206 100644 --- a/drivers/hv/mshv_common.c +++ b/drivers/hv/mshv_common.c @@ -216,3 +216,24 @@ void hv_sleep_notifiers_register(void) pr_err("%s: cannot register reboot notifier %d\n", __func__, ret); } + +/* + * Power off the machine by entering S5 sleep state via Hyper-V hypercall. + * This call does not return if successful. + */ +void hv_machine_power_off(void) +{ + unsigned long flags; + struct hv_input_enter_sleep_state *in; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + in->sleep_state = HV_SLEEP_STATE_S5; + + (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL); + local_irq_restore(flags); + + /* should never reach here */ + BUG(); + +}