1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00
Linus Torvalds 51d90a15fe ARM:
- Support for userspace handling of synchronous external aborts (SEAs),
   allowing the VMM to potentially handle the abort in a non-fatal
   manner.
 
 - Large rework of the VGIC's list register handling with the goal of
   supporting more active/pending IRQs than available list registers in
   hardware. In addition, the VGIC now supports EOImode==1 style
   deactivations for IRQs which may occur on a separate vCPU than the
   one that acked the IRQ.
 
 - Support for FEAT_XNX (user / privileged execute permissions) and
   FEAT_HAF (hardware update to the Access Flag) in the software page
   table walkers and shadow MMU.
 
 - Allow page table destruction to reschedule, fixing long need_resched
   latencies observed when destroying a large VM.
 
 - Minor fixes to KVM and selftests
 
 Loongarch:
 
 - Get VM PMU capability from HW GCFG register.
 
 - Add AVEC basic support.
 
 - Use 64-bit register definition for EIOINTC.
 
 - Add KVM timer test cases for tools/selftests.
 
 RISC/V:
 
 - SBI message passing (MPXY) support for KVM guest
 
 - Give a new, more specific error subcode for the case when in-kernel
   AIA virtualization fails to allocate IMSIC VS-file
 
 - Support KVM_DIRTY_LOG_INITIALLY_SET, enabling dirty log gradually
   in small chunks
 
 - Fix guest page fault within HLV* instructions
 
 - Flush VS-stage TLB after VCPU migration for Andes cores
 
 s390:
 
 - Always allocate ESCA (Extended System Control Area), instead of
   starting with the basic SCA and converting to ESCA with the
   addition of the 65th vCPU.  The price is increased number of
   exits (and worse performance) on z10 and earlier processor;
   ESCA was introduced by z114/z196 in 2010.
 
 - VIRT_XFER_TO_GUEST_WORK support
 
 - Operation exception forwarding support
 
 - Cleanups
 
 x86:
 
 - Skip the costly "zap all SPTEs" on an MMIO generation wrap if MMIO SPTE
   caching is disabled, as there can't be any relevant SPTEs to zap.
 
 - Relocate a misplaced export.
 
 - Fix an async #PF bug where KVM would clear the completion queue when the
   guest transitioned in and out of paging mode, e.g. when handling an SMI and
   then returning to paged mode via RSM.
 
 - Leave KVM's user-return notifier registered even when disabling
   virtualization, as long as kvm.ko is loaded.  On reboot/shutdown, keeping
   the notifier registered is ok; the kernel does not use the MSRs and the
   callback will run cleanly and restore host MSRs if the CPU manages to
   return to userspace before the system goes down.
 
 - Use the checked version of {get,put}_user().
 
 - Fix a long-lurking bug where KVM's lack of catch-up logic for periodic APIC
   timers can result in a hard lockup in the host.
 
 - Revert the periodic kvmclock sync logic now that KVM doesn't use a
   clocksource that's subject to NTP corrections.
 
 - Clean up KVM's handling of MMIO Stale Data and L1TF, and bury the latter
   behind CONFIG_CPU_MITIGATIONS.
 
 - Context switch XCR0, XSS, and PKRU outside of the entry/exit fast path;
   the only reason they were handled in the fast path was to paper of a bug
   in the core #MC code, and that has long since been fixed.
 
 - Add emulator support for AVX MOV instructions, to play nice with emulated
   devices whose guest drivers like to access PCI BARs with large multi-byte
   instructions.
 
 x86 (AMD):
 
 - Fix a few missing "VMCB dirty" bugs.
 
 - Fix the worst of KVM's lack of EFER.LMSLE emulation.
 
 - Add AVIC support for addressing 4k vCPUs in x2AVIC mode.
 
 - Fix incorrect handling of selective CR0 writes when checking intercepts
   during emulation of L2 instructions.
 
 - Fix a currently-benign bug where KVM would clobber SPEC_CTRL[63:32] on
   VMRUN and #VMEXIT.
 
 - Fix a bug where KVM corrupt the guest code stream when re-injecting a soft
   interrupt if the guest patched the underlying code after the VM-Exit, e.g.
   when Linux patches code with a temporary INT3.
 
 - Add KVM_X86_SNP_POLICY_BITS to advertise supported SNP policy bits to
   userspace, and extend KVM "support" to all policy bits that don't require
   any actual support from KVM.
 
 x86 (Intel):
 
 - Use the root role from kvm_mmu_page to construct EPTPs instead of the
   current vCPU state, partly as worthwhile cleanup, but mostly to pave the
   way for tracking per-root TLB flushes, and elide EPT flushes on pCPU
   migration if the root is clean from a previous flush.
 
 - Add a few missing nested consistency checks.
 
 - Rip out support for doing "early" consistency checks via hardware as the
   functionality hasn't been used in years and is no longer useful in general;
   replace it with an off-by-default module param to WARN if hardware fails
   a check that KVM does not perform.
 
 - Fix a currently-benign bug where KVM would drop the guest's SPEC_CTRL[63:32]
   on VM-Enter.
 
 - Misc cleanups.
 
 - Overhaul the TDX code to address systemic races where KVM (acting on behalf
   of userspace) could inadvertantly trigger lock contention in the TDX-Module;
   KVM was either working around these in weird, ugly ways, or was simply
   oblivious to them (though even Yan's devilish selftests could only break
   individual VMs, not the host kernel)
 
 - Fix a bug where KVM could corrupt a vCPU's cpu_list when freeing a TDX vCPU,
   if creating said vCPU failed partway through.
 
 - Fix a few sparse warnings (bad annotation, 0 != NULL).
 
 - Use struct_size() to simplify copying TDX capabilities to userspace.
 
 - Fix a bug where TDX would effectively corrupt user-return MSR values if the
   TDX Module rejects VP.ENTER and thus doesn't clobber host MSRs as expected.
 
 Selftests:
 
 - Fix a math goof in mmu_stress_test when running on a single-CPU system/VM.
 
 - Forcefully override ARCH from x86_64 to x86 to play nice with specifying
   ARCH=x86_64 on the command line.
 
 - Extend a bunch of nested VMX to validate nested SVM as well.
 
 - Add support for LA57 in the core VM_MODE_xxx macro, and add a test to
   verify KVM can save/restore nested VMX state when L1 is using 5-level
   paging, but L2 is not.
 
 - Clean up the guest paging code in anticipation of sharing the core logic for
   nested EPT and nested NPT.
 
 guest_memfd:
 
 - Add NUMA mempolicy support for guest_memfd, and clean up a variety of
   rough edges in guest_memfd along the way.
 
 - Define a CLASS to automatically handle get+put when grabbing a guest_memfd
   from a memslot to make it harder to leak references.
 
 - Enhance KVM selftests to make it easer to develop and debug selftests like
   those added for guest_memfd NUMA support, e.g. where test and/or KVM bugs
   often result in hard-to-debug SIGBUS errors.
 
 - Misc cleanups.
 
 Generic:
 
 - Use the recently-added WQ_PERCPU when creating the per-CPU workqueue for
   irqfd cleanup.
 
 - Fix a goof in the dirty ring documentation.
 
 - Fix choice of target for directed yield across different calls to
   kvm_vcpu_on_spin(); the function was always starting from the first
   vCPU instead of continuing the round-robin search.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCgAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmkvMa8UHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroMlFwf+Ow7zOYUuELSQ+Jn+hOYXiCNrdBDx
 ZamvMU8kLPr7XX0Zog6HgcMm//qyA6k5nSfqCjfsQZrIhRA/gWJ61jz1OX/Jxq18
 pJ9Vz6epnEPYiOtBwz+v8OS8MqDqVNzj2i6W1/cLPQE50c1Hhw64HWS5CSxDQiHW
 A7PVfl5YU12lW1vG3uE0sNESDt4Eh/spNM17iddXdF4ZUOGublserjDGjbc17E7H
 8BX3DkC2plqkJKwtjg0ae62hREkITZZc7RqsnftUkEhn0N0H9+rb6NKUyzIVh9NZ
 bCtCjtrKN9zfZ0Mujnms3ugBOVqNIputu/DtPnnFKXtXWSrHrgGSNv5ewA==
 =PEcw
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "ARM:

   - Support for userspace handling of synchronous external aborts
     (SEAs), allowing the VMM to potentially handle the abort in a
     non-fatal manner

   - Large rework of the VGIC's list register handling with the goal of
     supporting more active/pending IRQs than available list registers
     in hardware. In addition, the VGIC now supports EOImode==1 style
     deactivations for IRQs which may occur on a separate vCPU than the
     one that acked the IRQ

   - Support for FEAT_XNX (user / privileged execute permissions) and
     FEAT_HAF (hardware update to the Access Flag) in the software page
     table walkers and shadow MMU

   - Allow page table destruction to reschedule, fixing long
     need_resched latencies observed when destroying a large VM

   - Minor fixes to KVM and selftests

  Loongarch:

   - Get VM PMU capability from HW GCFG register

   - Add AVEC basic support

   - Use 64-bit register definition for EIOINTC

   - Add KVM timer test cases for tools/selftests

  RISC/V:

   - SBI message passing (MPXY) support for KVM guest

   - Give a new, more specific error subcode for the case when in-kernel
     AIA virtualization fails to allocate IMSIC VS-file

   - Support KVM_DIRTY_LOG_INITIALLY_SET, enabling dirty log gradually
     in small chunks

   - Fix guest page fault within HLV* instructions

   - Flush VS-stage TLB after VCPU migration for Andes cores

  s390:

   - Always allocate ESCA (Extended System Control Area), instead of
     starting with the basic SCA and converting to ESCA with the
     addition of the 65th vCPU. The price is increased number of exits
     (and worse performance) on z10 and earlier processor; ESCA was
     introduced by z114/z196 in 2010

   - VIRT_XFER_TO_GUEST_WORK support

   - Operation exception forwarding support

   - Cleanups

  x86:

   - Skip the costly "zap all SPTEs" on an MMIO generation wrap if MMIO
     SPTE caching is disabled, as there can't be any relevant SPTEs to
     zap

   - Relocate a misplaced export

   - Fix an async #PF bug where KVM would clear the completion queue
     when the guest transitioned in and out of paging mode, e.g. when
     handling an SMI and then returning to paged mode via RSM

   - Leave KVM's user-return notifier registered even when disabling
     virtualization, as long as kvm.ko is loaded. On reboot/shutdown,
     keeping the notifier registered is ok; the kernel does not use the
     MSRs and the callback will run cleanly and restore host MSRs if the
     CPU manages to return to userspace before the system goes down

   - Use the checked version of {get,put}_user()

   - Fix a long-lurking bug where KVM's lack of catch-up logic for
     periodic APIC timers can result in a hard lockup in the host

   - Revert the periodic kvmclock sync logic now that KVM doesn't use a
     clocksource that's subject to NTP corrections

   - Clean up KVM's handling of MMIO Stale Data and L1TF, and bury the
     latter behind CONFIG_CPU_MITIGATIONS

   - Context switch XCR0, XSS, and PKRU outside of the entry/exit fast
     path; the only reason they were handled in the fast path was to
     paper of a bug in the core #MC code, and that has long since been
     fixed

   - Add emulator support for AVX MOV instructions, to play nice with
     emulated devices whose guest drivers like to access PCI BARs with
     large multi-byte instructions

  x86 (AMD):

   - Fix a few missing "VMCB dirty" bugs

   - Fix the worst of KVM's lack of EFER.LMSLE emulation

   - Add AVIC support for addressing 4k vCPUs in x2AVIC mode

   - Fix incorrect handling of selective CR0 writes when checking
     intercepts during emulation of L2 instructions

   - Fix a currently-benign bug where KVM would clobber SPEC_CTRL[63:32]
     on VMRUN and #VMEXIT

   - Fix a bug where KVM corrupt the guest code stream when re-injecting
     a soft interrupt if the guest patched the underlying code after the
     VM-Exit, e.g. when Linux patches code with a temporary INT3

   - Add KVM_X86_SNP_POLICY_BITS to advertise supported SNP policy bits
     to userspace, and extend KVM "support" to all policy bits that
     don't require any actual support from KVM

  x86 (Intel):

   - Use the root role from kvm_mmu_page to construct EPTPs instead of
     the current vCPU state, partly as worthwhile cleanup, but mostly to
     pave the way for tracking per-root TLB flushes, and elide EPT
     flushes on pCPU migration if the root is clean from a previous
     flush

   - Add a few missing nested consistency checks

   - Rip out support for doing "early" consistency checks via hardware
     as the functionality hasn't been used in years and is no longer
     useful in general; replace it with an off-by-default module param
     to WARN if hardware fails a check that KVM does not perform

   - Fix a currently-benign bug where KVM would drop the guest's
     SPEC_CTRL[63:32] on VM-Enter

   - Misc cleanups

   - Overhaul the TDX code to address systemic races where KVM (acting
     on behalf of userspace) could inadvertantly trigger lock contention
     in the TDX-Module; KVM was either working around these in weird,
     ugly ways, or was simply oblivious to them (though even Yan's
     devilish selftests could only break individual VMs, not the host
     kernel)

   - Fix a bug where KVM could corrupt a vCPU's cpu_list when freeing a
     TDX vCPU, if creating said vCPU failed partway through

   - Fix a few sparse warnings (bad annotation, 0 != NULL)

   - Use struct_size() to simplify copying TDX capabilities to userspace

   - Fix a bug where TDX would effectively corrupt user-return MSR
     values if the TDX Module rejects VP.ENTER and thus doesn't clobber
     host MSRs as expected

  Selftests:

   - Fix a math goof in mmu_stress_test when running on a single-CPU
     system/VM

   - Forcefully override ARCH from x86_64 to x86 to play nice with
     specifying ARCH=x86_64 on the command line

   - Extend a bunch of nested VMX to validate nested SVM as well

   - Add support for LA57 in the core VM_MODE_xxx macro, and add a test
     to verify KVM can save/restore nested VMX state when L1 is using
     5-level paging, but L2 is not

   - Clean up the guest paging code in anticipation of sharing the core
     logic for nested EPT and nested NPT

  guest_memfd:

   - Add NUMA mempolicy support for guest_memfd, and clean up a variety
     of rough edges in guest_memfd along the way

   - Define a CLASS to automatically handle get+put when grabbing a
     guest_memfd from a memslot to make it harder to leak references

   - Enhance KVM selftests to make it easer to develop and debug
     selftests like those added for guest_memfd NUMA support, e.g. where
     test and/or KVM bugs often result in hard-to-debug SIGBUS errors

   - Misc cleanups

  Generic:

   - Use the recently-added WQ_PERCPU when creating the per-CPU
     workqueue for irqfd cleanup

   - Fix a goof in the dirty ring documentation

   - Fix choice of target for directed yield across different calls to
     kvm_vcpu_on_spin(); the function was always starting from the first
     vCPU instead of continuing the round-robin search"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (260 commits)
  KVM: arm64: at: Update AF on software walk only if VM has FEAT_HAFDBS
  KVM: arm64: at: Use correct HA bit in TCR_EL2 when regime is EL2
  KVM: arm64: Document KVM_PGTABLE_PROT_{UX,PX}
  KVM: arm64: Fix spelling mistake "Unexpeced" -> "Unexpected"
  KVM: arm64: Add break to default case in kvm_pgtable_stage2_pte_prot()
  KVM: arm64: Add endian casting to kvm_swap_s[12]_desc()
  KVM: arm64: Fix compilation when CONFIG_ARM64_USE_LSE_ATOMICS=n
  KVM: arm64: selftests: Add test for AT emulation
  KVM: arm64: nv: Expose hardware access flag management to NV guests
  KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW
  KVM: arm64: Implement HW access flag management in stage-1 SW PTW
  KVM: arm64: Propagate PTW errors up to AT emulation
  KVM: arm64: Add helper for swapping guest descriptor
  KVM: arm64: nv: Use pgtable definitions in stage-2 walk
  KVM: arm64: Handle endianness in read helper for emulated PTW
  KVM: arm64: nv: Stop passing vCPU through void ptr in S2 PTW
  KVM: arm64: Call helper for reading descriptors directly
  KVM: arm64: nv: Advertise support for FEAT_XNX
  KVM: arm64: Teach ptdump about FEAT_XNX permissions
  KVM: s390: Use generic VIRT_XFER_TO_GUEST_WORK functions
  ...
2025-12-05 17:01:20 -08:00

807 lines
22 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
#include <linux/sched/mm.h>
#include "messages.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "locking.h"
#include "fs.h"
#include "accessors.h"
#include "ioctl.h"
#include "verity.h"
#include "orphan.h"
/*
* Implementation of the interface defined in struct fsverity_operations.
*
* The main question is how and where to store the verity descriptor and the
* Merkle tree. We store both in dedicated btree items in the filesystem tree,
* together with the rest of the inode metadata. This means we'll need to do
* extra work to encrypt them once encryption is supported in btrfs, but btrfs
* has a lot of careful code around i_size and it seems better to make a new key
* type than try and adjust all of our expectations for i_size.
*
* Note that this differs from the implementation in ext4 and f2fs, where
* this data is stored as if it were in the file, but past EOF. However, btrfs
* does not have a widespread mechanism for caching opaque metadata pages, so we
* do pretend that the Merkle tree pages themselves are past EOF for the
* purposes of caching them (as opposed to creating a virtual inode).
*
* fs verity items are stored under two different key types on disk.
* The descriptor items:
* [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
*
* At offset 0, we store a btrfs_verity_descriptor_item which tracks the
* size of the descriptor item and some extra data for encryption.
* Starting at offset 1, these hold the generic fs verity descriptor.
* The latter are opaque to btrfs, we just read and write them as a blob for
* the higher level verity code. The most common descriptor size is 256 bytes.
*
* The merkle tree items:
* [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
*
* These also start at offset 0, and correspond to the merkle tree bytes.
* So when fsverity asks for page 0 of the merkle tree, we pull up one page
* starting at offset 0 for this key type. These are also opaque to btrfs,
* we're blindly storing whatever fsverity sends down.
*
* Another important consideration is the fact that the Merkle tree data scales
* linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
* ~1/127th the size) so for large files, writing the tree can be a lengthy
* operation. For that reason, we guard the whole enable verity operation
* (between begin_enable_verity and end_enable_verity) with an orphan item.
* Again, because the data can be pretty large, it's quite possible that we
* could run out of space writing it, so we try our best to handle errors by
* stopping and rolling back rather than aborting the victim transaction.
*/
#define MERKLE_START_ALIGN 65536
/*
* Compute the logical file offset where we cache the Merkle tree.
*
* @inode: inode of the verity file
*
* For the purposes of caching the Merkle tree pages, as required by
* fs-verity, it is convenient to do size computations in terms of a file
* offset, rather than in terms of page indices.
*
* Use 64K to be sure it's past the last page in the file, even with 64K pages.
* That rounding operation itself can overflow loff_t, so we do it in u64 and
* check.
*
* Returns the file offset on success, negative error code on failure.
*/
static loff_t merkle_file_pos(const struct inode *inode)
{
u64 sz = inode->i_size;
u64 rounded = round_up(sz, MERKLE_START_ALIGN);
if (rounded > inode->i_sb->s_maxbytes)
return -EFBIG;
return rounded;
}
/*
* Drop all the items for this inode with this key_type.
*
* @inode: inode to drop items for
* @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
* BTRFS_VERITY_MERKLE_ITEM)
*
* Before doing a verity enable we cleanup any existing verity items.
* This is also used to clean up if a verity enable failed half way through.
*
* Returns number of dropped items on success, negative error code on failure.
*/
static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = inode->root;
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int count = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
while (1) {
/* 1 for the item being dropped */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
/*
* Walk backwards through all the items until we find one that
* isn't from our key type or objectid
*/
key.objectid = btrfs_ino(inode);
key.type = key_type;
key.offset = (u64)-1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = 0;
/* No more keys of this type, we're done */
if (path->slots[0] == 0)
break;
path->slots[0]--;
} else if (ret < 0) {
btrfs_end_transaction(trans);
return ret;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
/* No more keys of this type, we're done */
if (key.objectid != btrfs_ino(inode) || key.type != key_type)
break;
/*
* This shouldn't be a performance sensitive function because
* it's not used as part of truncate. If it ever becomes
* perf sensitive, change this to walk forward and bulk delete
* items
*/
ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
if (ret) {
btrfs_end_transaction(trans);
return ret;
}
count++;
btrfs_release_path(path);
btrfs_end_transaction(trans);
}
btrfs_end_transaction(trans);
return count;
}
/*
* Drop all verity items
*
* @inode: inode to drop verity items for
*
* In most contexts where we are dropping verity items, we want to do it for all
* the types of verity items, not a particular one.
*
* Returns: 0 on success, negative error code on failure.
*/
int btrfs_drop_verity_items(struct btrfs_inode *inode)
{
int ret;
ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
if (ret < 0)
return ret;
ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
if (ret < 0)
return ret;
return 0;
}
/*
* Insert and write inode items with a given key type and offset.
*
* @inode: inode to insert for
* @key_type: key type to insert
* @offset: item offset to insert at
* @src: source data to write
* @len: length of source data to write
*
* Write len bytes from src into items of up to 2K length.
* The inserted items will have key (ino, key_type, offset + off) where off is
* consecutively increasing from 0 up to the last item ending at offset + len.
*
* Returns 0 on success and a negative error code on failure.
*/
static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
const char *src, u64 len)
{
struct btrfs_trans_handle *trans;
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long copy_bytes;
unsigned long src_offset = 0;
void *data;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
while (len > 0) {
/* 1 for the new item being inserted */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
key.objectid = btrfs_ino(inode);
key.type = key_type;
key.offset = offset;
/*
* Insert 2K at a time mostly to be friendly for smaller leaf
* size filesystems
*/
copy_bytes = min_t(u64, len, 2048);
ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
if (ret) {
btrfs_end_transaction(trans);
break;
}
leaf = path->nodes[0];
data = btrfs_item_ptr(leaf, path->slots[0], void);
write_extent_buffer(leaf, src + src_offset,
(unsigned long)data, copy_bytes);
offset += copy_bytes;
src_offset += copy_bytes;
len -= copy_bytes;
btrfs_release_path(path);
btrfs_end_transaction(trans);
}
return ret;
}
/*
* Read inode items of the given key type and offset from the btree.
*
* @inode: inode to read items of
* @key_type: key type to read
* @offset: item offset to read from
* @dest: Buffer to read into. This parameter has slightly tricky
* semantics. If it is NULL, the function will not do any copying
* and will just return the size of all the items up to len bytes.
* If dest_page is passed, then the function will kmap_local the
* page and ignore dest, but it must still be non-NULL to avoid the
* counting-only behavior.
* @len: length in bytes to read
* @dest_folio: copy into this folio instead of the dest buffer
*
* Helper function to read items from the btree. This returns the number of
* bytes read or < 0 for errors. We can return short reads if the items don't
* exist on disk or aren't big enough to fill the desired length. Supports
* reading into a provided buffer (dest) or into the page cache
*
* Returns number of bytes read or a negative error code on failure.
*/
static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
char *dest, u64 len, struct folio *dest_folio)
{
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_key key;
u64 item_end;
u64 copy_end;
int copied = 0;
u32 copy_offset;
unsigned long copy_bytes;
unsigned long dest_offset = 0;
void *data;
char *kaddr = dest;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (dest_folio)
path->reada = READA_FORWARD;
key.objectid = btrfs_ino(inode);
key.type = key_type;
key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = 0;
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
}
while (len > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != btrfs_ino(inode) || key.type != key_type)
break;
item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
if (copied > 0) {
/*
* Once we've copied something, we want all of the items
* to be sequential
*/
if (key.offset != offset)
break;
} else {
/*
* Our initial offset might be in the middle of an
* item. Make sure it all makes sense.
*/
if (key.offset > offset)
break;
if (item_end <= offset)
break;
}
/* desc = NULL to just sum all the item lengths */
if (!dest)
copy_end = item_end;
else
copy_end = min(offset + len, item_end);
/* Number of bytes in this item we want to copy */
copy_bytes = copy_end - offset;
/* Offset from the start of item for copying */
copy_offset = offset - key.offset;
if (dest) {
if (dest_folio)
kaddr = kmap_local_folio(dest_folio, 0);
data = btrfs_item_ptr(leaf, path->slots[0], void);
read_extent_buffer(leaf, kaddr + dest_offset,
(unsigned long)data + copy_offset,
copy_bytes);
if (dest_folio)
kunmap_local(kaddr);
}
offset += copy_bytes;
dest_offset += copy_bytes;
len -= copy_bytes;
copied += copy_bytes;
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
/*
* We've reached the last slot in this leaf and we need
* to go to the next leaf.
*/
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
break;
} else if (ret > 0) {
ret = 0;
break;
}
}
}
out:
if (!ret)
ret = copied;
return ret;
}
/*
* Delete an fsverity orphan
*
* @trans: transaction to do the delete in
* @inode: inode to orphan
*
* Capture verity orphan specific logic that is repeated in the couple places
* we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
* with 0 links.
*
* Returns zero on success or a negative error code on failure.
*/
static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
int ret;
/*
* If the inode has no links, it is either already unlinked, or was
* created with O_TMPFILE. In either case, it should have an orphan from
* that other operation. Rather than reference count the orphans, we
* simply ignore them here, because we only invoke the verity path in
* the orphan logic when i_nlink is 1.
*/
if (!inode->vfs_inode.i_nlink)
return 0;
ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
if (ret == -ENOENT)
ret = 0;
return ret;
}
/*
* Rollback in-progress verity if we encounter an error.
*
* @inode: inode verity had an error for
*
* We try to handle recoverable errors while enabling verity by rolling it back
* and just failing the operation, rather than having an fs level error no
* matter what. However, any error in rollback is unrecoverable.
*
* Returns 0 on success, negative error code on failure.
*/
static int rollback_verity(struct btrfs_inode *inode)
{
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = inode->root;
int ret;
btrfs_assert_inode_locked(inode);
truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
ret = btrfs_drop_verity_items(inode);
if (ret) {
btrfs_handle_fs_error(root->fs_info, ret,
"failed to drop verity items in rollback %llu",
(u64)inode->vfs_inode.i_ino);
goto out;
}
/*
* 1 for updating the inode flag
* 1 for deleting the orphan
*/
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
btrfs_handle_fs_error(root->fs_info, ret,
"failed to start transaction in verity rollback %llu",
(u64)inode->vfs_inode.i_ino);
goto out;
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = del_orphan(trans, inode);
if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
out:
if (trans)
btrfs_end_transaction(trans);
return ret;
}
/*
* Finalize making the file a valid verity file
*
* @inode: inode to be marked as verity
* @desc: contents of the verity descriptor to write (not NULL)
* @desc_size: size of the verity descriptor
*
* Do the actual work of finalizing verity after successfully writing the Merkle
* tree:
*
* - write out the descriptor items
* - mark the inode with the verity flag
* - delete the orphan item
* - mark the ro compat bit
* - clear the in progress bit
*
* Returns 0 on success, negative error code on failure.
*/
static int finish_verity(struct btrfs_inode *inode, const void *desc,
size_t desc_size)
{
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_verity_descriptor_item item;
int ret;
/* Write out the descriptor item */
memset(&item, 0, sizeof(item));
btrfs_set_stack_verity_descriptor_size(&item, desc_size);
ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
(const char *)&item, sizeof(item));
if (ret)
goto out;
/* Write out the descriptor itself */
ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
desc, desc_size);
if (ret)
goto out;
/*
* 1 for updating the inode flag
* 1 for deleting the orphan
*/
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
ret = del_orphan(trans, inode);
if (ret)
goto end_trans;
clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
btrfs_set_fs_compat_ro(root->fs_info, VERITY);
end_trans:
btrfs_end_transaction(trans);
out:
return ret;
}
/*
* fsverity op that begins enabling verity.
*
* @filp: file to enable verity on
*
* Begin enabling fsverity for the file. We drop any existing verity items, add
* an orphan and set the in progress bit.
*
* Returns 0 on success, negative error code on failure.
*/
static int btrfs_begin_enable_verity(struct file *filp)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
struct btrfs_root *root = inode->root;
struct btrfs_trans_handle *trans;
int ret;
btrfs_assert_inode_locked(inode);
if (IS_ENCRYPTED(&inode->vfs_inode))
return -EOPNOTSUPP;
if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
return -EBUSY;
/*
* This should almost never do anything, but theoretically, it's
* possible that we failed to enable verity on a file, then were
* interrupted or failed while rolling back, failed to cleanup the
* orphan, and finally attempt to enable verity again.
*/
ret = btrfs_drop_verity_items(inode);
if (ret)
return ret;
/* 1 for the orphan item */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_orphan_add(trans, inode);
if (!ret)
set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
btrfs_end_transaction(trans);
return 0;
}
/*
* fsverity op that ends enabling verity.
*
* @filp: file we are finishing enabling verity on
* @desc: verity descriptor to write out (NULL in error conditions)
* @desc_size: size of the verity descriptor (variable with signatures)
* @merkle_tree_size: size of the merkle tree in bytes
*
* If desc is null, then VFS is signaling an error occurred during verity
* enable, and we should try to rollback. Otherwise, attempt to finish verity.
*
* Returns 0 on success, negative error code on error.
*/
static int btrfs_end_enable_verity(struct file *filp, const void *desc,
size_t desc_size, u64 merkle_tree_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
int ret = 0;
int rollback_ret;
btrfs_assert_inode_locked(inode);
if (desc == NULL)
goto rollback;
ret = finish_verity(inode, desc, desc_size);
if (ret)
goto rollback;
return ret;
rollback:
rollback_ret = rollback_verity(inode);
if (rollback_ret)
btrfs_err(inode->root->fs_info,
"failed to rollback verity items: %d", rollback_ret);
return ret;
}
/*
* fsverity op that gets the struct fsverity_descriptor.
*
* @inode: inode to get the descriptor of
* @buf: output buffer for the descriptor contents
* @buf_size: size of the output buffer. 0 to query the size
*
* fsverity does a two pass setup for reading the descriptor, in the first pass
* it calls with buf_size = 0 to query the size of the descriptor, and then in
* the second pass it actually reads the descriptor off disk.
*
* Returns the size on success or a negative error code on failure.
*/
int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
{
u64 true_size;
int ret = 0;
struct btrfs_verity_descriptor_item item;
memset(&item, 0, sizeof(item));
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
(char *)&item, sizeof(item), NULL);
if (ret < 0)
return ret;
if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0))
return -EUCLEAN;
true_size = btrfs_stack_verity_descriptor_size(&item);
if (unlikely(true_size > INT_MAX))
return -EUCLEAN;
if (buf_size == 0)
return true_size;
if (buf_size < true_size)
return -ERANGE;
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
buf, buf_size, NULL);
if (ret < 0)
return ret;
if (ret != true_size)
return -EIO;
return true_size;
}
/*
* fsverity op that reads and caches a merkle tree page.
*
* @inode: inode to read a merkle tree page for
* @index: page index relative to the start of the merkle tree
* @num_ra_pages: number of pages to readahead. Optional, we ignore it
*
* The Merkle tree is stored in the filesystem btree, but its pages are cached
* with a logical position past EOF in the inode's mapping.
*
* Returns the page we read, or an ERR_PTR on error.
*/
static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
struct folio *folio;
u64 off = (u64)index << PAGE_SHIFT;
loff_t merkle_pos = merkle_file_pos(inode);
int ret;
if (merkle_pos < 0)
return ERR_PTR(merkle_pos);
if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
return ERR_PTR(-EFBIG);
index += merkle_pos >> PAGE_SHIFT;
again:
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
if (!IS_ERR(folio)) {
if (folio_test_uptodate(folio))
goto out;
folio_lock(folio);
/* If it's not uptodate after we have the lock, we got a read error. */
if (!folio_test_uptodate(folio)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EIO);
}
folio_unlock(folio);
goto out;
}
folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
0, NULL);
if (!folio)
return ERR_PTR(-ENOMEM);
ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS);
if (ret) {
folio_put(folio);
/* Did someone else insert a folio here? */
if (ret == -EEXIST)
goto again;
return ERR_PTR(ret);
}
/*
* Merkle item keys are indexed from byte 0 in the merkle tree.
* They have the form:
*
* [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
*/
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
folio_address(folio), PAGE_SIZE, folio);
if (ret < 0) {
folio_put(folio);
return ERR_PTR(ret);
}
if (ret < PAGE_SIZE)
folio_zero_segment(folio, ret, PAGE_SIZE);
folio_mark_uptodate(folio);
folio_unlock(folio);
out:
return folio_file_page(folio, index);
}
/*
* fsverity op that writes a Merkle tree block into the btree.
*
* @inode: inode to write a Merkle tree block for
* @buf: Merkle tree block to write
* @pos: the position of the block in the Merkle tree (in bytes)
* @size: the Merkle tree block size (in bytes)
*
* Returns 0 on success or negative error code on failure
*/
static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
u64 pos, unsigned int size)
{
loff_t merkle_pos = merkle_file_pos(inode);
if (merkle_pos < 0)
return merkle_pos;
if (merkle_pos > inode->i_sb->s_maxbytes - pos - size)
return -EFBIG;
return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
pos, buf, size);
}
const struct fsverity_operations btrfs_verityops = {
.inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) -
(int)offsetof(struct btrfs_inode, vfs_inode),
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
.read_merkle_tree_page = btrfs_read_merkle_tree_page,
.write_merkle_tree_block = btrfs_write_merkle_tree_block,
};