mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-01-11 09:00:12 +00:00
- The 6 patch series "panic: sys_info: Refactor and fix a potential
issue" from Andy Shevchenko fixes a build issue and does some cleanup in
ib/sys_info.c.
- The 9 patch series "Implement mul_u64_u64_div_u64_roundup()" from
David Laight enhances the 64-bit math code on behalf of a PWM driver and
beefs up the test module for these library functions.
- The 2 patch series "scripts/gdb/symbols: make BPF debug info available
to GDB" from Ilya Leoshkevich makes BPF symbol names, sizes, and line
numbers available to the GDB debugger.
- The 4 patch series "Enable hung_task and lockup cases to dump system
info on demand" from Feng Tang adds a sysctl which can be used to cause
additional info dumping when the hung-task and lockup detectors fire.
- The 6 patch series "lib/base64: add generic encoder/decoder, migrate
users" from Kuan-Wei Chiu adds a general base64 encoder/decoder to lib/
and migrates several users away from their private implementations.
- The 2 patch series "rbree: inline rb_first() and rb_last()" from Eric
Dumazet makes TCP a little faster.
- The 9 patch series "liveupdate: Rework KHO for in-kernel users" from
Pasha Tatashin reworks the KEXEC Handover interfaces in preparation for
Live Update Orchestrator (LUO), and possibly for other future clients.
- The 13 patch series "kho: simplify state machine and enable dynamic
updates" from Pasha Tatashin increases the flexibility of KEXEC
Handover. Also preparation for LUO.
- The 18 patch series "Live Update Orchestrator" from Pasha Tatashin is
a major new feature targeted at cloud environments. Quoting the [0/N]:
This series introduces the Live Update Orchestrator, a kernel subsystem
designed to facilitate live kernel updates using a kexec-based reboot.
This capability is critical for cloud environments, allowing hypervisors
to be updated with minimal downtime for running virtual machines. LUO
achieves this by preserving the state of selected resources, such as
memory, devices and their dependencies, across the kernel transition.
As a key feature, this series includes support for preserving memfd file
descriptors, which allows critical in-memory data, such as guest RAM or
any other large memory region, to be maintained in RAM across the kexec
reboot.
Mike Rappaport merits a mention here, for his extensive review and
testing work.
- The 3 patch series "kexec: reorganize kexec and kdump sysfs" from
Sourabh Jain moves the kexec and kdump sysfs entries from /sys/kernel/
to /sys/kernel/kexec/ and adds back-compatibility symlinks which can
hopefully be removed one day.
- The 2 patch series "kho: fixes for vmalloc restoration" from Mike
Rapoport fixes a BUG which was being hit during KHO restoration of
vmalloc() regions.
-----BEGIN PGP SIGNATURE-----
iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaTSAkQAKCRDdBJ7gKXxA
jrkiAP9QKfsRv46XZaM5raScjY1ayjP+gqb2rgt6BQ/gZvb2+wD/cPAYOR6BiX52
n0pVpQmG5P/KyOmpLztn96ejL4heKwQ=
=JY96
-----END PGP SIGNATURE-----
Merge tag 'mm-nonmm-stable-2025-12-06-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull non-MM updates from Andrew Morton:
- "panic: sys_info: Refactor and fix a potential issue" (Andy Shevchenko)
fixes a build issue and does some cleanup in ib/sys_info.c
- "Implement mul_u64_u64_div_u64_roundup()" (David Laight)
enhances the 64-bit math code on behalf of a PWM driver and beefs up
the test module for these library functions
- "scripts/gdb/symbols: make BPF debug info available to GDB" (Ilya Leoshkevich)
makes BPF symbol names, sizes, and line numbers available to the GDB
debugger
- "Enable hung_task and lockup cases to dump system info on demand" (Feng Tang)
adds a sysctl which can be used to cause additional info dumping when
the hung-task and lockup detectors fire
- "lib/base64: add generic encoder/decoder, migrate users" (Kuan-Wei Chiu)
adds a general base64 encoder/decoder to lib/ and migrates several
users away from their private implementations
- "rbree: inline rb_first() and rb_last()" (Eric Dumazet)
makes TCP a little faster
- "liveupdate: Rework KHO for in-kernel users" (Pasha Tatashin)
reworks the KEXEC Handover interfaces in preparation for Live Update
Orchestrator (LUO), and possibly for other future clients
- "kho: simplify state machine and enable dynamic updates" (Pasha Tatashin)
increases the flexibility of KEXEC Handover. Also preparation for LUO
- "Live Update Orchestrator" (Pasha Tatashin)
is a major new feature targeted at cloud environments. Quoting the
cover letter:
This series introduces the Live Update Orchestrator, a kernel
subsystem designed to facilitate live kernel updates using a
kexec-based reboot. This capability is critical for cloud
environments, allowing hypervisors to be updated with minimal
downtime for running virtual machines. LUO achieves this by
preserving the state of selected resources, such as memory,
devices and their dependencies, across the kernel transition.
As a key feature, this series includes support for preserving
memfd file descriptors, which allows critical in-memory data, such
as guest RAM or any other large memory region, to be maintained in
RAM across the kexec reboot.
Mike Rappaport merits a mention here, for his extensive review and
testing work.
- "kexec: reorganize kexec and kdump sysfs" (Sourabh Jain)
moves the kexec and kdump sysfs entries from /sys/kernel/ to
/sys/kernel/kexec/ and adds back-compatibility symlinks which can
hopefully be removed one day
- "kho: fixes for vmalloc restoration" (Mike Rapoport)
fixes a BUG which was being hit during KHO restoration of vmalloc()
regions
* tag 'mm-nonmm-stable-2025-12-06-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (139 commits)
calibrate: update header inclusion
Reinstate "resource: avoid unnecessary lookups in find_next_iomem_res()"
vmcoreinfo: track and log recoverable hardware errors
kho: fix restoring of contiguous ranges of order-0 pages
kho: kho_restore_vmalloc: fix initialization of pages array
MAINTAINERS: TPM DEVICE DRIVER: update the W-tag
init: replace simple_strtoul with kstrtoul to improve lpj_setup
KHO: fix boot failure due to kmemleak access to non-PRESENT pages
Documentation/ABI: new kexec and kdump sysfs interface
Documentation/ABI: mark old kexec sysfs deprecated
kexec: move sysfs entries to /sys/kernel/kexec
test_kho: always print restore status
kho: free chunks using free_page() instead of kfree()
selftests/liveupdate: add kexec test for multiple and empty sessions
selftests/liveupdate: add simple kexec-based selftest for LUO
selftests/liveupdate: add userspace API selftests
docs: add documentation for memfd preservation via LUO
mm: memfd_luo: allow preserving memfd
liveupdate: luo_file: add private argument to store runtime state
mm: shmem: export some functions to internal.h
...
258 lines
5.8 KiB
C
258 lines
5.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* linux/ipc/namespace.c
|
|
* Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc.
|
|
*/
|
|
|
|
#include <linux/ipc.h>
|
|
#include <linux/msg.h>
|
|
#include <linux/ipc_namespace.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/cred.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/user_namespace.h>
|
|
#include <linux/proc_ns.h>
|
|
#include <linux/nstree.h>
|
|
#include <linux/sched/task.h>
|
|
|
|
#include "util.h"
|
|
|
|
/*
|
|
* The work queue is used to avoid the cost of synchronize_rcu in kern_unmount.
|
|
*/
|
|
static void free_ipc(struct work_struct *unused);
|
|
static DECLARE_WORK(free_ipc_work, free_ipc);
|
|
|
|
static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
|
|
{
|
|
return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
|
|
}
|
|
|
|
static void dec_ipc_namespaces(struct ucounts *ucounts)
|
|
{
|
|
dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES);
|
|
}
|
|
|
|
static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
|
|
struct ipc_namespace *old_ns)
|
|
{
|
|
struct ipc_namespace *ns;
|
|
struct ucounts *ucounts;
|
|
int err;
|
|
|
|
err = -ENOSPC;
|
|
again:
|
|
ucounts = inc_ipc_namespaces(user_ns);
|
|
if (!ucounts) {
|
|
/*
|
|
* IPC namespaces are freed asynchronously, by free_ipc_work.
|
|
* If frees were pending, flush_work will wait, and
|
|
* return true. Fail the allocation if no frees are pending.
|
|
*/
|
|
if (flush_work(&free_ipc_work))
|
|
goto again;
|
|
goto fail;
|
|
}
|
|
|
|
err = -ENOMEM;
|
|
ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT);
|
|
if (ns == NULL)
|
|
goto fail_dec;
|
|
|
|
err = ns_common_init(ns);
|
|
if (err)
|
|
goto fail_free;
|
|
|
|
ns_tree_gen_id(ns);
|
|
ns->user_ns = get_user_ns(user_ns);
|
|
ns->ucounts = ucounts;
|
|
|
|
err = mq_init_ns(ns);
|
|
if (err)
|
|
goto fail_put;
|
|
|
|
err = -ENOMEM;
|
|
if (!setup_mq_sysctls(ns))
|
|
goto fail_mq_mount;
|
|
|
|
if (!setup_ipc_sysctls(ns))
|
|
goto fail_mq_sysctls;
|
|
|
|
err = msg_init_ns(ns);
|
|
if (err)
|
|
goto fail_ipc;
|
|
|
|
sem_init_ns(ns);
|
|
shm_init_ns(ns);
|
|
ns_tree_add_raw(ns);
|
|
|
|
return ns;
|
|
|
|
fail_ipc:
|
|
retire_ipc_sysctls(ns);
|
|
fail_mq_sysctls:
|
|
retire_mq_sysctls(ns);
|
|
fail_mq_mount:
|
|
mntput(ns->mq_mnt);
|
|
fail_put:
|
|
put_user_ns(ns->user_ns);
|
|
ns_common_free(ns);
|
|
fail_free:
|
|
kfree(ns);
|
|
fail_dec:
|
|
dec_ipc_namespaces(ucounts);
|
|
fail:
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
struct ipc_namespace *copy_ipcs(u64 flags,
|
|
struct user_namespace *user_ns, struct ipc_namespace *ns)
|
|
{
|
|
if (!(flags & CLONE_NEWIPC))
|
|
return get_ipc_ns(ns);
|
|
return create_ipc_ns(user_ns, ns);
|
|
}
|
|
|
|
/*
|
|
* free_ipcs - free all ipcs of one type
|
|
* @ns: the namespace to remove the ipcs from
|
|
* @ids: the table of ipcs to free
|
|
* @free: the function called to free each individual ipc
|
|
*
|
|
* Called for each kind of ipc when an ipc_namespace exits.
|
|
*/
|
|
void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
|
|
void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
|
|
{
|
|
struct kern_ipc_perm *perm;
|
|
int next_id;
|
|
int total, in_use;
|
|
|
|
down_write(&ids->rwsem);
|
|
|
|
in_use = ids->in_use;
|
|
|
|
for (total = 0, next_id = 0; total < in_use; next_id++) {
|
|
perm = idr_find(&ids->ipcs_idr, next_id);
|
|
if (perm == NULL)
|
|
continue;
|
|
rcu_read_lock();
|
|
ipc_lock_object(perm);
|
|
free(ns, perm);
|
|
total++;
|
|
}
|
|
up_write(&ids->rwsem);
|
|
}
|
|
|
|
static void free_ipc_ns(struct ipc_namespace *ns)
|
|
{
|
|
/*
|
|
* Caller needs to wait for an RCU grace period to have passed
|
|
* after making the mount point inaccessible to new accesses.
|
|
*/
|
|
mntput(ns->mq_mnt);
|
|
sem_exit_ns(ns);
|
|
msg_exit_ns(ns);
|
|
shm_exit_ns(ns);
|
|
|
|
retire_mq_sysctls(ns);
|
|
retire_ipc_sysctls(ns);
|
|
|
|
dec_ipc_namespaces(ns->ucounts);
|
|
put_user_ns(ns->user_ns);
|
|
ns_common_free(ns);
|
|
kfree(ns);
|
|
}
|
|
|
|
static LLIST_HEAD(free_ipc_list);
|
|
static void free_ipc(struct work_struct *unused)
|
|
{
|
|
struct llist_node *node = llist_del_all(&free_ipc_list);
|
|
struct ipc_namespace *n, *t;
|
|
|
|
llist_for_each_entry_safe(n, t, node, mnt_llist)
|
|
mnt_make_shortterm(n->mq_mnt);
|
|
|
|
/* Wait for any last users to have gone away. */
|
|
synchronize_rcu();
|
|
|
|
llist_for_each_entry_safe(n, t, node, mnt_llist)
|
|
free_ipc_ns(n);
|
|
}
|
|
|
|
/*
|
|
* put_ipc_ns - drop a reference to an ipc namespace.
|
|
* @ns: the namespace to put
|
|
*
|
|
* If this is the last task in the namespace exiting, and
|
|
* it is dropping the refcount to 0, then it can race with
|
|
* a task in another ipc namespace but in a mounts namespace
|
|
* which has this ipcns's mqueuefs mounted, doing some action
|
|
* with one of the mqueuefs files. That can raise the refcount.
|
|
* So dropping the refcount, and raising the refcount when
|
|
* accessing it through the VFS, are protected with mq_lock.
|
|
*
|
|
* (Clearly, a task raising the refcount on its own ipc_ns
|
|
* needn't take mq_lock since it can't race with the last task
|
|
* in the ipcns exiting).
|
|
*/
|
|
void put_ipc_ns(struct ipc_namespace *ns)
|
|
{
|
|
if (ns_ref_put_and_lock(ns, &mq_lock)) {
|
|
mq_clear_sbinfo(ns);
|
|
spin_unlock(&mq_lock);
|
|
|
|
ns_tree_remove(ns);
|
|
if (llist_add(&ns->mnt_llist, &free_ipc_list))
|
|
schedule_work(&free_ipc_work);
|
|
}
|
|
}
|
|
|
|
static struct ns_common *ipcns_get(struct task_struct *task)
|
|
{
|
|
struct ipc_namespace *ns = NULL;
|
|
struct nsproxy *nsproxy;
|
|
|
|
task_lock(task);
|
|
nsproxy = task->nsproxy;
|
|
if (nsproxy)
|
|
ns = get_ipc_ns(nsproxy->ipc_ns);
|
|
task_unlock(task);
|
|
|
|
return ns ? &ns->ns : NULL;
|
|
}
|
|
|
|
static void ipcns_put(struct ns_common *ns)
|
|
{
|
|
return put_ipc_ns(to_ipc_ns(ns));
|
|
}
|
|
|
|
static int ipcns_install(struct nsset *nsset, struct ns_common *new)
|
|
{
|
|
struct nsproxy *nsproxy = nsset->nsproxy;
|
|
struct ipc_namespace *ns = to_ipc_ns(new);
|
|
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
|
|
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
put_ipc_ns(nsproxy->ipc_ns);
|
|
nsproxy->ipc_ns = get_ipc_ns(ns);
|
|
return 0;
|
|
}
|
|
|
|
static struct user_namespace *ipcns_owner(struct ns_common *ns)
|
|
{
|
|
return to_ipc_ns(ns)->user_ns;
|
|
}
|
|
|
|
const struct proc_ns_operations ipcns_operations = {
|
|
.name = "ipc",
|
|
.get = ipcns_get,
|
|
.put = ipcns_put,
|
|
.install = ipcns_install,
|
|
.owner = ipcns_owner,
|
|
};
|