1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 09:00:12 +00:00
torvalds-linux/init/init_task.c
Linus Torvalds 2b09f480f0 A large overhaul of the restartable sequences and CID management:
The recent enablement of RSEQ in glibc resulted in regressions which are
   caused by the related overhead. It turned out that the decision to invoke
   the exit to user work was not really a decision. More or less each
   context switch caused that. There is a long list of small issues which
   sums up nicely and results in a 3-4% regression in I/O benchmarks.
 
   The other detail which caused issues due to extra work in context switch
   and task migration is the CID (memory context ID) management. It also
   requires to use a task work to consolidate the CID space, which is
   executed in the context of an arbitrary task and results in sporadic
   uncontrolled exit latencies.
 
   The rewrite addresses this by:
 
   - Removing deprecated and long unsupported functionality
 
   - Moving the related data into dedicated data structures which are
     optimized for fast path processing.
 
   - Caching values so actual decisions can be made
 
   - Replacing the current implementation with a optimized inlined variant.
 
   - Separating fast and slow path for architectures which use the generic
     entry code, so that only fault and error handling goes into the
     TIF_NOTIFY_RESUME handler.
 
   - Rewriting the CID management so that it becomes mostly invisible in the
     context switch path. That moves the work of switching modes into the
     fork/exit path, which is a reasonable tradeoff. That work is only
     required when a process creates more threads than the cpuset it is
     allowed to run on or when enough threads exit after that. An artificial
     thread pool benchmarks which triggers this did not degrade, it actually
     improved significantly.
 
     The main effect in migration heavy scenarios is that runqueue lock held
     time and therefore contention goes down significantly.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmksaRYTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoencEADA5he8PAFPmSRRPo6+2G5mHzWe8kIU
 5ZViQStWFNAA0qqy8VXryWiJ6qqrO6la9o7K4YOXASUtlkVjquRp1DF7PabqGwuy
 zshbRCXNlT51J8uqanN8VrGVjlf+bMdHDbGoI1SLkUTxG8b+kDD5PXUQE1ARelPP
 Slbg9u+EMrxj6D5MDTPbuW6TqryJEkPtiNScyOz43emp9ww9+WVxenOcRqU4D+Th
 mjWmrGIzkroSf4XReMoD/wg9TPTpUjXnNCwl2viY9JvBpkMfYtU4tJAGK3aNFOWy
 zsAN0O9CaFGrUEFne7qUmtwhNLdtnjx5HN5pe7yZd1EhdTuQKq4jPiiQnwwm8w72
 c0o6m45FNPmPoSyfaZWCkLjbTEUXonT9JF61iN35JVxim8gBDDJjHFKnLxDmLrH3
 X0eESE48ReY2EneDV6Y8RJRo6oG14Fccvc39aTf/2Rw3trpmtt2agvConQzupQIg
 DzANw4jhUUzFRrHrMHACNsqKFXh9ratue/S9DM3xxTpGO/bKdeK7jGIgzNf8O34M
 J0O6Hvk5jMdcWlIJTx21GoGzoSkkXnR49g/71aCcp+MwdY4x9zFz5SWi8LWQRmkx
 xRo6tY27Bma8/SEwMJjPpAUXDTpq6v+j3cPisybL1yGsyt9lh+p8LX7VUtwcoEqe
 6ZelC5Kgw/+/kg==
 =n5KT
 -----END PGP SIGNATURE-----

Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull rseq updates from Thomas Gleixner:
 "A large overhaul of the restartable sequences and CID management:

  The recent enablement of RSEQ in glibc resulted in regressions which
  are caused by the related overhead. It turned out that the decision to
  invoke the exit to user work was not really a decision. More or less
  each context switch caused that. There is a long list of small issues
  which sums up nicely and results in a 3-4% regression in I/O
  benchmarks.

  The other detail which caused issues due to extra work in context
  switch and task migration is the CID (memory context ID) management.
  It also requires to use a task work to consolidate the CID space,
  which is executed in the context of an arbitrary task and results in
  sporadic uncontrolled exit latencies.

  The rewrite addresses this by:

   - Removing deprecated and long unsupported functionality

   - Moving the related data into dedicated data structures which are
     optimized for fast path processing.

   - Caching values so actual decisions can be made

   - Replacing the current implementation with a optimized inlined
     variant.

   - Separating fast and slow path for architectures which use the
     generic entry code, so that only fault and error handling goes into
     the TIF_NOTIFY_RESUME handler.

   - Rewriting the CID management so that it becomes mostly invisible in
     the context switch path. That moves the work of switching modes
     into the fork/exit path, which is a reasonable tradeoff. That work
     is only required when a process creates more threads than the
     cpuset it is allowed to run on or when enough threads exit after
     that. An artificial thread pool benchmarks which triggers this did
     not degrade, it actually improved significantly.

     The main effect in migration heavy scenarios is that runqueue lock
     held time and therefore contention goes down significantly"

* tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
  sched/mmcid: Switch over to the new mechanism
  sched/mmcid: Implement deferred mode change
  irqwork: Move data struct to a types header
  sched/mmcid: Provide CID ownership mode fixup functions
  sched/mmcid: Provide new scheduler CID mechanism
  sched/mmcid: Introduce per task/CPU ownership infrastructure
  sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex
  sched/mmcid: Provide precomputed maximal value
  sched/mmcid: Move initialization out of line
  signal: Move MMCID exit out of sighand lock
  sched/mmcid: Convert mm CID mask to a bitmap
  cpumask: Cache num_possible_cpus()
  sched/mmcid: Use cpumask_weighted_or()
  cpumask: Introduce cpumask_weighted_or()
  sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
  sched/mmcid: Move scheduler code out of global header
  sched: Fixup whitespace damage
  sched/mmcid: Cacheline align MM CID storage
  sched/mmcid: Use proper data structures
  sched/mmcid: Revert the complex CID management
  ...
2025-12-02 08:48:53 -08:00

266 lines
7.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/init_task.h>
#include <linux/export.h>
#include <linux/mqueue.h>
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include <linux/sched/ext.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/audit.h>
#include <linux/numa.h>
#include <linux/scs.h>
#include <linux/plist.h>
#include <linux/uaccess.h>
static struct signal_struct init_signals = {
.nr_threads = 1,
.thread_head = LIST_HEAD_INIT(init_task.thread_node),
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit),
.shared_pending = {
.list = LIST_HEAD_INIT(init_signals.shared_pending.list),
.signal = {{0}}
},
.multiprocess = HLIST_HEAD_INIT,
.rlim = INIT_RLIMITS,
#ifdef CONFIG_CGROUPS
.cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
#endif
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
#ifdef CONFIG_POSIX_TIMERS
.posix_timers = HLIST_HEAD_INIT,
.ignored_posix_timers = HLIST_HEAD_INIT,
.cputimer = {
.cputime_atomic = INIT_CPUTIME_ATOMIC,
},
#endif
INIT_CPU_TIMERS(init_signals)
.pids = {
[PIDTYPE_PID] = &init_struct_pid,
[PIDTYPE_TGID] = &init_struct_pid,
[PIDTYPE_PGID] = &init_struct_pid,
[PIDTYPE_SID] = &init_struct_pid,
},
INIT_PREV_CPUTIME(init_signals)
};
static struct sighand_struct init_sighand = {
.count = REFCOUNT_INIT(1),
.action = { { { .sa_handler = SIG_DFL, } }, },
.siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
};
#ifdef CONFIG_SHADOW_CALL_STACK
unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
};
#endif
/* init to 2 - one for init_task, one to ensure it is never freed */
static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
/*
* The initial credentials for the initial task
*/
static struct cred init_cred = {
.usage = ATOMIC_INIT(4),
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
.sgid = GLOBAL_ROOT_GID,
.euid = GLOBAL_ROOT_UID,
.egid = GLOBAL_ROOT_GID,
.fsuid = GLOBAL_ROOT_UID,
.fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
.cap_effective = CAP_FULL_SET,
.cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
.ucounts = &init_ucounts,
};
/*
* Set up the first task table, touch at your own risk!. Base=0,
* limit=0x1fffff (=2MB)
*/
struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = REFCOUNT_INIT(1),
#endif
.__state = 0,
.stack = init_stack,
.usage = REFCOUNT_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
.max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
.faults_disabled_mapping = NULL,
.restart_block = {
.fn = do_no_restart_syscall,
},
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
.tasks = LIST_HEAD_INIT(init_task.tasks),
#ifdef CONFIG_SMP
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
#endif
#ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group,
#endif
#ifdef CONFIG_SCHED_CLASS_EXT
.scx = {
.dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node),
.sticky_cpu = -1,
.holding_cpu = -1,
.runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node),
.runnable_at = INITIAL_JIFFIES,
.ddsp_dsq_id = SCX_DSQ_INVALID,
.slice = SCX_SLICE_DFL,
},
#endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
.parent = &init_task,
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
.fs = &init_fs,
.files = &init_files,
#ifdef CONFIG_IO_URING
.io_uring = NULL,
#endif
.signal = &init_signals,
.sighand = &init_sighand,
.nsproxy = &init_nsproxy,
.pending = {
.list = LIST_HEAD_INIT(init_task.pending.list),
.signal = {{0}}
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_AUDIT
.loginuid = INVALID_UID,
.sessionid = AUDIT_SID_UNSET,
#endif
#ifdef CONFIG_PERF_EVENTS
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
.perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
#endif
#ifdef CONFIG_PREEMPT_RCU
.rcu_read_lock_nesting = 0,
.rcu_read_unlock_special.s = 0,
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
.rcu_blocked_node = NULL,
#endif
#ifdef CONFIG_TASKS_RCU
.rcu_tasks_holdout = false,
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
.rcu_tasks_idle_cpu = -1,
.rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
.trc_reader_nesting = 0,
.trc_reader_special.s = 0,
.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
#endif
#ifdef CONFIG_CPUSETS
.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
&init_task.alloc_lock),
#endif
#ifdef CONFIG_RT_MUTEXES
.pi_waiters = RB_ROOT_CACHED,
.pi_top_task = NULL,
#endif
INIT_PREV_CPUTIME(init_task)
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
.vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
.vtime.starttime = 0,
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
.numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
.kasan_depth = 1,
#endif
#ifdef CONFIG_KCSAN
.kcsan_ctx = {
.scoped_accesses = {LIST_POISON1, NULL},
},
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
.softirqs_enabled = 1,
#endif
#ifdef CONFIG_LOCKDEP
.lockdep_depth = 0, /* no locks held yet */
.curr_chain_key = INITIAL_CHAIN_KEY,
.lockdep_recursion = 0,
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.ret_stack = NULL,
.tracing_graph_pause = ATOMIC_INIT(0),
#endif
#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
.trace_recursion = 0,
#endif
#ifdef CONFIG_LIVEPATCH
.patch_state = KLP_TRANSITION_IDLE,
#endif
#ifdef CONFIG_SECURITY
.security = NULL,
#endif
#ifdef CONFIG_SECCOMP_FILTER
.seccomp = { .filter_count = ATOMIC_INIT(0) },
#endif
#ifdef CONFIG_SCHED_MM_CID
.mm_cid = { .cid = MM_CID_UNSET, },
#endif
};
EXPORT_SYMBOL(init_task);
/*
* Initial thread structure. Alignment of this is handled by a special
* linker map entry.
*/
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task);
#endif