1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-12 09:32:12 +00:00
Gabriele Monaco 7dec062cfc timers/migration: Exclude isolated cpus from hierarchy
The timer migration mechanism allows active CPUs to pull timers from
idle ones to improve the overall idle time. This is however undesired
when CPU intensive workloads run on isolated cores, as the algorithm
would move the timers from housekeeping to isolated cores, negatively
affecting the isolation.

Exclude isolated cores from the timer migration algorithm, extend the
concept of unavailable cores, currently used for offline ones, to
isolated ones:
* A core is unavailable if isolated or offline;
* A core is available if non isolated and online;

A core is considered unavailable as isolated if it belongs to:
* the isolcpus (domain) list
* an isolated cpuset
Except if it is:
* in the nohz_full list (already idle for the hierarchy)
* the nohz timekeeper core (must be available to handle global timers)

CPUs are added to the hierarchy during late boot, excluding isolated
ones, the hierarchy is also adapted when the cpuset isolation changes.

Due to how the timer migration algorithm works, any CPU part of the
hierarchy can have their global timers pulled by remote CPUs and have to
pull remote timers, only skipping pulling remote timers would break the
logic.
For this reason, prevent isolated CPUs from pulling remote global
timers, but also the other way around: any global timer started on an
isolated CPU will run there. This does not break the concept of
isolation (global timers don't come from outside the CPU) and, if
considered inappropriate, can usually be mitigated with other isolation
techniques (e.g. IRQ pinning).

This effect was noticed on a 128 cores machine running oslat on the
isolated cores (1-31,33-63,65-95,97-127). The tool monopolises CPUs,
and the CPU with lowest count in a timer migration hierarchy (here 1
and 65) appears as always active and continuously pulls global timers,
from the housekeeping CPUs. This ends up moving driver work (e.g.
delayed work) to isolated CPUs and causes latency spikes:

before the change:

 # oslat -c 1-31,33-63,65-95,97-127 -D 62s
 ...
  Maximum:     1203 10 3 4 ... 5 (us)

after the change:

 # oslat -c 1-31,33-63,65-95,97-127 -D 62s
 ...
  Maximum:      10 4 3 4 3 ... 5 (us)

The same behaviour was observed on a machine with as few as 20 cores /
40 threads with isocpus set to: 1-9,11-39 with rtla-osnoise-top.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: John B. Wyatt IV <jwyatt@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20251120145653.296659-8-gmonaco@redhat.com
2025-11-20 20:17:32 +01:00

201 lines
6.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMER_H
#define _LINUX_TIMER_H
#include <linux/list.h>
#include <linux/ktime.h>
#include <linux/stddef.h>
#include <linux/debugobjects.h>
#include <linux/stringify.h>
#include <linux/timer_types.h>
#ifdef CONFIG_LOCKDEP
/*
* NB: because we have to copy the lockdep_map, setting the lockdep_map key
* (second argument) here is required, otherwise it could be initialised to
* the copy of the lockdep_map later! We use the pointer to and the string
* "<file>:<line>" as the key resp. the name of the lockdep_map.
*/
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \
.lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
#else
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
#endif
/*
* @TIMER_DEFERRABLE: A deferrable timer will work normally when the
* system is busy, but will not cause a CPU to come out of idle just
* to service it; instead, the timer will be serviced when the CPU
* eventually wakes up with a subsequent non-deferrable timer.
*
* @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and
* it's safe to wait for the completion of the running instance from
* IRQ handlers, for example, by calling timer_delete_sync().
*
* Note: The irq disabled callback execution is a special case for
* workqueue locking issues. It's not meant for executing random crap
* with interrupts disabled. Abuse is monitored!
*
* @TIMER_PINNED: A pinned timer will always expire on the CPU on which the
* timer was enqueued. When a particular CPU is required, add_timer_on()
* has to be used. Enqueue via mod_timer() and add_timer() is always done
* on the local CPU.
*/
#define TIMER_CPUMASK 0x0003FFFF
#define TIMER_MIGRATING 0x00040000
#define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING)
#define TIMER_DEFERRABLE 0x00080000
#define TIMER_PINNED 0x00100000
#define TIMER_IRQSAFE 0x00200000
#define TIMER_INIT_FLAGS (TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
#define TIMER_ARRAYSHIFT 22
#define TIMER_ARRAYMASK 0xFFC00000
#define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
#define __TIMER_INITIALIZER(_function, _flags) { \
.entry = { .next = TIMER_ENTRY_STATIC }, \
.function = (_function), \
.flags = (_flags), \
__TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE) \
}
#define DEFINE_TIMER(_name, _function) \
struct timer_list _name = \
__TIMER_INITIALIZER(_function, 0)
/*
* LOCKDEP and DEBUG timer interfaces.
*/
void timer_init_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key);
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags, const char *name,
struct lock_class_key *key);
#else
static inline void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name,
struct lock_class_key *key)
{
timer_init_key(timer, func, flags, name, key);
}
#endif
#ifdef CONFIG_LOCKDEP
#define __timer_init(_timer, _fn, _flags) \
do { \
static struct lock_class_key __key; \
timer_init_key((_timer), (_fn), (_flags), #_timer, &__key);\
} while (0)
#define __timer_init_on_stack(_timer, _fn, _flags) \
do { \
static struct lock_class_key __key; \
timer_init_key_on_stack((_timer), (_fn), (_flags), \
#_timer, &__key); \
} while (0)
#else
#define __timer_init(_timer, _fn, _flags) \
timer_init_key((_timer), (_fn), (_flags), NULL, NULL)
#define __timer_init_on_stack(_timer, _fn, _flags) \
timer_init_key_on_stack((_timer), (_fn), (_flags), NULL, NULL)
#endif
/**
* timer_setup - prepare a timer for first use
* @timer: the timer in question
* @callback: the function to call when timer expires
* @flags: any TIMER_* flags
*
* Regular timer initialization should use either DEFINE_TIMER() above,
* or timer_setup(). For timers on the stack, timer_setup_on_stack() must
* be used and must be balanced with a call to timer_destroy_on_stack().
*/
#define timer_setup(timer, callback, flags) \
__timer_init((timer), (callback), (flags))
#define timer_setup_on_stack(timer, callback, flags) \
__timer_init_on_stack((timer), (callback), (flags))
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void timer_destroy_on_stack(struct timer_list *timer);
#else
static inline void timer_destroy_on_stack(struct timer_list *timer) { }
#endif
#define timer_container_of(var, callback_timer, timer_fieldname) \
container_of(callback_timer, typeof(*var), timer_fieldname)
/**
* timer_pending - is a timer pending?
* @timer: the timer in question
*
* timer_pending will tell whether a given timer is currently pending,
* or not. Callers must ensure serialization wrt. other operations done
* to this timer, eg. interrupt contexts, or other CPUs on SMP.
*
* Returns: 1 if the timer is pending, 0 if not.
*/
static inline int timer_pending(const struct timer_list * timer)
{
return !hlist_unhashed_lockless(&timer->entry);
}
extern void add_timer_on(struct timer_list *timer, int cpu);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
extern int timer_reduce(struct timer_list *timer, unsigned long expires);
/*
* The jiffies value which is added to now, when there is no timer
* in the timer wheel:
*/
#define TIMER_NEXT_MAX_DELTA ((1UL << 30) - 1)
extern void add_timer(struct timer_list *timer);
extern void add_timer_local(struct timer_list *timer);
extern void add_timer_global(struct timer_list *timer);
extern int timer_delete_sync_try(struct timer_list *timer);
extern int timer_delete_sync(struct timer_list *timer);
extern int timer_delete(struct timer_list *timer);
extern int timer_shutdown_sync(struct timer_list *timer);
extern int timer_shutdown(struct timer_list *timer);
extern void timers_init(void);
struct hrtimer;
extern enum hrtimer_restart it_real_fn(struct hrtimer *);
unsigned long __round_jiffies_relative(unsigned long j, int cpu);
unsigned long round_jiffies(unsigned long j);
unsigned long round_jiffies_relative(unsigned long j);
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu);
unsigned long round_jiffies_up(unsigned long j);
unsigned long round_jiffies_up_relative(unsigned long j);
#ifdef CONFIG_HOTPLUG_CPU
int timers_prepare_cpu(unsigned int cpu);
int timers_dead_cpu(unsigned int cpu);
#else
#define timers_prepare_cpu NULL
#define timers_dead_cpu NULL
#endif
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask);
#else
static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
{
return 0;
}
#endif
#endif