1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-17 12:00:35 +00:00
Donet Tom 4f745def81 drivers/base/node: optimize memory block registration to reduce boot time
Patch series "drivers/base/node.c: optimization and cleanups", v7.


This patch (of 7)

During node device initialization, `memory blocks` are registered under
each NUMA node.  The `memory blocks` to be registered are identified using
the node's start and end PFNs, which are obtained from the node's pg_data

However, not all PFNs within this range necessarily belong to the same
node—some may belong to other nodes.  Additionally, due to the
discontiguous nature of physical memory, certain sections within a `memory
block` may be absent.

As a result, `memory blocks` that fall between a node's start and end PFNs
may span across multiple nodes, and some sections within those blocks may
be missing.  `Memory blocks` have a fixed size, which is architecture
dependent.

Due to these considerations, the memory block registration is currently
performed as follows:

for_each_online_node(nid):
    start_pfn = pgdat->node_start_pfn;
    end_pfn = pgdat->node_start_pfn + node_spanned_pages;
    for_each_memory_block_between(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn))
        mem_blk = memory_block_id(pfn_to_section_nr(pfn));
        pfn_mb_start=section_nr_to_pfn(mem_blk->start_section_nr)
        pfn_mb_end = pfn_start + memory_block_pfns - 1
        for (pfn = pfn_mb_start; pfn < pfn_mb_end; pfn++):
            if (get_nid_for_pfn(pfn) != nid):
                continue;
            else
                do_register_memory_block_under_node(nid, mem_blk,
                                                        MEMINIT_EARLY);

Here, we derive the start and end PFNs from the node's pg_data, then
determine the memory blocks that may belong to the node.  For each `memory
block` in this range, we inspect all PFNs it contains and check their
associated NUMA node ID.  If a PFN within the block matches the current
node, the memory block is registered under that node.

If CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, get_nid_for_pfn() performs
a binary search in the `memblock regions` to determine the NUMA node ID
for a given PFN.  If it is not enabled, the node ID is retrieved directly
from the struct page.

On large systems, this process can become time-consuming, especially since
we iterate over each `memory block` and all PFNs within it until a match
is found.  When CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, the
additional overhead of the binary search increases the execution time
significantly, potentially leading to soft lockups during boot.

In this patch, we iterate over `memblock region` to identify the `memory
blocks` that belong to the current NUMA node.  `memblock regions` are
contiguous memory ranges, each associated with a single NUMA node, and
they do not span across multiple nodes.

for_each_memory_region(r): // r => region
  if (!node_online(r->nid)):
    continue;
  else
    for_each_memory_block_between(r->base, r->base + r->size - 1):
      do_register_memory_block_under_node(r->nid, mem_blk, MEMINIT_EARLY);

We iterate over all memblock regions, and if the node associated with the
region is online, we calculate the start and end memory blocks based on
the region's start and end PFNs.  We then register all the memory blocks
within that range under the region node.

Test Results on My system with 32TB RAM
=======================================
1. Boot time with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled.

Without this patch
------------------
Startup finished in 1min 16.528s (kernel)

With this patch
---------------
Startup finished in 17.236s (kernel) - 78% Improvement

2. Boot time with CONFIG_DEFERRED_STRUCT_PAGE_INIT disabled.

Without this patch
------------------
Startup finished in 28.320s (kernel)

With this patch
---------------
Startup finished in 15.621s (kernel) - 46% Improvement

[donettom@linux.ibm.com: restore removed extra line]
  Link: https://lkml.kernel.org/r/20250609140354.467908-1-donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/2a0a05c2dffc62a742bf1dd030098be4ce99be28.1748452241.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/2a0a05c2dffc62a742bf1dd030098be4ce99be28.1748452241.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-07-09 22:41:59 -07:00

221 lines
7.1 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* include/linux/memory.h - generic memory definition
*
* This is mainly for topological representation. We define the
* basic "struct memory_block" here, which can be embedded in per-arch
* definitions or NUMA information.
*
* Basic handling of the devices is done in drivers/base/memory.c
* and system devices are handled in drivers/base/sys.c.
*
* Memory block are exported via sysfs in the class/memory/devices/
* directory.
*
*/
#ifndef _LINUX_MEMORY_H_
#define _LINUX_MEMORY_H_
#include <linux/node.h>
#include <linux/compiler.h>
#include <linux/mutex.h>
#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
/**
* struct memory_group - a logical group of memory blocks
* @nid: The node id for all memory blocks inside the memory group.
* @memory_blocks: List of all memory blocks belonging to this memory group.
* @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this
* memory group.
* @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this
* memory group.
* @is_dynamic: The memory group type: static vs. dynamic
* @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum
* number of pages we'll have in this static memory group.
* @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages
* in which memory is added/removed in this dynamic memory group.
* This granularity defines the alignment of a unit in physical
* address space; it has to be at least as big as a single
* memory block.
*
* A memory group logically groups memory blocks; each memory block
* belongs to at most one memory group. A memory group corresponds to
* a memory device, such as a DIMM or a NUMA node, which spans multiple
* memory blocks and might even span multiple non-contiguous physical memory
* ranges.
*
* Modification of members after registration is serialized by memory
* hot(un)plug code.
*/
struct memory_group {
int nid;
struct list_head memory_blocks;
unsigned long present_kernel_pages;
unsigned long present_movable_pages;
bool is_dynamic;
union {
struct {
unsigned long max_pages;
} s;
struct {
unsigned long unit_pages;
} d;
};
};
struct memory_block {
unsigned long start_section_nr;
unsigned long state; /* serialized by the dev->lock */
int online_type; /* for passing data to online routine */
int nid; /* NID for this memory block */
/*
* The single zone of this memory block if all PFNs of this memory block
* that are System RAM (not a memory hole, not ZONE_DEVICE ranges) are
* managed by a single zone. NULL if multiple zones (including nodes)
* apply.
*/
struct zone *zone;
struct device dev;
struct vmem_altmap *altmap;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
atomic_long_t nr_hwpoison;
#endif
};
int arch_get_memory_phys_device(unsigned long start_pfn);
unsigned long memory_block_size_bytes(void);
int set_memory_block_size_order(unsigned int order);
/* These states are exposed to userspace as text strings in sysfs */
#define MEM_ONLINE (1<<0) /* exposed to userspace */
#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
#define MEM_OFFLINE (1<<2) /* exposed to userspace */
#define MEM_GOING_ONLINE (1<<3)
#define MEM_CANCEL_ONLINE (1<<4)
#define MEM_CANCEL_OFFLINE (1<<5)
#define MEM_PREPARE_ONLINE (1<<6)
#define MEM_FINISH_OFFLINE (1<<7)
struct memory_notify {
/*
* The altmap_start_pfn and altmap_nr_pages fields are designated for
* specifying the altmap range and are exclusively intended for use in
* MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
*/
unsigned long altmap_start_pfn;
unsigned long altmap_nr_pages;
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
int status_change_nid;
};
struct notifier_block;
struct mem_section;
/*
* Priorities for the hotplug memory callback routines (stored in decreasing
* order in the callback chain)
*/
#define DEFAULT_CALLBACK_PRI 0
#define SLAB_CALLBACK_PRI 1
#define HMAT_CALLBACK_PRI 2
#define CXL_CALLBACK_PRI 5
#define MM_COMPUTE_BATCH_PRI 10
#define CPUSET_CALLBACK_PRI 10
#define MEMTIER_HOTPLUG_PRI 100
#define KSM_CALLBACK_PRI 100
#ifndef CONFIG_MEMORY_HOTPLUG
static inline void memory_dev_init(void)
{
return;
}
static inline int register_memory_notifier(struct notifier_block *nb)
{
return 0;
}
static inline void unregister_memory_notifier(struct notifier_block *nb)
{
}
static inline int memory_notify(unsigned long val, void *v)
{
return 0;
}
static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
{
return 0;
}
static inline int memory_block_advise_max_size(unsigned long size)
{
return -ENODEV;
}
static inline unsigned long memory_block_advised_max_size(void)
{
return 0;
}
#else /* CONFIG_MEMORY_HOTPLUG */
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size,
struct vmem_altmap *altmap,
struct memory_group *group);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(unsigned long section_nr);
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func);
extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
extern int memory_group_register_static(int nid, unsigned long max_pages);
extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
extern int memory_group_unregister(int mgid);
struct memory_group *memory_group_find_by_id(int mgid);
typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
struct memory_group *excluded, void *arg);
struct memory_block *find_memory_block_by_id(unsigned long block_id);
#define hotplug_memory_notifier(fn, pri) ({ \
static __meminitdata struct notifier_block fn##_mem_nb =\
{ .notifier_call = fn, .priority = pri };\
register_memory_notifier(&fn##_mem_nb); \
})
extern int sections_per_block;
static inline unsigned long memory_block_id(unsigned long section_nr)
{
return section_nr / sections_per_block;
}
static inline unsigned long pfn_to_block_id(unsigned long pfn)
{
return memory_block_id(pfn_to_section_nr(pfn));
}
static inline unsigned long phys_to_block_id(unsigned long phys)
{
return pfn_to_block_id(PFN_DOWN(phys));
}
#ifdef CONFIG_NUMA
void memory_block_add_nid(struct memory_block *mem, int nid,
enum meminit_context context);
#endif /* CONFIG_NUMA */
int memory_block_advise_max_size(unsigned long size);
unsigned long memory_block_advised_max_size(void);
#endif /* CONFIG_MEMORY_HOTPLUG */
/*
* Kernel text modification mutex, used for code patching. Users of this lock
* can sleep.
*/
extern struct mutex text_mutex;
#endif /* _LINUX_MEMORY_H_ */