mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-01-17 20:10:49 +00:00
Patch series "drivers/base/node.c: optimization and cleanups", v7.
This patch (of 7)
During node device initialization, `memory blocks` are registered under
each NUMA node. The `memory blocks` to be registered are identified using
the node's start and end PFNs, which are obtained from the node's pg_data
However, not all PFNs within this range necessarily belong to the same
node—some may belong to other nodes. Additionally, due to the
discontiguous nature of physical memory, certain sections within a `memory
block` may be absent.
As a result, `memory blocks` that fall between a node's start and end PFNs
may span across multiple nodes, and some sections within those blocks may
be missing. `Memory blocks` have a fixed size, which is architecture
dependent.
Due to these considerations, the memory block registration is currently
performed as follows:
for_each_online_node(nid):
start_pfn = pgdat->node_start_pfn;
end_pfn = pgdat->node_start_pfn + node_spanned_pages;
for_each_memory_block_between(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn))
mem_blk = memory_block_id(pfn_to_section_nr(pfn));
pfn_mb_start=section_nr_to_pfn(mem_blk->start_section_nr)
pfn_mb_end = pfn_start + memory_block_pfns - 1
for (pfn = pfn_mb_start; pfn < pfn_mb_end; pfn++):
if (get_nid_for_pfn(pfn) != nid):
continue;
else
do_register_memory_block_under_node(nid, mem_blk,
MEMINIT_EARLY);
Here, we derive the start and end PFNs from the node's pg_data, then
determine the memory blocks that may belong to the node. For each `memory
block` in this range, we inspect all PFNs it contains and check their
associated NUMA node ID. If a PFN within the block matches the current
node, the memory block is registered under that node.
If CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, get_nid_for_pfn() performs
a binary search in the `memblock regions` to determine the NUMA node ID
for a given PFN. If it is not enabled, the node ID is retrieved directly
from the struct page.
On large systems, this process can become time-consuming, especially since
we iterate over each `memory block` and all PFNs within it until a match
is found. When CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, the
additional overhead of the binary search increases the execution time
significantly, potentially leading to soft lockups during boot.
In this patch, we iterate over `memblock region` to identify the `memory
blocks` that belong to the current NUMA node. `memblock regions` are
contiguous memory ranges, each associated with a single NUMA node, and
they do not span across multiple nodes.
for_each_memory_region(r): // r => region
if (!node_online(r->nid)):
continue;
else
for_each_memory_block_between(r->base, r->base + r->size - 1):
do_register_memory_block_under_node(r->nid, mem_blk, MEMINIT_EARLY);
We iterate over all memblock regions, and if the node associated with the
region is online, we calculate the start and end memory blocks based on
the region's start and end PFNs. We then register all the memory blocks
within that range under the region node.
Test Results on My system with 32TB RAM
=======================================
1. Boot time with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled.
Without this patch
------------------
Startup finished in 1min 16.528s (kernel)
With this patch
---------------
Startup finished in 17.236s (kernel) - 78% Improvement
2. Boot time with CONFIG_DEFERRED_STRUCT_PAGE_INIT disabled.
Without this patch
------------------
Startup finished in 28.320s (kernel)
With this patch
---------------
Startup finished in 15.621s (kernel) - 46% Improvement
[donettom@linux.ibm.com: restore removed extra line]
Link: https://lkml.kernel.org/r/20250609140354.467908-1-donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/2a0a05c2dffc62a742bf1dd030098be4ce99be28.1748452241.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/2a0a05c2dffc62a742bf1dd030098be4ce99be28.1748452241.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
194 lines
4.9 KiB
C
194 lines
4.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* include/linux/node.h - generic node definition
|
|
*
|
|
* This is mainly for topological representation. We define the
|
|
* basic 'struct node' here, which can be embedded in per-arch
|
|
* definitions of processors.
|
|
*
|
|
* Basic handling of the devices is done in drivers/base/node.c
|
|
* and system devices are handled in drivers/base/sys.c.
|
|
*
|
|
* Nodes are exported via driverfs in the class/node/devices/
|
|
* directory.
|
|
*/
|
|
#ifndef _LINUX_NODE_H_
|
|
#define _LINUX_NODE_H_
|
|
|
|
#include <linux/device.h>
|
|
#include <linux/list.h>
|
|
|
|
/**
|
|
* struct access_coordinate - generic performance coordinates container
|
|
*
|
|
* @read_bandwidth: Read bandwidth in MB/s
|
|
* @write_bandwidth: Write bandwidth in MB/s
|
|
* @read_latency: Read latency in nanoseconds
|
|
* @write_latency: Write latency in nanoseconds
|
|
*/
|
|
struct access_coordinate {
|
|
unsigned int read_bandwidth;
|
|
unsigned int write_bandwidth;
|
|
unsigned int read_latency;
|
|
unsigned int write_latency;
|
|
};
|
|
|
|
/*
|
|
* ACCESS_COORDINATE_LOCAL correlates to ACCESS CLASS 0
|
|
* - access_coordinate between target node and nearest initiator node
|
|
* ACCESS_COORDINATE_CPU correlates to ACCESS CLASS 1
|
|
* - access_coordinate between target node and nearest CPU node
|
|
*/
|
|
enum access_coordinate_class {
|
|
ACCESS_COORDINATE_LOCAL,
|
|
ACCESS_COORDINATE_CPU,
|
|
ACCESS_COORDINATE_MAX
|
|
};
|
|
|
|
enum cache_indexing {
|
|
NODE_CACHE_DIRECT_MAP,
|
|
NODE_CACHE_INDEXED,
|
|
NODE_CACHE_OTHER,
|
|
};
|
|
|
|
enum cache_write_policy {
|
|
NODE_CACHE_WRITE_BACK,
|
|
NODE_CACHE_WRITE_THROUGH,
|
|
NODE_CACHE_WRITE_OTHER,
|
|
};
|
|
|
|
enum cache_mode {
|
|
NODE_CACHE_ADDR_MODE_RESERVED,
|
|
NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR,
|
|
};
|
|
|
|
/**
|
|
* struct node_cache_attrs - system memory caching attributes
|
|
*
|
|
* @indexing: The ways memory blocks may be placed in cache
|
|
* @write_policy: Write back or write through policy
|
|
* @size: Total size of cache in bytes
|
|
* @line_size: Number of bytes fetched on a cache miss
|
|
* @level: The cache hierarchy level
|
|
* @address_mode: The address mode
|
|
*/
|
|
struct node_cache_attrs {
|
|
enum cache_indexing indexing;
|
|
enum cache_write_policy write_policy;
|
|
u64 size;
|
|
u16 line_size;
|
|
u8 level;
|
|
u16 address_mode;
|
|
};
|
|
|
|
#ifdef CONFIG_HMEM_REPORTING
|
|
void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs);
|
|
void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord,
|
|
enum access_coordinate_class access);
|
|
#else
|
|
static inline void node_add_cache(unsigned int nid,
|
|
struct node_cache_attrs *cache_attrs)
|
|
{
|
|
}
|
|
|
|
static inline void node_set_perf_attrs(unsigned int nid,
|
|
struct access_coordinate *coord,
|
|
enum access_coordinate_class access)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
struct node {
|
|
struct device dev;
|
|
struct list_head access_list;
|
|
#ifdef CONFIG_HMEM_REPORTING
|
|
struct list_head cache_attrs;
|
|
struct device *cache_dev;
|
|
#endif
|
|
};
|
|
|
|
struct memory_block;
|
|
extern struct node *node_devices[];
|
|
|
|
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
|
|
void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
|
|
unsigned long end_pfn,
|
|
enum meminit_context context);
|
|
#else
|
|
static inline void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
|
|
unsigned long end_pfn,
|
|
enum meminit_context context)
|
|
{
|
|
}
|
|
static inline void register_memory_blocks_under_nodes(void)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
extern void unregister_node(struct node *node);
|
|
#ifdef CONFIG_NUMA
|
|
extern void node_dev_init(void);
|
|
/* Core of the node registration - only memory hotplug should use this */
|
|
extern int __register_one_node(int nid);
|
|
|
|
/* Registers an online node */
|
|
static inline int register_one_node(int nid)
|
|
{
|
|
int error = 0;
|
|
|
|
if (node_online(nid)) {
|
|
struct pglist_data *pgdat = NODE_DATA(nid);
|
|
unsigned long start_pfn = pgdat->node_start_pfn;
|
|
unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
|
|
|
|
error = __register_one_node(nid);
|
|
if (error)
|
|
return error;
|
|
register_memory_blocks_under_node(nid, start_pfn, end_pfn,
|
|
MEMINIT_EARLY);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
extern void unregister_one_node(int nid);
|
|
extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
|
|
extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
|
|
extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
|
|
|
|
extern int register_memory_node_under_compute_node(unsigned int mem_nid,
|
|
unsigned int cpu_nid,
|
|
enum access_coordinate_class access);
|
|
#else
|
|
static inline void node_dev_init(void)
|
|
{
|
|
}
|
|
static inline int __register_one_node(int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int register_one_node(int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int unregister_one_node(int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#define to_node(device) container_of(device, struct node, dev)
|
|
|
|
#endif /* _LINUX_NODE_H_ */
|