mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-01-18 20:40:22 +00:00
Commit 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") could had
memcg->memsw->watermark and memcg->memsw->failcnt been accessed
concurrently as reported by KCSAN,
BUG: KCSAN: data-race in page_counter_try_charge / page_counter_try_charge
read to 0xffff8fb18c4cd190 of 8 bytes by task 1081 on cpu 59:
page_counter_try_charge+0x4d/0x150 mm/page_counter.c:138
try_charge+0x131/0xd50 mm/memcontrol.c:2405
__memcg_kmem_charge_memcg+0x58/0x140
__memcg_kmem_charge+0xcc/0x280
__alloc_pages_nodemask+0x1e1/0x450
alloc_pages_current+0xa6/0x120
pte_alloc_one+0x17/0xd0
__pte_alloc+0x3a/0x1f0
copy_p4d_range+0xc36/0x1990
copy_page_range+0x21d/0x360
dup_mmap+0x5f5/0x7a0
dup_mm+0xa2/0x240
copy_process+0x1b3f/0x3460
_do_fork+0xaa/0xa20
__x64_sys_clone+0x13b/0x170
do_syscall_64+0x91/0xb47
entry_SYSCALL_64_after_hwframe+0x49/0xbe
write to 0xffff8fb18c4cd190 of 8 bytes by task 1153 on cpu 120:
page_counter_try_charge+0x5b/0x150 mm/page_counter.c:139
try_charge+0x131/0xd50 mm/memcontrol.c:2405
mem_cgroup_try_charge+0x159/0x460
mem_cgroup_try_charge_delay+0x3d/0xa0
wp_page_copy+0x14d/0x930
do_wp_page+0x107/0x7b0
__handle_mm_fault+0xce6/0xd40
handle_mm_fault+0xfc/0x2f0
do_page_fault+0x263/0x6f9
page_fault+0x34/0x40
BUG: KCSAN: data-race in page_counter_try_charge / page_counter_try_charge
write to 0xffff88809bbf2158 of 8 bytes by task 11782 on cpu 0:
page_counter_try_charge+0x100/0x170 mm/page_counter.c:129
try_charge+0x185/0xbf0 mm/memcontrol.c:2405
__memcg_kmem_charge_memcg+0x4a/0xe0 mm/memcontrol.c:2837
__memcg_kmem_charge+0xcf/0x1b0 mm/memcontrol.c:2877
__alloc_pages_nodemask+0x26c/0x310 mm/page_alloc.c:4780
read to 0xffff88809bbf2158 of 8 bytes by task 11814 on cpu 1:
page_counter_try_charge+0xef/0x170 mm/page_counter.c:129
try_charge+0x185/0xbf0 mm/memcontrol.c:2405
__memcg_kmem_charge_memcg+0x4a/0xe0 mm/memcontrol.c:2837
__memcg_kmem_charge+0xcf/0x1b0 mm/memcontrol.c:2877
__alloc_pages_nodemask+0x26c/0x310 mm/page_alloc.c:4780
Since watermark could be compared or set to garbage due to a data race
which would change the code logic, fix it by adding a pair of READ_ONCE()
and WRITE_ONCE() in those places.
The "failcnt" counter is tolerant of some degree of inaccuracy and is only
used to report stats, a data race will not be harmful, thus mark it as an
intentional data race using the data_race() macro.
Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters")
Reported-by: syzbot+f36cfe60b1006a94f9dc@syzkaller.appspotmail.com
Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Link: http://lkml.kernel.org/r/1581519682-23594-1-git-send-email-cai@lca.pw
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
263 lines
6.6 KiB
C
263 lines
6.6 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Lockless hierarchical page accounting & limiting
|
|
*
|
|
* Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
|
|
*/
|
|
|
|
#include <linux/page_counter.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/string.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/bug.h>
|
|
#include <asm/page.h>
|
|
|
|
static void propagate_protected_usage(struct page_counter *c,
|
|
unsigned long usage)
|
|
{
|
|
unsigned long protected, old_protected;
|
|
unsigned long low, min;
|
|
long delta;
|
|
|
|
if (!c->parent)
|
|
return;
|
|
|
|
min = READ_ONCE(c->min);
|
|
if (min || atomic_long_read(&c->min_usage)) {
|
|
protected = min(usage, min);
|
|
old_protected = atomic_long_xchg(&c->min_usage, protected);
|
|
delta = protected - old_protected;
|
|
if (delta)
|
|
atomic_long_add(delta, &c->parent->children_min_usage);
|
|
}
|
|
|
|
low = READ_ONCE(c->low);
|
|
if (low || atomic_long_read(&c->low_usage)) {
|
|
protected = min(usage, low);
|
|
old_protected = atomic_long_xchg(&c->low_usage, protected);
|
|
delta = protected - old_protected;
|
|
if (delta)
|
|
atomic_long_add(delta, &c->parent->children_low_usage);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* page_counter_cancel - take pages out of the local counter
|
|
* @counter: counter
|
|
* @nr_pages: number of pages to cancel
|
|
*/
|
|
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
|
|
{
|
|
long new;
|
|
|
|
new = atomic_long_sub_return(nr_pages, &counter->usage);
|
|
propagate_protected_usage(counter, new);
|
|
/* More uncharges than charges? */
|
|
WARN_ON_ONCE(new < 0);
|
|
}
|
|
|
|
/**
|
|
* page_counter_charge - hierarchically charge pages
|
|
* @counter: counter
|
|
* @nr_pages: number of pages to charge
|
|
*
|
|
* NOTE: This does not consider any configured counter limits.
|
|
*/
|
|
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
|
|
{
|
|
struct page_counter *c;
|
|
|
|
for (c = counter; c; c = c->parent) {
|
|
long new;
|
|
|
|
new = atomic_long_add_return(nr_pages, &c->usage);
|
|
propagate_protected_usage(c, new);
|
|
/*
|
|
* This is indeed racy, but we can live with some
|
|
* inaccuracy in the watermark.
|
|
*/
|
|
if (new > READ_ONCE(c->watermark))
|
|
WRITE_ONCE(c->watermark, new);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* page_counter_try_charge - try to hierarchically charge pages
|
|
* @counter: counter
|
|
* @nr_pages: number of pages to charge
|
|
* @fail: points first counter to hit its limit, if any
|
|
*
|
|
* Returns %true on success, or %false and @fail if the counter or one
|
|
* of its ancestors has hit its configured limit.
|
|
*/
|
|
bool page_counter_try_charge(struct page_counter *counter,
|
|
unsigned long nr_pages,
|
|
struct page_counter **fail)
|
|
{
|
|
struct page_counter *c;
|
|
|
|
for (c = counter; c; c = c->parent) {
|
|
long new;
|
|
/*
|
|
* Charge speculatively to avoid an expensive CAS. If
|
|
* a bigger charge fails, it might falsely lock out a
|
|
* racing smaller charge and send it into reclaim
|
|
* early, but the error is limited to the difference
|
|
* between the two sizes, which is less than 2M/4M in
|
|
* case of a THP locking out a regular page charge.
|
|
*
|
|
* The atomic_long_add_return() implies a full memory
|
|
* barrier between incrementing the count and reading
|
|
* the limit. When racing with page_counter_limit(),
|
|
* we either see the new limit or the setter sees the
|
|
* counter has changed and retries.
|
|
*/
|
|
new = atomic_long_add_return(nr_pages, &c->usage);
|
|
if (new > c->max) {
|
|
atomic_long_sub(nr_pages, &c->usage);
|
|
propagate_protected_usage(c, new);
|
|
/*
|
|
* This is racy, but we can live with some
|
|
* inaccuracy in the failcnt which is only used
|
|
* to report stats.
|
|
*/
|
|
data_race(c->failcnt++);
|
|
*fail = c;
|
|
goto failed;
|
|
}
|
|
propagate_protected_usage(c, new);
|
|
/*
|
|
* Just like with failcnt, we can live with some
|
|
* inaccuracy in the watermark.
|
|
*/
|
|
if (new > READ_ONCE(c->watermark))
|
|
WRITE_ONCE(c->watermark, new);
|
|
}
|
|
return true;
|
|
|
|
failed:
|
|
for (c = counter; c != *fail; c = c->parent)
|
|
page_counter_cancel(c, nr_pages);
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* page_counter_uncharge - hierarchically uncharge pages
|
|
* @counter: counter
|
|
* @nr_pages: number of pages to uncharge
|
|
*/
|
|
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
|
|
{
|
|
struct page_counter *c;
|
|
|
|
for (c = counter; c; c = c->parent)
|
|
page_counter_cancel(c, nr_pages);
|
|
}
|
|
|
|
/**
|
|
* page_counter_set_max - set the maximum number of pages allowed
|
|
* @counter: counter
|
|
* @nr_pages: limit to set
|
|
*
|
|
* Returns 0 on success, -EBUSY if the current number of pages on the
|
|
* counter already exceeds the specified limit.
|
|
*
|
|
* The caller must serialize invocations on the same counter.
|
|
*/
|
|
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
|
|
{
|
|
for (;;) {
|
|
unsigned long old;
|
|
long usage;
|
|
|
|
/*
|
|
* Update the limit while making sure that it's not
|
|
* below the concurrently-changing counter value.
|
|
*
|
|
* The xchg implies two full memory barriers before
|
|
* and after, so the read-swap-read is ordered and
|
|
* ensures coherency with page_counter_try_charge():
|
|
* that function modifies the count before checking
|
|
* the limit, so if it sees the old limit, we see the
|
|
* modified counter and retry.
|
|
*/
|
|
usage = atomic_long_read(&counter->usage);
|
|
|
|
if (usage > nr_pages)
|
|
return -EBUSY;
|
|
|
|
old = xchg(&counter->max, nr_pages);
|
|
|
|
if (atomic_long_read(&counter->usage) <= usage)
|
|
return 0;
|
|
|
|
counter->max = old;
|
|
cond_resched();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* page_counter_set_min - set the amount of protected memory
|
|
* @counter: counter
|
|
* @nr_pages: value to set
|
|
*
|
|
* The caller must serialize invocations on the same counter.
|
|
*/
|
|
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
|
|
{
|
|
struct page_counter *c;
|
|
|
|
WRITE_ONCE(counter->min, nr_pages);
|
|
|
|
for (c = counter; c; c = c->parent)
|
|
propagate_protected_usage(c, atomic_long_read(&c->usage));
|
|
}
|
|
|
|
/**
|
|
* page_counter_set_low - set the amount of protected memory
|
|
* @counter: counter
|
|
* @nr_pages: value to set
|
|
*
|
|
* The caller must serialize invocations on the same counter.
|
|
*/
|
|
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
|
|
{
|
|
struct page_counter *c;
|
|
|
|
WRITE_ONCE(counter->low, nr_pages);
|
|
|
|
for (c = counter; c; c = c->parent)
|
|
propagate_protected_usage(c, atomic_long_read(&c->usage));
|
|
}
|
|
|
|
/**
|
|
* page_counter_memparse - memparse() for page counter limits
|
|
* @buf: string to parse
|
|
* @max: string meaning maximum possible value
|
|
* @nr_pages: returns the result in number of pages
|
|
*
|
|
* Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
|
|
* limited to %PAGE_COUNTER_MAX.
|
|
*/
|
|
int page_counter_memparse(const char *buf, const char *max,
|
|
unsigned long *nr_pages)
|
|
{
|
|
char *end;
|
|
u64 bytes;
|
|
|
|
if (!strcmp(buf, max)) {
|
|
*nr_pages = PAGE_COUNTER_MAX;
|
|
return 0;
|
|
}
|
|
|
|
bytes = memparse(buf, &end);
|
|
if (*end != '\0')
|
|
return -EINVAL;
|
|
|
|
*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
|
|
|
|
return 0;
|
|
}
|