1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-12 01:20:14 +00:00
torvalds-linux/include/net/cls_cgroup.h
Yafang Shao 66048f8b3c net/cls_cgroup: Fix task_get_classid() during qdisc run
During recent testing with the netem qdisc to inject delays into TCP
traffic, we observed that our CLS BPF program failed to function correctly
due to incorrect classid retrieval from task_get_classid(). The issue
manifests in the following call stack:

        bpf_get_cgroup_classid+5
        cls_bpf_classify+507
        __tcf_classify+90
        tcf_classify+217
        __dev_queue_xmit+798
        bond_dev_queue_xmit+43
        __bond_start_xmit+211
        bond_start_xmit+70
        dev_hard_start_xmit+142
        sch_direct_xmit+161
        __qdisc_run+102             <<<<< Issue location
        __dev_xmit_skb+1015
        __dev_queue_xmit+637
        neigh_hh_output+159
        ip_finish_output2+461
        __ip_finish_output+183
        ip_finish_output+41
        ip_output+120
        ip_local_out+94
        __ip_queue_xmit+394
        ip_queue_xmit+21
        __tcp_transmit_skb+2169
        tcp_write_xmit+959
        __tcp_push_pending_frames+55
        tcp_push+264
        tcp_sendmsg_locked+661
        tcp_sendmsg+45
        inet_sendmsg+67
        sock_sendmsg+98
        sock_write_iter+147
        vfs_write+786
        ksys_write+181
        __x64_sys_write+25
        do_syscall_64+56
        entry_SYSCALL_64_after_hwframe+100

The problem occurs when multiple tasks share a single qdisc. In such cases,
__qdisc_run() may transmit skbs created by different tasks. Consequently,
task_get_classid() retrieves an incorrect classid since it references the
current task's context rather than the skb's originating task.

Given that dev_queue_xmit() always executes with bh disabled, we can use
softirq_count() instead to obtain the correct classid.

The simple steps to reproduce this issue:
1. Add network delay to the network interface:
  such as: tc qdisc add dev bond0 root netem delay 1.5ms
2. Build two distinct net_cls cgroups, each with a network-intensive task
3. Initiate parallel TCP streams from both tasks to external servers.

Under this specific condition, the issue reliably occurs. The kernel
eventually dequeues an SKB that originated from Task-A while executing in
the context of Task-B.

It is worth noting that it will change the established behavior for a
slightly different scenario:

  <sock S is created by task A>
  <class ID for task A is changed>
  <skb is created by sock S xmit and classified>

prior to this patch the skb will be classified with the 'new' task A
classid, now with the old/original one. The bpf_get_cgroup_classid_curr()
function is a more appropriate choice for this case.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20250902062933.30087-1-laoar.shao@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-09-14 11:55:04 -07:00

89 lines
2.0 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* cls_cgroup.h Control Group Classifier
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#ifndef _NET_CLS_CGROUP_H
#define _NET_CLS_CGROUP_H
#include <linux/cgroup.h>
#include <linux/hardirq.h>
#include <linux/rcupdate.h>
#include <net/sock.h>
#include <net/inet_sock.h>
#ifdef CONFIG_CGROUP_NET_CLASSID
struct cgroup_cls_state {
struct cgroup_subsys_state css;
u32 classid;
};
struct cgroup_cls_state *task_cls_state(struct task_struct *p);
static inline u32 task_cls_classid(struct task_struct *p)
{
u32 classid;
if (in_interrupt())
return 0;
rcu_read_lock();
classid = container_of(task_css(p, net_cls_cgrp_id),
struct cgroup_cls_state, css)->classid;
rcu_read_unlock();
return classid;
}
static inline void sock_update_classid(struct sock_cgroup_data *skcd)
{
u32 classid;
classid = task_cls_classid(current);
sock_cgroup_set_classid(skcd, classid);
}
static inline u32 __task_get_classid(struct task_struct *task)
{
return task_cls_state(task)->classid;
}
static inline u32 task_get_classid(const struct sk_buff *skb)
{
u32 classid = __task_get_classid(current);
/* Due to the nature of the classifier it is required to ignore all
* packets originating from softirq context as accessing `current'
* would lead to false results.
*
* This test assumes that all callers of dev_queue_xmit() explicitly
* disable bh. Knowing this, it is possible to detect softirq based
* calls by looking at the number of nested bh disable calls because
* softirqs always disables bh.
*/
if (softirq_count()) {
struct sock *sk = skb_to_full_sk(skb);
/* If there is an sock_cgroup_classid we'll use that. */
if (!sk || !sk_fullsock(sk))
return 0;
classid = sock_cgroup_classid(&sk->sk_cgrp_data);
}
return classid;
}
#else /* !CONFIG_CGROUP_NET_CLASSID */
static inline void sock_update_classid(struct sock_cgroup_data *skcd)
{
}
static inline u32 task_get_classid(const struct sk_buff *skb)
{
return 0;
}
#endif /* CONFIG_CGROUP_NET_CLASSID */
#endif /* _NET_CLS_CGROUP_H */