perf: attach/detach PMU specific data

The LBR call stack data has to be saved/restored during context switch
to fix the shorter LBRs call stacks issue in the  system-wide mode.
Allocate PMU specific data and attach them to the corresponding
task_struct during LBR call stack monitoring.

When a LBR call stack event is accounted, the perf_ctx_data for the
related tasks will be allocated/attached by attach_perf_ctx_data().
When a LBR call stack event is unaccounted, the perf_ctx_data for
related tasks will be detached/freed by detach_perf_ctx_data().

The LBR call stack event could be a per-task event or a system-wide
event.
- For a per-task event, perf only allocates the perf_ctx_data for the
  current task. If the allocation fails, perf will error out.
- For a system-wide event, perf has to allocate the perf_ctx_data for
  both the existing tasks and the upcoming tasks.
  The allocation for the existing tasks is done in perf_event_alloc().
  If any allocation fails, perf will error out.
  The allocation for the new tasks will be done in perf_event_fork().
  A global reader/writer semaphore, global_ctx_data_rwsem, is added to
  address the global race.
- The perf_ctx_data only be freed by the last LBR call stack event.
  The number of the per-task events is tracked by refcount of each task.
  Since the system-wide events impact all tasks, it's not practical to
  go through the whole task list to update the refcount for each
  system-wide event. The number of system-wide events is tracked by a
  global variable global_ctx_data_ref.

Suggested-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250314172700.438923-3-kan.liang@linux.intel.com
This commit is contained in:
Kan Liang 2025-03-14 10:26:56 -07:00 committed by Peter Zijlstra
parent fdfda868ee
commit 506e64e710
2 changed files with 291 additions and 1 deletions

View File

@ -676,11 +676,12 @@ struct swevent_hlist {
#define PERF_ATTACH_GROUP 0x0002
#define PERF_ATTACH_TASK 0x0004
#define PERF_ATTACH_TASK_DATA 0x0008
#define PERF_ATTACH_ITRACE 0x0010
#define PERF_ATTACH_GLOBAL_DATA 0x0010
#define PERF_ATTACH_SCHED_CB 0x0020
#define PERF_ATTACH_CHILD 0x0040
#define PERF_ATTACH_EXCLUSIVE 0x0080
#define PERF_ATTACH_CALLCHAIN 0x0100
#define PERF_ATTACH_ITRACE 0x0200
struct bpf_prog;
struct perf_cgroup;

View File

@ -55,6 +55,7 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
#include "internal.h"
@ -5217,6 +5218,225 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}
static struct perf_ctx_data *
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
{
struct perf_ctx_data *cd;
cd = kzalloc(sizeof(*cd), GFP_KERNEL);
if (!cd)
return NULL;
cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
if (!cd->data) {
kfree(cd);
return NULL;
}
cd->global = global;
cd->ctx_cache = ctx_cache;
refcount_set(&cd->refcount, 1);
return cd;
}
static void free_perf_ctx_data(struct perf_ctx_data *cd)
{
kmem_cache_free(cd->ctx_cache, cd->data);
kfree(cd);
}
static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
{
struct perf_ctx_data *cd;
cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
free_perf_ctx_data(cd);
}
static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
{
call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
}
static int
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
bool global)
{
struct perf_ctx_data *cd, *old = NULL;
cd = alloc_perf_ctx_data(ctx_cache, global);
if (!cd)
return -ENOMEM;
for (;;) {
if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
if (old)
perf_free_ctx_data_rcu(old);
return 0;
}
if (!old) {
/*
* After seeing a dead @old, we raced with
* removal and lost, try again to install @cd.
*/
continue;
}
if (refcount_inc_not_zero(&old->refcount)) {
free_perf_ctx_data(cd); /* unused */
return 0;
}
/*
* @old is a dead object, refcount==0 is stable, try and
* replace it with @cd.
*/
}
return 0;
}
static void __detach_global_ctx_data(void);
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
static refcount_t global_ctx_data_ref;
static int
attach_global_ctx_data(struct kmem_cache *ctx_cache)
{
struct task_struct *g, *p;
struct perf_ctx_data *cd;
int ret;
if (refcount_inc_not_zero(&global_ctx_data_ref))
return 0;
guard(percpu_write)(&global_ctx_data_rwsem);
if (refcount_inc_not_zero(&global_ctx_data_ref))
return 0;
again:
/* Allocate everything */
scoped_guard (rcu) {
for_each_process_thread(g, p) {
cd = rcu_dereference(p->perf_ctx_data);
if (cd && !cd->global) {
cd->global = 1;
if (!refcount_inc_not_zero(&cd->refcount))
cd = NULL;
}
if (!cd) {
get_task_struct(p);
goto alloc;
}
}
}
refcount_set(&global_ctx_data_ref, 1);
return 0;
alloc:
ret = attach_task_ctx_data(p, ctx_cache, true);
put_task_struct(p);
if (ret) {
__detach_global_ctx_data();
return ret;
}
goto again;
}
static int
attach_perf_ctx_data(struct perf_event *event)
{
struct task_struct *task = event->hw.target;
struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
int ret;
if (!ctx_cache)
return -ENOMEM;
if (task)
return attach_task_ctx_data(task, ctx_cache, false);
ret = attach_global_ctx_data(ctx_cache);
if (ret)
return ret;
event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
return 0;
}
static void
detach_task_ctx_data(struct task_struct *p)
{
struct perf_ctx_data *cd;
scoped_guard (rcu) {
cd = rcu_dereference(p->perf_ctx_data);
if (!cd || !refcount_dec_and_test(&cd->refcount))
return;
}
/*
* The old ctx_data may be lost because of the race.
* Nothing is required to do for the case.
* See attach_task_ctx_data().
*/
if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
perf_free_ctx_data_rcu(cd);
}
static void __detach_global_ctx_data(void)
{
struct task_struct *g, *p;
struct perf_ctx_data *cd;
again:
scoped_guard (rcu) {
for_each_process_thread(g, p) {
cd = rcu_dereference(p->perf_ctx_data);
if (!cd || !cd->global)
continue;
cd->global = 0;
get_task_struct(p);
goto detach;
}
}
return;
detach:
detach_task_ctx_data(p);
put_task_struct(p);
goto again;
}
static void detach_global_ctx_data(void)
{
if (refcount_dec_not_one(&global_ctx_data_ref))
return;
guard(percpu_write)(&global_ctx_data_rwsem);
if (!refcount_dec_and_test(&global_ctx_data_ref))
return;
/* remove everything */
__detach_global_ctx_data();
}
static void detach_perf_ctx_data(struct perf_event *event)
{
struct task_struct *task = event->hw.target;
event->attach_state &= ~PERF_ATTACH_TASK_DATA;
if (task)
return detach_task_ctx_data(task);
if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
detach_global_ctx_data();
event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
}
}
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@ -5398,6 +5618,9 @@ static void __free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);
if (event->attach_state & PERF_ATTACH_TASK_DATA)
detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
@ -8607,10 +8830,58 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}
/*
* Allocate data for a new task when profiling system-wide
* events which require PMU specific data
*/
static void
perf_event_alloc_task_data(struct task_struct *child,
struct task_struct *parent)
{
struct kmem_cache *ctx_cache = NULL;
struct perf_ctx_data *cd;
if (!refcount_read(&global_ctx_data_ref))
return;
scoped_guard (rcu) {
cd = rcu_dereference(parent->perf_ctx_data);
if (cd)
ctx_cache = cd->ctx_cache;
}
if (!ctx_cache)
return;
guard(percpu_read)(&global_ctx_data_rwsem);
scoped_guard (rcu) {
cd = rcu_dereference(child->perf_ctx_data);
if (!cd) {
/*
* A system-wide event may be unaccount,
* when attaching the perf_ctx_data.
*/
if (!refcount_read(&global_ctx_data_ref))
return;
goto attach;
}
if (!cd->global) {
cd->global = 1;
refcount_inc(&cd->refcount);
}
}
return;
attach:
attach_task_ctx_data(child, ctx_cache, true);
}
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
perf_event_alloc_task_data(task, current);
}
/*
@ -12490,6 +12761,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (IS_ERR(pmu))
return (void*)pmu;
/*
* The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
* The attach should be right after the perf_init_event().
* Otherwise, the __free_event() would mistakenly detach the non-exist
* perf_ctx_data because of the other errors between them.
*/
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
err = attach_perf_ctx_data(event);
if (err)
return ERR_PTR(err);
}
/*
* Disallow uncore-task events. Similarly, disallow uncore-cgroup
* events (they don't make sense as the cgroup will be different
@ -13637,6 +13920,12 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
/*
* Detach the perf_ctx_data for the system-wide event.
*/
guard(percpu_read)(&global_ctx_data_rwsem);
detach_task_ctx_data(child);
}
static void perf_free_event(struct perf_event *event,