perf trace: Reorganize syscalls

Identify struct syscall information in the syscalls table by a machine
type and syscall number, not just system call number. Having the
machine type means that 32-bit system calls can be differentiated from
64-bit ones on a machine capable of both. Having a table for all
machine types and all system call numbers would be too large, so
maintain a sorted array of system calls as they are encountered.

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Howard Chu <howardchu95@gmail.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Link: https://lore.kernel.org/r/20250319050741.269828-5-irogers@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
This commit is contained in:
Ian Rogers 2025-03-18 22:07:31 -07:00 committed by Namhyung Kim
parent af472d3c44
commit 3d94b8441c

View File

@ -66,6 +66,7 @@
#include "syscalltbl.h"
#include "../perf.h"
#include "trace_augment.h"
#include "dwarf-regs.h"
#include <errno.h>
#include <inttypes.h>
@ -86,6 +87,7 @@
#include <linux/ctype.h>
#include <perf/mmap.h>
#include <tools/libc_compat.h>
#ifdef HAVE_LIBTRACEEVENT
#include <event-parse.h>
@ -149,7 +151,10 @@ struct trace {
struct perf_tool tool;
struct syscalltbl *sctbl;
struct {
/** Sorted sycall numbers used by the trace. */
struct syscall *table;
/** Size of table. */
size_t table_size;
struct {
struct evsel *sys_enter,
*sys_exit,
@ -1454,22 +1459,37 @@ static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias);
}
/*
* is_exit: is this "exit" or "exit_group"?
* is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
* args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
* nonexistent: Just a hole in the syscall table, syscall id not allocated
/**
* struct syscall
*/
struct syscall {
/** @e_machine: The ELF machine associated with the entry. */
int e_machine;
/** @id: id value from the tracepoint, the system call number. */
int id;
struct tep_event *tp_format;
int nr_args;
/**
* @args_size: sum of the sizes of the syscall arguments, anything
* after that is augmented stuff: pathname for openat, etc.
*/
int args_size;
struct {
struct bpf_program *sys_enter,
*sys_exit;
} bpf_prog;
/** @is_exit: is this "exit" or "exit_group"? */
bool is_exit;
/**
* @is_open: is this "open" or "openat"? To associate the fd returned in
* sys_exit with the pathname in sys_enter.
*/
bool is_open;
/**
* @nonexistent: Name lookup failed. Just a hole in the syscall table,
* syscall id not allocated.
*/
bool nonexistent;
bool use_btf;
struct tep_format_field *args;
@ -2107,22 +2127,21 @@ static int syscall__set_arg_fmts(struct syscall *sc)
return 0;
}
static int trace__read_syscall_info(struct trace *trace, int id)
static int syscall__read_info(struct syscall *sc, struct trace *trace)
{
char tp_name[128];
struct syscall *sc;
const char *name = syscalltbl__name(trace->sctbl, id);
const char *name;
int err;
if (trace->syscalls.table == NULL) {
trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc));
if (trace->syscalls.table == NULL)
return -ENOMEM;
}
sc = trace->syscalls.table + id;
if (sc->nonexistent)
return -EEXIST;
if (sc->name) {
/* Info already read. */
return 0;
}
name = syscalltbl__name(trace->sctbl, sc->id);
if (name == NULL) {
sc->nonexistent = true;
return -EEXIST;
@ -2145,15 +2164,16 @@ static int trace__read_syscall_info(struct trace *trace, int id)
*/
if (IS_ERR(sc->tp_format)) {
sc->nonexistent = true;
return PTR_ERR(sc->tp_format);
err = PTR_ERR(sc->tp_format);
sc->tp_format = NULL;
return err;
}
/*
* The tracepoint format contains __syscall_nr field, so it's one more
* than the actual number of syscall arguments.
*/
if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields - 1))
if (syscall__alloc_arg_fmts(sc, sc->tp_format->format.nr_fields - 1))
return -ENOMEM;
sc->args = sc->tp_format->format.fields;
@ -2442,13 +2462,69 @@ next_arg:
return printed;
}
static void syscall__init(struct syscall *sc, int e_machine, int id)
{
memset(sc, 0, sizeof(*sc));
sc->e_machine = e_machine;
sc->id = id;
}
static void syscall__exit(struct syscall *sc)
{
if (!sc)
return;
zfree(&sc->arg_fmt);
}
static int syscall__cmp(const void *va, const void *vb)
{
const struct syscall *a = va, *b = vb;
if (a->e_machine != b->e_machine)
return a->e_machine - b->e_machine;
return a->id - b->id;
}
static struct syscall *trace__find_syscall(struct trace *trace, int e_machine, int id)
{
struct syscall key = {
.e_machine = e_machine,
.id = id,
};
struct syscall *sc, *tmp;
if (trace->syscalls.table) {
sc = bsearch(&key, trace->syscalls.table, trace->syscalls.table_size,
sizeof(struct syscall), syscall__cmp);
if (sc)
return sc;
}
tmp = reallocarray(trace->syscalls.table, trace->syscalls.table_size + 1,
sizeof(struct syscall));
if (!tmp)
return NULL;
trace->syscalls.table = tmp;
sc = &trace->syscalls.table[trace->syscalls.table_size++];
syscall__init(sc, e_machine, id);
qsort(trace->syscalls.table, trace->syscalls.table_size, sizeof(struct syscall),
syscall__cmp);
sc = bsearch(&key, trace->syscalls.table, trace->syscalls.table_size,
sizeof(struct syscall), syscall__cmp);
return sc;
}
typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel,
union perf_event *event,
struct perf_sample *sample);
static struct syscall *trace__syscall_info(struct trace *trace,
struct evsel *evsel, int id)
static struct syscall *trace__syscall_info(struct trace *trace, struct evsel *evsel,
int e_machine, int id)
{
struct syscall *sc;
int err = 0;
if (id < 0) {
@ -2473,28 +2549,20 @@ static struct syscall *trace__syscall_info(struct trace *trace,
err = -EINVAL;
if (id > trace->sctbl->syscalls.max_id) {
goto out_cant_read;
}
sc = trace__find_syscall(trace, e_machine, id);
if (sc)
err = syscall__read_info(sc, trace);
if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) &&
(err = trace__read_syscall_info(trace, id)) != 0)
goto out_cant_read;
if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
goto out_cant_read;
return &trace->syscalls.table[id];
out_cant_read:
if (verbose > 0) {
if (err && verbose > 0) {
char sbuf[STRERR_BUFSIZE];
fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf)));
if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL)
fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err,
str_error_r(-err, sbuf, sizeof(sbuf)));
if (sc && sc->name)
fprintf(trace->output, "(%s)", sc->name);
fputs(" information\n", trace->output);
}
return NULL;
return err ? NULL : sc;
}
struct syscall_stats {
@ -2643,14 +2711,6 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam
return NULL;
}
static void syscall__exit(struct syscall *sc)
{
if (!sc)
return;
zfree(&sc->arg_fmt);
}
static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
@ -2662,7 +2722,7 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
int augmented_args_size = 0;
void *augmented_args = NULL;
struct syscall *sc = trace__syscall_info(trace, evsel, id);
struct syscall *sc = trace__syscall_info(trace, evsel, EM_HOST, id);
struct thread_trace *ttrace;
if (sc == NULL)
@ -2736,7 +2796,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
struct thread_trace *ttrace;
struct thread *thread;
int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
struct syscall *sc = trace__syscall_info(trace, evsel, id);
struct syscall *sc = trace__syscall_info(trace, evsel, EM_HOST, id);
char msg[1024];
void *args, *augmented_args = NULL;
int augmented_args_size;
@ -2811,7 +2871,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
struct thread *thread;
int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
int alignment = trace->args_alignment;
struct syscall *sc = trace__syscall_info(trace, evsel, id);
struct syscall *sc = trace__syscall_info(trace, evsel, EM_HOST, id);
struct thread_trace *ttrace;
if (sc == NULL)
@ -3164,7 +3224,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
if (evsel == trace->syscalls.events.bpf_output) {
int id = perf_evsel__sc_tp_uint(evsel, id, sample);
struct syscall *sc = trace__syscall_info(trace, evsel, id);
struct syscall *sc = trace__syscall_info(trace, evsel, EM_HOST, id);
if (sc) {
fprintf(trace->output, "%s(", sc->name);
@ -3673,7 +3733,7 @@ out_unaugmented:
static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
{
struct syscall *sc = trace__syscall_info(trace, NULL, id);
struct syscall *sc = trace__syscall_info(trace, NULL, EM_HOST, id);
if (sc == NULL)
return;
@ -3684,20 +3744,20 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
{
struct syscall *sc = trace__syscall_info(trace, NULL, id);
struct syscall *sc = trace__syscall_info(trace, NULL, EM_HOST, id);
return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
}
static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
{
struct syscall *sc = trace__syscall_info(trace, NULL, id);
struct syscall *sc = trace__syscall_info(trace, NULL, EM_HOST, id);
return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented);
}
static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigned int *beauty_array)
{
struct tep_format_field *field;
struct syscall *sc = trace__syscall_info(trace, NULL, key);
struct syscall *sc = trace__syscall_info(trace, NULL, EM_HOST, key);
const struct btf_type *bt;
char *struct_offset, *tmp, name[32];
bool can_augment = false;
@ -3779,13 +3839,14 @@ static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int key, unsigne
return -1;
}
static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *_sc)
{
struct syscall sc = *_sc; /* Copy as trace__syscall_info may invalidate pointer. */
struct tep_format_field *field, *candidate_field;
/*
* We're only interested in syscalls that have a pointer:
*/
for (field = sc->args; field; field = field->next) {
for (field = sc.args; field; field = field->next) {
if (field->flags & TEP_FIELD_IS_POINTER)
goto try_to_find_pair;
}
@ -3795,15 +3856,16 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
try_to_find_pair:
for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
int id = syscalltbl__id_at_idx(trace->sctbl, i);
struct syscall *pair = trace__syscall_info(trace, NULL, id);
/* calling trace__syscall_info() may invalidate '_sc' */
struct syscall *pair = trace__syscall_info(trace, NULL, sc.e_machine, id);
struct bpf_program *pair_prog;
bool is_candidate = false;
if (pair == NULL || pair == sc ||
if (pair == NULL || pair->id == sc.id ||
pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
continue;
for (field = sc->args, candidate_field = pair->args;
for (field = sc.args, candidate_field = pair->args;
field && candidate_field; field = field->next, candidate_field = candidate_field->next) {
bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
@ -3870,7 +3932,7 @@ try_to_find_pair:
goto next_candidate;
}
pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name);
pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc.name);
return pair_prog;
next_candidate:
continue;
@ -3945,7 +4007,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
*/
for (int i = 0; i < trace->sctbl->syscalls.nr_entries; ++i) {
int key = syscalltbl__id_at_idx(trace->sctbl, i);
struct syscall *sc = trace__syscall_info(trace, NULL, key);
struct syscall *sc = trace__syscall_info(trace, NULL, EM_HOST, key);
struct bpf_program *pair_prog;
int prog_fd;
@ -3966,7 +4028,11 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
pair_prog = trace__find_usable_bpf_prog_entry(trace, sc);
if (pair_prog == NULL)
continue;
/*
* Get syscall info again as find usable entry above might
* modify the syscall table and shuffle it.
*/
sc = trace__syscall_info(trace, NULL, EM_HOST, key);
sc->bpf_prog.sys_enter = pair_prog;
/*
@ -4761,7 +4827,10 @@ static size_t syscall__dump_stats(struct trace *trace, FILE *fp,
pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0;
avg /= NSEC_PER_MSEC;
sc = &trace->syscalls.table[entry->syscall];
sc = trace__syscall_info(trace, /*evsel=*/NULL, EM_HOST, entry->syscall);
if (!sc)
continue;
printed += fprintf(fp, " %-15s", sc->name);
printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f",
n, stats->nr_failures, entry->msecs, min, avg);
@ -5218,12 +5287,10 @@ out:
static void trace__exit(struct trace *trace)
{
int i;
strlist__delete(trace->ev_qualifier);
zfree(&trace->ev_qualifier_ids.entries);
if (trace->syscalls.table) {
for (i = 0; i <= trace->sctbl->syscalls.max_id; i++)
for (size_t i = 0; i < trace->syscalls.table_size; i++)
syscall__exit(&trace->syscalls.table[i]);
zfree(&trace->syscalls.table);
}