perf lock contention: Add -J/--inject-delay option
This is to slow down lock acquistion (on contention locks) deliberately.
A possible use case is to estimate impact on application performance by
optimization of kernel locking behavior. By delaying the lock it can
simulate the worse condition as a control group, and then compare with
the current behavior as a optimized condition.
The syntax is 'time@function' and the time can have unit suffix like
"us" and "ms". For example, I ran a simple test like below.
$ sudo perf lock con -abl -L tasklist_lock -- \
sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol
92 1.18 ms 199.54 us 12.79 us ffffffff8a806080 tasklist_lock (rwlock)
The contention count was 92 and the average wait time was around 10 us.
But if I add 100 usec of delay to the tasklist_lock,
$ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol
190 15.67 ms 230.10 us 82.46 us ffffffff8a806080 tasklist_lock (rwlock)
The contention count increased and the average wait time was up closed
to 100 usec. If I increase the delay even more,
$ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol
1002 2.80 s 3.01 ms 2.80 ms ffffffff8a806080 tasklist_lock (rwlock)
Now every sleep process had contention and the wait time was more than 1
msec. This is on my 4 CPU laptop so I guess one CPU has the lock while
other 3 are waiting for it mostly.
For simplicity, it only supports global locks for now.
Committer testing:
root@number:~# grep -m1 'model name' /proc/cpuinfo
model name : AMD Ryzen 9 9950X3D 16-Core Processor
root@number:~# perf lock con -abl -L tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol
142 453.85 us 25.39 us 3.20 us ffffffffae808080 tasklist_lock (rwlock)
root@number:~# perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol
1040 2.39 s 3.11 ms 2.30 ms ffffffffae808080 tasklist_lock (rwlock)
root@number:~# perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol
1025 24.72 s 31.01 ms 24.12 ms ffffffffae808080 tasklist_lock (rwlock)
root@number:~#
Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250509171950.183591-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
committed by
Arnaldo Carvalho de Melo
parent
4bfe27140e
commit
c42e219942
@@ -216,6 +216,17 @@ CONTENTION OPTIONS
|
||||
--cgroup-filter=<value>::
|
||||
Show lock contention only in the given cgroups (comma separated list).
|
||||
|
||||
-J::
|
||||
--inject-delay=<time@function>::
|
||||
Add delays to the given lock. It's added to the contention-end part so
|
||||
that the (new) owner of the lock will be delayed. But by slowing down
|
||||
the owner, the waiters will also be delayed as well. This is working
|
||||
only with -b/--use-bpf.
|
||||
|
||||
The 'time' is specified in nsec but it can have a unit suffix. Available
|
||||
units are "ms" and "us". Note that it will busy-wait after it gets the
|
||||
lock. Please use it at your own risk.
|
||||
|
||||
|
||||
SEE ALSO
|
||||
--------
|
||||
|
||||
@@ -62,6 +62,8 @@ static const char *output_name = NULL;
|
||||
static FILE *lock_output;
|
||||
|
||||
static struct lock_filter filters;
|
||||
static struct lock_delay *delays;
|
||||
static int nr_delays;
|
||||
|
||||
static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
|
||||
|
||||
@@ -2001,6 +2003,8 @@ static int __cmd_contention(int argc, const char **argv)
|
||||
.max_stack = max_stack_depth,
|
||||
.stack_skip = stack_skip,
|
||||
.filters = &filters,
|
||||
.delays = delays,
|
||||
.nr_delays = nr_delays,
|
||||
.save_callstack = needs_callstack(),
|
||||
.owner = show_lock_owner,
|
||||
.cgroups = RB_ROOT,
|
||||
@@ -2504,6 +2508,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool add_lock_delay(char *spec)
|
||||
{
|
||||
char *at, *pos;
|
||||
struct lock_delay *tmp;
|
||||
unsigned long duration;
|
||||
|
||||
at = strchr(spec, '@');
|
||||
if (at == NULL) {
|
||||
pr_err("lock delay should have '@' sign: %s\n", spec);
|
||||
return false;
|
||||
}
|
||||
if (at == spec) {
|
||||
pr_err("lock delay should have time before '@': %s\n", spec);
|
||||
return false;
|
||||
}
|
||||
|
||||
*at = '\0';
|
||||
duration = strtoul(spec, &pos, 0);
|
||||
if (!strcmp(pos, "ns"))
|
||||
duration *= 1;
|
||||
else if (!strcmp(pos, "us"))
|
||||
duration *= 1000;
|
||||
else if (!strcmp(pos, "ms"))
|
||||
duration *= 1000 * 1000;
|
||||
else if (*pos) {
|
||||
pr_err("invalid delay time: %s@%s\n", spec, at + 1);
|
||||
return false;
|
||||
}
|
||||
|
||||
tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
|
||||
if (tmp == NULL) {
|
||||
pr_err("Memory allocation failure\n");
|
||||
return false;
|
||||
}
|
||||
delays = tmp;
|
||||
|
||||
delays[nr_delays].sym = strdup(at + 1);
|
||||
if (delays[nr_delays].sym == NULL) {
|
||||
pr_err("Memory allocation failure\n");
|
||||
return false;
|
||||
}
|
||||
delays[nr_delays].time = duration;
|
||||
|
||||
nr_delays++;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
|
||||
int unset __maybe_unused)
|
||||
{
|
||||
char *s, *tmp, *tok;
|
||||
int ret = 0;
|
||||
|
||||
s = strdup(str);
|
||||
if (s == NULL)
|
||||
return -1;
|
||||
|
||||
for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
|
||||
if (!add_lock_delay(tok)) {
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free(s);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cmd_lock(int argc, const char **argv)
|
||||
{
|
||||
const struct option lock_options[] = {
|
||||
@@ -2580,6 +2652,8 @@ int cmd_lock(int argc, const char **argv)
|
||||
OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
|
||||
OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
|
||||
"Filter specific cgroups", parse_cgroup_filter),
|
||||
OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC",
|
||||
"Inject delays to specific locks", parse_lock_delay),
|
||||
OPT_PARENT(lock_options)
|
||||
};
|
||||
|
||||
|
||||
@@ -261,6 +261,27 @@ int lock_contention_prepare(struct lock_contention *con)
|
||||
skel->rodata->has_addr = 1;
|
||||
}
|
||||
|
||||
/* resolve lock name in delays */
|
||||
if (con->nr_delays) {
|
||||
struct symbol *sym;
|
||||
struct map *kmap;
|
||||
|
||||
for (i = 0; i < con->nr_delays; i++) {
|
||||
sym = machine__find_kernel_symbol_by_name(con->machine,
|
||||
con->delays[i].sym,
|
||||
&kmap);
|
||||
if (sym == NULL) {
|
||||
pr_warning("ignore unknown symbol: %s\n",
|
||||
con->delays[i].sym);
|
||||
continue;
|
||||
}
|
||||
|
||||
con->delays[i].addr = map__unmap_ip(kmap, sym->start);
|
||||
}
|
||||
skel->rodata->lock_delay = 1;
|
||||
bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
|
||||
}
|
||||
|
||||
bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
|
||||
bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
|
||||
bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
|
||||
@@ -352,6 +373,13 @@ int lock_contention_prepare(struct lock_contention *con)
|
||||
bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
|
||||
}
|
||||
|
||||
if (con->nr_delays) {
|
||||
fd = bpf_map__fd(skel->maps.lock_delays);
|
||||
|
||||
for (i = 0; i < con->nr_delays; i++)
|
||||
bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
|
||||
}
|
||||
|
||||
if (con->aggr_mode == LOCK_AGGR_CGROUP)
|
||||
read_all_cgroups(&con->cgroups);
|
||||
|
||||
|
||||
@@ -14,6 +14,9 @@
|
||||
/* for collect_zone_lock(). It should be more than the actual zones. */
|
||||
#define MAX_ZONES 10
|
||||
|
||||
/* for do_lock_delay(). Arbitrarily set to 1 million. */
|
||||
#define MAX_LOOP (1U << 20)
|
||||
|
||||
/* lock contention flags from include/trace/events/lock.h */
|
||||
#define LCB_F_SPIN (1U << 0)
|
||||
#define LCB_F_READ (1U << 1)
|
||||
@@ -149,6 +152,13 @@ struct {
|
||||
__uint(max_entries, 1);
|
||||
} slab_caches SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(key_size, sizeof(__u64));
|
||||
__uint(value_size, sizeof(__u64));
|
||||
__uint(max_entries, 1);
|
||||
} lock_delays SEC(".maps");
|
||||
|
||||
struct rw_semaphore___old {
|
||||
struct task_struct *owner;
|
||||
} __attribute__((preserve_access_index));
|
||||
@@ -179,6 +189,7 @@ const volatile int stack_skip;
|
||||
const volatile int lock_owner;
|
||||
const volatile int use_cgroup_v2;
|
||||
const volatile int max_stack;
|
||||
const volatile int lock_delay;
|
||||
|
||||
/* determine the key of lock stat */
|
||||
const volatile int aggr_mode;
|
||||
@@ -387,6 +398,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline long delay_callback(__u64 idx, void *arg)
|
||||
{
|
||||
__u64 target = *(__u64 *)arg;
|
||||
|
||||
if (target <= bpf_ktime_get_ns())
|
||||
return 1;
|
||||
|
||||
/* just to kill time */
|
||||
(void)bpf_get_prandom_u32();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void do_lock_delay(__u64 duration)
|
||||
{
|
||||
__u64 target = bpf_ktime_get_ns() + duration;
|
||||
|
||||
bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
|
||||
}
|
||||
|
||||
static inline void check_lock_delay(__u64 lock)
|
||||
{
|
||||
__u64 *delay;
|
||||
|
||||
delay = bpf_map_lookup_elem(&lock_delays, &lock);
|
||||
if (delay)
|
||||
do_lock_delay(*delay);
|
||||
}
|
||||
|
||||
static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
|
||||
{
|
||||
__u32 pid;
|
||||
@@ -796,6 +836,9 @@ found:
|
||||
update_contention_data(data, duration, 1);
|
||||
|
||||
out:
|
||||
if (lock_delay)
|
||||
check_lock_delay(pelem->lock);
|
||||
|
||||
pelem->lock = 0;
|
||||
if (need_delete)
|
||||
bpf_map_delete_elem(&tstamp, &pid);
|
||||
|
||||
@@ -18,6 +18,12 @@ struct lock_filter {
|
||||
char **slabs;
|
||||
};
|
||||
|
||||
struct lock_delay {
|
||||
char *sym;
|
||||
unsigned long addr;
|
||||
unsigned long time;
|
||||
};
|
||||
|
||||
struct lock_stat {
|
||||
struct hlist_node hash_entry;
|
||||
struct rb_node rb; /* used for sorting */
|
||||
@@ -140,6 +146,7 @@ struct lock_contention {
|
||||
struct machine *machine;
|
||||
struct hlist_head *result;
|
||||
struct lock_filter *filters;
|
||||
struct lock_delay *delays;
|
||||
struct lock_contention_fails fails;
|
||||
struct rb_root cgroups;
|
||||
void *btf;
|
||||
@@ -149,6 +156,7 @@ struct lock_contention {
|
||||
int aggr_mode;
|
||||
int owner;
|
||||
int nr_filtered;
|
||||
int nr_delays;
|
||||
bool save_callstack;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user