perf lock contention: Add -J/--inject-delay option

This is to slow down lock acquistion (on contention locks) deliberately.

A possible use case is to estimate impact on application performance by
optimization of kernel locking behavior.  By delaying the lock it can
simulate the worse condition as a control group, and then compare with
the current behavior as a optimized condition.

The syntax is 'time@function' and the time can have unit suffix like
"us" and "ms".  For example, I ran a simple test like below.

  $ sudo perf lock con -abl -L tasklist_lock -- \
    sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended   total wait     max wait     avg wait            address   symbol

          92      1.18 ms    199.54 us     12.79 us   ffffffff8a806080   tasklist_lock (rwlock)

The contention count was 92 and the average wait time was around 10 us.
But if I add 100 usec of delay to the tasklist_lock,

  $ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
    sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended   total wait     max wait     avg wait            address   symbol

         190     15.67 ms    230.10 us     82.46 us   ffffffff8a806080   tasklist_lock (rwlock)

The contention count increased and the average wait time was up closed
to 100 usec.  If I increase the delay even more,

  $ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
    sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended   total wait     max wait     avg wait            address   symbol

        1002      2.80 s       3.01 ms      2.80 ms   ffffffff8a806080   tasklist_lock (rwlock)

Now every sleep process had contention and the wait time was more than 1
msec.  This is on my 4 CPU laptop so I guess one CPU has the lock while
other 3 are waiting for it mostly.

For simplicity, it only supports global locks for now.

Committer testing:

  root@number:~# grep -m1 'model name' /proc/cpuinfo
  model name : AMD Ryzen 9 9950X3D 16-Core Processor
  root@number:~# perf lock con -abl -L tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended  total wait   max wait  avg wait           address  symbol

         142   453.85 us   25.39 us   3.20 us  ffffffffae808080  tasklist_lock (rwlock)
  root@number:~# perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended  total wait   max wait  avg wait           address  symbol

        1040     2.39 s     3.11 ms   2.30 ms  ffffffffae808080  tasklist_lock (rwlock)
  root@number:~# perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
   contended  total wait   max wait  avg wait           address  symbol

        1025    24.72 s    31.01 ms  24.12 ms  ffffffffae808080  tasklist_lock (rwlock)
  root@number:~#

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250509171950.183591-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
Namhyung Kim
2025-05-09 10:19:50 -07:00
committed by Arnaldo Carvalho de Melo
parent 4bfe27140e
commit c42e219942
5 changed files with 164 additions and 0 deletions

View File

@@ -216,6 +216,17 @@ CONTENTION OPTIONS
--cgroup-filter=<value>::
Show lock contention only in the given cgroups (comma separated list).
-J::
--inject-delay=<time@function>::
Add delays to the given lock. It's added to the contention-end part so
that the (new) owner of the lock will be delayed. But by slowing down
the owner, the waiters will also be delayed as well. This is working
only with -b/--use-bpf.
The 'time' is specified in nsec but it can have a unit suffix. Available
units are "ms" and "us". Note that it will busy-wait after it gets the
lock. Please use it at your own risk.
SEE ALSO
--------

View File

@@ -62,6 +62,8 @@ static const char *output_name = NULL;
static FILE *lock_output;
static struct lock_filter filters;
static struct lock_delay *delays;
static int nr_delays;
static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
@@ -2001,6 +2003,8 @@ static int __cmd_contention(int argc, const char **argv)
.max_stack = max_stack_depth,
.stack_skip = stack_skip,
.filters = &filters,
.delays = delays,
.nr_delays = nr_delays,
.save_callstack = needs_callstack(),
.owner = show_lock_owner,
.cgroups = RB_ROOT,
@@ -2504,6 +2508,74 @@ static int parse_cgroup_filter(const struct option *opt __maybe_unused, const ch
return ret;
}
static bool add_lock_delay(char *spec)
{
char *at, *pos;
struct lock_delay *tmp;
unsigned long duration;
at = strchr(spec, '@');
if (at == NULL) {
pr_err("lock delay should have '@' sign: %s\n", spec);
return false;
}
if (at == spec) {
pr_err("lock delay should have time before '@': %s\n", spec);
return false;
}
*at = '\0';
duration = strtoul(spec, &pos, 0);
if (!strcmp(pos, "ns"))
duration *= 1;
else if (!strcmp(pos, "us"))
duration *= 1000;
else if (!strcmp(pos, "ms"))
duration *= 1000 * 1000;
else if (*pos) {
pr_err("invalid delay time: %s@%s\n", spec, at + 1);
return false;
}
tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays));
if (tmp == NULL) {
pr_err("Memory allocation failure\n");
return false;
}
delays = tmp;
delays[nr_delays].sym = strdup(at + 1);
if (delays[nr_delays].sym == NULL) {
pr_err("Memory allocation failure\n");
return false;
}
delays[nr_delays].time = duration;
nr_delays++;
return true;
}
static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str,
int unset __maybe_unused)
{
char *s, *tmp, *tok;
int ret = 0;
s = strdup(str);
if (s == NULL)
return -1;
for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) {
if (!add_lock_delay(tok)) {
ret = -1;
break;
}
}
free(s);
return ret;
}
int cmd_lock(int argc, const char **argv)
{
const struct option lock_options[] = {
@@ -2580,6 +2652,8 @@ int cmd_lock(int argc, const char **argv)
OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"),
OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS",
"Filter specific cgroups", parse_cgroup_filter),
OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC",
"Inject delays to specific locks", parse_lock_delay),
OPT_PARENT(lock_options)
};

View File

@@ -261,6 +261,27 @@ int lock_contention_prepare(struct lock_contention *con)
skel->rodata->has_addr = 1;
}
/* resolve lock name in delays */
if (con->nr_delays) {
struct symbol *sym;
struct map *kmap;
for (i = 0; i < con->nr_delays; i++) {
sym = machine__find_kernel_symbol_by_name(con->machine,
con->delays[i].sym,
&kmap);
if (sym == NULL) {
pr_warning("ignore unknown symbol: %s\n",
con->delays[i].sym);
continue;
}
con->delays[i].addr = map__unmap_ip(kmap, sym->start);
}
skel->rodata->lock_delay = 1;
bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays);
}
bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
@@ -352,6 +373,13 @@ int lock_contention_prepare(struct lock_contention *con)
bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY);
}
if (con->nr_delays) {
fd = bpf_map__fd(skel->maps.lock_delays);
for (i = 0; i < con->nr_delays; i++)
bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY);
}
if (con->aggr_mode == LOCK_AGGR_CGROUP)
read_all_cgroups(&con->cgroups);

View File

@@ -14,6 +14,9 @@
/* for collect_zone_lock(). It should be more than the actual zones. */
#define MAX_ZONES 10
/* for do_lock_delay(). Arbitrarily set to 1 million. */
#define MAX_LOOP (1U << 20)
/* lock contention flags from include/trace/events/lock.h */
#define LCB_F_SPIN (1U << 0)
#define LCB_F_READ (1U << 1)
@@ -149,6 +152,13 @@ struct {
__uint(max_entries, 1);
} slab_caches SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(__u64));
__uint(value_size, sizeof(__u64));
__uint(max_entries, 1);
} lock_delays SEC(".maps");
struct rw_semaphore___old {
struct task_struct *owner;
} __attribute__((preserve_access_index));
@@ -179,6 +189,7 @@ const volatile int stack_skip;
const volatile int lock_owner;
const volatile int use_cgroup_v2;
const volatile int max_stack;
const volatile int lock_delay;
/* determine the key of lock stat */
const volatile int aggr_mode;
@@ -387,6 +398,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
return 0;
}
static inline long delay_callback(__u64 idx, void *arg)
{
__u64 target = *(__u64 *)arg;
if (target <= bpf_ktime_get_ns())
return 1;
/* just to kill time */
(void)bpf_get_prandom_u32();
return 0;
}
static inline void do_lock_delay(__u64 duration)
{
__u64 target = bpf_ktime_get_ns() + duration;
bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
}
static inline void check_lock_delay(__u64 lock)
{
__u64 *delay;
delay = bpf_map_lookup_elem(&lock_delays, &lock);
if (delay)
do_lock_delay(*delay);
}
static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
{
__u32 pid;
@@ -796,6 +836,9 @@ found:
update_contention_data(data, duration, 1);
out:
if (lock_delay)
check_lock_delay(pelem->lock);
pelem->lock = 0;
if (need_delete)
bpf_map_delete_elem(&tstamp, &pid);

View File

@@ -18,6 +18,12 @@ struct lock_filter {
char **slabs;
};
struct lock_delay {
char *sym;
unsigned long addr;
unsigned long time;
};
struct lock_stat {
struct hlist_node hash_entry;
struct rb_node rb; /* used for sorting */
@@ -140,6 +146,7 @@ struct lock_contention {
struct machine *machine;
struct hlist_head *result;
struct lock_filter *filters;
struct lock_delay *delays;
struct lock_contention_fails fails;
struct rb_root cgroups;
void *btf;
@@ -149,6 +156,7 @@ struct lock_contention {
int aggr_mode;
int owner;
int nr_filtered;
int nr_delays;
bool save_callstack;
};