tests: arm64: Add SIMD context switch stress test

Add stress test for SIMD register save/restore during context switching. The test validates correct FPU context handling using either ARM Neon or SVE2 instructions depending on the build target. Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
2025-10-22 13:48:02 -04:00
parent ed6d497276
commit f5d6b9ea0e
5 changed files with 1008 additions and 0 deletions
--- a/tests/arch/arm64/arm64_sve_stress/CMakeLists.txt
+++ b/tests/arch/arm64/arm64_sve_stress/CMakeLists.txt
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20.0)
+find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
+project(arm64_sve_stress)
+
+target_sources(app PRIVATE src/main.c)
--- a/tests/arch/arm64/arm64_sve_stress/README.md
+++ b/tests/arch/arm64/arm64_sve_stress/README.md
@@ -0,0 +1,129 @@
+<!--
+SPDX-License-Identifier: Apache-2.0
+SPDX-FileCopyrightText: Copyright (c) 2025 Arm Limited (or its affiliates)
+-->
+
+# ARM64 Neon / SVE2 SIMD Context Switch Stress Test
+
+## Overview
+
+This test validates the correctness and resilience of SIMD register save/restore
+during frequent thread preemptions in Zephyr RTOS on ARM AArch64 platforms using
+Neon or SVE2 instructions.
+
+The test performs mixed workloads (F32 matrix x vector multiplication and
+complex number multiplication) across multiple threads, while a high-priority
+preemptor thread intentionally clobbers SIMD registers to simulate
+context-switch interference.
+
+## Features
+
+- Supports both ARM Neon and SVE2 instruction sets
+- Multi-threaded SIMD workloads using Zephyr's kernel threading API
+- Preemption stress through a high-priority thread that:
+  - Forces frequent context switches
+  - Clobbers SIMD/SVE registers to simulate register save/restore errors
+- Data integrity validation:
+  - Compares SIMD results against scalar reference implementations
+  - Detects register corruption and computational divergence
+- Configurable test size, tolerance, and timing
+
+## Test Architecture
+
+### Thread Types
+
+- **SIMD Worker threads**: Execute either matrix x vector or complex multiplication
+  tests using SIMD instructions
+- **Preemptor Thread**: Runs at higher priority, forces preemptions and clobbers
+  SIMD registers
+
+### Workload Distribution
+
+- 8 total worker threads (`SIMD_THREAD_CNT = 8`)
+  - 4 matrix x vector tests (`MATRIX_TEST_CTX = 4`)
+  - 4 complex multiplication tests (`CMPLX_MUL_TEST_CTX = 4`)
+
+### Test Flow
+
+1. Worker threads initialize their test data with unique per-thread seeds
+2. Each worker computes a reference result using scalar operations
+3. Workers continuously execute SIMD computations in a loop
+4. The high-priority preemptor thread:
+   - Clobbers all SIMD registers every 1ms
+   - Forces context switches via timeslicing
+   - Periodically prints statistics
+5. Workers validate SIMD results against reference after each iteration
+6. Test tracks context switches that occur during SIMD operations
+7. Test runs for `TEST_DURATION_MSEC` (10 seconds by default)
+8. Final assertions verify zero failures and successful context switching
+
+## Build and Run
+
+### Prerequisites
+
+- Zephyr RTOS environment set up with an AArch64 toolchain
+- Target board with Neon or SVE2 SIMD support (e.g., ARMv8-A, ARMv9 cores)
+- FVP model or physical hardware
+
+### Build with west
+
+```bash
+west build -b fvp_base_revc_2xaem/v8a tests/arch/arm64/arm64_sve_stress
+```
+
+For ARMv9 with SVE:
+```bash
+west build -b fvp_base_revc_2xaem/v9a tests/arch/arm64/arm64_sve_stress
+```
+
+### Run with west
+
+```bash
+west build -t run
+```
+
+### Run with Twister
+
+```bash
+west twister -T tests/arch/arm64/arm64_sve_stress/
+```
+
+### Expected Output
+
+```
+=== ARM Neon/SVE2 SIMD context switch stress test ===
+task 0 succ 5743 fail 0 switch_during_simd 23936
+task 1 succ 5744 fail 0 switch_during_simd 24000
+task 2 succ 5744 fail 0 switch_during_simd 24084
+task 3 succ 5744 fail 0 switch_during_simd 24114
+task 4 succ 3559 fail 0 switch_during_simd 5162
+task 5 succ 3559 fail 0 switch_during_simd 5166
+task 6 succ 3559 fail 0 switch_during_simd 5168
+task 7 succ 3556 fail 0 switch_during_simd 5162
+...
+=== Final Test Results ===
+task 0: success=11493 failures=0 switches_during_simd=47968
+task 1: success=11494 failures=0 switches_during_simd=47944
+task 2: success=11494 failures=0 switches_during_simd=47820
+task 3: success=11494 failures=0 switches_during_simd=48164
+task 4: success=7122 failures=0 switches_during_simd=10346
+task 5: success=7122 failures=0 switches_during_simd=10350
+task 6: success=7122 failures=0 switches_during_simd=10352
+task 7: success=7118 failures=0 switches_during_simd=10282
+Total: success=75059 failures=0 switches=235826
+```
+
+If any test fails, the program halts with error diagnostics showing the
+mismatched SIMD vs scalar results.
+
+## Test Success Criteria
+
+The test passes if:
+1. All SIMD operations complete with zero failures (`total_failures == 0`)
+2. At least one successful SIMD operation was completed (`total_success > 0`)
+3. At least one context switch occurred during SIMD operations (`total_switches > 0`)
+
+This ensures that:
+- SIMD context switching is functioning correctly
+- Register clobbering by the preemptor thread is properly handled
+- No computational errors occur despite aggressive context switching
--- a/tests/arch/arm64/arm64_sve_stress/prj.conf
+++ b/tests/arch/arm64/arm64_sve_stress/prj.conf
@@ -0,0 +1,22 @@
+CONFIG_ZTEST=y
+CONFIG_TRACING=y
+CONFIG_TRACING_USER=y
+
+CONFIG_FPU=y
+CONFIG_FPU_SHARING=y
+
+CONFIG_STACK_SENTINEL=y
+CONFIG_INIT_STACKS=y
+
+# Preempt mid-compute
+CONFIG_TIMESLICING=y
+CONFIG_TIMESLICE_SIZE=1
+
+CONFIG_THREAD_NAME=y
+
+# Optional: more ticks -> more preemption opportunities
+CONFIG_SYS_CLOCK_TICKS_PER_SEC=10000
+
+# Bigger stacks; inline asm + printk + checks cost some stack
+CONFIG_MAIN_STACK_SIZE=4096
+CONFIG_THREAD_STACK_INFO=y
--- a/tests/arch/arm64/arm64_sve_stress/src/main.c
+++ b/tests/arch/arm64/arm64_sve_stress/src/main.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2025 Arm Limited (or its affiliates).
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * ARM Neon/SVE2 SIMD context switch stress test
+ *
+ * This test validates the correctness and resilience of SIMD register
+ * save/restore during frequent thread preemptions in Zephyr RTOS.
+ *
+ * The test performs mixed workloads (F32 matrix x vector multiplication and
+ * complex number multiplication) across multiple threads, while a
+ * high-priority preemptor thread intentionally clobbers SIMD registers to
+ * simulate context-switch interference.
+ */
+
+#include <zephyr/ztest.h>
+#include <zephyr/kernel.h>
+#include <zephyr/sys/atomic.h>
+#include <tracing_user.h>
+
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+/* use asm reference variant to prevent compiler vectorization */
+#define USE_ASM_SCALAR_AS_REF 1
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#endif
+#include <arm_neon.h>
+
+/* amount of SIMD threads */
+#define SIMD_THREAD_CNT		8
+
+/* thread stack sizes */
+#define WORKER_STACK		4096
+#define PREEMPTOR_STACK		4096
+
+#define SIMD_TASK_PRIO		2	/* priority for SIMD test threads */
+#define PREEMPTOR_PRIO		0	/* higher priority to force preemptions */
+
+#define F32REL_THRSH		1e-1f	/* SIMD test precision tolerance */
+
+/* test duration = max high priority iterations */
+#define TEST_DURATION_MSEC	10000
+
+/* periodicity of context switches statistics dumps inside the high prio thread*/
+#define STAT_UPD_PERIOD_MSEC	2501
+
+
+/* matrix by vector mult fixed size */
+#define TEST_MATRIX_ROWS	64
+#define TEST_MATRIX_COLS	127
+
+/* complx vector mult fixed size */
+#define TEST_CMPLX_MULT_SZ	2048
+
+/* 50% matrix mult / 50% complex mult threads */
+#define MATRIX_TEST_CTX		(SIMD_THREAD_CNT/2)
+#define CMPLX_MUL_TEST_CTX	(SIMD_THREAD_CNT/2)
+
+/* SIMD tests storage */
+typedef float float32_t;
+typedef struct cfloat32_t {
+	float32_t re, im;
+} cfloat32_t;
+
+__aligned(16) float32_t test_matrix[MATRIX_TEST_CTX][TEST_MATRIX_ROWS * TEST_MATRIX_COLS];
+__aligned(16) float32_t test_vector[MATRIX_TEST_CTX][TEST_MATRIX_COLS];
+
+__aligned(16) float32_t test_out_simd[MATRIX_TEST_CTX][TEST_MATRIX_COLS];
+__aligned(16) float32_t test_out_scalar[MATRIX_TEST_CTX][TEST_MATRIX_COLS];
+
+__aligned(16) cfloat32_t test_cmplx_mult_a[CMPLX_MUL_TEST_CTX][TEST_CMPLX_MULT_SZ];
+__aligned(16) cfloat32_t test_cmplx_mult_b[CMPLX_MUL_TEST_CTX][TEST_CMPLX_MULT_SZ];
+
+__aligned(16) cfloat32_t test_cmplx_mult_out_simd[CMPLX_MUL_TEST_CTX][TEST_CMPLX_MULT_SZ];
+__aligned(16) cfloat32_t test_cmplx_mult_out_scalar[CMPLX_MUL_TEST_CTX][TEST_CMPLX_MULT_SZ];
+
+#define dump_buf(a, buf_sz, wrap, format)				\
+{									\
+	printf("%s:\n", #a);						\
+	for (int i = 0; i < buf_sz; i++) {				\
+		printf(i % wrap == wrap - 1 ? format",\n" : format", ",	\
+		   (double)a[i]);					\
+	}								\
+	printf("\n");							\
+}
+
+#define F32_MIN		(-FLT_MAX)
+
+__attribute__ ((noinline))
+float32_t max_abs_diff_f32(const float32_t *src_A, const float32_t *src_B,
+			   uint32_t block_size)
+{
+	uint32_t blk_cnt = block_size >> 2;  /* Process 4 elements per loop */
+	uint32_t rem_cnt = block_size & 3;   /* Remaining elements */
+	float32x4_t max_vec = vdupq_n_f32(-FLT_MAX);
+
+	const float32_t *p_A = src_A;
+	const float32_t *p_B = src_B;
+
+	while (blk_cnt > 0) {
+		float32x4_t in_vec_A = vld1q_f32(p_A);
+		float32x4_t in_vec_B = vld1q_f32(p_B);
+
+		in_vec_A = vabsq_f32(in_vec_A - in_vec_B);
+		max_vec = vmaxq_f32(max_vec, in_vec_A);
+		p_A += 4;
+		p_B += 4;
+		blk_cnt--;
+	}
+
+	/* Reduce 4-wide vector to one float */
+	float32x2_t tmp = vpmax_f32(vget_low_f32(max_vec), vget_high_f32(max_vec));
+	float32_t max_value = fmaxf(vget_lane_f32(tmp, 0), vget_lane_f32(tmp, 1));
+
+	/* Handle remaining elements (less than 4) */
+	while (rem_cnt > 0) {
+		float32_t new_val = fabsf(*p_A++ - *p_B++);
+
+		if (new_val > max_value) {
+			max_value = new_val;
+		}
+		rem_cnt--;
+	}
+
+	return max_value;
+}
+
+/**
+ * verify reference vs SIMD out within range tolerance
+ */
+
+bool vec_within_threshold_f32(const float32_t *__restrict ref,
+			      const float32_t *__restrict vec2,
+			      int length, float32_t threshold)
+{
+	return (max_abs_diff_f32(ref, vec2, length) <= threshold);
+}
+
+/**
+ * f32_mat_x_vec_ref
+ * Reference row-major matrixvector multiply:
+ *   y[i] = sum_j A[i*m + j] * x[j], for i in [0..n)
+ * Arguments:
+ *   A  : n×m matrix (row-major)
+ *   x  : length m vector
+ *   y  : length n output vector
+ *   n,m: sizes in rows/cols
+ * Scalar reference used for correctness checks.
+ */
+
+__attribute__ ((noinline))
+void f32_mat_x_vec_ref(const float *__restrict A,
+		       const float *__restrict x, float *__restrict y,
+		       uint64_t n, uint64_t m)
+{
+	for (uint64_t i = 0; i < n; i++) {
+		float sum = 0;
+#ifdef USE_ASM_SCALAR_AS_REF
+		const float32_t *p_M = &A[i * m];
+		const float32_t *p_V = x;
+		uint64_t cnt = m;
+
+		__asm__ volatile (
+			"1:	ldr     s0, [%[p_M]], #4\n"
+			"	ldr     s1, [%[p_V]], #4\n"
+			"	fmadd   %s[sum], s0, s1, %s[sum]\n"
+			"	subs    %[cnt], %[cnt], #1\n"
+			"	b.ne    1b\n"
+			: [p_M] "+&r"(p_M), [p_V] "+&r"(p_V),
+			  [sum] "+w"(sum), [cnt] "+&r"(cnt)
+			:
+			: "s0", "s1", "memory", "cc");
+#else
+		for (uint64_t j = 0; j < m; ++j) {
+			sum += A[i * m + j] * x[j];
+		}
+#endif
+		y[i] = sum;
+	}
+}
+
+/**
+ * f32_mat_x_vec_simd
+ * SIMD-accelerated matrix vector multiply.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+__attribute__ ((noinline))
+void f32_mat_x_vec_simd(const float *A, const float *x, float *y,
+			uint64_t rows, uint64_t cols)
+{
+	for (uint64_t i = 0; i < rows; i++) {
+		svfloat32_t acc = svdup_f32(0.0f);
+
+		for (uint64_t j = 0; j < cols; j += svcntw()) {
+			svbool_t pg = svwhilelt_b32(j, cols);
+			svfloat32_t a_vec = svld1(pg, &A[i * cols + j]);
+			svfloat32_t x_vec = svld1(pg, &x[j]);
+
+			acc = svmla_m(pg, acc, a_vec, x_vec);
+		}
+
+		/* Reduce across vector lanes into a scalar */
+		float sum = svaddv(svptrue_b32(), acc);
+
+		y[i] = sum;
+	}
+}
+
+#else
+
+__attribute__ ((noinline))
+void f32_mat_x_vec_simd(const float *__restrict A, const float *__restrict x,
+			float *__restrict y, uint64_t n, uint64_t m)
+{
+	for (uint64_t i = 0; i < n; i++) {
+		float32x4_t acc = vdupq_n_f32(0.0f);
+		uint64_t j = 0;
+
+		for (; j + 4 <= m; j += 4) {
+			float32x4_t a = vld1q_f32(&A[i * m + j]);
+			float32x4_t xv = vld1q_f32(&x[j]);
+
+			acc = vmlaq_f32(acc, a, xv);
+		}
+
+		/* Horizontal reduction of 4 lanes */
+		float sum;
+#if defined(__aarch64__)
+		sum = vaddvq_f32(acc);  /* AArch64: fast across-lanes sum */
+#else
+		/* Portable AArch32 reduction */
+		float32x2_t low = vget_low_f32(acc);
+		float32x2_t high = vget_high_f32(acc);
+		float32x2_t s2 = vadd_f32(low, high);   /* (a0+a2, a1+a3) */
+
+		s2 = vpadd_f32(s2, s2); /* (a0+a1+a2+a3, ...) */
+		sum = vget_lane_f32(s2, 0);
+#endif
+
+		/* Scalar tail */
+		for (; j < m; ++j) {
+			sum += A[i * m + j] * x[j];
+		}
+		y[i] = sum;
+	}
+}
+
+#endif
+
+/**
+ * f32_cmplx_mult_ref
+ * Element-wise complex multiply reference
+ * dst[i] = src_A[i] * src_B[i]
+ * Complex numbers stored as {re, im} interleaved.
+ */
+
+void f32_cmplx_mult_ref(const cfloat32_t *src_A, const cfloat32_t *src_B,
+			cfloat32_t *dst, uint64_t size)
+{
+#ifdef USE_ASM_SCALAR_AS_REF
+	__asm__ volatile (
+		"1:	subs    %[size], %[size], #1\n"
+		"	ldp     s0, s1, [%[src_A]], #8\n"
+		"	ldp     s2, s3, [%[src_B]], #8\n"
+		"	fmul    s4, s3, s1\n"
+		"	fmul    s1, s1, s2\n"
+		"	fnmsub  s2, s2, s0, s4\n"
+		"	fmadd   s1, s3, s0, s1\n"
+		"	stp     s2, s1, [%[dst]], #8\n"
+		"	b.ne    1b\n"
+		: [src_A] "+&r"(src_A), [src_B] "+&r"(src_B),
+		  [dst] "+&r"(dst), [size] "+&r"(size)
+		:
+		: "s0", "s1", "s2", "s3", "s4", "memory", "cc");
+#else
+	uint64_t i;
+
+	for (i = 0; i < size; i++) {
+		dst[i].re = (src_A[i].re * src_B[i].re) - (src_A[i].im * src_B[i].im);
+		dst[i].im = (src_A[i].re * src_B[i].im) + (src_A[i].im * src_B[i].re);
+	}
+#endif
+}
+
+/**
+ * f32_cmplx_mult_simd
+ * Element-wise complex multiply SIMD
+ * dst[i] = src_A[i] * src_B[i]
+ * Complex numbers stored as {re, im} interleaved.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+__attribute__ ((noinline))
+void f32_cmplx_mult_simd(const cfloat32_t *src_A, const cfloat32_t *src_B,
+			 cfloat32_t *dst, uint64_t size)
+{
+	uint64_t count = 0, count_1 = 0;
+	uint64_t a1 = 0, b1 = 0, c1 = 0;
+
+	__asm__ volatile (
+		/* Check if there are any elements to process before start of loop */
+		"	ptrue p2.s\n"
+		"	whilelt p0.d, %[count], %[size]\n"
+		"	b.none  3f\n"
+		"	cntd    %[count_1]\n"
+		"	whilelt p1.d, %[count_1], %[size]\n"
+		"	b.none  2f\n"
+		"	addvl   %[a1], %[src_A], #1\n"
+		"	addvl   %[b1], %[src_B], #1\n"
+		"	addvl   %[c1], %[dst], #1\n"
+		"1:	dup     z0.s, #0\n"
+		"	dup     z1.s, #0\n"
+		/* Load complex elements from a and b arrays */
+		"	ld1d    {z10.d}, p0/z, [%[src_A], %[count], lsl #3]\n"
+		"	ld1d    {z12.d}, p0/z, [%[src_B], %[count], lsl #3]\n"
+		"	ld1d    {z11.d}, p1/z, [%[a1], %[count], lsl #3]\n"
+		"	ld1d    {z13.d}, p1/z, [%[b1], %[count], lsl #3]\n"
+		/* Integer Complex multiplication */
+		"	fcmla    z0.s, p2/m, z10.s, z12.s, #0\n"
+		"	fcmla    z0.s, p2/m, z10.s, z12.s, #90\n"
+		"	fcmla    z1.s, p2/m, z11.s, z13.s, #0\n"
+		"	fcmla    z1.s, p2/m, z11.s, z13.s, #90\n"
+		/* Store result to c array */
+		"	st1d    {z0.d}, p0, [%[dst], %[count], lsl #3]\n"
+		"	st1d    {z1.d}, p1, [%[c1], %[count], lsl #3]\n"
+		"	incd    %[count], all, mul #2\n"
+		"	whilelt p1.d, %[count], %[size]\n"
+		"	b.first 1b\n"
+		"	decd    %[count]\n"
+		"	whilelt p0.d, %[count], %[size]\n"
+		"	b.none  3f\n"
+		"2:	dup     z0.s, #0\n"
+		"	ld1d    {z10.d}, p0/z, [%[src_A], %[count], lsl #3]\n"
+		"	ld1d    {z12.d}, p0/z, [%[src_B], %[count], lsl #3]\n"
+		"	fcmla    z0.s, p2/m, z10.s, z12.s, #0\n"
+		"	fcmla    z0.s, p2/m, z10.s, z12.s, #90\n"
+		"	st1d    {z0.d}, p0, [%[dst], %[count], lsl #3]\n"
+		/* End of operation */
+		"3:\n"
+		: [a1] "+&r"(a1), [b1] "+&r"(b1),
+		  [c1] "+&r"(c1), [count] "+&r"(count), [count_1] "+&r"(count_1)
+		: [src_A] "r"(src_A), [src_B] "r"(src_B),
+		  [dst] "r"(dst), [size] "r"(size)
+		: "z0", "z1", "z10", "z11", "z12", "z13", "p0", "p1", "p2", "memory", "cc");
+}
+
+#else
+
+__attribute__ ((noinline))
+void f32_cmplx_mult_simd(const cfloat32_t *src_A, const cfloat32_t *src_B,
+			 cfloat32_t *dst, uint64_t size)
+{
+	float reg0 = 0, reg1 = 0, reg2 = 0, reg3 = 0, reg4 = 0, reg5 = 0;
+
+	__asm__ volatile (
+		/* Check whether there are >=4 elements to process before */
+		/* starting loop. If not, proceed to loop tail. */
+		"	cbz     %[size], 3f\n"
+		"	cmp     %[size], #4\n"
+		"	blt     2f\n"
+		/* Loop begins here: */
+		"1:	movi    v0.4s, #0x0\n"
+		"	movi    v1.4s, #0x0\n"
+		/* Load elements from a & b arrays such that real and */
+		/* imaginary parts are de-interleaved */
+		"	ld2     {v10.4s,v11.4s}, [%[src_A]], #32\n"
+		"	ld2     {v20.4s,v21.4s}, [%[src_B]], #32\n"
+		/* Perform complex multiplication */
+		"	fmla     v0.4s, v10.4s, v20.4s\n"
+		"	fmls     v0.4s, v11.4s, v21.4s\n"
+		"	fmla     v1.4s, v10.4s, v21.4s\n"
+		"	fmla     v1.4s, v11.4s, v20.4s\n"
+		/* Store the result to c array */
+		"	st2     {v0.4s,v1.4s}, [%[dst]], #32\n"
+		/* Compare whether are >=4 elements left to continue with */
+		/* loop iterations */
+		"	sub     %[size], %[size], #4\n"
+		"	cmp     %[size], #4\n"
+		"	bge     1b\n"
+		/* Loop ends here: */
+		/* Process loop tail if any */
+		"2:	cbz     %[size], 3f\n"
+		"	ldp     %s[reg0], %s[reg1], [%[src_A]], #8\n"
+		"	ldp     %s[reg2], %s[reg3], [%[src_B]], #8\n"
+		"	fmul    %s[reg4], %s[reg0], %s[reg2]\n"
+		"	fmsub   %s[reg4], %s[reg1], %s[reg3], %s[reg4]\n"
+		"	fmul    %s[reg5], %s[reg0], %s[reg3]\n"
+		"	fmadd   %s[reg5], %s[reg1], %s[reg2], %s[reg5]\n"
+		"	stp     %s[reg4], %s[reg5], [%[dst]], #8\n"
+		"	sub     %[size], %[size], #1\n"
+		"	cbnz    %[size], 2b\n"
+		"3:\n"
+		: [src_A] "+&r"(src_A), [src_B] "+&r"(src_B),
+		  [dst] "+&r"(dst), [reg0] "+&w"(reg0), [reg1] "+&w"(reg1), [reg2] "+&w"(reg2),
+		  [reg3] "+&w"(reg3), [reg4] "+&w"(reg4), [reg5] "+&w"(reg5), [size] "+&r"(size)
+		:
+		: "v0", "v1", "v10", "v11", "v20", "v21", "v0", "v1", "memory", "cc");
+}
+
+#endif
+
+void gen_test_data_f32(float32_t *in_out, float32_t scale,
+		       float32_t offs, uint64_t size)
+{
+	for (uint64_t i = 0; i < size; i++) {
+		in_out[i] = (float32_t)i * scale + offs;
+	}
+}
+
+/* context switch tracker */
+atomic_t switches_cnt;
+
+void sys_trace_thread_switched_in_user(void)
+{
+	__asm__ volatile ("hint #0x31");
+}
+
+void sys_trace_thread_switched_out_user(void)
+{
+	atomic_inc(&switches_cnt);
+	__asm__ volatile ("hint #0x33");
+}
+
+static inline uint32_t preempt_snapshot(void)
+{
+	return (uint32_t)atomic_get(&switches_cnt);
+}
+
+static inline uint32_t preempt_cnt_delta(uint32_t snap)
+{
+	return (uint32_t)atomic_get(&switches_cnt) - snap;
+}
+
+/* Per-thread result flags */
+struct worker_stats {
+	volatile int failures;		/* count of mismatches */
+	volatile int success;		/* how many checks performed */
+	volatile int switch_during_simd;/* how many times preempted during simd */
+	uint32_t id;			/* unique per worker */
+} __aligned(64);
+
+struct matrix_test_ctx {
+	const float32_t *mat;
+	const float32_t *vec;
+	float32_t *out_simd;
+	float32_t *out_scal;
+} __aligned(64);
+
+struct cplxmult_test_ctx {
+	const cfloat32_t *vecA;
+	const cfloat32_t *vecB;
+	cfloat32_t *out_simd;
+	cfloat32_t *out_scal;
+} __aligned(64);
+
+struct worker_stats wstats[SIMD_THREAD_CNT];
+struct matrix_test_ctx mattst_ctx[MATRIX_TEST_CTX];
+struct cplxmult_test_ctx cplxmultst_ctx[CMPLX_MUL_TEST_CTX];
+
+K_THREAD_STACK_ARRAY_DEFINE(worker_stacks, SIMD_THREAD_CNT, WORKER_STACK);
+struct k_thread worker_threads[SIMD_THREAD_CNT];
+
+K_THREAD_STACK_DEFINE(preemptor_stack, PREEMPTOR_STACK);
+struct k_thread preemptor_thread;
+
+/* Test control */
+volatile bool test_complete;
+
+void matmul_loop(void *arg1, void *arg2, void *arg3)
+{
+	ARG_UNUSED(arg3);
+	struct worker_stats *st = (struct worker_stats *)arg1;
+	struct matrix_test_ctx *tst = (struct matrix_test_ctx *)arg2;
+
+	/*
+	 * unique per-thread seed for vector generation
+	 */
+	float32_t offs = 2.0f + (float32_t)st->id;
+
+	offs = offs * 0.0123f;
+
+	gen_test_data_f32((float32_t *)tst->mat, 0.125f, offs,
+			  TEST_MATRIX_ROWS * TEST_MATRIX_COLS);
+	gen_test_data_f32((float32_t *)tst->vec, 0.025f, offs, TEST_MATRIX_COLS);
+
+	f32_mat_x_vec_ref(tst->mat, tst->vec, tst->out_scal, TEST_MATRIX_ROWS, TEST_MATRIX_COLS);
+
+	while (!test_complete) {
+		__asm__ volatile (
+			"hint #9\n"
+			"mov x0, %[id]\n"
+			:
+			: [id] "r"(st->id)
+			: "x0");
+
+		/*
+		 * clear output
+		 */
+		for (uint64_t i = 0; i < TEST_MATRIX_ROWS; i++) {
+			tst->out_simd[i] = 0.0f;
+		}
+
+		uint32_t snap = preempt_snapshot();
+
+		f32_mat_x_vec_simd(tst->mat, tst->vec, tst->out_simd, TEST_MATRIX_ROWS,
+				   TEST_MATRIX_COLS);
+
+		uint32_t cnt = preempt_cnt_delta(snap);
+
+		if (cnt) {
+			st->switch_during_simd += cnt;
+		}
+
+		if (!vec_within_threshold_f32(tst->out_scal, tst->out_simd,
+					  TEST_MATRIX_ROWS, F32REL_THRSH)) {
+			printf("error in mat x vec test\n");
+			dump_buf(tst->out_simd, TEST_MATRIX_ROWS, TEST_MATRIX_ROWS, "%f");
+			dump_buf(tst->out_scal, TEST_MATRIX_ROWS, TEST_MATRIX_ROWS, "%f");
+			st->failures++;
+
+			break;
+		}
+		st->success++;
+
+		__asm__ volatile ("hint #11");
+	}
+}
+
+void cmplxmul_loop(void *arg1, void *arg2, void *arg3)
+{
+	ARG_UNUSED(arg3);
+	struct worker_stats *st = (struct worker_stats *)arg1;
+	struct cplxmult_test_ctx *tst = (struct cplxmult_test_ctx *)arg2;
+
+	/*
+	 * unique per-thread seed
+	 */
+	float32_t offs = 2.0f + (float32_t)st->id;
+
+	gen_test_data_f32((float32_t *)tst->vecA, 0.025f, offs,
+			  TEST_CMPLX_MULT_SZ * 2);
+	gen_test_data_f32((float32_t *)tst->vecB, 0.01234f, (0.234f * offs),
+			  TEST_CMPLX_MULT_SZ * 2);
+
+	f32_cmplx_mult_ref(tst->vecA, tst->vecB, tst->out_scal, TEST_CMPLX_MULT_SZ);
+
+	while (!test_complete) {
+		__asm__ volatile (
+			"hint #0x21\n"
+			"mov x0, %[id]\n"
+			:
+			: [id] "r"(st->id)
+			: "x0");
+
+		/*
+		 * clear output
+		 */
+		for (uint64_t i = 0; i < TEST_CMPLX_MULT_SZ; i++) {
+			tst->out_simd[i].re = 0.0f;
+			tst->out_simd[i].im = 0.0f;
+		}
+
+		uint32_t snap = preempt_snapshot();
+
+		f32_cmplx_mult_simd(tst->vecA, tst->vecB, tst->out_simd, TEST_CMPLX_MULT_SZ);
+
+		st->switch_during_simd += preempt_cnt_delta(snap);
+
+		float32_t *ptest_cmplx_mult_out_simd = (float32_t *)tst->out_simd;
+		float32_t *ptest_cmplx_mult_out_scalar = (float32_t *)tst->out_scal;
+
+		if (!vec_within_threshold_f32(ptest_cmplx_mult_out_scalar,
+					  ptest_cmplx_mult_out_simd,
+					  TEST_CMPLX_MULT_SZ * 2,
+					  F32REL_THRSH)) {
+			printf("error in cmplx  test\n");
+			dump_buf(ptest_cmplx_mult_out_simd, TEST_CMPLX_MULT_SZ,
+				 TEST_CMPLX_MULT_SZ, "%f");
+			dump_buf(ptest_cmplx_mult_out_scalar, TEST_CMPLX_MULT_SZ,
+				 TEST_CMPLX_MULT_SZ, "%f");
+			st->failures++;
+
+			break;
+		}
+		st->success++;
+
+		__asm__ volatile ("hint #0x23");
+	}
+}
+
+/**
+ * simd_reg_clobber
+ * Simulate register clobbering between context switches
+ */
+void simd_reg_clobber(void)
+{
+#if defined(__ARM_FEATURE_SVE)
+	__asm__ volatile (
+		"	dup     z1.s, #1\n"
+		"	dup     z2.s, #2\n"
+		"	dup     z3.s, #3\n"
+		"	dup     z4.s, #4\n"
+		"	dup     z5.s, #5\n"
+		"	dup     z6.s, #6\n"
+		"	dup     z7.s, #7\n"
+		"	dup     z8.s, #8\n"
+		"	dup     z9.s, #9\n"
+		"	dup     z10.s, #10\n"
+		"	dup     z11.s, #11\n"
+		"	dup     z12.s, #12\n"
+		"	dup     z13.s, #13\n"
+		"	dup     z14.s, #14\n"
+		"	dup     z15.s, #15\n"
+		"	dup     z16.s, #(16+0)\n"
+		"	dup     z17.s, #(16+1)\n"
+		"	dup     z18.s, #(16+2)\n"
+		"	dup     z19.s, #(16+3)\n"
+		"	dup     z20.s, #(16+4)\n"
+		"	dup     z21.s, #(16+5)\n"
+		"	dup     z22.s, #(16+6)\n"
+		"	dup     z23.s, #(16+7)\n"
+		"	dup     z24.s, #(16+8)\n"
+		"	dup     z25.s, #(16+9)\n"
+		"	dup     z26.s, #(16+10)\n"
+		"	dup     z27.s, #(16+11)\n"
+		"	dup     z28.s, #(16+12)\n"
+		"	dup     z29.s, #(16+13)\n"
+		"	dup     z30.s, #(16+14)\n"
+		"	dup     z31.s, #(16+15)\n"
+		"	whilelo p0.s,%x[s] ,%x[e]\n"
+		"	whilelo p1.s,%x[s] ,%x[e]\n"
+		"	whilelo p2.s,%x[s] ,%x[e]\n"
+		"	whilelo p3.s,%x[s] ,%x[e]\n"
+		"	whilelo p4.s,%x[s] ,%x[e]\n"
+		"	whilelo p5.s,%x[s] ,%x[e]\n"
+		"	whilelo p6.s,%x[s] ,%x[e]\n"
+		"	whilelo p7.s,%x[s] ,%x[e]\n"
+		"	whilelo p8.s,%x[s] ,%x[e]\n"
+		"	whilelo p9.s,%x[s] ,%x[e]\n"
+		"	whilelo p10.s,%x[s] ,%x[e]\n"
+		"	whilelo p11.s,%x[s] ,%x[e]\n"
+		"	whilelo p12.s,%x[s] ,%x[e]\n"
+		"	whilelo p13.s,%x[s] ,%x[e]\n"
+		"	whilelo p14.s,%x[s] ,%x[e]\n"
+		"	whilelo p15.s,%x[s] ,%x[e]\n"
+		:
+		: [s] "r"(0), [e] "r"(3)
+		: "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		  "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
+		  "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+		  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+		  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+		  "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+		  "memory", "cc");
+#else
+	__asm__ volatile (
+		"	movi    v0.4s, #0\n"
+		"	movi    v1.4s, #1\n"
+		"	movi    v2.4s, #2\n"
+		"	movi    v3.4s, #3\n"
+		"	movi    v4.4s, #4\n"
+		"	movi    v5.4s, #5\n"
+		"	movi    v6.4s, #6\n"
+		"	movi    v7.4s, #7\n"
+		"	movi    v8.4s, #8\n"
+		"	movi    v9.4s, #9\n"
+		"	movi    v10.4s, #10\n"
+		"	movi    v11.4s, #11\n"
+		"	movi    v12.4s, #12\n"
+		"	movi    v13.4s, #13\n"
+		"	movi    v14.4s, #14\n"
+		"	movi    v15.4s, #15\n"
+		"	movi    v16.4s, #(16+0)\n"
+		"	movi    v17.4s, #(16+1)\n"
+		"	movi    v18.4s, #(16+2)\n"
+		"	movi    v19.4s, #(16+3)\n"
+		"	movi    v20.4s, #(16+4)\n"
+		"	movi    v21.4s, #(16+5)\n"
+		"	movi    v22.4s, #(16+6)\n"
+		"	movi    v23.4s, #(16+7)\n"
+		"	movi    v24.4s, #(16+8)\n"
+		"	movi    v25.4s, #(16+9)\n"
+		"	movi    v26.4s, #(16+10)\n"
+		"	movi    v27.4s, #(16+11)\n"
+		"	movi    v28.4s, #(16+12)\n"
+		"	movi    v29.4s, #(16+13)\n"
+		"	movi    v30.4s, #(16+14)\n"
+		"	movi    v31.4s, #(16+15)\n"
+		:
+		:
+		: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+		  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+		  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+		  "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+		  "memory", "cc");
+#endif
+}
+
+/* high-priority preemptor: force preemption + SIMD register clobber */
+void preemptor_entry(void *a, void *b, void *c)
+{
+	ARG_UNUSED(a);
+	ARG_UNUSED(b);
+	ARG_UNUSED(c);
+	int periodic_cnt = 0;
+
+	while (periodic_cnt < TEST_DURATION_MSEC) {
+		__asm__ volatile ("hint #13");
+
+		simd_reg_clobber();
+
+		if (++periodic_cnt % STAT_UPD_PERIOD_MSEC == 0) {
+			for (int i = 0; i < SIMD_THREAD_CNT; ++i) {
+				TC_PRINT("task %d succ %d fail %d switch_during_simd %d\n",
+					 wstats[i].id, wstats[i].success,
+					 wstats[i].failures, wstats[i].switch_during_simd);
+			}
+		}
+
+		k_msleep(1);
+	}
+}
+
+void sve_stress_init(void)
+{
+	/* Reset state */
+	test_complete = false;
+	atomic_set(&switches_cnt, 0);
+
+	for (int i = 0; i < SIMD_THREAD_CNT; ++i) {
+		wstats[i].id = i;
+		wstats[i].success = 0;
+		wstats[i].failures = 0;
+		wstats[i].switch_during_simd = 0;
+	}
+}
+
+ZTEST(arm64_sve_stress, test_simd_context_switch_stress)
+{
+	TC_PRINT("=== ARM Neon/SVE2 SIMD context switch stress test ===\n");
+
+#if defined(__ARM_FEATURE_SVE)
+	TC_PRINT("Using SVE instructions\n");
+#else
+	TC_PRINT("Using Neon instructions\n");
+#endif
+
+	sve_stress_init();
+
+	/*
+	 * Spawn workers at equal priority, preemptive
+	 */
+	for (int i = 0; i < SIMD_THREAD_CNT; ++i) {
+		char thrd_name[16];
+
+		if (i < MATRIX_TEST_CTX) {
+			mattst_ctx[i].mat = test_matrix[i];
+			mattst_ctx[i].vec = test_vector[i];
+			mattst_ctx[i].out_simd = test_out_simd[i];
+			mattst_ctx[i].out_scal = test_out_scalar[i];
+
+			snprintk(thrd_name, sizeof(thrd_name), "matmul%d", wstats[i].id);
+			k_thread_create(&worker_threads[i],
+					worker_stacks[i], WORKER_STACK,
+					matmul_loop, &wstats[i], &mattst_ctx[i], NULL,
+					SIMD_TASK_PRIO, 0, K_NO_WAIT);
+		} else {
+			int idx = i - MATRIX_TEST_CTX;
+
+			cplxmultst_ctx[idx].vecA = test_cmplx_mult_a[idx];
+			cplxmultst_ctx[idx].vecB = test_cmplx_mult_b[idx];
+			cplxmultst_ctx[idx].out_simd = test_cmplx_mult_out_simd[idx];
+			cplxmultst_ctx[idx].out_scal = test_cmplx_mult_out_scalar[idx];
+
+			snprintk(thrd_name, sizeof(thrd_name), "cmplxmul%d", wstats[i].id);
+			k_thread_create(&worker_threads[i],
+					worker_stacks[i], WORKER_STACK,
+					cmplxmul_loop, &wstats[i], &cplxmultst_ctx[idx], NULL,
+					SIMD_TASK_PRIO, 0, K_NO_WAIT);
+		}
+		k_thread_name_set(&worker_threads[i], thrd_name);
+	}
+
+	/*
+	 * High-priority preemptor to increase preemption points, SIMD vector content change
+	 * and stats display
+	 */
+	k_thread_create(&preemptor_thread, preemptor_stack, PREEMPTOR_STACK,
+			preemptor_entry, NULL, NULL, NULL, PREEMPTOR_PRIO, 0, K_NO_WAIT);
+	k_thread_name_set(&preemptor_thread, "simdpreemptor");
+
+	/* Wait for test to complete */
+	k_thread_join(&preemptor_thread, K_FOREVER);
+
+	/* Signal threads to stop and wait for them */
+	test_complete = true;
+
+	for (int i = 0; i < SIMD_THREAD_CNT; ++i) {
+		k_thread_join(&worker_threads[i], K_FOREVER);
+	}
+
+	/* Print final statistics */
+	TC_PRINT("\n=== Final Test Results ===\n");
+	int total_failures = 0;
+	int total_success = 0;
+	int total_switches = 0;
+
+	for (int i = 0; i < SIMD_THREAD_CNT; ++i) {
+		TC_PRINT("task %d: success=%d failures=%d switches_during_simd=%d\n",
+			 wstats[i].id, wstats[i].success, wstats[i].failures,
+			 wstats[i].switch_during_simd);
+		total_failures += wstats[i].failures;
+		total_success += wstats[i].success;
+		total_switches += wstats[i].switch_during_simd;
+	}
+
+	TC_PRINT("Total: success=%d failures=%d switches=%d\n",
+		 total_success, total_failures, total_switches);
+
+	/* Verify test passed */
+	zassert_equal(total_failures, 0,
+		  "SIMD context switch stress test failed with %d errors", total_failures);
+	zassert_true(total_success > 0, "No successful SIMD operations completed");
+	zassert_true(total_switches > 0, "No context switches occurred during SIMD operations");
+}
+
+ZTEST_SUITE(arm64_sve_stress, NULL, NULL, NULL, NULL, NULL);
--- a/tests/arch/arm64/arm64_sve_stress/testcase.yaml
+++ b/tests/arch/arm64/arm64_sve_stress/testcase.yaml
@@ -0,0 +1,15 @@
+tests:
+  arch.arm64.sve_stress:
+    arch_allow: arm64
+    filter: CONFIG_FPU and CONFIG_FPU_SHARING
+    integration_platforms:
+      - fvp_base_revc_2xaem/v8a
+      - fvp_base_revc_2xaem/v9a
+    tags:
+      - sve
+      - neon
+      - arm64
+      - context_switch
+      - stress
+    timeout: 120
+    slow: true