s390/fpu: limit save and restore to used registers

The first invocation of kernel_fpu_begin() after switching from user to kernel context will save all vector registers, even if only parts of the vector registers are used within the kernel fpu context. Given that save and restore of all vector registers is quite expensive change the current approach in several ways: - Instead of saving and restoring all user registers limit this to those registers which are actually used within an kernel fpu context. - On context switch save all remaining user fpu registers, so they can be restored when the task is rescheduled. - Saving user registers within kernel_fpu_begin() is done without disabling and enabling interrupts - which also slightly reduces runtime. In worst case (e.g. interrupt context uses the same registers) this may lead to the situation that registers are saved several times, however the assumption is that this will not happen frequently, so that the new method is faster in nearly all cases. - save_user_fpu_regs() can still be called from all contexts and saves all (or all remaining) user registers to a tasks ufpu user fpu save area. Overall this reduces the time required to save and restore the user fpu context for nearly all cases. Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
author: Heiko Carstens <hca@linux.ibm.com> 2024-02-03 11:45:18 +0100
committer: Heiko Carstens <hca@linux.ibm.com> 2024-02-16 14:30:16 +0100
commit: 8c09871a950a3fe686e0e27fd4193179c5f74f37 (patch)
tree: 55b39ecbbdcb9e43dc3c24b733795f606068637a /arch/s390
parent: 066c40918bb495de8f2e45bd7eec06737a142712 (diff)
download: linux-8c09871a950a3fe686e0e27fd4193179c5f74f37.tar.gz
4 files changed, 128 insertions, 69 deletions
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index 659e07d7f31a8..7f5004065e8aa 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -41,8 +41,7 @@ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
 
 static __always_inline void arch_exit_to_user_mode(void)
 {
-	if (test_thread_flag(TIF_FPU))
-		__load_user_fpu_regs();
+	load_user_fpu_regs();
 
 	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 		debug_user_asce(1);
diff --git a/arch/s390/include/asm/fpu.h b/arch/s390/include/asm/fpu.h
index c1b3920092a1c..c84cb33913e29 100644
--- a/arch/s390/include/asm/fpu.h
+++ b/arch/s390/include/asm/fpu.h
@@ -58,10 +58,6 @@ static inline bool cpu_has_vx(void)
 	return likely(test_facility(129));
 }
 
-void save_user_fpu_regs(void);
-void load_user_fpu_regs(void);
-void __load_user_fpu_regs(void);
-
 enum {
 	KERNEL_FPC_BIT = 0,
 	KERNEL_VXR_V0V7_BIT,
@@ -83,6 +79,8 @@ enum {
 #define KERNEL_VXR		(KERNEL_VXR_LOW	   | KERNEL_VXR_HIGH)
 #define KERNEL_FPR		(KERNEL_FPC	   | KERNEL_VXR_LOW)
 
+void load_fpu_state(struct fpu *state, int flags);
+void save_fpu_state(struct fpu *state, int flags);
 void __kernel_fpu_begin(struct kernel_fpu *state, int flags);
 void __kernel_fpu_end(struct kernel_fpu *state, int flags);
 
@@ -162,26 +160,57 @@ static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
 	__load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
 }
 
-static inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
+static inline void load_user_fpu_regs(void)
+{
+	struct thread_struct *thread = &current->thread;
+
+	if (!thread->ufpu_flags)
+		return;
+	load_fpu_state(&thread->ufpu, thread->ufpu_flags);
+	thread->ufpu_flags = 0;
+}
+
+static __always_inline void __save_user_fpu_regs(struct thread_struct *thread, int flags)
 {
-	state->hdr.mask = READ_ONCE(current->thread.kfpu_flags);
-	if (!test_thread_flag(TIF_FPU)) {
-		/* Save user space FPU state and register contents */
-		save_user_fpu_regs();
-	} else if (state->hdr.mask & flags) {
-		/* Save FPU/vector register in-use by the kernel */
+	save_fpu_state(&thread->ufpu, flags);
+	__atomic_or(flags, &thread->ufpu_flags);
+}
+
+static inline void save_user_fpu_regs(void)
+{
+	struct thread_struct *thread = &current->thread;
+	int mask, flags;
+
+	mask = __atomic_or(KERNEL_FPC | KERNEL_VXR, &thread->kfpu_flags);
+	flags = ~READ_ONCE(thread->ufpu_flags) & (KERNEL_FPC | KERNEL_VXR);
+	if (flags)
+		__save_user_fpu_regs(thread, flags);
+	barrier();
+	WRITE_ONCE(thread->kfpu_flags, mask);
+}
+
+static __always_inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
+{
+	struct thread_struct *thread = &current->thread;
+	int mask, uflags;
+
+	mask = __atomic_or(flags, &thread->kfpu_flags);
+	state->hdr.mask = mask;
+	uflags = READ_ONCE(thread->ufpu_flags);
+	if ((uflags & flags) != flags)
+		__save_user_fpu_regs(thread, ~uflags & flags);
+	if (mask & flags)
 		__kernel_fpu_begin(state, flags);
-	}
-	__atomic_or(flags, &current->thread.kfpu_flags);
 }
 
-static inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
+static __always_inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
 {
-	WRITE_ONCE(current->thread.kfpu_flags, state->hdr.mask);
-	if (state->hdr.mask & flags) {
-		/* Restore FPU/vector register in-use by the kernel */
+	int mask = state->hdr.mask;
+
+	if (mask & flags)
 		__kernel_fpu_end(state, flags);
-	}
+	barrier();
+	WRITE_ONCE(current->thread.kfpu_flags, mask);
 }
 
 void __kernel_fpu_invalid_size(void);
@@ -222,28 +251,16 @@ static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
 
 static inline void save_kernel_fpu_regs(struct thread_struct *thread)
 {
-	struct fpu *state = &thread->kfpu;
-
 	if (!thread->kfpu_flags)
 		return;
-	fpu_stfpc(&state->fpc);
-	if (likely(cpu_has_vx()))
-		save_vx_regs(state->vxrs);
-	else
-		save_fp_regs_vx(state->vxrs);
+	save_fpu_state(&thread->kfpu, thread->kfpu_flags);
 }
 
 static inline void restore_kernel_fpu_regs(struct thread_struct *thread)
 {
-	struct fpu *state = &thread->kfpu;
-
 	if (!thread->kfpu_flags)
 		return;
-	fpu_lfpc(&state->fpc);
-	if (likely(cpu_has_vx()))
-		load_vx_regs(state->vxrs);
-	else
-		load_fp_regs_vx(state->vxrs);
+	load_fpu_state(&thread->kfpu, thread->kfpu_flags);
 }
 
 static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index ecce58abf3dba..7cf00cf8fb0bc 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -166,6 +166,7 @@ struct thread_struct {
 	unsigned int gmap_write_flag;		/* gmap fault write indication */
 	unsigned int gmap_int_code;		/* int code of last gmap fault */
 	unsigned int gmap_pfault;		/* signal of a pending guest pfault */
+	int ufpu_flags;				/* user fpu flags */
 	int kfpu_flags;				/* kernel fpu flags */
 
 	/* Per-thread information related to debugging */
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index 62e9befe7890a..fa90bbdc5ef94 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -107,45 +107,87 @@ void __kernel_fpu_end(struct kernel_fpu *state, int flags)
 }
 EXPORT_SYMBOL(__kernel_fpu_end);
 
-void __load_user_fpu_regs(void)
+void load_fpu_state(struct fpu *state, int flags)
 {
-	struct fpu *state = &current->thread.ufpu;
-
-	fpu_lfpc_safe(&state->fpc);
-	if (likely(cpu_has_vx()))
-		load_vx_regs(state->vxrs);
-	else
-		load_fp_regs_vx(state->vxrs);
-	clear_thread_flag(TIF_FPU);
-}
+	__vector128 *vxrs = &state->vxrs[0];
+	int mask;
 
-void load_user_fpu_regs(void)
-{
-	raw_local_irq_disable();
-	__load_user_fpu_regs();
-	raw_local_irq_enable();
+	if (flags & KERNEL_FPC)
+		fpu_lfpc(&state->fpc);
+	if (!cpu_has_vx()) {
+		if (flags & KERNEL_VXR_V0V7)
+			load_fp_regs_vx(state->vxrs);
+		return;
+	}
+	mask = flags & KERNEL_VXR;
+	if (mask == KERNEL_VXR) {
+		fpu_vlm(0, 15, &vxrs[0]);
+		fpu_vlm(16, 31, &vxrs[16]);
+		return;
+	}
+	if (mask == KERNEL_VXR_MID) {
+		fpu_vlm(8, 23, &vxrs[8]);
+		return;
+	}
+	mask = flags & KERNEL_VXR_LOW;
+	if (mask) {
+		if (mask == KERNEL_VXR_LOW)
+			fpu_vlm(0, 15, &vxrs[0]);
+		else if (mask == KERNEL_VXR_V0V7)
+			fpu_vlm(0, 7, &vxrs[0]);
+		else
+			fpu_vlm(8, 15, &vxrs[8]);
+	}
+	mask = flags & KERNEL_VXR_HIGH;
+	if (mask) {
+		if (mask == KERNEL_VXR_HIGH)
+			fpu_vlm(16, 31, &vxrs[16]);
+		else if (mask == KERNEL_VXR_V16V23)
+			fpu_vlm(16, 23, &vxrs[16]);
+		else
+			fpu_vlm(24, 31, &vxrs[24]);
+	}
 }
-EXPORT_SYMBOL(load_user_fpu_regs);
 
-void save_user_fpu_regs(void)
+void save_fpu_state(struct fpu *state, int flags)
 {
-	unsigned long flags;
-	struct fpu *state;
-
-	local_irq_save(flags);
-
-	if (test_thread_flag(TIF_FPU))
-		goto out;
-
-	state = &current->thread.ufpu;
+	__vector128 *vxrs = &state->vxrs[0];
+	int mask;
 
-	fpu_stfpc(&state->fpc);
-	if (likely(cpu_has_vx()))
-		save_vx_regs(state->vxrs);
-	else
-		save_fp_regs_vx(state->vxrs);
-	set_thread_flag(TIF_FPU);
-out:
-	local_irq_restore(flags);
+	if (flags & KERNEL_FPC)
+		fpu_stfpc(&state->fpc);
+	if (!cpu_has_vx()) {
+		if (flags & KERNEL_VXR_LOW)
+			save_fp_regs_vx(state->vxrs);
+		return;
+	}
+	mask = flags & KERNEL_VXR;
+	if (mask == KERNEL_VXR) {
+		fpu_vstm(0, 15, &vxrs[0]);
+		fpu_vstm(16, 31, &vxrs[16]);
+		return;
+	}
+	if (mask == KERNEL_VXR_MID) {
+		fpu_vstm(8, 23, &vxrs[8]);
+		return;
+	}
+	mask = flags & KERNEL_VXR_LOW;
+	if (mask) {
+		if (mask == KERNEL_VXR_LOW)
+			fpu_vstm(0, 15, &vxrs[0]);
+		else if (mask == KERNEL_VXR_V0V7)
+			fpu_vstm(0, 7, &vxrs[0]);
+		else
+			fpu_vstm(8, 15, &vxrs[8]);
+	}
+	mask = flags & KERNEL_VXR_HIGH;
+	if (mask) {
+		if (mask == KERNEL_VXR_HIGH)
+			fpu_vstm(16, 31, &vxrs[16]);
+		else if (mask == KERNEL_VXR_V16V23)
+			fpu_vstm(16, 23, &vxrs[16]);
+		else
+			fpu_vstm(24, 31, &vxrs[24]);
+	}
 }
-EXPORT_SYMBOL(save_user_fpu_regs);
+EXPORT_SYMBOL(save_fpu_state);
author	Heiko Carstens <hca@linux.ibm.com>	2024-02-03 11:45:18 +0100
committer	Heiko Carstens <hca@linux.ibm.com>	2024-02-16 14:30:16 +0100
commit	8c09871a950a3fe686e0e27fd4193179c5f74f37 (patch)
tree	55b39ecbbdcb9e43dc3c24b733795f606068637a /arch/s390
parent	066c40918bb495de8f2e45bd7eec06737a142712 (diff)
download	linux-8c09871a950a3fe686e0e27fd4193179c5f74f37.tar.gz