aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPalmer Dabbelt <palmer@rivosinc.com>2024-03-13 07:30:33 -0700
committerPalmer Dabbelt <palmer@rivosinc.com>2024-03-13 07:30:33 -0700
commit2b2ca354674bed0d0222ce1426d2d45b065ac1e8 (patch)
treeb36b271e9efce452423d4b0c7d26eea976e1a7b8
parenta13a806dfb8a21e57f4e9777cafb547e51c97bbd (diff)
parentf413aae96cda059635910c462ede0a8f0385897c (diff)
downloadlinux-2b2ca354674bed0d0222ce1426d2d45b065ac1e8.tar.gz
Merge patch series "riscv: Use Kconfig to set unaligned access speed"
Charlie Jenkins <charlie@rivosinc.com> says: If the hardware unaligned access speed is known at compile time, it is possible to avoid running the unaligned access speed probe to speedup boot-time. * b4-shazam-merge: riscv: Set unaligned access speed at compile time riscv: Decouple emulated unaligned accesses from access speed riscv: Only check online cpus for emulated accesses riscv: lib: Introduce has_fast_unaligned_access() Link: https://lore.kernel.org/r/20240308-disable_misaligned_probe_config-v9-0-a388770ba0ce@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
-rw-r--r--arch/riscv/Kconfig58
-rw-r--r--arch/riscv/include/asm/cpufeature.h31
-rw-r--r--arch/riscv/kernel/Makefile4
-rw-r--r--arch/riscv/kernel/cpufeature.c255
-rw-r--r--arch/riscv/kernel/sys_hwprobe.c13
-rw-r--r--arch/riscv/kernel/traps_misaligned.c17
-rw-r--r--arch/riscv/kernel/unaligned_access_speed.c282
-rw-r--r--arch/riscv/lib/csum.c7
8 files changed, 374 insertions, 293 deletions
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0bfcfec67ed572..8ebafe337eac98 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -704,27 +704,61 @@ config THREAD_SIZE_ORDER
affects irq stack size, which is equal to thread stack size.
config RISCV_MISALIGNED
- bool "Support misaligned load/store traps for kernel and userspace"
+ bool
select SYSCTL_ARCH_UNALIGN_ALLOW
- default y
help
- Say Y here if you want the kernel to embed support for misaligned
- load/store for both kernel and userspace. When disable, misaligned
- accesses will generate SIGBUS in userspace and panic in kernel.
+ Embed support for emulating misaligned loads and stores.
+
+choice
+ prompt "Unaligned Accesses Support"
+ default RISCV_PROBE_UNALIGNED_ACCESS
+ help
+ This determines the level of support for unaligned accesses. This
+ information is used by the kernel to perform optimizations. It is also
+ exposed to user space via the hwprobe syscall. The hardware will be
+ probed at boot by default.
+
+config RISCV_PROBE_UNALIGNED_ACCESS
+ bool "Probe for hardware unaligned access support"
+ select RISCV_MISALIGNED
+ help
+ During boot, the kernel will run a series of tests to determine the
+ speed of unaligned accesses. This probing will dynamically determine
+ the speed of unaligned accesses on the underlying system. If unaligned
+ memory accesses trap into the kernel as they are not supported by the
+ system, the kernel will emulate the unaligned accesses to preserve the
+ UABI.
+
+config RISCV_EMULATED_UNALIGNED_ACCESS
+ bool "Emulate unaligned access where system support is missing"
+ select RISCV_MISALIGNED
+ help
+ If unaligned memory accesses trap into the kernel as they are not
+ supported by the system, the kernel will emulate the unaligned
+ accesses to preserve the UABI. When the underlying system does support
+ unaligned accesses, the unaligned accesses are assumed to be slow.
+
+config RISCV_SLOW_UNALIGNED_ACCESS
+ bool "Assume the system supports slow unaligned memory accesses"
+ depends on NONPORTABLE
+ help
+ Assume that the system supports slow unaligned memory accesses. The
+ kernel and userspace programs may not be able to run at all on systems
+ that do not support unaligned memory accesses.
config RISCV_EFFICIENT_UNALIGNED_ACCESS
- bool "Assume the CPU supports fast unaligned memory accesses"
+ bool "Assume the system supports fast unaligned memory accesses"
depends on NONPORTABLE
select DCACHE_WORD_ACCESS if MMU
select HAVE_EFFICIENT_UNALIGNED_ACCESS
help
- Say Y here if you want the kernel to assume that the CPU supports
- efficient unaligned memory accesses. When enabled, this option
- improves the performance of the kernel on such CPUs. However, the
- kernel will run much more slowly, or will not be able to run at all,
- on CPUs that do not support efficient unaligned memory accesses.
+ Assume that the system supports fast unaligned memory accesses. When
+ enabled, this option improves the performance of the kernel on such
+ systems. However, the kernel and userspace programs will run much more
+ slowly, or will not be able to run at all, on systems that do not
+ support efficient unaligned memory accesses.
- If unsure what to do here, say N.
+endchoice
endmenu # "Platform type"
diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index 5a626ed2c47a89..46061f5e976439 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright 2022-2023 Rivos, Inc
+ * Copyright 2022-2024 Rivos, Inc
*/
#ifndef _ASM_CPUFEATURE_H
@@ -28,29 +28,38 @@ struct riscv_isainfo {
DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo);
-DECLARE_PER_CPU(long, misaligned_access_speed);
-
/* Per-cpu ISA extensions. */
extern struct riscv_isainfo hart_isa[NR_CPUS];
void riscv_user_isa_enable(void);
-#ifdef CONFIG_RISCV_MISALIGNED
-bool unaligned_ctl_available(void);
-bool check_unaligned_access_emulated(int cpu);
+#if defined(CONFIG_RISCV_MISALIGNED)
+bool check_unaligned_access_emulated_all_cpus(void);
void unaligned_emulation_finish(void);
+bool unaligned_ctl_available(void);
+DECLARE_PER_CPU(long, misaligned_access_speed);
#else
static inline bool unaligned_ctl_available(void)
{
return false;
}
+#endif
+
+#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
+DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
-static inline bool check_unaligned_access_emulated(int cpu)
+static __always_inline bool has_fast_unaligned_accesses(void)
{
- return false;
+ return static_branch_likely(&fast_unaligned_access_speed_key);
+}
+#else
+static __always_inline bool has_fast_unaligned_accesses(void)
+{
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ return true;
+ else
+ return false;
}
-
-static inline void unaligned_emulation_finish(void) {}
#endif
unsigned long riscv_get_elf_hwcap(void);
@@ -135,6 +144,4 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
}
-DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
-
#endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index f71910718053d8..c8085126a6f989 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -38,7 +38,6 @@ extra-y += vmlinux.lds
obj-y += head.o
obj-y += soc.o
obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o
-obj-y += copy-unaligned.o
obj-y += cpu.o
obj-y += cpufeature.o
obj-y += entry.o
@@ -62,6 +61,9 @@ obj-y += tests/
obj-$(CONFIG_MMU) += vdso.o vdso/
obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
+obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
+obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o
+
obj-$(CONFIG_FPU) += fpu.o
obj-$(CONFIG_RISCV_ISA_V) += vector.o
obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 0c7688fa83766f..afeae3ff43dc1f 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -11,7 +11,6 @@
#include <linux/cpu.h>
#include <linux/cpuhotplug.h>
#include <linux/ctype.h>
-#include <linux/jump_label.h>
#include <linux/log2.h>
#include <linux/memory.h>
#include <linux/module.h>
@@ -21,20 +20,12 @@
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/hwcap.h>
-#include <asm/hwprobe.h>
#include <asm/patch.h>
#include <asm/processor.h>
#include <asm/vector.h>
-#include "copy-unaligned.h"
-
#define NUM_ALPHA_EXTS ('z' - 'a' + 1)
-#define MISALIGNED_ACCESS_JIFFIES_LG2 1
-#define MISALIGNED_BUFFER_SIZE 0x4000
-#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
-#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
-
unsigned long elf_hwcap __read_mostly;
/* Host ISA bitmap */
@@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
/* Per-cpu ISA extensions. */
struct riscv_isainfo hart_isa[NR_CPUS];
-/* Performance information */
-DEFINE_PER_CPU(long, misaligned_access_speed);
-
-static cpumask_t fast_misaligned_access;
-
/**
* riscv_isa_extension_base() - Get base extension word
*
@@ -707,247 +693,6 @@ unsigned long riscv_get_elf_hwcap(void)
return hwcap;
}
-static int check_unaligned_access(void *param)
-{
- int cpu = smp_processor_id();
- u64 start_cycles, end_cycles;
- u64 word_cycles;
- u64 byte_cycles;
- int ratio;
- unsigned long start_jiffies, now;
- struct page *page = param;
- void *dst;
- void *src;
- long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
-
- if (check_unaligned_access_emulated(cpu))
- return 0;
-
- /* Make an unaligned destination buffer. */
- dst = (void *)((unsigned long)page_address(page) | 0x1);
- /* Unalign src as well, but differently (off by 1 + 2 = 3). */
- src = dst + (MISALIGNED_BUFFER_SIZE / 2);
- src += 2;
- word_cycles = -1ULL;
- /* Do a warmup. */
- __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- preempt_disable();
- start_jiffies = jiffies;
- while ((now = jiffies) == start_jiffies)
- cpu_relax();
-
- /*
- * For a fixed amount of time, repeatedly try the function, and take
- * the best time in cycles as the measurement.
- */
- while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
- start_cycles = get_cycles64();
- /* Ensure the CSR read can't reorder WRT to the copy. */
- mb();
- __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- /* Ensure the copy ends before the end time is snapped. */
- mb();
- end_cycles = get_cycles64();
- if ((end_cycles - start_cycles) < word_cycles)
- word_cycles = end_cycles - start_cycles;
- }
-
- byte_cycles = -1ULL;
- __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- start_jiffies = jiffies;
- while ((now = jiffies) == start_jiffies)
- cpu_relax();
-
- while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
- start_cycles = get_cycles64();
- mb();
- __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- mb();
- end_cycles = get_cycles64();
- if ((end_cycles - start_cycles) < byte_cycles)
- byte_cycles = end_cycles - start_cycles;
- }
-
- preempt_enable();
-
- /* Don't divide by zero. */
- if (!word_cycles || !byte_cycles) {
- pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
- cpu);
-
- return 0;
- }
-
- if (word_cycles < byte_cycles)
- speed = RISCV_HWPROBE_MISALIGNED_FAST;
-
- ratio = div_u64((byte_cycles * 100), word_cycles);
- pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
- cpu,
- ratio / 100,
- ratio % 100,
- (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
-
- per_cpu(misaligned_access_speed, cpu) = speed;
-
- /*
- * Set the value of fast_misaligned_access of a CPU. These operations
- * are atomic to avoid race conditions.
- */
- if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
- cpumask_set_cpu(cpu, &fast_misaligned_access);
- else
- cpumask_clear_cpu(cpu, &fast_misaligned_access);
-
- return 0;
-}
-
-static void check_unaligned_access_nonboot_cpu(void *param)
-{
- unsigned int cpu = smp_processor_id();
- struct page **pages = param;
-
- if (smp_processor_id() != 0)
- check_unaligned_access(pages[cpu]);
-}
-
-DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
-
-static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
-{
- if (cpumask_weight(mask) == weight)
- static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
- else
- static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
-}
-
-static void set_unaligned_access_static_branches_except_cpu(int cpu)
-{
- /*
- * Same as set_unaligned_access_static_branches, except excludes the
- * given CPU from the result. When a CPU is hotplugged into an offline
- * state, this function is called before the CPU is set to offline in
- * the cpumask, and thus the CPU needs to be explicitly excluded.
- */
-
- cpumask_t fast_except_me;
-
- cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
- cpumask_clear_cpu(cpu, &fast_except_me);
-
- modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
-}
-
-static void set_unaligned_access_static_branches(void)
-{
- /*
- * This will be called after check_unaligned_access_all_cpus so the
- * result of unaligned access speed for all CPUs will be available.
- *
- * To avoid the number of online cpus changing between reading
- * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
- * held before calling this function.
- */
-
- cpumask_t fast_and_online;
-
- cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
-
- modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
-}
-
-static int lock_and_set_unaligned_access_static_branch(void)
-{
- cpus_read_lock();
- set_unaligned_access_static_branches();
- cpus_read_unlock();
-
- return 0;
-}
-
-arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
-
-static int riscv_online_cpu(unsigned int cpu)
-{
- static struct page *buf;
-
- /* We are already set since the last check */
- if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
- goto exit;
-
- buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
- if (!buf) {
- pr_warn("Allocation failure, not measuring misaligned performance\n");
- return -ENOMEM;
- }
-
- check_unaligned_access(buf);
- __free_pages(buf, MISALIGNED_BUFFER_ORDER);
-
-exit:
- set_unaligned_access_static_branches();
-
- return 0;
-}
-
-static int riscv_offline_cpu(unsigned int cpu)
-{
- set_unaligned_access_static_branches_except_cpu(cpu);
-
- return 0;
-}
-
-/* Measure unaligned access on all CPUs present at boot in parallel. */
-static int check_unaligned_access_all_cpus(void)
-{
- unsigned int cpu;
- unsigned int cpu_count = num_possible_cpus();
- struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
- GFP_KERNEL);
-
- if (!bufs) {
- pr_warn("Allocation failure, not measuring misaligned performance\n");
- return 0;
- }
-
- /*
- * Allocate separate buffers for each CPU so there's no fighting over
- * cache lines.
- */
- for_each_cpu(cpu, cpu_online_mask) {
- bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
- if (!bufs[cpu]) {
- pr_warn("Allocation failure, not measuring misaligned performance\n");
- goto out;
- }
- }
-
- /* Check everybody except 0, who stays behind to tend jiffies. */
- on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
-
- /* Check core 0. */
- smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
-
- /*
- * Setup hotplug callbacks for any new CPUs that come online or go
- * offline.
- */
- cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
- riscv_online_cpu, riscv_offline_cpu);
-
-out:
- unaligned_emulation_finish();
- for_each_cpu(cpu, cpu_online_mask) {
- if (bufs[cpu])
- __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
- }
-
- kfree(bufs);
- return 0;
-}
-
-arch_initcall(check_unaligned_access_all_cpus);
-
void riscv_user_isa_enable(void)
{
if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index a7c56b41efd24d..8cae41a502dd4a 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -147,6 +147,7 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
return (pair.value & ext);
}
+#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
static u64 hwprobe_misaligned(const struct cpumask *cpus)
{
int cpu;
@@ -169,6 +170,18 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
return perf;
}
+#else
+static u64 hwprobe_misaligned(const struct cpumask *cpus)
+{
+ if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS))
+ return RISCV_HWPROBE_MISALIGNED_FAST;
+
+ if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available())
+ return RISCV_HWPROBE_MISALIGNED_EMULATED;
+
+ return RISCV_HWPROBE_MISALIGNED_SLOW;
+}
+#endif
static void hwprobe_one_pair(struct riscv_hwprobe *pair,
const struct cpumask *cpus)
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 8ded225e8c5b13..2adb7c3e4dd5bf 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -413,7 +413,9 @@ int handle_misaligned_load(struct pt_regs *regs)
perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
*this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED;
+#endif
if (!unaligned_enabled)
return -1;
@@ -596,7 +598,7 @@ int handle_misaligned_store(struct pt_regs *regs)
return 0;
}
-bool check_unaligned_access_emulated(int cpu)
+static bool check_unaligned_access_emulated(int cpu)
{
long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
unsigned long tmp_var, tmp_val;
@@ -623,7 +625,7 @@ bool check_unaligned_access_emulated(int cpu)
return misaligned_emu_detected;
}
-void unaligned_emulation_finish(void)
+bool check_unaligned_access_emulated_all_cpus(void)
{
int cpu;
@@ -632,13 +634,12 @@ void unaligned_emulation_finish(void)
* accesses emulated since tasks requesting such control can run on any
* CPU.
*/
- for_each_present_cpu(cpu) {
- if (per_cpu(misaligned_access_speed, cpu) !=
- RISCV_HWPROBE_MISALIGNED_EMULATED) {
- return;
- }
- }
+ for_each_online_cpu(cpu)
+ if (!check_unaligned_access_emulated(cpu))
+ return false;
+
unaligned_ctl = true;
+ return true;
}
bool unaligned_ctl_available(void)
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
new file mode 100644
index 00000000000000..52264ea4f0bd75
--- /dev/null
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Rivos Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/jump_label.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <asm/cpufeature.h>
+#include <asm/hwprobe.h>
+
+#include "copy-unaligned.h"
+
+#define MISALIGNED_ACCESS_JIFFIES_LG2 1
+#define MISALIGNED_BUFFER_SIZE 0x4000
+#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
+#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
+
+DEFINE_PER_CPU(long, misaligned_access_speed);
+
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
+static cpumask_t fast_misaligned_access;
+static int check_unaligned_access(void *param)
+{
+ int cpu = smp_processor_id();
+ u64 start_cycles, end_cycles;
+ u64 word_cycles;
+ u64 byte_cycles;
+ int ratio;
+ unsigned long start_jiffies, now;
+ struct page *page = param;
+ void *dst;
+ void *src;
+ long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
+
+ if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
+ return 0;
+
+ /* Make an unaligned destination buffer. */
+ dst = (void *)((unsigned long)page_address(page) | 0x1);
+ /* Unalign src as well, but differently (off by 1 + 2 = 3). */
+ src = dst + (MISALIGNED_BUFFER_SIZE / 2);
+ src += 2;
+ word_cycles = -1ULL;
+ /* Do a warmup. */
+ __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ preempt_disable();
+ start_jiffies = jiffies;
+ while ((now = jiffies) == start_jiffies)
+ cpu_relax();
+
+ /*
+ * For a fixed amount of time, repeatedly try the function, and take
+ * the best time in cycles as the measurement.
+ */
+ while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+ start_cycles = get_cycles64();
+ /* Ensure the CSR read can't reorder WRT to the copy. */
+ mb();
+ __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ /* Ensure the copy ends before the end time is snapped. */
+ mb();
+ end_cycles = get_cycles64();
+ if ((end_cycles - start_cycles) < word_cycles)
+ word_cycles = end_cycles - start_cycles;
+ }
+
+ byte_cycles = -1ULL;
+ __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ start_jiffies = jiffies;
+ while ((now = jiffies) == start_jiffies)
+ cpu_relax();
+
+ while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+ start_cycles = get_cycles64();
+ mb();
+ __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ mb();
+ end_cycles = get_cycles64();
+ if ((end_cycles - start_cycles) < byte_cycles)
+ byte_cycles = end_cycles - start_cycles;
+ }
+
+ preempt_enable();
+
+ /* Don't divide by zero. */
+ if (!word_cycles || !byte_cycles) {
+ pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
+ cpu);
+
+ return 0;
+ }
+
+ if (word_cycles < byte_cycles)
+ speed = RISCV_HWPROBE_MISALIGNED_FAST;
+
+ ratio = div_u64((byte_cycles * 100), word_cycles);
+ pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
+ cpu,
+ ratio / 100,
+ ratio % 100,
+ (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
+
+ per_cpu(misaligned_access_speed, cpu) = speed;
+
+ /*
+ * Set the value of fast_misaligned_access of a CPU. These operations
+ * are atomic to avoid race conditions.
+ */
+ if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
+ cpumask_set_cpu(cpu, &fast_misaligned_access);
+ else
+ cpumask_clear_cpu(cpu, &fast_misaligned_access);
+
+ return 0;
+}
+
+static void check_unaligned_access_nonboot_cpu(void *param)
+{
+ unsigned int cpu = smp_processor_id();
+ struct page **pages = param;
+
+ if (smp_processor_id() != 0)
+ check_unaligned_access(pages[cpu]);
+}
+
+DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
+
+static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
+{
+ if (cpumask_weight(mask) == weight)
+ static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
+ else
+ static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
+}
+
+static void set_unaligned_access_static_branches_except_cpu(int cpu)
+{
+ /*
+ * Same as set_unaligned_access_static_branches, except excludes the
+ * given CPU from the result. When a CPU is hotplugged into an offline
+ * state, this function is called before the CPU is set to offline in
+ * the cpumask, and thus the CPU needs to be explicitly excluded.
+ */
+
+ cpumask_t fast_except_me;
+
+ cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
+ cpumask_clear_cpu(cpu, &fast_except_me);
+
+ modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
+}
+
+static void set_unaligned_access_static_branches(void)
+{
+ /*
+ * This will be called after check_unaligned_access_all_cpus so the
+ * result of unaligned access speed for all CPUs will be available.
+ *
+ * To avoid the number of online cpus changing between reading
+ * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
+ * held before calling this function.
+ */
+
+ cpumask_t fast_and_online;
+
+ cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
+
+ modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
+}
+
+static int lock_and_set_unaligned_access_static_branch(void)
+{
+ cpus_read_lock();
+ set_unaligned_access_static_branches();
+ cpus_read_unlock();
+
+ return 0;
+}
+
+arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
+
+static int riscv_online_cpu(unsigned int cpu)
+{
+ static struct page *buf;
+
+ /* We are already set since the last check */
+ if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
+ goto exit;
+
+ buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+ if (!buf) {
+ pr_warn("Allocation failure, not measuring misaligned performance\n");
+ return -ENOMEM;
+ }
+
+ check_unaligned_access(buf);
+ __free_pages(buf, MISALIGNED_BUFFER_ORDER);
+
+exit:
+ set_unaligned_access_static_branches();
+
+ return 0;
+}
+
+static int riscv_offline_cpu(unsigned int cpu)
+{
+ set_unaligned_access_static_branches_except_cpu(cpu);
+
+ return 0;
+}
+
+/* Measure unaligned access speed on all CPUs present at boot in parallel. */
+static int check_unaligned_access_speed_all_cpus(void)
+{
+ unsigned int cpu;
+ unsigned int cpu_count = num_possible_cpus();
+ struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
+ GFP_KERNEL);
+
+ if (!bufs) {
+ pr_warn("Allocation failure, not measuring misaligned performance\n");
+ return 0;
+ }
+
+ /*
+ * Allocate separate buffers for each CPU so there's no fighting over
+ * cache lines.
+ */
+ for_each_cpu(cpu, cpu_online_mask) {
+ bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+ if (!bufs[cpu]) {
+ pr_warn("Allocation failure, not measuring misaligned performance\n");
+ goto out;
+ }
+ }
+
+ /* Check everybody except 0, who stays behind to tend jiffies. */
+ on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
+
+ /* Check core 0. */
+ smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
+
+ /*
+ * Setup hotplug callbacks for any new CPUs that come online or go
+ * offline.
+ */
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
+ riscv_online_cpu, riscv_offline_cpu);
+
+out:
+ for_each_cpu(cpu, cpu_online_mask) {
+ if (bufs[cpu])
+ __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
+ }
+
+ kfree(bufs);
+ return 0;
+}
+
+static int check_unaligned_access_all_cpus(void)
+{
+ bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
+
+ if (!all_cpus_emulated)
+ return check_unaligned_access_speed_all_cpus();
+
+ return 0;
+}
+#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
+static int check_unaligned_access_all_cpus(void)
+{
+ check_unaligned_access_emulated_all_cpus();
+
+ return 0;
+}
+#endif
+
+arch_initcall(check_unaligned_access_all_cpus);
diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c
index af3df5274ccbae..7178e0acfa2284 100644
--- a/arch/riscv/lib/csum.c
+++ b/arch/riscv/lib/csum.c
@@ -3,7 +3,7 @@
* Checksum library
*
* Influenced by arch/arm64/lib/csum.c
- * Copyright (C) 2023 Rivos Inc.
+ * Copyright (C) 2023-2024 Rivos Inc.
*/
#include <linux/bitops.h>
#include <linux/compiler.h>
@@ -318,10 +318,7 @@ unsigned int do_csum(const unsigned char *buff, int len)
* branches. The largest chunk of overlap was delegated into the
* do_csum_common function.
*/
- if (static_branch_likely(&fast_misaligned_access_speed_key))
- return do_csum_no_alignment(buff, len);
-
- if (((unsigned long)buff & OFFSET_MASK) == 0)
+ if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0))
return do_csum_no_alignment(buff, len);
return do_csum_with_alignment(buff, len);