summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/lib.rs14
-rw-r--r--src/memcmp.s201
-rw-r--r--src/memcpy.s265
-rw-r--r--src/memset.s120
4 files changed, 592 insertions, 8 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 1b4a90c..ce9ec2c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,8 +1,6 @@
-#[cfg(test)]
-mod tests {
- #[test]
- fn it_works() {
- let result = 2 + 2;
- assert_eq!(result, 4);
- }
-}
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+#![no_std]
+core::arch::global_asm!(include_str!("memcmp.s"));
+core::arch::global_asm!(include_str!("memcpy.s"));
+core::arch::global_asm!(include_str!("memset.s"));
diff --git a/src/memcmp.s b/src/memcmp.s
new file mode 100644
index 0000000..7b7a972
--- /dev/null
+++ b/src/memcmp.s
@@ -0,0 +1,201 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ */
+
+ src1 .req x0
+ src2 .req x1
+ limit .req x2
+ result .req w0
+ data1 .req x3
+ data1w .req w3
+ data2 .req x4
+ data2w .req w4
+ data3 .req x5
+ data3w .req w5
+ data4 .req x6
+ data4w .req w6
+ tmp .req x6
+ src1end .req x7
+ src2end .req x8
+
+ .section ".text", "ax", %progbits
+ .globl memcmp
+ .globl bcmp
+memcmp:
+bcmp:
+ cmp limit, 16
+ b.lo .Lless16
+ ldp data1, data3, [src1]
+ ldp data2, data4, [src2]
+ ccmp data1, data2, 0, ne
+ ccmp data3, data4, 0, eq
+ b.ne .Lreturn2
+
+ add src1end, src1, limit
+ add src2end, src2, limit
+ cmp limit, 32
+ b.ls .Llast_bytes
+ cmp limit, 160
+ b.hs .Lloop_align
+ sub limit, limit, 32
+
+ .p2align 4
+.Lloop32:
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne .Lreturn2
+ cmp limit, 16
+ b.ls .Llast_bytes
+
+ ldp data1, data3, [src1, 32]
+ ldp data2, data4, [src2, 32]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne .Lreturn2
+ add src1, src1, 32
+ add src2, src2, 32
+.Llast64:
+ subs limit, limit, 32
+ b.hi .Lloop32
+
+ /* Compare last 1-16 bytes using unaligned access. */
+.Llast_bytes:
+ ldp data1, data3, [src1end, -16]
+ ldp data2, data4, [src2end, -16]
+.Lreturn2:
+ cmp data1, data2
+ csel data1, data1, data3, ne
+ csel data2, data2, data4, ne
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+.Lreturn:
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+
+ .p2align 4
+.Lless16:
+ add src1end, src1, limit
+ add src2end, src2, limit
+ tbz limit, 3, .Lless8
+ ldr data1, [src1]
+ ldr data2, [src2]
+ ldr data3, [src1end, -8]
+ ldr data4, [src2end, -8]
+ b .Lreturn2
+
+ .p2align 4
+.Lless8:
+ tbz limit, 2, .Lless4
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ldr data3w, [src1end, -4]
+ ldr data4w, [src2end, -4]
+ b .Lreturn2
+
+.Lless4:
+ tbz limit, 1, .Lless2
+ ldrh data1w, [src1]
+ ldrh data2w, [src2]
+ cmp data1w, data2w
+ b.ne .Lreturn
+.Lless2:
+ mov result, 0
+ tbz limit, 0, .Lreturn_zero
+ ldrb data1w, [src1end, -1]
+ ldrb data2w, [src2end, -1]
+ sub result, data1w, data2w
+.Lreturn_zero:
+ ret
+
+.Lloop_align:
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne .Lreturn2
+
+ /* Align src2 and adjust src1, src2 and limit. */
+ and tmp, src2, 15
+ sub tmp, tmp, 16
+ sub src2, src2, tmp
+ add limit, limit, tmp
+ sub src1, src1, tmp
+ sub limit, limit, 64 + 16
+
+ .p2align 4
+.Lloop64_:
+ ldr q0, [src1, 16]
+ ldr q1, [src2, 16]
+ subs limit, limit, 64
+ ldr q2, [src1, 32]
+ ldr q3, [src2, 32]
+ eor v0.16b, v0.16b, v1.16b
+ eor v1.16b, v2.16b, v3.16b
+ ldr q2, [src1, 48]
+ ldr q3, [src2, 48]
+ umaxp v0.16b, v0.16b, v1.16b
+ ldr q4, [src1, 64]!
+ ldr q5, [src2, 64]!
+ eor v1.16b, v2.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ umaxp v1.16b, v1.16b, v2.16b
+ umaxp v0.16b, v0.16b, v1.16b
+ umaxp v0.16b, v0.16b, v0.16b
+ fmov tmp, d0
+ ccmp tmp, 0, 0, hi
+ b.eq .Lloop64_
+
+ /* If equal, process last 1-64 bytes using scalar loop. */
+ add limit, limit, 64 + 16
+ cbz tmp, .Llast64
+
+ /* Determine the 8-byte aligned offset of the first difference. */
+#ifdef __AARCH64EB__
+ rev16 tmp, tmp
+#endif
+ rev tmp, tmp
+ clz tmp, tmp
+ bic tmp, tmp, 7
+ sub tmp, tmp, 48
+ ldr data1, [src1, tmp]
+ ldr data2, [src2, tmp]
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ mov result, 1
+ cmp data1, data2
+ cneg result, result, lo
+ ret
+
+ .unreq src1
+ .unreq src2
+ .unreq limit
+ .unreq result
+ .unreq data1
+ .unreq data1w
+ .unreq data2
+ .unreq data2w
+ .unreq data3
+ .unreq data3w
+ .unreq data4
+ .unreq data4w
+ .unreq tmp
+ .unreq src1end
+ .unreq src2end
+
diff --git a/src/memcpy.s b/src/memcpy.s
new file mode 100644
index 0000000..db26904
--- /dev/null
+++ b/src/memcpy.s
@@ -0,0 +1,265 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+ dstin .req x0
+ src .req x1
+ count .req x2
+ dst .req x3
+ srcend .req x4
+ dstend .req x5
+ A_l .req x6
+ A_lw .req w6
+ A_h .req x7
+ B_l .req x8
+ B_lw .req w8
+ B_h .req x9
+ C_l .req x10
+ C_lw .req w10
+ C_h .req x11
+ D_l .req x12
+ D_h .req x13
+ E_l .req x14
+ E_h .req x15
+ F_l .req x16
+ F_h .req x17
+ G_l .req count
+ G_h .req dst
+ H_l .req src
+ H_h .req srcend
+ tmp1 .req x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ .section ".text", "ax", %progbits
+ .global memcpy
+ .global memmove
+memcpy:
+memmove:
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi .Lcopy_long
+ cmp count, 32
+ b.hi .Lcopy32_128
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo .Lcopy16
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+.Lcopy16:
+ tbz count, 3, .Lcopy8
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+.Lcopy8:
+ tbz count, 2, .Lcopy4
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+.Lcopy4:
+ cbz count, .Lcopy0
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+.Lcopy0:
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+.Lcopy32_128:
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi .Lcopy128
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+.Lcopy128:
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls .Lcopy96
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+.Lcopy96:
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+.Lcopy_long:
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, .Lcopy0
+ cmp tmp1, count
+ b.lo .Lcopy_long_backwards
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls .Lcopy64_from_end
+
+.Lloop64:
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi .Lloop64
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+.Lcopy64_from_end:
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+.Lcopy_long_backwards:
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls .Lcopy64_from_start
+
+.Lloop64_backwards:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi .Lloop64_backwards
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+.Lcopy64_from_start:
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
+ .unreq dstin
+ .unreq src
+ .unreq count
+ .unreq dst
+ .unreq srcend
+ .unreq dstend
+ .unreq A_l
+ .unreq A_lw
+ .unreq A_h
+ .unreq B_l
+ .unreq B_lw
+ .unreq B_h
+ .unreq C_l
+ .unreq C_lw
+ .unreq C_h
+ .unreq D_l
+ .unreq D_h
+ .unreq E_l
+ .unreq E_h
+ .unreq F_l
+ .unreq F_h
+ .unreq G_l
+ .unreq G_h
+ .unreq H_l
+ .unreq H_h
+ .unreq tmp1
diff --git a/src/memset.s b/src/memset.s
new file mode 100644
index 0000000..4d99ca9
--- /dev/null
+++ b/src/memset.s
@@ -0,0 +1,120 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+ dstin .req x0
+ val .req x1
+ valw .req w1
+ count .req x2
+ dst .req x3
+ dstend .req x4
+ zva_val .req x5
+
+ .section ".text", "ax", %progbits
+ .globl memset
+memset:
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi .Lset_long
+ cmp count, 16
+ b.hs .Lset_medium
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ .p2align 4
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+.Lset_medium:
+ str q0, [dstin]
+ tbnz count, 6, .Lset96
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+.Lset96:
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+.Lset_long:
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne .Lno_zva
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne .Lno_zva
+#endif
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+.Lzva_loop:
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi .Lzva_loop
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+.Lno_zva:
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+.Lno_zva_loop:
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi .Lno_zva_loop
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .unreq dstin
+ .unreq val
+ .unreq valw
+ .unreq count
+ .unreq dst
+ .unreq dstend
+ .unreq zva_val
+