diff options
-rw-r--r-- | src/lib.rs | 14 | ||||
-rw-r--r-- | src/memcmp.s | 201 | ||||
-rw-r--r-- | src/memcpy.s | 265 | ||||
-rw-r--r-- | src/memset.s | 120 |
4 files changed, 592 insertions, 8 deletions
@@ -1,8 +1,6 @@ -#[cfg(test)] -mod tests { - #[test] - fn it_works() { - let result = 2 + 2; - assert_eq!(result, 4); - } -} +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +#![no_std] +core::arch::global_asm!(include_str!("memcmp.s")); +core::arch::global_asm!(include_str!("memcpy.s")); +core::arch::global_asm!(include_str!("memset.s")); diff --git a/src/memcmp.s b/src/memcmp.s new file mode 100644 index 0000000..7b7a972 --- /dev/null +++ b/src/memcmp.s @@ -0,0 +1,201 @@ +/* memcmp - compare memory + * + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + */ + + src1 .req x0 + src2 .req x1 + limit .req x2 + result .req w0 + data1 .req x3 + data1w .req w3 + data2 .req x4 + data2w .req w4 + data3 .req x5 + data3w .req w5 + data4 .req x6 + data4w .req w6 + tmp .req x6 + src1end .req x7 + src2end .req x8 + + .section ".text", "ax", %progbits + .globl memcmp + .globl bcmp +memcmp: +bcmp: + cmp limit, 16 + b.lo .Lless16 + ldp data1, data3, [src1] + ldp data2, data4, [src2] + ccmp data1, data2, 0, ne + ccmp data3, data4, 0, eq + b.ne .Lreturn2 + + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 + b.ls .Llast_bytes + cmp limit, 160 + b.hs .Lloop_align + sub limit, limit, 32 + + .p2align 4 +.Lloop32: + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne .Lreturn2 + cmp limit, 16 + b.ls .Llast_bytes + + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne .Lreturn2 + add src1, src1, 32 + add src2, src2, 32 +.Llast64: + subs limit, limit, 32 + b.hi .Lloop32 + + /* Compare last 1-16 bytes using unaligned access. */ +.Llast_bytes: + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +.Lreturn2: + cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne + + /* Compare data bytes and set return value to 0, -1 or 1. */ +.Lreturn: +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret + + .p2align 4 +.Lless16: + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, .Lless8 + ldr data1, [src1] + ldr data2, [src2] + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b .Lreturn2 + + .p2align 4 +.Lless8: + tbz limit, 2, .Lless4 + ldr data1w, [src1] + ldr data2w, [src2] + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b .Lreturn2 + +.Lless4: + tbz limit, 1, .Lless2 + ldrh data1w, [src1] + ldrh data2w, [src2] + cmp data1w, data2w + b.ne .Lreturn +.Lless2: + mov result, 0 + tbz limit, 0, .Lreturn_zero + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] + sub result, data1w, data2w +.Lreturn_zero: + ret + +.Lloop_align: + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne .Lreturn2 + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +.Lloop64_: + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq .Lloop64_ + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, .Llast64 + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo + ret + + .unreq src1 + .unreq src2 + .unreq limit + .unreq result + .unreq data1 + .unreq data1w + .unreq data2 + .unreq data2w + .unreq data3 + .unreq data3w + .unreq data4 + .unreq data4w + .unreq tmp + .unreq src1end + .unreq src2end + diff --git a/src/memcpy.s b/src/memcpy.s new file mode 100644 index 0000000..db26904 --- /dev/null +++ b/src/memcpy.s @@ -0,0 +1,265 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + + dstin .req x0 + src .req x1 + count .req x2 + dst .req x3 + srcend .req x4 + dstend .req x5 + A_l .req x6 + A_lw .req w6 + A_h .req x7 + B_l .req x8 + B_lw .req w8 + B_h .req x9 + C_l .req x10 + C_lw .req w10 + C_h .req x11 + D_l .req x12 + D_h .req x13 + E_l .req x14 + E_h .req x15 + F_l .req x16 + F_h .req x17 + G_l .req count + G_h .req dst + H_l .req src + H_h .req srcend + tmp1 .req x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + + .section ".text", "ax", %progbits + .global memcpy + .global memmove +memcpy: +memmove: + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi .Lcopy_long + cmp count, 32 + b.hi .Lcopy32_128 + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo .Lcopy16 + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +.Lcopy16: + tbz count, 3, .Lcopy8 + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +.Lcopy8: + tbz count, 2, .Lcopy4 + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +.Lcopy4: + cbz count, .Lcopy0 + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +.Lcopy0: + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +.Lcopy32_128: + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi .Lcopy128 + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +.Lcopy128: + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls .Lcopy96 + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +.Lcopy96: + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +.Lcopy_long: + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, .Lcopy0 + cmp tmp1, count + b.lo .Lcopy_long_backwards + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls .Lcopy64_from_end + +.Lloop64: + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi .Lloop64 + + /* Write the last iteration and copy 64 bytes from the end. */ +.Lcopy64_from_end: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +.Lcopy_long_backwards: + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls .Lcopy64_from_start + +.Lloop64_backwards: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi .Lloop64_backwards + + /* Write the last iteration and copy 64 bytes from the start. */ +.Lcopy64_from_start: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + + .unreq dstin + .unreq src + .unreq count + .unreq dst + .unreq srcend + .unreq dstend + .unreq A_l + .unreq A_lw + .unreq A_h + .unreq B_l + .unreq B_lw + .unreq B_h + .unreq C_l + .unreq C_lw + .unreq C_h + .unreq D_l + .unreq D_h + .unreq E_l + .unreq E_h + .unreq F_l + .unreq F_h + .unreq G_l + .unreq G_h + .unreq H_l + .unreq H_h + .unreq tmp1 diff --git a/src/memset.s b/src/memset.s new file mode 100644 index 0000000..4d99ca9 --- /dev/null +++ b/src/memset.s @@ -0,0 +1,120 @@ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2012-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + + dstin .req x0 + val .req x1 + valw .req w1 + count .req x2 + dst .req x3 + dstend .req x4 + zva_val .req x5 + + .section ".text", "ax", %progbits + .globl memset +memset: + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi .Lset_long + cmp count, 16 + b.hs .Lset_medium + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + .p2align 4 +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +.Lset_medium: + str q0, [dstin] + tbnz count, 6, .Lset96 + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +.Lset96: + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +.Lset_long: + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne .Lno_zva + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne .Lno_zva +#endif + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +.Lzva_loop: + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi .Lzva_loop + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +.Lno_zva: + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +.Lno_zva_loop: + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi .Lno_zva_loop + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .unreq dstin + .unreq val + .unreq valw + .unreq count + .unreq dst + .unreq dstend + .unreq zva_val + |