4 files changed, 592 insertions, 8 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 1b4a90c..ce9ec2c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,8 +1,6 @@
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn it_works() {
-        let result = 2 + 2;
-        assert_eq!(result, 4);
-    }
-}
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+#![no_std]
+core::arch::global_asm!(include_str!("memcmp.s"));
+core::arch::global_asm!(include_str!("memcpy.s"));
+core::arch::global_asm!(include_str!("memset.s"));
diff --git a/src/memcmp.s b/src/memcmp.s
new file mode 100644
index 0000000..7b7a972
--- /dev/null
+++ b/src/memcmp.s
@@ -0,0 +1,201 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ */
+
+	src1	.req	x0
+	src2	.req	x1
+	limit	.req	x2
+	result	.req	w0
+	data1	.req	x3
+	data1w	.req	w3
+	data2	.req	x4
+	data2w	.req	w4
+	data3	.req	x5
+	data3w	.req	w5
+	data4	.req	x6
+	data4w	.req	w6
+	tmp	.req	x6
+	src1end	.req	x7
+	src2end	.req	x8
+
+	.section ".text", "ax", %progbits
+	.globl	memcmp
+	.globl	bcmp
+memcmp:
+bcmp:
+	cmp	limit, 16
+	b.lo	.Lless16
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	.Lreturn2
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
+	b.ls	.Llast_bytes
+	cmp	limit, 160
+	b.hs	.Lloop_align
+	sub	limit, limit, 32
+
+	.p2align 4
+.Lloop32:
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	.Lreturn2
+	cmp	limit, 16
+	b.ls	.Llast_bytes
+
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	.Lreturn2
+	add	src1, src1, 32
+	add	src2, src2, 32
+.Llast64:
+	subs	limit, limit, 32
+	b.hi	.Lloop32
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+.Llast_bytes:
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+.Lreturn2:
+	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+.Lreturn:
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+.Lless16:
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, .Lless8
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	.Lreturn2
+
+	.p2align 4
+.Lless8:
+	tbz	limit, 2, .Lless4
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	.Lreturn2
+
+.Lless4:
+	tbz	limit, 1, .Lless2
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
+	cmp	data1w, data2w
+	b.ne	.Lreturn
+.Lless2:
+	mov	result, 0
+	tbz	limit, 0, .Lreturn_zero
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
+	sub	result, data1w, data2w
+.Lreturn_zero:
+	ret
+
+.Lloop_align:
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	.Lreturn2
+
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+.Lloop64_:
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	.Lloop64_
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, .Llast64
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
+	ret
+
+	.unreq	src1
+	.unreq	src2
+	.unreq	limit
+	.unreq	result
+	.unreq	data1
+	.unreq	data1w
+	.unreq	data2
+	.unreq	data2w
+	.unreq	data3
+	.unreq	data3w
+	.unreq	data4
+	.unreq	data4w
+	.unreq	tmp
+	.unreq	src1end
+	.unreq	src2end
+
diff --git a/src/memcpy.s b/src/memcpy.s
new file mode 100644
index 0000000..db26904
--- /dev/null
+++ b/src/memcpy.s
@@ -0,0 +1,265 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+	dstin	.req	x0
+	src	.req	x1
+	count	.req	x2
+	dst	.req	x3
+	srcend	.req	x4
+	dstend	.req	x5
+	A_l	.req	x6
+	A_lw	.req	w6
+	A_h	.req	x7
+	B_l	.req	x8
+	B_lw	.req	w8
+	B_h	.req	x9
+	C_l	.req	x10
+	C_lw	.req	w10
+	C_h	.req	x11
+	D_l	.req	x12
+	D_h	.req	x13
+	E_l	.req	x14
+	E_h	.req	x15
+	F_l	.req	x16
+	F_h	.req	x17
+	G_l	.req	count
+	G_h	.req	dst
+	H_l	.req	src
+	H_h	.req	srcend
+	tmp1	.req	x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+	.section ".text", "ax", %progbits
+	.global memcpy
+	.global memmove
+memcpy:
+memmove:
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	.Lcopy_long
+	cmp	count, 32
+	b.hi	.Lcopy32_128
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	.Lcopy16
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+.Lcopy16:
+	tbz	count, 3, .Lcopy8
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+.Lcopy8:
+	tbz	count, 2, .Lcopy4
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+.Lcopy4:
+	cbz	count, .Lcopy0
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+.Lcopy0:
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+.Lcopy32_128:
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	.Lcopy128
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+.Lcopy128:
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	.Lcopy96
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+.Lcopy96:
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+.Lcopy_long:
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, .Lcopy0
+	cmp	tmp1, count
+	b.lo	.Lcopy_long_backwards
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp	D_l, D_h, [src]
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	.Lcopy64_from_end
+
+.Lloop64:
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	.Lloop64
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+.Lcopy64_from_end:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+.Lcopy_long_backwards:
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	.Lcopy64_from_start
+
+.Lloop64_backwards:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	.Lloop64_backwards
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+.Lcopy64_from_start:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+	ret
+
+	.unreq	dstin
+	.unreq	src
+	.unreq	count
+	.unreq	dst
+	.unreq	srcend
+	.unreq	dstend
+	.unreq	A_l
+	.unreq	A_lw
+	.unreq	A_h
+	.unreq	B_l
+	.unreq	B_lw
+	.unreq	B_h
+	.unreq	C_l
+	.unreq	C_lw
+	.unreq	C_h
+	.unreq	D_l
+	.unreq	D_h
+	.unreq	E_l
+	.unreq	E_h
+	.unreq	F_l
+	.unreq	F_h
+	.unreq	G_l
+	.unreq	G_h
+	.unreq	H_l
+	.unreq	H_h
+	.unreq	tmp1
diff --git a/src/memset.s b/src/memset.s
new file mode 100644
index 0000000..4d99ca9
--- /dev/null
+++ b/src/memset.s
@@ -0,0 +1,120 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+	dstin	.req	x0
+	val	.req	x1
+	valw	.req	w1
+	count	.req	x2
+	dst	.req	x3
+	dstend	.req	x4
+	zva_val	.req	x5
+
+	.section ".text", "ax", %progbits
+	.globl	memset
+memset:
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	.Lset_long
+	cmp	count, 16
+	b.hs	.Lset_medium
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	.p2align 4
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+.Lset_medium:
+	str	q0, [dstin]
+	tbnz	count, 6, .Lset96
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+.Lset96:
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+.Lset_long:
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	.Lno_zva
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	.Lno_zva
+#endif
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+.Lzva_loop:
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	.Lzva_loop
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+.Lno_zva:
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+.Lno_zva_loop:
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	.Lno_zva_loop
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.unreq	dstin
+	.unreq	val
+	.unreq	valw
+	.unreq	count
+	.unreq	dst
+	.unreq	dstend
+	.unreq	zva_val
+