/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128
// - RISC-V Vector AES block cipher extension ('Zvkned')

#include <linux/linkage.h>

.text
.option arch, +zvkned

#include "aes-macros.S"

#define KEYP		a0
#define INP		a1
#define OUTP		a2
#define LEN		a3
#define IVP		a4

.macro	__aes_crypt_zvkned	enc, keylen
	vle32.v		v16, (INP)
	aes_crypt	v16, \enc, \keylen
	vse32.v		v16, (OUTP)
	ret
.endm

.macro	aes_crypt_zvkned	enc
	aes_begin	KEYP, 128f, 192f
	__aes_crypt_zvkned	\enc, 256
128:
	__aes_crypt_zvkned	\enc, 128
192:
	__aes_crypt_zvkned	\enc, 192
.endm

// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
//			   const u8 in[16], u8 out[16]);
SYM_FUNC_START(aes_encrypt_zvkned)
	aes_crypt_zvkned	1
SYM_FUNC_END(aes_encrypt_zvkned)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_decrypt_zvkned)
	aes_crypt_zvkned	0
SYM_FUNC_END(aes_decrypt_zvkned)

.macro	__aes_ecb_crypt	enc, keylen
	srli		t0, LEN, 2
	// t0 is the remaining length in 32-bit words.  It's a multiple of 4.
1:
	vsetvli		t1, t0, e32, m8, ta, ma
	sub		t0, t0, t1	// Subtract number of words processed
	slli		t1, t1, 2	// Words to bytes
	vle32.v		v16, (INP)
	aes_crypt	v16, \enc, \keylen
	vse32.v		v16, (OUTP)
	add		INP, INP, t1
	add		OUTP, OUTP, t1
	bnez		t0, 1b

	ret
.endm

.macro	aes_ecb_crypt	enc
	aes_begin	KEYP, 128f, 192f
	__aes_ecb_crypt	\enc, 256
128:
	__aes_ecb_crypt	\enc, 128
192:
	__aes_ecb_crypt	\enc, 192
.endm

// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
//			       const u8 *in, u8 *out, size_t len);
//
// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
SYM_FUNC_START(aes_ecb_encrypt_zvkned)
	aes_ecb_crypt	1
SYM_FUNC_END(aes_ecb_encrypt_zvkned)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_ecb_decrypt_zvkned)
	aes_ecb_crypt	0
SYM_FUNC_END(aes_ecb_decrypt_zvkned)

.macro	aes_cbc_encrypt	keylen
	vle32.v		v16, (IVP)	// Load IV
1:
	vle32.v		v17, (INP)	// Load plaintext block
	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
	aes_encrypt	v16, \keylen	// Encrypt
	vse32.v		v16, (OUTP)	// Store ciphertext block
	addi		INP, INP, 16
	addi		OUTP, OUTP, 16
	addi		LEN, LEN, -16
	bnez		LEN, 1b

	vse32.v		v16, (IVP)	// Store next IV
	ret
.endm

.macro	aes_cbc_decrypt	keylen
	srli		LEN, LEN, 2	// Convert LEN from bytes to words
	vle32.v		v16, (IVP)	// Load IV
1:
	vsetvli		t0, LEN, e32, m4, ta, ma
	vle32.v		v20, (INP)	// Load ciphertext blocks
	vslideup.vi	v16, v20, 4	// Setup prev ciphertext blocks
	addi		t1, t0, -4
	vslidedown.vx	v24, v20, t1	// Save last ciphertext block
	aes_decrypt	v20, \keylen	// Decrypt the blocks
	vxor.vv		v20, v20, v16	// XOR with prev ciphertext blocks
	vse32.v		v20, (OUTP)	// Store plaintext blocks
	vmv.v.v		v16, v24	// Next "IV" is last ciphertext block
	slli		t1, t0, 2	// Words to bytes
	add		INP, INP, t1
	add		OUTP, OUTP, t1
	sub		LEN, LEN, t0
	bnez		LEN, 1b

	vsetivli	zero, 4, e32, m1, ta, ma
	vse32.v		v16, (IVP)	// Store next IV
	ret
.endm

// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
//			       const u8 *in, u8 *out, size_t len, u8 iv[16]);
//
// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
SYM_FUNC_START(aes_cbc_encrypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_encrypt	256
128:
	aes_cbc_encrypt	128
192:
	aes_cbc_encrypt	192
SYM_FUNC_END(aes_cbc_encrypt_zvkned)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_cbc_decrypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_decrypt	256
128:
	aes_cbc_decrypt	128
192:
	aes_cbc_decrypt	192
SYM_FUNC_END(aes_cbc_decrypt_zvkned)

.macro	aes_cbc_cts_encrypt	keylen

	// CBC-encrypt all blocks except the last.  But don't store the
	// second-to-last block to the output buffer yet, since it will be
	// handled specially in the ciphertext stealing step.  Exception: if the
	// message is single-block, still encrypt the last (and only) block.
	li		t0, 16
	j		2f
1:
	vse32.v		v16, (OUTP)	// Store ciphertext block
	addi		OUTP, OUTP, 16
2:
	vle32.v		v17, (INP)	// Load plaintext block
	vxor.vv		v16, v16, v17	// XOR with IV or prev ciphertext block
	aes_encrypt	v16, \keylen	// Encrypt
	addi		INP, INP, 16
	addi		LEN, LEN, -16
	bgt		LEN, t0, 1b	// Repeat if more than one block remains

	// Special case: if the message is a single block, just do CBC.
	beqz		LEN, .Lcts_encrypt_done\@

	// Encrypt the last two blocks using ciphertext stealing as follows:
	//	C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
	//	C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
	//
	// C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
	// plaintext block.  Block n, the last block, may be partial; its length
	// is 1 <= LEN <= 16.  If there are only 2 blocks, C[n-2] means the IV.
	//
	// v16 already contains Encrypt(P[n-1] ^ C[n-2]).
	// INP points to P[n].  OUTP points to where C[n-1] should go.
	// To support in-place encryption, load P[n] before storing C[n].
	addi		t0, OUTP, 16	// Get pointer to where C[n] should go
	vsetvli		zero, LEN, e8, m1, tu, ma
	vle8.v		v17, (INP)	// Load P[n]
	vse8.v		v16, (t0)	// Store C[n]
	vxor.vv		v16, v16, v17	// v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
	vsetivli	zero, 4, e32, m1, ta, ma
	aes_encrypt	v16, \keylen
.Lcts_encrypt_done\@:
	vse32.v		v16, (OUTP)	// Store C[n-1] (or C[n] in single-block case)
	ret
.endm

#define LEN32		t4 // Length of remaining full blocks in 32-bit words
#define LEN_MOD16	t5 // Length of message in bytes mod 16

.macro	aes_cbc_cts_decrypt	keylen
	andi		LEN32, LEN, ~15
	srli		LEN32, LEN32, 2
	andi		LEN_MOD16, LEN, 15

	// Save C[n-2] in v28 so that it's available later during the ciphertext
	// stealing step.  If there are fewer than three blocks, C[n-2] means
	// the IV, otherwise it means the third-to-last ciphertext block.
	vmv.v.v		v28, v16	// IV
	add		t0, LEN, -33
	bltz		t0, .Lcts_decrypt_loop\@
	andi		t0, t0, ~15
	add		t0, t0, INP
	vle32.v		v28, (t0)

	// CBC-decrypt all full blocks.  For the last full block, or the last 2
	// full blocks if the message is block-aligned, this doesn't write the
	// correct output blocks (unless the message is only a single block),
	// because it XORs the wrong values with the raw AES plaintexts.  But we
	// fix this after this loop without redoing the AES decryptions.  This
	// approach allows more of the AES decryptions to be parallelized.
.Lcts_decrypt_loop\@:
	vsetvli		t0, LEN32, e32, m4, ta, ma
	addi		t1, t0, -4
	vle32.v		v20, (INP)	// Load next set of ciphertext blocks
	vmv.v.v		v24, v16	// Get IV or last ciphertext block of prev set
	vslideup.vi	v24, v20, 4	// Setup prev ciphertext blocks
	vslidedown.vx	v16, v20, t1	// Save last ciphertext block of this set
	aes_decrypt	v20, \keylen	// Decrypt this set of blocks
	vxor.vv		v24, v24, v20	// XOR prev ciphertext blocks with decrypted blocks
	vse32.v		v24, (OUTP)	// Store this set of plaintext blocks
	sub		LEN32, LEN32, t0
	slli		t0, t0, 2	// Words to bytes
	add		INP, INP, t0
	add		OUTP, OUTP, t0
	bnez		LEN32, .Lcts_decrypt_loop\@

	vsetivli	zero, 4, e32, m4, ta, ma
	vslidedown.vx	v20, v20, t1	// Extract raw plaintext of last full block
	addi		t0, OUTP, -16	// Get pointer to last full plaintext block
	bnez		LEN_MOD16, .Lcts_decrypt_non_block_aligned\@

	// Special case: if the message is a single block, just do CBC.
	li		t1, 16
	beq		LEN, t1, .Lcts_decrypt_done\@

	// Block-aligned message.  Just fix up the last 2 blocks.  We need:
	//
	//	P[n-1] = Decrypt(C[n]) ^ C[n-2]
	//	P[n] = Decrypt(C[n-1]) ^ C[n]
	//
	// We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
	// Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
	// is everything needed to fix the output without re-decrypting blocks.
	addi		t1, OUTP, -32	// Get pointer to where P[n-1] should go
	vxor.vv		v20, v20, v28	// Decrypt(C[n]) ^ C[n-2] == P[n-1]
	vle32.v		v24, (t1)	// Decrypt(C[n-1]) ^ C[n-2]
	vse32.v		v20, (t1)	// Store P[n-1]
	vxor.vv		v20, v24, v16	// Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
	j		.Lcts_decrypt_finish\@

.Lcts_decrypt_non_block_aligned\@:
	// Decrypt the last two blocks using ciphertext stealing as follows:
	//
	//	P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
	//	P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
	//
	// We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
	vmv.v.v		v16, v20	// v16 = Decrypt(C[n-1])
	vsetvli		zero, LEN_MOD16, e8, m1, tu, ma
	vle8.v		v20, (INP)	// v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
	vxor.vv		v16, v16, v20	// v16 = Decrypt(C[n-1]) ^ C[n]
	vse8.v		v16, (OUTP)	// Store P[n]
	vsetivli	zero, 4, e32, m1, ta, ma
	aes_decrypt	v20, \keylen	// v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
.Lcts_decrypt_finish\@:
	vxor.vv		v20, v20, v28	// XOR with C[n-2]
	vse32.v		v20, (t0)	// Store last full plaintext block
.Lcts_decrypt_done\@:
	ret
.endm

.macro	aes_cbc_cts_crypt	keylen
	vle32.v		v16, (IVP)	// Load IV
	beqz		a5, .Lcts_decrypt\@
	aes_cbc_cts_encrypt \keylen
.Lcts_decrypt\@:
	aes_cbc_cts_decrypt \keylen
.endm

// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
//			         const u8 *in, u8 *out, size_t len,
//				 const u8 iv[16], bool enc);
//
// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
// This is the variant that unconditionally swaps the last two blocks.
SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
	aes_begin	KEYP, 128f, 192f
	aes_cbc_cts_crypt 256
128:
	aes_cbc_cts_crypt 128
192:
	aes_cbc_cts_crypt 192
SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)