xref: /linux/arch/mips/crypto/chacha-core.S (revision 3a2f58f3)
149aa7c00SJason A. Donenfeld/* SPDX-License-Identifier: GPL-2.0 OR MIT */
249aa7c00SJason A. Donenfeld/*
349aa7c00SJason A. Donenfeld * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
449aa7c00SJason A. Donenfeld * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
549aa7c00SJason A. Donenfeld */
649aa7c00SJason A. Donenfeld
749aa7c00SJason A. Donenfeld#define MASK_U32		0x3c
849aa7c00SJason A. Donenfeld#define CHACHA20_BLOCK_SIZE	64
949aa7c00SJason A. Donenfeld#define STACK_SIZE		32
1049aa7c00SJason A. Donenfeld
1149aa7c00SJason A. Donenfeld#define X0	$t0
1249aa7c00SJason A. Donenfeld#define X1	$t1
1349aa7c00SJason A. Donenfeld#define X2	$t2
1449aa7c00SJason A. Donenfeld#define X3	$t3
1549aa7c00SJason A. Donenfeld#define X4	$t4
1649aa7c00SJason A. Donenfeld#define X5	$t5
1749aa7c00SJason A. Donenfeld#define X6	$t6
1849aa7c00SJason A. Donenfeld#define X7	$t7
1949aa7c00SJason A. Donenfeld#define X8	$t8
2049aa7c00SJason A. Donenfeld#define X9	$t9
2149aa7c00SJason A. Donenfeld#define X10	$v1
2249aa7c00SJason A. Donenfeld#define X11	$s6
2349aa7c00SJason A. Donenfeld#define X12	$s5
2449aa7c00SJason A. Donenfeld#define X13	$s4
2549aa7c00SJason A. Donenfeld#define X14	$s3
2649aa7c00SJason A. Donenfeld#define X15	$s2
2749aa7c00SJason A. Donenfeld/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
2849aa7c00SJason A. Donenfeld#define T0	$s1
2949aa7c00SJason A. Donenfeld#define T1	$s0
3049aa7c00SJason A. Donenfeld#define T(n)	T ## n
3149aa7c00SJason A. Donenfeld#define X(n)	X ## n
3249aa7c00SJason A. Donenfeld
3349aa7c00SJason A. Donenfeld/* Input arguments */
3449aa7c00SJason A. Donenfeld#define STATE		$a0
3549aa7c00SJason A. Donenfeld#define OUT		$a1
3649aa7c00SJason A. Donenfeld#define IN		$a2
3749aa7c00SJason A. Donenfeld#define BYTES		$a3
3849aa7c00SJason A. Donenfeld
3949aa7c00SJason A. Donenfeld/* Output argument */
4049aa7c00SJason A. Donenfeld/* NONCE[0] is kept in a register and not in memory.
4149aa7c00SJason A. Donenfeld * We don't want to touch original value in memory.
4249aa7c00SJason A. Donenfeld * Must be incremented every loop iteration.
4349aa7c00SJason A. Donenfeld */
4449aa7c00SJason A. Donenfeld#define NONCE_0		$v0
4549aa7c00SJason A. Donenfeld
4649aa7c00SJason A. Donenfeld/* SAVED_X and SAVED_CA are set in the jump table.
4749aa7c00SJason A. Donenfeld * Use regs which are overwritten on exit else we don't leak clear data.
4849aa7c00SJason A. Donenfeld * They are used to handling the last bytes which are not multiple of 4.
4949aa7c00SJason A. Donenfeld */
5049aa7c00SJason A. Donenfeld#define SAVED_X		X15
5149aa7c00SJason A. Donenfeld#define SAVED_CA	$s7
5249aa7c00SJason A. Donenfeld
5349aa7c00SJason A. Donenfeld#define IS_UNALIGNED	$s7
5449aa7c00SJason A. Donenfeld
5549aa7c00SJason A. Donenfeld#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
5649aa7c00SJason A. Donenfeld#define MSB 0
5749aa7c00SJason A. Donenfeld#define LSB 3
5849aa7c00SJason A. Donenfeld#define ROTx rotl
5949aa7c00SJason A. Donenfeld#define ROTR(n) rotr n, 24
6049aa7c00SJason A. Donenfeld#define	CPU_TO_LE32(n) \
6149aa7c00SJason A. Donenfeld	wsbh	n; \
6249aa7c00SJason A. Donenfeld	rotr	n, 16;
6349aa7c00SJason A. Donenfeld#else
6449aa7c00SJason A. Donenfeld#define MSB 3
6549aa7c00SJason A. Donenfeld#define LSB 0
6649aa7c00SJason A. Donenfeld#define ROTx rotr
6749aa7c00SJason A. Donenfeld#define CPU_TO_LE32(n)
6849aa7c00SJason A. Donenfeld#define ROTR(n)
6949aa7c00SJason A. Donenfeld#endif
7049aa7c00SJason A. Donenfeld
7149aa7c00SJason A. Donenfeld#define FOR_EACH_WORD(x) \
7249aa7c00SJason A. Donenfeld	x( 0); \
7349aa7c00SJason A. Donenfeld	x( 1); \
7449aa7c00SJason A. Donenfeld	x( 2); \
7549aa7c00SJason A. Donenfeld	x( 3); \
7649aa7c00SJason A. Donenfeld	x( 4); \
7749aa7c00SJason A. Donenfeld	x( 5); \
7849aa7c00SJason A. Donenfeld	x( 6); \
7949aa7c00SJason A. Donenfeld	x( 7); \
8049aa7c00SJason A. Donenfeld	x( 8); \
8149aa7c00SJason A. Donenfeld	x( 9); \
8249aa7c00SJason A. Donenfeld	x(10); \
8349aa7c00SJason A. Donenfeld	x(11); \
8449aa7c00SJason A. Donenfeld	x(12); \
8549aa7c00SJason A. Donenfeld	x(13); \
8649aa7c00SJason A. Donenfeld	x(14); \
8749aa7c00SJason A. Donenfeld	x(15);
8849aa7c00SJason A. Donenfeld
8949aa7c00SJason A. Donenfeld#define FOR_EACH_WORD_REV(x) \
9049aa7c00SJason A. Donenfeld	x(15); \
9149aa7c00SJason A. Donenfeld	x(14); \
9249aa7c00SJason A. Donenfeld	x(13); \
9349aa7c00SJason A. Donenfeld	x(12); \
9449aa7c00SJason A. Donenfeld	x(11); \
9549aa7c00SJason A. Donenfeld	x(10); \
9649aa7c00SJason A. Donenfeld	x( 9); \
9749aa7c00SJason A. Donenfeld	x( 8); \
9849aa7c00SJason A. Donenfeld	x( 7); \
9949aa7c00SJason A. Donenfeld	x( 6); \
10049aa7c00SJason A. Donenfeld	x( 5); \
10149aa7c00SJason A. Donenfeld	x( 4); \
10249aa7c00SJason A. Donenfeld	x( 3); \
10349aa7c00SJason A. Donenfeld	x( 2); \
10449aa7c00SJason A. Donenfeld	x( 1); \
10549aa7c00SJason A. Donenfeld	x( 0);
10649aa7c00SJason A. Donenfeld
10749aa7c00SJason A. Donenfeld#define PLUS_ONE_0	 1
10849aa7c00SJason A. Donenfeld#define PLUS_ONE_1	 2
10949aa7c00SJason A. Donenfeld#define PLUS_ONE_2	 3
11049aa7c00SJason A. Donenfeld#define PLUS_ONE_3	 4
11149aa7c00SJason A. Donenfeld#define PLUS_ONE_4	 5
11249aa7c00SJason A. Donenfeld#define PLUS_ONE_5	 6
11349aa7c00SJason A. Donenfeld#define PLUS_ONE_6	 7
11449aa7c00SJason A. Donenfeld#define PLUS_ONE_7	 8
11549aa7c00SJason A. Donenfeld#define PLUS_ONE_8	 9
11649aa7c00SJason A. Donenfeld#define PLUS_ONE_9	10
11749aa7c00SJason A. Donenfeld#define PLUS_ONE_10	11
11849aa7c00SJason A. Donenfeld#define PLUS_ONE_11	12
11949aa7c00SJason A. Donenfeld#define PLUS_ONE_12	13
12049aa7c00SJason A. Donenfeld#define PLUS_ONE_13	14
12149aa7c00SJason A. Donenfeld#define PLUS_ONE_14	15
12249aa7c00SJason A. Donenfeld#define PLUS_ONE_15	16
12349aa7c00SJason A. Donenfeld#define PLUS_ONE(x)	PLUS_ONE_ ## x
12449aa7c00SJason A. Donenfeld#define _CONCAT3(a,b,c)	a ## b ## c
12549aa7c00SJason A. Donenfeld#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
12649aa7c00SJason A. Donenfeld
12749aa7c00SJason A. Donenfeld#define STORE_UNALIGNED(x) \
128*3a2f58f3SArd BiesheuvelCONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
12949aa7c00SJason A. Donenfeld	.if (x != 12); \
13049aa7c00SJason A. Donenfeld		lw	T0, (x*4)(STATE); \
13149aa7c00SJason A. Donenfeld	.endif; \
13249aa7c00SJason A. Donenfeld	lwl	T1, (x*4)+MSB ## (IN); \
13349aa7c00SJason A. Donenfeld	lwr	T1, (x*4)+LSB ## (IN); \
13449aa7c00SJason A. Donenfeld	.if (x == 12); \
13549aa7c00SJason A. Donenfeld		addu	X ## x, NONCE_0; \
13649aa7c00SJason A. Donenfeld	.else; \
13749aa7c00SJason A. Donenfeld		addu	X ## x, T0; \
13849aa7c00SJason A. Donenfeld	.endif; \
13949aa7c00SJason A. Donenfeld	CPU_TO_LE32(X ## x); \
14049aa7c00SJason A. Donenfeld	xor	X ## x, T1; \
14149aa7c00SJason A. Donenfeld	swl	X ## x, (x*4)+MSB ## (OUT); \
14249aa7c00SJason A. Donenfeld	swr	X ## x, (x*4)+LSB ## (OUT);
14349aa7c00SJason A. Donenfeld
14449aa7c00SJason A. Donenfeld#define STORE_ALIGNED(x) \
145*3a2f58f3SArd BiesheuvelCONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
14649aa7c00SJason A. Donenfeld	.if (x != 12); \
14749aa7c00SJason A. Donenfeld		lw	T0, (x*4)(STATE); \
14849aa7c00SJason A. Donenfeld	.endif; \
14949aa7c00SJason A. Donenfeld	lw	T1, (x*4) ## (IN); \
15049aa7c00SJason A. Donenfeld	.if (x == 12); \
15149aa7c00SJason A. Donenfeld		addu	X ## x, NONCE_0; \
15249aa7c00SJason A. Donenfeld	.else; \
15349aa7c00SJason A. Donenfeld		addu	X ## x, T0; \
15449aa7c00SJason A. Donenfeld	.endif; \
15549aa7c00SJason A. Donenfeld	CPU_TO_LE32(X ## x); \
15649aa7c00SJason A. Donenfeld	xor	X ## x, T1; \
15749aa7c00SJason A. Donenfeld	sw	X ## x, (x*4) ## (OUT);
15849aa7c00SJason A. Donenfeld
15949aa7c00SJason A. Donenfeld/* Jump table macro.
16049aa7c00SJason A. Donenfeld * Used for setup and handling the last bytes, which are not multiple of 4.
16149aa7c00SJason A. Donenfeld * X15 is free to store Xn
16249aa7c00SJason A. Donenfeld * Every jumptable entry must be equal in size.
16349aa7c00SJason A. Donenfeld */
16449aa7c00SJason A. Donenfeld#define JMPTBL_ALIGNED(x) \
165*3a2f58f3SArd Biesheuvel.Lchacha_mips_jmptbl_aligned_ ## x: ; \
16649aa7c00SJason A. Donenfeld	.set	noreorder; \
167*3a2f58f3SArd Biesheuvel	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
16849aa7c00SJason A. Donenfeld	.if (x == 12); \
16949aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, NONCE_0; \
17049aa7c00SJason A. Donenfeld	.else; \
17149aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, SAVED_CA; \
17249aa7c00SJason A. Donenfeld	.endif; \
17349aa7c00SJason A. Donenfeld	.set	reorder
17449aa7c00SJason A. Donenfeld
17549aa7c00SJason A. Donenfeld#define JMPTBL_UNALIGNED(x) \
176*3a2f58f3SArd Biesheuvel.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
17749aa7c00SJason A. Donenfeld	.set	noreorder; \
178*3a2f58f3SArd Biesheuvel	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
17949aa7c00SJason A. Donenfeld	.if (x == 12); \
18049aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, NONCE_0; \
18149aa7c00SJason A. Donenfeld	.else; \
18249aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, SAVED_CA; \
18349aa7c00SJason A. Donenfeld	.endif; \
18449aa7c00SJason A. Donenfeld	.set	reorder
18549aa7c00SJason A. Donenfeld
18649aa7c00SJason A. Donenfeld#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
18749aa7c00SJason A. Donenfeld	addu	X(A), X(K); \
18849aa7c00SJason A. Donenfeld	addu	X(B), X(L); \
18949aa7c00SJason A. Donenfeld	addu	X(C), X(M); \
19049aa7c00SJason A. Donenfeld	addu	X(D), X(N); \
19149aa7c00SJason A. Donenfeld	xor	X(V), X(A); \
19249aa7c00SJason A. Donenfeld	xor	X(W), X(B); \
19349aa7c00SJason A. Donenfeld	xor	X(Y), X(C); \
19449aa7c00SJason A. Donenfeld	xor	X(Z), X(D); \
19549aa7c00SJason A. Donenfeld	rotl	X(V), S;    \
19649aa7c00SJason A. Donenfeld	rotl	X(W), S;    \
19749aa7c00SJason A. Donenfeld	rotl	X(Y), S;    \
19849aa7c00SJason A. Donenfeld	rotl	X(Z), S;
19949aa7c00SJason A. Donenfeld
20049aa7c00SJason A. Donenfeld.text
20149aa7c00SJason A. Donenfeld.set	reorder
20249aa7c00SJason A. Donenfeld.set	noat
203*3a2f58f3SArd Biesheuvel.globl	chacha_crypt_arch
204*3a2f58f3SArd Biesheuvel.ent	chacha_crypt_arch
205*3a2f58f3SArd Biesheuvelchacha_crypt_arch:
20649aa7c00SJason A. Donenfeld	.frame	$sp, STACK_SIZE, $ra
20749aa7c00SJason A. Donenfeld
208*3a2f58f3SArd Biesheuvel	/* Load number of rounds */
209*3a2f58f3SArd Biesheuvel	lw	$at, 16($sp)
210*3a2f58f3SArd Biesheuvel
21149aa7c00SJason A. Donenfeld	addiu	$sp, -STACK_SIZE
21249aa7c00SJason A. Donenfeld
21349aa7c00SJason A. Donenfeld	/* Return bytes = 0. */
214*3a2f58f3SArd Biesheuvel	beqz	BYTES, .Lchacha_mips_end
21549aa7c00SJason A. Donenfeld
21649aa7c00SJason A. Donenfeld	lw	NONCE_0, 48(STATE)
21749aa7c00SJason A. Donenfeld
21849aa7c00SJason A. Donenfeld	/* Save s0-s7 */
21949aa7c00SJason A. Donenfeld	sw	$s0,  0($sp)
22049aa7c00SJason A. Donenfeld	sw	$s1,  4($sp)
22149aa7c00SJason A. Donenfeld	sw	$s2,  8($sp)
22249aa7c00SJason A. Donenfeld	sw	$s3, 12($sp)
22349aa7c00SJason A. Donenfeld	sw	$s4, 16($sp)
22449aa7c00SJason A. Donenfeld	sw	$s5, 20($sp)
22549aa7c00SJason A. Donenfeld	sw	$s6, 24($sp)
22649aa7c00SJason A. Donenfeld	sw	$s7, 28($sp)
22749aa7c00SJason A. Donenfeld
22849aa7c00SJason A. Donenfeld	/* Test IN or OUT is unaligned.
22949aa7c00SJason A. Donenfeld	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
23049aa7c00SJason A. Donenfeld	 */
23149aa7c00SJason A. Donenfeld	or	IS_UNALIGNED, IN, OUT
23249aa7c00SJason A. Donenfeld	andi	IS_UNALIGNED, 0x3
23349aa7c00SJason A. Donenfeld
234*3a2f58f3SArd Biesheuvel	b	.Lchacha_rounds_start
23549aa7c00SJason A. Donenfeld
23649aa7c00SJason A. Donenfeld.align 4
237*3a2f58f3SArd Biesheuvel.Loop_chacha_rounds:
23849aa7c00SJason A. Donenfeld	addiu	IN,  CHACHA20_BLOCK_SIZE
23949aa7c00SJason A. Donenfeld	addiu	OUT, CHACHA20_BLOCK_SIZE
24049aa7c00SJason A. Donenfeld	addiu	NONCE_0, 1
24149aa7c00SJason A. Donenfeld
242*3a2f58f3SArd Biesheuvel.Lchacha_rounds_start:
24349aa7c00SJason A. Donenfeld	lw	X0,  0(STATE)
24449aa7c00SJason A. Donenfeld	lw	X1,  4(STATE)
24549aa7c00SJason A. Donenfeld	lw	X2,  8(STATE)
24649aa7c00SJason A. Donenfeld	lw	X3,  12(STATE)
24749aa7c00SJason A. Donenfeld
24849aa7c00SJason A. Donenfeld	lw	X4,  16(STATE)
24949aa7c00SJason A. Donenfeld	lw	X5,  20(STATE)
25049aa7c00SJason A. Donenfeld	lw	X6,  24(STATE)
25149aa7c00SJason A. Donenfeld	lw	X7,  28(STATE)
25249aa7c00SJason A. Donenfeld	lw	X8,  32(STATE)
25349aa7c00SJason A. Donenfeld	lw	X9,  36(STATE)
25449aa7c00SJason A. Donenfeld	lw	X10, 40(STATE)
25549aa7c00SJason A. Donenfeld	lw	X11, 44(STATE)
25649aa7c00SJason A. Donenfeld
25749aa7c00SJason A. Donenfeld	move	X12, NONCE_0
25849aa7c00SJason A. Donenfeld	lw	X13, 52(STATE)
25949aa7c00SJason A. Donenfeld	lw	X14, 56(STATE)
26049aa7c00SJason A. Donenfeld	lw	X15, 60(STATE)
26149aa7c00SJason A. Donenfeld
262*3a2f58f3SArd Biesheuvel.Loop_chacha_xor_rounds:
26349aa7c00SJason A. Donenfeld	addiu	$at, -2
26449aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
26549aa7c00SJason A. Donenfeld	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
26649aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
26749aa7c00SJason A. Donenfeld	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
26849aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
26949aa7c00SJason A. Donenfeld	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
27049aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
27149aa7c00SJason A. Donenfeld	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
272*3a2f58f3SArd Biesheuvel	bnez	$at, .Loop_chacha_xor_rounds
27349aa7c00SJason A. Donenfeld
27449aa7c00SJason A. Donenfeld	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
27549aa7c00SJason A. Donenfeld
27649aa7c00SJason A. Donenfeld	/* Is data src/dst unaligned? Jump */
277*3a2f58f3SArd Biesheuvel	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
27849aa7c00SJason A. Donenfeld
27949aa7c00SJason A. Donenfeld	/* Set number rounds here to fill delayslot. */
280*3a2f58f3SArd Biesheuvel	lw	$at, (STACK_SIZE+16)($sp)
28149aa7c00SJason A. Donenfeld
28249aa7c00SJason A. Donenfeld	/* BYTES < 0, it has no full block. */
283*3a2f58f3SArd Biesheuvel	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
28449aa7c00SJason A. Donenfeld
28549aa7c00SJason A. Donenfeld	FOR_EACH_WORD_REV(STORE_ALIGNED)
28649aa7c00SJason A. Donenfeld
28749aa7c00SJason A. Donenfeld	/* BYTES > 0? Loop again. */
288*3a2f58f3SArd Biesheuvel	bgtz	BYTES, .Loop_chacha_rounds
28949aa7c00SJason A. Donenfeld
29049aa7c00SJason A. Donenfeld	/* Place this here to fill delay slot */
29149aa7c00SJason A. Donenfeld	addiu	NONCE_0, 1
29249aa7c00SJason A. Donenfeld
29349aa7c00SJason A. Donenfeld	/* BYTES < 0? Handle last bytes */
294*3a2f58f3SArd Biesheuvel	bltz	BYTES, .Lchacha_mips_xor_bytes
29549aa7c00SJason A. Donenfeld
296*3a2f58f3SArd Biesheuvel.Lchacha_mips_xor_done:
29749aa7c00SJason A. Donenfeld	/* Restore used registers */
29849aa7c00SJason A. Donenfeld	lw	$s0,  0($sp)
29949aa7c00SJason A. Donenfeld	lw	$s1,  4($sp)
30049aa7c00SJason A. Donenfeld	lw	$s2,  8($sp)
30149aa7c00SJason A. Donenfeld	lw	$s3, 12($sp)
30249aa7c00SJason A. Donenfeld	lw	$s4, 16($sp)
30349aa7c00SJason A. Donenfeld	lw	$s5, 20($sp)
30449aa7c00SJason A. Donenfeld	lw	$s6, 24($sp)
30549aa7c00SJason A. Donenfeld	lw	$s7, 28($sp)
30649aa7c00SJason A. Donenfeld
30749aa7c00SJason A. Donenfeld	/* Write NONCE_0 back to right location in state */
30849aa7c00SJason A. Donenfeld	sw	NONCE_0, 48(STATE)
30949aa7c00SJason A. Donenfeld
310*3a2f58f3SArd Biesheuvel.Lchacha_mips_end:
31149aa7c00SJason A. Donenfeld	addiu	$sp, STACK_SIZE
31249aa7c00SJason A. Donenfeld	jr	$ra
31349aa7c00SJason A. Donenfeld
314*3a2f58f3SArd Biesheuvel.Lchacha_mips_no_full_block_aligned:
31549aa7c00SJason A. Donenfeld	/* Restore the offset on BYTES */
31649aa7c00SJason A. Donenfeld	addiu	BYTES, CHACHA20_BLOCK_SIZE
31749aa7c00SJason A. Donenfeld
31849aa7c00SJason A. Donenfeld	/* Get number of full WORDS */
31949aa7c00SJason A. Donenfeld	andi	$at, BYTES, MASK_U32
32049aa7c00SJason A. Donenfeld
32149aa7c00SJason A. Donenfeld	/* Load upper half of jump table addr */
322*3a2f58f3SArd Biesheuvel	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
32349aa7c00SJason A. Donenfeld
32449aa7c00SJason A. Donenfeld	/* Calculate lower half jump table offset */
32549aa7c00SJason A. Donenfeld	ins	T0, $at, 1, 6
32649aa7c00SJason A. Donenfeld
32749aa7c00SJason A. Donenfeld	/* Add offset to STATE */
32849aa7c00SJason A. Donenfeld	addu	T1, STATE, $at
32949aa7c00SJason A. Donenfeld
33049aa7c00SJason A. Donenfeld	/* Add lower half jump table addr */
331*3a2f58f3SArd Biesheuvel	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
33249aa7c00SJason A. Donenfeld
33349aa7c00SJason A. Donenfeld	/* Read value from STATE */
33449aa7c00SJason A. Donenfeld	lw	SAVED_CA, 0(T1)
33549aa7c00SJason A. Donenfeld
33649aa7c00SJason A. Donenfeld	/* Store remaining bytecounter as negative value */
33749aa7c00SJason A. Donenfeld	subu	BYTES, $at, BYTES
33849aa7c00SJason A. Donenfeld
33949aa7c00SJason A. Donenfeld	jr	T0
34049aa7c00SJason A. Donenfeld
34149aa7c00SJason A. Donenfeld	/* Jump table */
34249aa7c00SJason A. Donenfeld	FOR_EACH_WORD(JMPTBL_ALIGNED)
34349aa7c00SJason A. Donenfeld
34449aa7c00SJason A. Donenfeld
345*3a2f58f3SArd Biesheuvel.Loop_chacha_unaligned:
34649aa7c00SJason A. Donenfeld	/* Set number rounds here to fill delayslot. */
347*3a2f58f3SArd Biesheuvel	lw	$at, (STACK_SIZE+16)($sp)
34849aa7c00SJason A. Donenfeld
34949aa7c00SJason A. Donenfeld	/* BYTES > 0, it has no full block. */
350*3a2f58f3SArd Biesheuvel	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
35149aa7c00SJason A. Donenfeld
35249aa7c00SJason A. Donenfeld	FOR_EACH_WORD_REV(STORE_UNALIGNED)
35349aa7c00SJason A. Donenfeld
35449aa7c00SJason A. Donenfeld	/* BYTES > 0? Loop again. */
355*3a2f58f3SArd Biesheuvel	bgtz	BYTES, .Loop_chacha_rounds
35649aa7c00SJason A. Donenfeld
35749aa7c00SJason A. Donenfeld	/* Write NONCE_0 back to right location in state */
35849aa7c00SJason A. Donenfeld	sw	NONCE_0, 48(STATE)
35949aa7c00SJason A. Donenfeld
36049aa7c00SJason A. Donenfeld	.set noreorder
36149aa7c00SJason A. Donenfeld	/* Fall through to byte handling */
362*3a2f58f3SArd Biesheuvel	bgez	BYTES, .Lchacha_mips_xor_done
363*3a2f58f3SArd Biesheuvel.Lchacha_mips_xor_unaligned_0_b:
364*3a2f58f3SArd Biesheuvel.Lchacha_mips_xor_aligned_0_b:
36549aa7c00SJason A. Donenfeld	/* Place this here to fill delay slot */
36649aa7c00SJason A. Donenfeld	addiu	NONCE_0, 1
36749aa7c00SJason A. Donenfeld	.set reorder
36849aa7c00SJason A. Donenfeld
369*3a2f58f3SArd Biesheuvel.Lchacha_mips_xor_bytes:
37049aa7c00SJason A. Donenfeld	addu	IN, $at
37149aa7c00SJason A. Donenfeld	addu	OUT, $at
37249aa7c00SJason A. Donenfeld	/* First byte */
37349aa7c00SJason A. Donenfeld	lbu	T1, 0(IN)
37449aa7c00SJason A. Donenfeld	addiu	$at, BYTES, 1
37549aa7c00SJason A. Donenfeld	CPU_TO_LE32(SAVED_X)
37649aa7c00SJason A. Donenfeld	ROTR(SAVED_X)
37749aa7c00SJason A. Donenfeld	xor	T1, SAVED_X
37849aa7c00SJason A. Donenfeld	sb	T1, 0(OUT)
379*3a2f58f3SArd Biesheuvel	beqz	$at, .Lchacha_mips_xor_done
38049aa7c00SJason A. Donenfeld	/* Second byte */
38149aa7c00SJason A. Donenfeld	lbu	T1, 1(IN)
38249aa7c00SJason A. Donenfeld	addiu	$at, BYTES, 2
38349aa7c00SJason A. Donenfeld	ROTx	SAVED_X, 8
38449aa7c00SJason A. Donenfeld	xor	T1, SAVED_X
38549aa7c00SJason A. Donenfeld	sb	T1, 1(OUT)
386*3a2f58f3SArd Biesheuvel	beqz	$at, .Lchacha_mips_xor_done
38749aa7c00SJason A. Donenfeld	/* Third byte */
38849aa7c00SJason A. Donenfeld	lbu	T1, 2(IN)
38949aa7c00SJason A. Donenfeld	ROTx	SAVED_X, 8
39049aa7c00SJason A. Donenfeld	xor	T1, SAVED_X
39149aa7c00SJason A. Donenfeld	sb	T1, 2(OUT)
392*3a2f58f3SArd Biesheuvel	b	.Lchacha_mips_xor_done
39349aa7c00SJason A. Donenfeld
394*3a2f58f3SArd Biesheuvel.Lchacha_mips_no_full_block_unaligned:
39549aa7c00SJason A. Donenfeld	/* Restore the offset on BYTES */
39649aa7c00SJason A. Donenfeld	addiu	BYTES, CHACHA20_BLOCK_SIZE
39749aa7c00SJason A. Donenfeld
39849aa7c00SJason A. Donenfeld	/* Get number of full WORDS */
39949aa7c00SJason A. Donenfeld	andi	$at, BYTES, MASK_U32
40049aa7c00SJason A. Donenfeld
40149aa7c00SJason A. Donenfeld	/* Load upper half of jump table addr */
402*3a2f58f3SArd Biesheuvel	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
40349aa7c00SJason A. Donenfeld
40449aa7c00SJason A. Donenfeld	/* Calculate lower half jump table offset */
40549aa7c00SJason A. Donenfeld	ins	T0, $at, 1, 6
40649aa7c00SJason A. Donenfeld
40749aa7c00SJason A. Donenfeld	/* Add offset to STATE */
40849aa7c00SJason A. Donenfeld	addu	T1, STATE, $at
40949aa7c00SJason A. Donenfeld
41049aa7c00SJason A. Donenfeld	/* Add lower half jump table addr */
411*3a2f58f3SArd Biesheuvel	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
41249aa7c00SJason A. Donenfeld
41349aa7c00SJason A. Donenfeld	/* Read value from STATE */
41449aa7c00SJason A. Donenfeld	lw	SAVED_CA, 0(T1)
41549aa7c00SJason A. Donenfeld
41649aa7c00SJason A. Donenfeld	/* Store remaining bytecounter as negative value */
41749aa7c00SJason A. Donenfeld	subu	BYTES, $at, BYTES
41849aa7c00SJason A. Donenfeld
41949aa7c00SJason A. Donenfeld	jr	T0
42049aa7c00SJason A. Donenfeld
42149aa7c00SJason A. Donenfeld	/* Jump table */
42249aa7c00SJason A. Donenfeld	FOR_EACH_WORD(JMPTBL_UNALIGNED)
423*3a2f58f3SArd Biesheuvel.end chacha_crypt_arch
424*3a2f58f3SArd Biesheuvel.set at
425*3a2f58f3SArd Biesheuvel
426*3a2f58f3SArd Biesheuvel/* Input arguments
427*3a2f58f3SArd Biesheuvel * STATE	$a0
428*3a2f58f3SArd Biesheuvel * OUT		$a1
429*3a2f58f3SArd Biesheuvel * NROUND	$a2
430*3a2f58f3SArd Biesheuvel */
431*3a2f58f3SArd Biesheuvel
432*3a2f58f3SArd Biesheuvel#undef X12
433*3a2f58f3SArd Biesheuvel#undef X13
434*3a2f58f3SArd Biesheuvel#undef X14
435*3a2f58f3SArd Biesheuvel#undef X15
436*3a2f58f3SArd Biesheuvel
437*3a2f58f3SArd Biesheuvel#define X12	$a3
438*3a2f58f3SArd Biesheuvel#define X13	$at
439*3a2f58f3SArd Biesheuvel#define X14	$v0
440*3a2f58f3SArd Biesheuvel#define X15	STATE
441*3a2f58f3SArd Biesheuvel
442*3a2f58f3SArd Biesheuvel.set noat
443*3a2f58f3SArd Biesheuvel.globl	hchacha_block_arch
444*3a2f58f3SArd Biesheuvel.ent	hchacha_block_arch
445*3a2f58f3SArd Biesheuvelhchacha_block_arch:
446*3a2f58f3SArd Biesheuvel	.frame	$sp, STACK_SIZE, $ra
447*3a2f58f3SArd Biesheuvel
448*3a2f58f3SArd Biesheuvel	addiu	$sp, -STACK_SIZE
449*3a2f58f3SArd Biesheuvel
450*3a2f58f3SArd Biesheuvel	/* Save X11(s6) */
451*3a2f58f3SArd Biesheuvel	sw	X11, 0($sp)
452*3a2f58f3SArd Biesheuvel
453*3a2f58f3SArd Biesheuvel	lw	X0,  0(STATE)
454*3a2f58f3SArd Biesheuvel	lw	X1,  4(STATE)
455*3a2f58f3SArd Biesheuvel	lw	X2,  8(STATE)
456*3a2f58f3SArd Biesheuvel	lw	X3,  12(STATE)
457*3a2f58f3SArd Biesheuvel	lw	X4,  16(STATE)
458*3a2f58f3SArd Biesheuvel	lw	X5,  20(STATE)
459*3a2f58f3SArd Biesheuvel	lw	X6,  24(STATE)
460*3a2f58f3SArd Biesheuvel	lw	X7,  28(STATE)
461*3a2f58f3SArd Biesheuvel	lw	X8,  32(STATE)
462*3a2f58f3SArd Biesheuvel	lw	X9,  36(STATE)
463*3a2f58f3SArd Biesheuvel	lw	X10, 40(STATE)
464*3a2f58f3SArd Biesheuvel	lw	X11, 44(STATE)
465*3a2f58f3SArd Biesheuvel	lw	X12, 48(STATE)
466*3a2f58f3SArd Biesheuvel	lw	X13, 52(STATE)
467*3a2f58f3SArd Biesheuvel	lw	X14, 56(STATE)
468*3a2f58f3SArd Biesheuvel	lw	X15, 60(STATE)
469*3a2f58f3SArd Biesheuvel
470*3a2f58f3SArd Biesheuvel.Loop_hchacha_xor_rounds:
471*3a2f58f3SArd Biesheuvel	addiu	$a2, -2
472*3a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
473*3a2f58f3SArd Biesheuvel	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
474*3a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
475*3a2f58f3SArd Biesheuvel	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
476*3a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
477*3a2f58f3SArd Biesheuvel	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
478*3a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
479*3a2f58f3SArd Biesheuvel	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
480*3a2f58f3SArd Biesheuvel	bnez	$a2, .Loop_hchacha_xor_rounds
481*3a2f58f3SArd Biesheuvel
482*3a2f58f3SArd Biesheuvel	/* Restore used register */
483*3a2f58f3SArd Biesheuvel	lw	X11, 0($sp)
484*3a2f58f3SArd Biesheuvel
485*3a2f58f3SArd Biesheuvel	sw	X0,  0(OUT)
486*3a2f58f3SArd Biesheuvel	sw	X1,  4(OUT)
487*3a2f58f3SArd Biesheuvel	sw	X2,  8(OUT)
488*3a2f58f3SArd Biesheuvel	sw	X3,  12(OUT)
489*3a2f58f3SArd Biesheuvel	sw	X12, 16(OUT)
490*3a2f58f3SArd Biesheuvel	sw	X13, 20(OUT)
491*3a2f58f3SArd Biesheuvel	sw	X14, 24(OUT)
492*3a2f58f3SArd Biesheuvel	sw	X15, 28(OUT)
493*3a2f58f3SArd Biesheuvel
494*3a2f58f3SArd Biesheuvel	addiu	$sp, STACK_SIZE
495*3a2f58f3SArd Biesheuvel	jr	$ra
496*3a2f58f3SArd Biesheuvel.end hchacha_block_arch
49749aa7c00SJason A. Donenfeld.set at
498