1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 */
6
7#define MASK_U32		0x3c
8#define CHACHA20_BLOCK_SIZE	64
9#define STACK_SIZE		32
10
11#define X0	$t0
12#define X1	$t1
13#define X2	$t2
14#define X3	$t3
15#define X4	$t4
16#define X5	$t5
17#define X6	$t6
18#define X7	$t7
19#define X8	$t8
20#define X9	$t9
21#define X10	$v1
22#define X11	$s6
23#define X12	$s5
24#define X13	$s4
25#define X14	$s3
26#define X15	$s2
27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28#define T0	$s1
29#define T1	$s0
30#define T(n)	T ## n
31#define X(n)	X ## n
32
33/* Input arguments */
34#define STATE		$a0
35#define OUT		$a1
36#define IN		$a2
37#define BYTES		$a3
38
39/* Output argument */
40/* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
43 */
44#define NONCE_0		$v0
45
46/* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
49 */
50#define SAVED_X		X15
51#define SAVED_CA	$s7
52
53#define IS_UNALIGNED	$s7
54
55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56#define MSB 0
57#define LSB 3
58#define ROTx rotl
59#define ROTR(n) rotr n, 24
60#define	CPU_TO_LE32(n) \
61	wsbh	n; \
62	rotr	n, 16;
63#else
64#define MSB 3
65#define LSB 0
66#define ROTx rotr
67#define CPU_TO_LE32(n)
68#define ROTR(n)
69#endif
70
71#define FOR_EACH_WORD(x) \
72	x( 0); \
73	x( 1); \
74	x( 2); \
75	x( 3); \
76	x( 4); \
77	x( 5); \
78	x( 6); \
79	x( 7); \
80	x( 8); \
81	x( 9); \
82	x(10); \
83	x(11); \
84	x(12); \
85	x(13); \
86	x(14); \
87	x(15);
88
89#define FOR_EACH_WORD_REV(x) \
90	x(15); \
91	x(14); \
92	x(13); \
93	x(12); \
94	x(11); \
95	x(10); \
96	x( 9); \
97	x( 8); \
98	x( 7); \
99	x( 6); \
100	x( 5); \
101	x( 4); \
102	x( 3); \
103	x( 2); \
104	x( 1); \
105	x( 0);
106
107#define PLUS_ONE_0	 1
108#define PLUS_ONE_1	 2
109#define PLUS_ONE_2	 3
110#define PLUS_ONE_3	 4
111#define PLUS_ONE_4	 5
112#define PLUS_ONE_5	 6
113#define PLUS_ONE_6	 7
114#define PLUS_ONE_7	 8
115#define PLUS_ONE_8	 9
116#define PLUS_ONE_9	10
117#define PLUS_ONE_10	11
118#define PLUS_ONE_11	12
119#define PLUS_ONE_12	13
120#define PLUS_ONE_13	14
121#define PLUS_ONE_14	15
122#define PLUS_ONE_15	16
123#define PLUS_ONE(x)	PLUS_ONE_ ## x
124#define _CONCAT3(a,b,c)	a ## b ## c
125#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
126
127#define STORE_UNALIGNED(x) \
128CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129	.if (x != 12); \
130		lw	T0, (x*4)(STATE); \
131	.endif; \
132	lwl	T1, (x*4)+MSB ## (IN); \
133	lwr	T1, (x*4)+LSB ## (IN); \
134	.if (x == 12); \
135		addu	X ## x, NONCE_0; \
136	.else; \
137		addu	X ## x, T0; \
138	.endif; \
139	CPU_TO_LE32(X ## x); \
140	xor	X ## x, T1; \
141	swl	X ## x, (x*4)+MSB ## (OUT); \
142	swr	X ## x, (x*4)+LSB ## (OUT);
143
144#define STORE_ALIGNED(x) \
145CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146	.if (x != 12); \
147		lw	T0, (x*4)(STATE); \
148	.endif; \
149	lw	T1, (x*4) ## (IN); \
150	.if (x == 12); \
151		addu	X ## x, NONCE_0; \
152	.else; \
153		addu	X ## x, T0; \
154	.endif; \
155	CPU_TO_LE32(X ## x); \
156	xor	X ## x, T1; \
157	sw	X ## x, (x*4) ## (OUT);
158
159/* Jump table macro.
160 * Used for setup and handling the last bytes, which are not multiple of 4.
161 * X15 is free to store Xn
162 * Every jumptable entry must be equal in size.
163 */
164#define JMPTBL_ALIGNED(x) \
165.Lchacha_mips_jmptbl_aligned_ ## x: ; \
166	.set	noreorder; \
167	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
168	.if (x == 12); \
169		addu	SAVED_X, X ## x, NONCE_0; \
170	.else; \
171		addu	SAVED_X, X ## x, SAVED_CA; \
172	.endif; \
173	.set	reorder
174
175#define JMPTBL_UNALIGNED(x) \
176.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
177	.set	noreorder; \
178	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
179	.if (x == 12); \
180		addu	SAVED_X, X ## x, NONCE_0; \
181	.else; \
182		addu	SAVED_X, X ## x, SAVED_CA; \
183	.endif; \
184	.set	reorder
185
186#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
187	addu	X(A), X(K); \
188	addu	X(B), X(L); \
189	addu	X(C), X(M); \
190	addu	X(D), X(N); \
191	xor	X(V), X(A); \
192	xor	X(W), X(B); \
193	xor	X(Y), X(C); \
194	xor	X(Z), X(D); \
195	rotl	X(V), S;    \
196	rotl	X(W), S;    \
197	rotl	X(Y), S;    \
198	rotl	X(Z), S;
199
200.text
201.set	reorder
202.set	noat
203.globl	chacha_crypt_arch
204.ent	chacha_crypt_arch
205chacha_crypt_arch:
206	.frame	$sp, STACK_SIZE, $ra
207
208	/* Load number of rounds */
209	lw	$at, 16($sp)
210
211	addiu	$sp, -STACK_SIZE
212
213	/* Return bytes = 0. */
214	beqz	BYTES, .Lchacha_mips_end
215
216	lw	NONCE_0, 48(STATE)
217
218	/* Save s0-s7 */
219	sw	$s0,  0($sp)
220	sw	$s1,  4($sp)
221	sw	$s2,  8($sp)
222	sw	$s3, 12($sp)
223	sw	$s4, 16($sp)
224	sw	$s5, 20($sp)
225	sw	$s6, 24($sp)
226	sw	$s7, 28($sp)
227
228	/* Test IN or OUT is unaligned.
229	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
230	 */
231	or	IS_UNALIGNED, IN, OUT
232	andi	IS_UNALIGNED, 0x3
233
234	b	.Lchacha_rounds_start
235
236.align 4
237.Loop_chacha_rounds:
238	addiu	IN,  CHACHA20_BLOCK_SIZE
239	addiu	OUT, CHACHA20_BLOCK_SIZE
240	addiu	NONCE_0, 1
241
242.Lchacha_rounds_start:
243	lw	X0,  0(STATE)
244	lw	X1,  4(STATE)
245	lw	X2,  8(STATE)
246	lw	X3,  12(STATE)
247
248	lw	X4,  16(STATE)
249	lw	X5,  20(STATE)
250	lw	X6,  24(STATE)
251	lw	X7,  28(STATE)
252	lw	X8,  32(STATE)
253	lw	X9,  36(STATE)
254	lw	X10, 40(STATE)
255	lw	X11, 44(STATE)
256
257	move	X12, NONCE_0
258	lw	X13, 52(STATE)
259	lw	X14, 56(STATE)
260	lw	X15, 60(STATE)
261
262.Loop_chacha_xor_rounds:
263	addiu	$at, -2
264	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
265	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
266	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
267	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
268	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
269	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
270	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
271	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
272	bnez	$at, .Loop_chacha_xor_rounds
273
274	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
275
276	/* Is data src/dst unaligned? Jump */
277	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
278
279	/* Set number rounds here to fill delayslot. */
280	lw	$at, (STACK_SIZE+16)($sp)
281
282	/* BYTES < 0, it has no full block. */
283	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
284
285	FOR_EACH_WORD_REV(STORE_ALIGNED)
286
287	/* BYTES > 0? Loop again. */
288	bgtz	BYTES, .Loop_chacha_rounds
289
290	/* Place this here to fill delay slot */
291	addiu	NONCE_0, 1
292
293	/* BYTES < 0? Handle last bytes */
294	bltz	BYTES, .Lchacha_mips_xor_bytes
295
296.Lchacha_mips_xor_done:
297	/* Restore used registers */
298	lw	$s0,  0($sp)
299	lw	$s1,  4($sp)
300	lw	$s2,  8($sp)
301	lw	$s3, 12($sp)
302	lw	$s4, 16($sp)
303	lw	$s5, 20($sp)
304	lw	$s6, 24($sp)
305	lw	$s7, 28($sp)
306
307	/* Write NONCE_0 back to right location in state */
308	sw	NONCE_0, 48(STATE)
309
310.Lchacha_mips_end:
311	addiu	$sp, STACK_SIZE
312	jr	$ra
313
314.Lchacha_mips_no_full_block_aligned:
315	/* Restore the offset on BYTES */
316	addiu	BYTES, CHACHA20_BLOCK_SIZE
317
318	/* Get number of full WORDS */
319	andi	$at, BYTES, MASK_U32
320
321	/* Load upper half of jump table addr */
322	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
323
324	/* Calculate lower half jump table offset */
325	ins	T0, $at, 1, 6
326
327	/* Add offset to STATE */
328	addu	T1, STATE, $at
329
330	/* Add lower half jump table addr */
331	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
332
333	/* Read value from STATE */
334	lw	SAVED_CA, 0(T1)
335
336	/* Store remaining bytecounter as negative value */
337	subu	BYTES, $at, BYTES
338
339	jr	T0
340
341	/* Jump table */
342	FOR_EACH_WORD(JMPTBL_ALIGNED)
343
344
345.Loop_chacha_unaligned:
346	/* Set number rounds here to fill delayslot. */
347	lw	$at, (STACK_SIZE+16)($sp)
348
349	/* BYTES > 0, it has no full block. */
350	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
351
352	FOR_EACH_WORD_REV(STORE_UNALIGNED)
353
354	/* BYTES > 0? Loop again. */
355	bgtz	BYTES, .Loop_chacha_rounds
356
357	/* Write NONCE_0 back to right location in state */
358	sw	NONCE_0, 48(STATE)
359
360	.set noreorder
361	/* Fall through to byte handling */
362	bgez	BYTES, .Lchacha_mips_xor_done
363.Lchacha_mips_xor_unaligned_0_b:
364.Lchacha_mips_xor_aligned_0_b:
365	/* Place this here to fill delay slot */
366	addiu	NONCE_0, 1
367	.set reorder
368
369.Lchacha_mips_xor_bytes:
370	addu	IN, $at
371	addu	OUT, $at
372	/* First byte */
373	lbu	T1, 0(IN)
374	addiu	$at, BYTES, 1
375	CPU_TO_LE32(SAVED_X)
376	ROTR(SAVED_X)
377	xor	T1, SAVED_X
378	sb	T1, 0(OUT)
379	beqz	$at, .Lchacha_mips_xor_done
380	/* Second byte */
381	lbu	T1, 1(IN)
382	addiu	$at, BYTES, 2
383	ROTx	SAVED_X, 8
384	xor	T1, SAVED_X
385	sb	T1, 1(OUT)
386	beqz	$at, .Lchacha_mips_xor_done
387	/* Third byte */
388	lbu	T1, 2(IN)
389	ROTx	SAVED_X, 8
390	xor	T1, SAVED_X
391	sb	T1, 2(OUT)
392	b	.Lchacha_mips_xor_done
393
394.Lchacha_mips_no_full_block_unaligned:
395	/* Restore the offset on BYTES */
396	addiu	BYTES, CHACHA20_BLOCK_SIZE
397
398	/* Get number of full WORDS */
399	andi	$at, BYTES, MASK_U32
400
401	/* Load upper half of jump table addr */
402	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
403
404	/* Calculate lower half jump table offset */
405	ins	T0, $at, 1, 6
406
407	/* Add offset to STATE */
408	addu	T1, STATE, $at
409
410	/* Add lower half jump table addr */
411	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
412
413	/* Read value from STATE */
414	lw	SAVED_CA, 0(T1)
415
416	/* Store remaining bytecounter as negative value */
417	subu	BYTES, $at, BYTES
418
419	jr	T0
420
421	/* Jump table */
422	FOR_EACH_WORD(JMPTBL_UNALIGNED)
423.end chacha_crypt_arch
424.set at
425
426/* Input arguments
427 * STATE	$a0
428 * OUT		$a1
429 * NROUND	$a2
430 */
431
432#undef X12
433#undef X13
434#undef X14
435#undef X15
436
437#define X12	$a3
438#define X13	$at
439#define X14	$v0
440#define X15	STATE
441
442.set noat
443.globl	hchacha_block_arch
444.ent	hchacha_block_arch
445hchacha_block_arch:
446	.frame	$sp, STACK_SIZE, $ra
447
448	addiu	$sp, -STACK_SIZE
449
450	/* Save X11(s6) */
451	sw	X11, 0($sp)
452
453	lw	X0,  0(STATE)
454	lw	X1,  4(STATE)
455	lw	X2,  8(STATE)
456	lw	X3,  12(STATE)
457	lw	X4,  16(STATE)
458	lw	X5,  20(STATE)
459	lw	X6,  24(STATE)
460	lw	X7,  28(STATE)
461	lw	X8,  32(STATE)
462	lw	X9,  36(STATE)
463	lw	X10, 40(STATE)
464	lw	X11, 44(STATE)
465	lw	X12, 48(STATE)
466	lw	X13, 52(STATE)
467	lw	X14, 56(STATE)
468	lw	X15, 60(STATE)
469
470.Loop_hchacha_xor_rounds:
471	addiu	$a2, -2
472	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
473	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
474	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
475	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
476	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
477	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
478	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
479	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
480	bnez	$a2, .Loop_hchacha_xor_rounds
481
482	/* Restore used register */
483	lw	X11, 0($sp)
484
485	sw	X0,  0(OUT)
486	sw	X1,  4(OUT)
487	sw	X2,  8(OUT)
488	sw	X3,  12(OUT)
489	sw	X12, 16(OUT)
490	sw	X13, 20(OUT)
491	sw	X14, 24(OUT)
492	sw	X15, 28(OUT)
493
494	addiu	$sp, STACK_SIZE
495	jr	$ra
496.end hchacha_block_arch
497.set at
498