1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Poly1305 hash for MIPS64.
18#
19# May 2016
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone.
22#
23#		IALU/gcc
24# R1x000	5.64/+120%	(big-endian)
25# Octeon II	3.80/+280%	(little-endian)
26
27######################################################################
28# There is a number of MIPS ABI in use, O32 and N32/64 are most
29# widely used. Then there is a new contender: NUBI. It appears that if
30# one picks the latter, it's possible to arrange code in ABI neutral
31# manner. Therefore let's stick to NUBI register layout:
32#
33($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
34($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
35($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
36($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
37#
38# The return value is placed in $a0. Following coding rules facilitate
39# interoperability:
40#
41# - never ever touch $tp, "thread pointer", former $gp [o32 can be
42#   excluded from the rule, because it's specified volatile];
43# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
44#   old code];
45# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
46#
47# For reference here is register layout for N32/64 MIPS ABIs:
48#
49# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
50# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
51# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
52# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
53# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
54#
55# <appro@openssl.org>
56#
57######################################################################
58
59$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
60
61die "MIPS64 only" unless ($flavour =~ /64|n32/i);
62
63$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
65
66($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
67($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
68
69$code.=<<___;
70#include "mips_arch.h"
71
72#ifdef MIPSEB
73# define MSB 0
74# define LSB 7
75#else
76# define MSB 7
77# define LSB 0
78#endif
79
80.text
81.set	noat
82.set	noreorder
83
84.align	5
85.globl	poly1305_init
86.ent	poly1305_init
87poly1305_init:
88	.frame	$sp,0,$ra
89	.set	reorder
90
91	sd	$zero,0($ctx)
92	sd	$zero,8($ctx)
93	sd	$zero,16($ctx)
94
95	beqz	$inp,.Lno_key
96
97#if defined(_MIPS_ARCH_MIPS64R6)
98	ld	$in0,0($inp)
99	ld	$in1,8($inp)
100#else
101	ldl	$in0,0+MSB($inp)
102	ldl	$in1,8+MSB($inp)
103	ldr	$in0,0+LSB($inp)
104	ldr	$in1,8+LSB($inp)
105#endif
106#ifdef	MIPSEB
107# if defined(_MIPS_ARCH_MIPS64R2)
108	dsbh	$in0,$in0		# byte swap
109	 dsbh	$in1,$in1
110	dshd	$in0,$in0
111	 dshd	$in1,$in1
112# else
113	ori	$tmp0,$zero,0xFF
114	dsll	$tmp2,$tmp0,32
115	or	$tmp0,$tmp2		# 0x000000FF000000FF
116
117	and	$tmp1,$in0,$tmp0	# byte swap
118	 and	$tmp3,$in1,$tmp0
119	dsrl	$tmp2,$in0,24
120	 dsrl	$tmp4,$in1,24
121	dsll	$tmp1,24
122	 dsll	$tmp3,24
123	and	$tmp2,$tmp0
124	 and	$tmp4,$tmp0
125	dsll	$tmp0,8			# 0x0000FF000000FF00
126	or	$tmp1,$tmp2
127	 or	$tmp3,$tmp4
128	and	$tmp2,$in0,$tmp0
129	 and	$tmp4,$in1,$tmp0
130	dsrl	$in0,8
131	 dsrl	$in1,8
132	dsll	$tmp2,8
133	 dsll	$tmp4,8
134	and	$in0,$tmp0
135	 and	$in1,$tmp0
136	or	$tmp1,$tmp2
137	 or	$tmp3,$tmp4
138	or	$in0,$tmp1
139	 or	$in1,$tmp3
140	dsrl	$tmp1,$in0,32
141	 dsrl	$tmp3,$in1,32
142	dsll	$in0,32
143	 dsll	$in1,32
144	or	$in0,$tmp1
145	 or	$in1,$tmp3
146# endif
147#endif
148	li	$tmp0,1
149	dsll	$tmp0,32
150	daddiu	$tmp0,-63
151	dsll	$tmp0,28
152	daddiu	$tmp0,-1		# 0ffffffc0fffffff
153
154	and	$in0,$tmp0
155	daddiu	$tmp0,-3		# 0ffffffc0ffffffc
156	and	$in1,$tmp0
157
158	sd	$in0,24($ctx)
159	dsrl	$tmp0,$in1,2
160	sd	$in1,32($ctx)
161	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
162	sd	$tmp0,40($ctx)
163
164.Lno_key:
165	li	$v0,0			# return 0
166	jr	$ra
167.end	poly1305_init
168___
169{
170my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
171   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
172
173$code.=<<___;
174.align	5
175.globl	poly1305_blocks
176.ent	poly1305_blocks
177poly1305_blocks:
178	.set	noreorder
179	dsrl	$len,4			# number of complete blocks
180	bnez	$len,poly1305_blocks_internal
181	nop
182	jr	$ra
183	nop
184.end	poly1305_blocks
185
186.align	5
187.ent	poly1305_blocks_internal
188poly1305_blocks_internal:
189	.frame	$sp,6*8,$ra
190	.mask	$SAVED_REGS_MASK,-8
191	.set	noreorder
192	dsubu	$sp,6*8
193	sd	$s5,40($sp)
194	sd	$s4,32($sp)
195___
196$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
197	sd	$s3,24($sp)
198	sd	$s2,16($sp)
199	sd	$s1,8($sp)
200	sd	$s0,0($sp)
201___
202$code.=<<___;
203	.set	reorder
204
205	ld	$h0,0($ctx)		# load hash value
206	ld	$h1,8($ctx)
207	ld	$h2,16($ctx)
208
209	ld	$r0,24($ctx)		# load key
210	ld	$r1,32($ctx)
211	ld	$s1,40($ctx)
212
213.Loop:
214#if defined(_MIPS_ARCH_MIPS64R6)
215	ld	$in0,0($inp)		# load input
216	ld	$in1,8($inp)
217#else
218	ldl	$in0,0+MSB($inp)	# load input
219	ldl	$in1,8+MSB($inp)
220	ldr	$in0,0+LSB($inp)
221	ldr	$in1,8+LSB($inp)
222#endif
223	daddiu	$len,-1
224	daddiu	$inp,16
225#ifdef	MIPSEB
226# if defined(_MIPS_ARCH_MIPS64R2)
227	dsbh	$in0,$in0		# byte swap
228	 dsbh	$in1,$in1
229	dshd	$in0,$in0
230	 dshd	$in1,$in1
231# else
232	ori	$tmp0,$zero,0xFF
233	dsll	$tmp2,$tmp0,32
234	or	$tmp0,$tmp2		# 0x000000FF000000FF
235
236	and	$tmp1,$in0,$tmp0	# byte swap
237	 and	$tmp3,$in1,$tmp0
238	dsrl	$tmp2,$in0,24
239	 dsrl	$tmp4,$in1,24
240	dsll	$tmp1,24
241	 dsll	$tmp3,24
242	and	$tmp2,$tmp0
243	 and	$tmp4,$tmp0
244	dsll	$tmp0,8			# 0x0000FF000000FF00
245	or	$tmp1,$tmp2
246	 or	$tmp3,$tmp4
247	and	$tmp2,$in0,$tmp0
248	 and	$tmp4,$in1,$tmp0
249	dsrl	$in0,8
250	 dsrl	$in1,8
251	dsll	$tmp2,8
252	 dsll	$tmp4,8
253	and	$in0,$tmp0
254	 and	$in1,$tmp0
255	or	$tmp1,$tmp2
256	 or	$tmp3,$tmp4
257	or	$in0,$tmp1
258	 or	$in1,$tmp3
259	dsrl	$tmp1,$in0,32
260	 dsrl	$tmp3,$in1,32
261	dsll	$in0,32
262	 dsll	$in1,32
263	or	$in0,$tmp1
264	 or	$in1,$tmp3
265# endif
266#endif
267	daddu	$h0,$in0		# accumulate input
268	daddu	$h1,$in1
269	sltu	$tmp0,$h0,$in0
270	sltu	$tmp1,$h1,$in1
271	daddu	$h1,$tmp0
272
273	dmultu	($r0,$h0)		# h0*r0
274	 daddu	$h2,$padbit
275	 sltu	$tmp0,$h1,$tmp0
276	mflo	($d0,$r0,$h0)
277	mfhi	($d1,$r0,$h0)
278
279	dmultu	($s1,$h1)		# h1*5*r1
280	 daddu	$tmp0,$tmp1
281	 daddu	$h2,$tmp0
282	mflo	($tmp0,$s1,$h1)
283	mfhi	($tmp1,$s1,$h1)
284
285	dmultu	($r1,$h0)		# h0*r1
286	 daddu	$d0,$tmp0
287	 daddu	$d1,$tmp1
288	mflo	($tmp2,$r1,$h0)
289	mfhi	($d2,$r1,$h0)
290	 sltu	$tmp0,$d0,$tmp0
291	 daddu	$d1,$tmp0
292
293	dmultu	($r0,$h1)		# h1*r0
294	 daddu	$d1,$tmp2
295	 sltu	$tmp2,$d1,$tmp2
296	mflo	($tmp0,$r0,$h1)
297	mfhi	($tmp1,$r0,$h1)
298	 daddu	$d2,$tmp2
299
300	dmultu	($s1,$h2)		# h2*5*r1
301	 daddu	$d1,$tmp0
302	 daddu	$d2,$tmp1
303	mflo	($tmp2,$s1,$h2)
304
305	dmultu	($r0,$h2)		# h2*r0
306	 sltu	$tmp0,$d1,$tmp0
307	 daddu	$d2,$tmp0
308	mflo	($tmp3,$r0,$h2)
309
310	daddu	$d1,$tmp2
311	daddu	$d2,$tmp3
312	sltu	$tmp2,$d1,$tmp2
313	daddu	$d2,$tmp2
314
315	li	$tmp0,-4		# final reduction
316	and	$tmp0,$d2
317	dsrl	$tmp1,$d2,2
318	andi	$h2,$d2,3
319	daddu	$tmp0,$tmp1
320	daddu	$h0,$d0,$tmp0
321	sltu	$tmp0,$h0,$tmp0
322	daddu	$h1,$d1,$tmp0
323	sltu	$tmp0,$h1,$tmp0
324	daddu	$h2,$h2,$tmp0
325
326	bnez	$len,.Loop
327
328	sd	$h0,0($ctx)		# store hash value
329	sd	$h1,8($ctx)
330	sd	$h2,16($ctx)
331
332	.set	noreorder
333	ld	$s5,40($sp)		# epilogue
334	ld	$s4,32($sp)
335___
336$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
337	ld	$s3,24($sp)
338	ld	$s2,16($sp)
339	ld	$s1,8($sp)
340	ld	$s0,0($sp)
341___
342$code.=<<___;
343	jr	$ra
344	daddu	$sp,6*8
345.end	poly1305_blocks_internal
346___
347}
348{
349my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
350
351$code.=<<___;
352.align	5
353.globl	poly1305_emit
354.ent	poly1305_emit
355poly1305_emit:
356	.frame	$sp,0,$ra
357	.set	reorder
358
359	ld	$tmp0,0($ctx)
360	ld	$tmp1,8($ctx)
361	ld	$tmp2,16($ctx)
362
363	daddiu	$in0,$tmp0,5		# compare to modulus
364	sltiu	$tmp3,$in0,5
365	daddu	$in1,$tmp1,$tmp3
366	sltu	$tmp3,$in1,$tmp3
367	daddu	$tmp2,$tmp2,$tmp3
368
369	dsrl	$tmp2,2			# see if it carried/borrowed
370	dsubu	$tmp2,$zero,$tmp2
371	nor	$tmp3,$zero,$tmp2
372
373	and	$in0,$tmp2
374	and	$tmp0,$tmp3
375	and	$in1,$tmp2
376	and	$tmp1,$tmp3
377	or	$in0,$tmp0
378	or	$in1,$tmp1
379
380	lwu	$tmp0,0($nonce)		# load nonce
381	lwu	$tmp1,4($nonce)
382	lwu	$tmp2,8($nonce)
383	lwu	$tmp3,12($nonce)
384	dsll	$tmp1,32
385	dsll	$tmp3,32
386	or	$tmp0,$tmp1
387	or	$tmp2,$tmp3
388
389	daddu	$in0,$tmp0		# accumulate nonce
390	daddu	$in1,$tmp2
391	sltu	$tmp0,$in0,$tmp0
392	daddu	$in1,$tmp0
393
394	dsrl	$tmp0,$in0,8		# write mac value
395	dsrl	$tmp1,$in0,16
396	dsrl	$tmp2,$in0,24
397	sb	$in0,0($mac)
398	dsrl	$tmp3,$in0,32
399	sb	$tmp0,1($mac)
400	dsrl	$tmp0,$in0,40
401	sb	$tmp1,2($mac)
402	dsrl	$tmp1,$in0,48
403	sb	$tmp2,3($mac)
404	dsrl	$tmp2,$in0,56
405	sb	$tmp3,4($mac)
406	dsrl	$tmp3,$in1,8
407	sb	$tmp0,5($mac)
408	dsrl	$tmp0,$in1,16
409	sb	$tmp1,6($mac)
410	dsrl	$tmp1,$in1,24
411	sb	$tmp2,7($mac)
412
413	sb	$in1,8($mac)
414	dsrl	$tmp2,$in1,32
415	sb	$tmp3,9($mac)
416	dsrl	$tmp3,$in1,40
417	sb	$tmp0,10($mac)
418	dsrl	$tmp0,$in1,48
419	sb	$tmp1,11($mac)
420	dsrl	$tmp1,$in1,56
421	sb	$tmp2,12($mac)
422	sb	$tmp3,13($mac)
423	sb	$tmp0,14($mac)
424	sb	$tmp1,15($mac)
425
426	jr	$ra
427.end	poly1305_emit
428.rdata
429.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
430.align	2
431___
432}
433
434$output=pop and open STDOUT,">$output";
435print $code;
436close STDOUT or die "error closing STDOUT: $!";
437
438