1;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
2;;
3;; Licensed under the OpenSSL license (the "License").  You may not use
4;; this file except in compliance with the License.  You can obtain a copy
5;; in the file LICENSE in the source distribution or at
6;; https://www.openssl.org/source/license.html
7;;
8;;====================================================================
9;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10;; project.
11;;
12;; Rights for redistribution and usage in source and binary forms are
13;; granted according to the OpenSSL license. Warranty of any kind is
14;; disclaimed.
15;;====================================================================
16;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
17;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
18;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
19;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
20;;====================================================================
21	.text
22
23	.if	.ASSEMBLER_VERSION<7000000
24	.asg	0,__TI_EABI__
25	.endif
26	.if	__TI_EABI__
27	.asg	bn_mul_add_words,_bn_mul_add_words
28	.asg	bn_mul_words,_bn_mul_words
29	.asg	bn_sqr_words,_bn_sqr_words
30	.asg	bn_add_words,_bn_add_words
31	.asg	bn_sub_words,_bn_sub_words
32	.asg	bn_div_words,_bn_div_words
33	.asg	bn_sqr_comba8,_bn_sqr_comba8
34	.asg	bn_mul_comba8,_bn_mul_comba8
35	.asg	bn_sqr_comba4,_bn_sqr_comba4
36	.asg	bn_mul_comba4,_bn_mul_comba4
37	.endif
38
39	.asg	B3,RA
40	.asg	A4,ARG0
41	.asg	B4,ARG1
42	.asg	A6,ARG2
43	.asg	B6,ARG3
44	.asg	A8,ARG4
45	.asg	B8,ARG5
46	.asg	A4,RET
47	.asg	A15,FP
48	.asg	B14,DP
49	.asg	B15,SP
50
51	.global	_bn_mul_add_words
52_bn_mul_add_words:
53	.asmfunc
54	MV	ARG2,B0
55  [!B0]	BNOP	RA
56||[!B0]	MVK	0,RET
57   [B0]	MVC	B0,ILC
58   [B0]	ZERO	A19		; high part of accumulator
59|| [B0]	MV	ARG0,A2
60|| [B0]	MV	ARG3,A3
61	NOP	3
62
63	SPLOOP	2		; 2*n+10
64;;====================================================================
65	LDW	*ARG1++,B7	; ap[i]
66	NOP	3
67	LDW	*ARG0++,A7	; rp[i]
68	MPY32U	B7,A3,A17:A16
69	NOP	3		; [2,0] in epilogue
70	ADDU	A16,A7,A21:A20
71	ADDU	A19,A21:A20,A19:A18
72||	MV.S	A17,A23
73	SPKERNEL 2,1		; leave slot for "return value"
74||	STW	A18,*A2++	; rp[i]
75||	ADD	A19,A23,A19
76;;====================================================================
77	BNOP	RA,4
78	MV	A19,RET		; return value
79	.endasmfunc
80
81	.global	_bn_mul_words
82_bn_mul_words:
83	.asmfunc
84	MV	ARG2,B0
85  [!B0]	BNOP	RA
86||[!B0]	MVK	0,RET
87   [B0]	MVC	B0,ILC
88   [B0]	ZERO	A19		; high part of accumulator
89	NOP	3
90
91	SPLOOP	2		; 2*n+10
92;;====================================================================
93	LDW	*ARG1++,A7	; ap[i]
94	NOP	4
95	MPY32U	A7,ARG3,A17:A16
96	NOP	4		; [2,0] in epiloque
97	ADDU	A19,A16,A19:A18
98||	MV.S	A17,A21
99	SPKERNEL 2,1		; leave slot for "return value"
100||	STW	A18,*ARG0++	; rp[i]
101||	ADD.L	A19,A21,A19
102;;====================================================================
103	BNOP	RA,4
104	MV	A19,RET		; return value
105	.endasmfunc
106
107	.global	_bn_sqr_words
108_bn_sqr_words:
109	.asmfunc
110	MV	ARG2,B0
111  [!B0]	BNOP	RA
112||[!B0]	MVK	0,RET
113   [B0]	MVC	B0,ILC
114   [B0]	MV	ARG0,B2
115|| [B0]	ADD	4,ARG0,ARG0
116	NOP	3
117
118	SPLOOP	2		; 2*n+10
119;;====================================================================
120	LDW	*ARG1++,B7	; ap[i]
121	NOP	4
122	MPY32U	B7,B7,B1:B0
123	NOP	3		; [2,0] in epilogue
124	STW	B0,*B2++(8)	; rp[2*i]
125	MV	B1,A1
126	SPKERNEL 2,0		; fully overlap BNOP RA,5
127||	STW	A1,*ARG0++(8)	; rp[2*i+1]
128;;====================================================================
129	BNOP	RA,5
130	.endasmfunc
131
132	.global	_bn_add_words
133_bn_add_words:
134	.asmfunc
135	MV	ARG3,B0
136  [!B0]	BNOP	RA
137||[!B0]	MVK	0,RET
138   [B0]	MVC	B0,ILC
139   [B0]	ZERO	A1		; carry flag
140|| [B0]	MV	ARG0,A3
141	NOP	3
142
143	SPLOOP	2		; 2*n+6
144;;====================================================================
145	LDW	*ARG2++,A7	; bp[i]
146||	LDW	*ARG1++,B7	; ap[i]
147	NOP	4
148	ADDU	A7,B7,A9:A8
149	ADDU	A1,A9:A8,A1:A0
150	SPKERNEL 0,0		; fully overlap BNOP RA,5
151||	STW	A0,*A3++	; write result
152||	MV	A1,RET		; keep carry flag in RET
153;;====================================================================
154	BNOP	RA,5
155	.endasmfunc
156
157	.global	_bn_sub_words
158_bn_sub_words:
159	.asmfunc
160	MV	ARG3,B0
161  [!B0]	BNOP	RA
162||[!B0]	MVK	0,RET
163   [B0]	MVC	B0,ILC
164   [B0]	ZERO	A2		; borrow flag
165|| [B0]	MV	ARG0,A3
166	NOP	3
167
168	SPLOOP	2		; 2*n+6
169;;====================================================================
170	LDW	*ARG2++,A7	; bp[i]
171||	LDW	*ARG1++,B7	; ap[i]
172	NOP	4
173	SUBU	B7,A7,A1:A0
174  [A2]	SUB	A1:A0,1,A1:A0
175	SPKERNEL 0,1		; leave slot for "return borrow flag"
176||	STW	A0,*A3++	; write result
177||	AND	1,A1,A2		; pass on borrow flag
178;;====================================================================
179	BNOP	RA,4
180	AND	1,A1,RET	; return borrow flag
181	.endasmfunc
182
183	.global	_bn_div_words
184_bn_div_words:
185	.asmfunc
186	LMBD	1,A6,A0		; leading zero bits in dv
187	LMBD	1,A4,A1		; leading zero bits in hi
188||	MVK	32,B0
189	CMPLTU	A1,A0,A2
190||	ADD	A0,B0,B0
191  [ A2]	BNOP	RA
192||[ A2]	MVK	-1,A4		; return overflow
193||[!A2]	MV	A4,A3		; reassign hi
194  [!A2]	MV	B4,A4		; reassign lo, will be quotient
195||[!A2]	MVC	B0,ILC
196  [!A2]	SHL	A6,A0,A6	; normalize dv
197||	MVK	1,A1
198
199  [!A2]	CMPLTU	A3,A6,A1	; hi<dv?
200||[!A2]	SHL	A4,1,A5:A4	; lo<<1
201  [!A1]	SUB	A3,A6,A3	; hi-=dv
202||[!A1]	OR	1,A4,A4
203  [!A2]	SHRU	A3,31,A1	; upper bit
204||[!A2]	ADDAH	A5,A3,A3	; hi<<1|lo>>31
205
206	SPLOOP	3
207  [!A1]	CMPLTU	A3,A6,A1	; hi<dv?
208||[ A1]	ZERO	A1
209||	SHL	A4,1,A5:A4	; lo<<1
210  [!A1]	SUB	A3,A6,A3	; hi-=dv
211||[!A1]	OR	1,A4,A4		; quotient
212	SHRU	A3,31,A1	; upper bit
213||	ADDAH	A5,A3,A3	; hi<<1|lo>>31
214	SPKERNEL
215
216	BNOP	RA,5
217	.endasmfunc
218
219;;====================================================================
220;; Not really Comba algorithm, just straightforward NxM... Dedicated
221;; fully unrolled real Comba implementations are asymptotically 2x
222;; faster, but naturally larger undertaking. Purpose of this exercise
223;; was rather to learn to master nested SPLOOPs...
224;;====================================================================
225	.global	_bn_sqr_comba8
226	.global	_bn_mul_comba8
227_bn_sqr_comba8:
228	MV	ARG1,ARG2
229_bn_mul_comba8:
230	.asmfunc
231	MVK	8,B0		; N, RILC
232||	MVK	8,A0		; M, outer loop counter
233||	MV	ARG1,A5		; copy ap
234||	MV	ARG0,B4		; copy rp
235||	ZERO	B19		; high part of accumulator
236	MVC	B0,RILC
237||	SUB	B0,2,B1		; N-2, initial ILC
238||	SUB	B0,1,B2		; const B2=N-1
239||	LDW	*A5++,B6	; ap[0]
240||	MV	A0,A3		; const A3=M
241sploopNxM?:			; for best performance arrange M<=N
242   [A0]	SPLOOPD	2		; 2*n+10
243||	MVC	B1,ILC
244||	ADDAW	B4,B0,B5
245||	ZERO	B7
246||	LDW	*A5++,A9	; pre-fetch ap[1]
247||	ZERO	A1
248||	SUB	A0,1,A0
249;;====================================================================
250;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
251;; This is because of Advisory 15 from TI publication SPRZ247I.
252	LDW	*ARG2++,A7	; bp[i]
253	NOP	3
254   [A1]	LDW	*B5++,B7	; rp[i]
255	MPY32U	A7,B6,B17:B16
256	NOP	3
257	ADDU	B16,B7,B21:B20
258	ADDU	B19,B21:B20,B19:B18
259||	MV.S	B17,B23
260	SPKERNEL
261||	STW	B18,*B4++	; rp[i]
262||	ADD.S	B19,B23,B19
263;;====================================================================
264outer?:				; m*2*(n+1)+10
265	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
266	SPMASKR
267||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
268	MVD	A9,B6		; move through .M unit(*)
269   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
270	SUBAW	B5,B2,B5	; rewind rp to rp[1]
271	MVK	1,A1
272   [A0]	BNOP.S1	outer?,4
273|| [A0]	SUB.L	A0,1,A0
274	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
275||	ZERO.S	B19		; high part of accumulator
276;; end of outer?
277	BNOP	RA,5		; return
278	.endasmfunc
279;; (*)	It should be noted that B6 is used as input to MPY32U in
280;;	chronologically next cycle in *preceding* SPLOOP iteration.
281;;	Normally such arrangement would require DINT, but at this
282;;	point SPLOOP is draining and interrupts are disabled
283;;	implicitly.
284
285	.global	_bn_sqr_comba4
286	.global	_bn_mul_comba4
287_bn_sqr_comba4:
288	MV	ARG1,ARG2
289_bn_mul_comba4:
290	.asmfunc
291	.if	0
292	BNOP	sploopNxM?,3
293	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
294	;; because of low-counter effect, when prologue phase finishes
295	;; before SPKERNEL instruction is reached. As result it's 25%
296	;; slower than expected...
297	MVK	4,B0		; N, RILC
298||	MVK	4,A0		; M, outer loop counter
299||	MV	ARG1,A5		; copy ap
300||	MV	ARG0,B4		; copy rp
301||	ZERO	B19		; high part of accumulator
302	MVC	B0,RILC
303||	SUB	B0,2,B1		; first ILC
304||	SUB	B0,1,B2		; const B2=N-1
305||	LDW	*A5++,B6	; ap[0]
306||	MV	A0,A3		; const A3=M
307	.else
308	;; This alternative is an exercise in fully unrolled Comba
309	;; algorithm implementation that operates at n*(n+1)+12, or
310	;; as little as 32 cycles...
311	LDW	*ARG1[0],B16	; a[0]
312||	LDW	*ARG2[0],A16	; b[0]
313	LDW	*ARG1[1],B17	; a[1]
314||	LDW	*ARG2[1],A17	; b[1]
315	LDW	*ARG1[2],B18	; a[2]
316||	LDW	*ARG2[2],A18	; b[2]
317	LDW	*ARG1[3],B19	; a[3]
318||	LDW	*ARG2[3],A19	; b[3]
319	NOP
320	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
321	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
322	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
323	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
324	STW	A0,*ARG0[0]
325||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
326	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
327||	ADDU	A22,A1,A1:A0
328	MV	A23,B0
329||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
330||	ADDU	A24,A1:A0,A1:A0
331	ADDU	A25,B0,B1:B0
332||	STW	A0,*ARG0[1]
333||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
334||	ADDU	A26,A1,A9:A8
335	ADDU	A27,B1,B9:B8
336||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
337||	ADDU	A28,A9:A8,A9:A8
338	ADDU	A29,B9:B8,B9:B8
339||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
340||	ADDU	A30,A9:A8,A9:A8
341	ADDU	A31,B9:B8,B9:B8
342||	ADDU	B0,A9:A8,A9:A8
343	STW	A8,*ARG0[2]
344||	ADDU	A20,A9,A1:A0
345	ADDU	A21,B9,B1:B0
346||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
347||	ADDU	A22,A1:A0,A1:A0
348	ADDU	A23,B1:B0,B1:B0
349||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
350||	ADDU	A24,A1:A0,A1:A0
351	ADDU	A25,B1:B0,B1:B0
352||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
353||	ADDU	A26,A1:A0,A1:A0
354	ADDU	A27,B1:B0,B1:B0
355||	ADDU	B8,A1:A0,A1:A0
356	STW	A0,*ARG0[3]
357||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
358||	ADDU	A20,A1,A9:A8
359	ADDU	A21,B1,B9:B8
360||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
361||	ADDU	A22,A9:A8,A9:A8
362	ADDU	A23,B9:B8,B9:B8
363||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
364||	ADDU	A24,A9:A8,A9:A8
365	ADDU	A25,B9:B8,B9:B8
366||	ADDU	B0,A9:A8,A9:A8
367	STW	A8,*ARG0[4]
368||	ADDU	A26,A9,A1:A0
369	ADDU	A27,B9,B1:B0
370||	ADDU	A28,A1:A0,A1:A0
371	ADDU	A29,B1:B0,B1:B0
372||	BNOP	RA
373||	ADDU	B8,A1:A0,A1:A0
374	STW	A0,*ARG0[5]
375||	ADDU	A30,A1,A9:A8
376	ADD	A31,B1,B8
377	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
378	ADD	B8,A9,A9
379||	STW	A8,*ARG0[6]
380	STW	A9,*ARG0[7]
381	.endif
382	.endasmfunc
383