1#ifdef __linux__
2#include <asm/regdef.h>
3#else
4#include <asm.h>
5#include <regdef.h>
6#endif
7
8.text
9
10.set	noat
11.set	noreorder
12
13.globl	bn_mul_mont
14.align	5
15.ent	bn_mul_mont
16bn_mul_mont:
17	lda	sp,-48(sp)
18	stq	ra,0(sp)
19	stq	s3,8(sp)
20	stq	s4,16(sp)
21	stq	s5,24(sp)
22	stq	fp,32(sp)
23	mov	sp,fp
24	.mask	0x0400f000,-48
25	.frame	fp,48,ra
26	.prologue 0
27
28	.align	4
29	.set	reorder
30	sextl	a5,a5
31	mov	0,v0
32	cmplt	a5,4,AT
33	bne	AT,.Lexit
34
35	ldq	t1,0(a1)	# ap[0]
36	s8addq	a5,16,AT
37	ldq	t4,8(a1)
38	subq	sp,AT,sp
39	ldq	t5,0(a2)	# bp[0]
40	lda	AT,-4096(zero)	# mov	-4096,AT
41	ldq	a4,0(a4)
42	and	sp,AT,sp
43
44	mulq	t1,t5,t0
45	ldq	t3,0(a3)	# np[0]
46	umulh	t1,t5,t1
47	ldq	t6,8(a3)
48
49	mulq	t0,a4,s5
50
51	mulq	t3,s5,t2
52	umulh	t3,s5,t3
53
54	addq	t2,t0,t2
55	cmpult	t2,t0,AT
56	addq	t3,AT,t3
57
58	mulq	t4,t5,t8
59	mov	2,s4
60	umulh	t4,t5,t9
61	mov	sp,t7
62
63	mulq	t6,s5,t10
64	s8addq	s4,a1,t4
65	umulh	t6,s5,t11
66	s8addq	s4,a3,t6
67.align	4
68.L1st:
69	.set	noreorder
70	ldq	t4,0(t4)
71	addl	s4,1,s4
72	ldq	t6,0(t6)
73	lda	t7,8(t7)
74
75	addq	t8,t1,t0
76	mulq	t4,t5,t8
77	cmpult	t0,t1,AT
78	addq	t10,t3,t2
79
80	mulq	t6,s5,t10
81	addq	t9,AT,t1
82	cmpult	t2,t3,v0
83	cmplt	s4,a5,t12
84
85	umulh	t4,t5,t9
86	addq	t11,v0,t3
87	addq	t2,t0,t2
88	s8addq	s4,a1,t4
89
90	umulh	t6,s5,t11
91	cmpult	t2,t0,v0
92	addq	t3,v0,t3
93	s8addq	s4,a3,t6
94
95	stq	t2,-8(t7)
96	nop
97	unop
98	bne	t12,.L1st
99	.set	reorder
100
101	addq	t8,t1,t0
102	addq	t10,t3,t2
103	cmpult	t0,t1,AT
104	cmpult	t2,t3,v0
105	addq	t9,AT,t1
106	addq	t11,v0,t3
107
108	addq	t2,t0,t2
109	cmpult	t2,t0,v0
110	addq	t3,v0,t3
111
112	stq	t2,0(t7)
113
114	addq	t3,t1,t3
115	cmpult	t3,t1,AT
116	stq	t3,8(t7)
117	stq	AT,16(t7)
118
119	mov	1,s3
120.align	4
121.Louter:
122	s8addq	s3,a2,t5
123	ldq	t1,0(a1)
124	ldq	t4,8(a1)
125	ldq	t5,0(t5)
126	ldq	t3,0(a3)
127	ldq	t6,8(a3)
128	ldq	t12,0(sp)
129
130	mulq	t1,t5,t0
131	umulh	t1,t5,t1
132
133	addq	t0,t12,t0
134	cmpult	t0,t12,AT
135	addq	t1,AT,t1
136
137	mulq	t0,a4,s5
138
139	mulq	t3,s5,t2
140	umulh	t3,s5,t3
141
142	addq	t2,t0,t2
143	cmpult	t2,t0,AT
144	mov	2,s4
145	addq	t3,AT,t3
146
147	mulq	t4,t5,t8
148	mov	sp,t7
149	umulh	t4,t5,t9
150
151	mulq	t6,s5,t10
152	s8addq	s4,a1,t4
153	umulh	t6,s5,t11
154.align	4
155.Linner:
156	.set	noreorder
157	ldq	t12,8(t7)	#L0
158	nop			#U1
159	ldq	t4,0(t4)	#L1
160	s8addq	s4,a3,t6	#U0
161
162	ldq	t6,0(t6)	#L0
163	nop			#U1
164	addq	t8,t1,t0	#L1
165	lda	t7,8(t7)
166
167	mulq	t4,t5,t8	#U1
168	cmpult	t0,t1,AT	#L0
169	addq	t10,t3,t2	#L1
170	addl	s4,1,s4
171
172	mulq	t6,s5,t10	#U1
173	addq	t9,AT,t1	#L0
174	addq	t0,t12,t0	#L1
175	cmpult	t2,t3,v0	#U0
176
177	umulh	t4,t5,t9	#U1
178	cmpult	t0,t12,AT	#L0
179	addq	t2,t0,t2	#L1
180	addq	t11,v0,t3	#U0
181
182	umulh	t6,s5,t11	#U1
183	s8addq	s4,a1,t4	#L0
184	cmpult	t2,t0,v0	#L1
185	cmplt	s4,a5,t12	#U0	# borrow t12
186
187	addq	t1,AT,t1	#L0
188	addq	t3,v0,t3	#U1
189	stq	t2,-8(t7)	#L1
190	bne	t12,.Linner	#U0
191	.set	reorder
192
193	ldq	t12,8(t7)
194	addq	t8,t1,t0
195	addq	t10,t3,t2
196	cmpult	t0,t1,AT
197	cmpult	t2,t3,v0
198	addq	t9,AT,t1
199	addq	t11,v0,t3
200
201	addq	t0,t12,t0
202	cmpult	t0,t12,AT
203	addq	t1,AT,t1
204
205	ldq	t12,16(t7)
206	addq	t2,t0,s4
207	cmpult	s4,t0,v0
208	addq	t3,v0,t3
209
210	addq	t3,t1,t2
211	stq	s4,0(t7)
212	cmpult	t2,t1,t3
213	addq	t2,t12,t2
214	cmpult	t2,t12,AT
215	addl	s3,1,s3
216	addq	t3,AT,t3
217	stq	t2,8(t7)
218	cmplt	s3,a5,t12	# borrow t12
219	stq	t3,16(t7)
220	bne	t12,.Louter
221
222	s8addq	a5,sp,t12	# &tp[num]
223	mov	a0,a2		# put rp aside
224	mov	sp,t7
225	mov	sp,a1
226	mov	0,t1		# clear borrow bit
227
228.align	4
229.Lsub:	ldq	t0,0(t7)
230	ldq	t2,0(a3)
231	lda	t7,8(t7)
232	lda	a3,8(a3)
233	subq	t0,t2,t2	# tp[i]-np[i]
234	cmpult	t0,t2,AT
235	subq	t2,t1,t0
236	cmpult	t2,t0,t1
237	or	t1,AT,t1
238	stq	t0,0(a0)
239	cmpult	t7,t12,v0
240	lda	a0,8(a0)
241	bne	v0,.Lsub
242
243	subq	t3,t1,t1	# handle upmost overflow bit
244	mov	sp,t7
245	mov	a2,a0		# restore rp
246
247.align	4
248.Lcopy:	ldq	t4,0(t7)	# conditional copy
249	ldq	t6,0(a0)
250	lda	t7,8(t7)
251	lda	a0,8(a0)
252	cmoveq	t1,t6,t4
253	stq	zero,-8(t7)	# zap tp
254	cmpult	t7,t12,AT
255	stq	t4,-8(a0)
256	bne	AT,.Lcopy
257	mov	1,v0
258
259.Lexit:
260	.set	noreorder
261	mov	fp,sp
262	/*ldq	ra,0(sp)*/
263	ldq	s3,8(sp)
264	ldq	s4,16(sp)
265	ldq	s5,24(sp)
266	ldq	fp,32(sp)
267	lda	sp,48(sp)
268	ret	(ra)
269.end	bn_mul_mont
270.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro@openssl.org>"
271.align	2
272