1 /*
2 * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions.
3 *
4 * This Source Code Form is subject to the terms of the Mozilla Public
5 * License, v. 2.0. If a copy of the MPL was not distributed with this
6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7
8 #include "mpi-priv.h"
9
10 static int is_sse = -1;
11 extern unsigned long s_mpi_is_sse2();
12
13 /*
14 * ebp - 36: caller's esi
15 * ebp - 32: caller's edi
16 * ebp - 28:
17 * ebp - 24:
18 * ebp - 20:
19 * ebp - 16:
20 * ebp - 12:
21 * ebp - 8:
22 * ebp - 4:
23 * ebp + 0: caller's ebp
24 * ebp + 4: return address
25 * ebp + 8: a argument
26 * ebp + 12: a_len argument
27 * ebp + 16: b argument
28 * ebp + 20: c argument
29 * registers:
30 * eax:
31 * ebx: carry
32 * ecx: a_len
33 * edx:
34 * esi: a ptr
35 * edi: c ptr
36 */
s_mpv_mul_d(const mp_digit * a,mp_size a_len,mp_digit b,mp_digit * c)37 __declspec(naked) void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
38 {
39 __asm {
40 mov eax, is_sse
41 cmp eax, 0
42 je s_mpv_mul_d_x86
43 jg s_mpv_mul_d_sse2
44 call s_mpi_is_sse2
45 mov is_sse, eax
46 cmp eax, 0
47 jg s_mpv_mul_d_sse2
48 s_mpv_mul_d_x86:
49 push ebp
50 mov ebp,esp
51 sub esp,28
52 push edi
53 push esi
54 push ebx
55 mov ebx,0 ; carry = 0
56 mov ecx,[ebp+12] ; ecx = a_len
57 mov edi,[ebp+20]
58 cmp ecx,0
59 je L_2 ; jmp if a_len == 0
60 mov esi,[ebp+8] ; esi = a
61 cld
62 L_1:
63 lodsd ; eax = [ds:esi]; esi += 4
64 mov edx,[ebp+16] ; edx = b
65 mul edx ; edx:eax = Phi:Plo = a_i * b
66
67 add eax,ebx ; add carry (ebx) to edx:eax
68 adc edx,0
69 mov ebx,edx ; high half of product becomes next carry
70
71 stosd ; [es:edi] = ax; edi += 4;
72 dec ecx ; --a_len
73 jnz L_1 ; jmp if a_len != 0
74 L_2:
75 mov [edi],ebx ; *c = carry
76 pop ebx
77 pop esi
78 pop edi
79 leave
80 ret
81 nop
82 s_mpv_mul_d_sse2:
83 push ebp
84 mov ebp, esp
85 push edi
86 push esi
87 psubq mm2, mm2 ; carry = 0
88 mov ecx, [ebp+12] ; ecx = a_len
89 movd mm1, [ebp+16] ; mm1 = b
90 mov edi, [ebp+20]
91 cmp ecx, 0
92 je L_6 ; jmp if a_len == 0
93 mov esi, [ebp+8] ; esi = a
94 cld
95 L_5:
96 movd mm0, [esi] ; mm0 = *a++
97 add esi, 4
98 pmuludq mm0, mm1 ; mm0 = b * *a++
99 paddq mm2, mm0 ; add the carry
100 movd [edi], mm2 ; store the 32bit result
101 add edi, 4
102 psrlq mm2, 32 ; save the carry
103 dec ecx ; --a_len
104 jnz L_5 ; jmp if a_len != 0
105 L_6:
106 movd [edi], mm2 ; *c = carry
107 emms
108 pop esi
109 pop edi
110 leave
111 ret
112 nop
113 }
114 }
115
116 /*
117 * ebp - 36: caller's esi
118 * ebp - 32: caller's edi
119 * ebp - 28:
120 * ebp - 24:
121 * ebp - 20:
122 * ebp - 16:
123 * ebp - 12:
124 * ebp - 8:
125 * ebp - 4:
126 * ebp + 0: caller's ebp
127 * ebp + 4: return address
128 * ebp + 8: a argument
129 * ebp + 12: a_len argument
130 * ebp + 16: b argument
131 * ebp + 20: c argument
132 * registers:
133 * eax:
134 * ebx: carry
135 * ecx: a_len
136 * edx:
137 * esi: a ptr
138 * edi: c ptr
139 */
s_mpv_mul_d_add(const mp_digit * a,mp_size a_len,mp_digit b,mp_digit * c)140 __declspec(naked) void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
141 {
142 __asm {
143 mov eax, is_sse
144 cmp eax, 0
145 je s_mpv_mul_d_add_x86
146 jg s_mpv_mul_d_add_sse2
147 call s_mpi_is_sse2
148 mov is_sse, eax
149 cmp eax, 0
150 jg s_mpv_mul_d_add_sse2
151 s_mpv_mul_d_add_x86:
152 push ebp
153 mov ebp,esp
154 sub esp,28
155 push edi
156 push esi
157 push ebx
158 mov ebx,0 ; carry = 0
159 mov ecx,[ebp+12] ; ecx = a_len
160 mov edi,[ebp+20]
161 cmp ecx,0
162 je L_11 ; jmp if a_len == 0
163 mov esi,[ebp+8] ; esi = a
164 cld
165 L_10:
166 lodsd ; eax = [ds:esi]; esi += 4
167 mov edx,[ebp+16] ; edx = b
168 mul edx ; edx:eax = Phi:Plo = a_i * b
169
170 add eax,ebx ; add carry (ebx) to edx:eax
171 adc edx,0
172 mov ebx,[edi] ; add in current word from *c
173 add eax,ebx
174 adc edx,0
175 mov ebx,edx ; high half of product becomes next carry
176
177 stosd ; [es:edi] = ax; edi += 4;
178 dec ecx ; --a_len
179 jnz L_10 ; jmp if a_len != 0
180 L_11:
181 mov [edi],ebx ; *c = carry
182 pop ebx
183 pop esi
184 pop edi
185 leave
186 ret
187 nop
188 s_mpv_mul_d_add_sse2:
189 push ebp
190 mov ebp, esp
191 push edi
192 push esi
193 psubq mm2, mm2 ; carry = 0
194 mov ecx, [ebp+12] ; ecx = a_len
195 movd mm1, [ebp+16] ; mm1 = b
196 mov edi, [ebp+20]
197 cmp ecx, 0
198 je L_16 ; jmp if a_len == 0
199 mov esi, [ebp+8] ; esi = a
200 cld
201 L_15:
202 movd mm0, [esi] ; mm0 = *a++
203 add esi, 4
204 pmuludq mm0, mm1 ; mm0 = b * *a++
205 paddq mm2, mm0 ; add the carry
206 movd mm0, [edi]
207 paddq mm2, mm0 ; add the carry
208 movd [edi], mm2 ; store the 32bit result
209 add edi, 4
210 psrlq mm2, 32 ; save the carry
211 dec ecx ; --a_len
212 jnz L_15 ; jmp if a_len != 0
213 L_16:
214 movd [edi], mm2 ; *c = carry
215 emms
216 pop esi
217 pop edi
218 leave
219 ret
220 nop
221 }
222 }
223
224 /*
225 * ebp - 36: caller's esi
226 * ebp - 32: caller's edi
227 * ebp - 28:
228 * ebp - 24:
229 * ebp - 20:
230 * ebp - 16:
231 * ebp - 12:
232 * ebp - 8:
233 * ebp - 4:
234 * ebp + 0: caller's ebp
235 * ebp + 4: return address
236 * ebp + 8: a argument
237 * ebp + 12: a_len argument
238 * ebp + 16: b argument
239 * ebp + 20: c argument
240 * registers:
241 * eax:
242 * ebx: carry
243 * ecx: a_len
244 * edx:
245 * esi: a ptr
246 * edi: c ptr
247 */
s_mpv_mul_d_add_prop(const mp_digit * a,mp_size a_len,mp_digit b,mp_digit * c)248 __declspec(naked) void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
249 {
250 __asm {
251 mov eax, is_sse
252 cmp eax, 0
253 je s_mpv_mul_d_add_prop_x86
254 jg s_mpv_mul_d_add_prop_sse2
255 call s_mpi_is_sse2
256 mov is_sse, eax
257 cmp eax, 0
258 jg s_mpv_mul_d_add_prop_sse2
259 s_mpv_mul_d_add_prop_x86:
260 push ebp
261 mov ebp,esp
262 sub esp,28
263 push edi
264 push esi
265 push ebx
266 mov ebx,0 ; carry = 0
267 mov ecx,[ebp+12] ; ecx = a_len
268 mov edi,[ebp+20]
269 cmp ecx,0
270 je L_21 ; jmp if a_len == 0
271 cld
272 mov esi,[ebp+8] ; esi = a
273 L_20:
274 lodsd ; eax = [ds:esi]; esi += 4
275 mov edx,[ebp+16] ; edx = b
276 mul edx ; edx:eax = Phi:Plo = a_i * b
277
278 add eax,ebx ; add carry (ebx) to edx:eax
279 adc edx,0
280 mov ebx,[edi] ; add in current word from *c
281 add eax,ebx
282 adc edx,0
283 mov ebx,edx ; high half of product becomes next carry
284
285 stosd ; [es:edi] = ax; edi += 4;
286 dec ecx ; --a_len
287 jnz L_20 ; jmp if a_len != 0
288 L_21:
289 cmp ebx,0 ; is carry zero?
290 jz L_23
291 mov eax,[edi] ; add in current word from *c
292 add eax,ebx
293 stosd ; [es:edi] = ax; edi += 4;
294 jnc L_23
295 L_22:
296 mov eax,[edi] ; add in current word from *c
297 adc eax,0
298 stosd ; [es:edi] = ax; edi += 4;
299 jc L_22
300 L_23:
301 pop ebx
302 pop esi
303 pop edi
304 leave
305 ret
306 nop
307 s_mpv_mul_d_add_prop_sse2:
308 push ebp
309 mov ebp, esp
310 push edi
311 push esi
312 push ebx
313 psubq mm2, mm2 ; carry = 0
314 mov ecx, [ebp+12] ; ecx = a_len
315 movd mm1, [ebp+16] ; mm1 = b
316 mov edi, [ebp+20]
317 cmp ecx, 0
318 je L_26 ; jmp if a_len == 0
319 mov esi, [ebp+8] ; esi = a
320 cld
321 L_25:
322 movd mm0, [esi] ; mm0 = *a++
323 movd mm3, [edi] ; fetch the sum
324 add esi, 4
325 pmuludq mm0, mm1 ; mm0 = b * *a++
326 paddq mm2, mm0 ; add the carry
327 paddq mm2, mm3 ; add *c++
328 movd [edi], mm2 ; store the 32bit result
329 add edi, 4
330 psrlq mm2, 32 ; save the carry
331 dec ecx ; --a_len
332 jnz L_25 ; jmp if a_len != 0
333 L_26:
334 movd ebx, mm2
335 cmp ebx, 0 ; is carry zero?
336 jz L_28
337 mov eax, [edi]
338 add eax, ebx
339 stosd
340 jnc L_28
341 L_27:
342 mov eax, [edi] ; add in current word from *c
343 adc eax, 0
344 stosd ; [es:edi] = ax; edi += 4;
345 jc L_27
346 L_28:
347 emms
348 pop ebx
349 pop esi
350 pop edi
351 leave
352 ret
353 nop
354 }
355 }
356
357 /*
358 * ebp - 20: caller's esi
359 * ebp - 16: caller's edi
360 * ebp - 12:
361 * ebp - 8: carry
362 * ebp - 4: a_len local
363 * ebp + 0: caller's ebp
364 * ebp + 4: return address
365 * ebp + 8: pa argument
366 * ebp + 12: a_len argument
367 * ebp + 16: ps argument
368 * ebp + 20:
369 * registers:
370 * eax:
371 * ebx: carry
372 * ecx: a_len
373 * edx:
374 * esi: a ptr
375 * edi: c ptr
376 */
s_mpv_sqr_add_prop(const mp_digit * a,mp_size a_len,mp_digit * sqrs)377 __declspec(naked) void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
378 {
379 __asm {
380 mov eax, is_sse
381 cmp eax, 0
382 je s_mpv_sqr_add_prop_x86
383 jg s_mpv_sqr_add_prop_sse2
384 call s_mpi_is_sse2
385 mov is_sse, eax
386 cmp eax, 0
387 jg s_mpv_sqr_add_prop_sse2
388 s_mpv_sqr_add_prop_x86:
389 push ebp
390 mov ebp,esp
391 sub esp,12
392 push edi
393 push esi
394 push ebx
395 mov ebx,0 ; carry = 0
396 mov ecx,[ebp+12] ; a_len
397 mov edi,[ebp+16] ; edi = ps
398 cmp ecx,0
399 je L_31 ; jump if a_len == 0
400 cld
401 mov esi,[ebp+8] ; esi = pa
402 L_30:
403 lodsd ; eax = [ds:si]; si += 4;
404 mul eax
405
406 add eax,ebx ; add "carry"
407 adc edx,0
408 mov ebx,[edi]
409 add eax,ebx ; add low word from result
410 mov ebx,[edi+4]
411 stosd ; [es:di] = eax; di += 4;
412 adc edx,ebx ; add high word from result
413 mov ebx,0
414 mov eax,edx
415 adc ebx,0
416 stosd ; [es:di] = eax; di += 4;
417 dec ecx ; --a_len
418 jnz L_30 ; jmp if a_len != 0
419 L_31:
420 cmp ebx,0 ; is carry zero?
421 jz L_34
422 mov eax,[edi] ; add in current word from *c
423 add eax,ebx
424 stosd ; [es:edi] = ax; edi += 4;
425 jnc L_34
426 L_32:
427 mov eax,[edi] ; add in current word from *c
428 adc eax,0
429 stosd ; [es:edi] = ax; edi += 4;
430 jc L_32
431 L_34:
432 pop ebx
433 pop esi
434 pop edi
435 leave
436 ret
437 nop
438 s_mpv_sqr_add_prop_sse2:
439 push ebp
440 mov ebp, esp
441 push edi
442 push esi
443 push ebx
444 psubq mm2, mm2 ; carry = 0
445 mov ecx, [ebp+12] ; ecx = a_len
446 mov edi, [ebp+16]
447 cmp ecx, 0
448 je L_36 ; jmp if a_len == 0
449 mov esi, [ebp+8] ; esi = a
450 cld
451 L_35:
452 movd mm0, [esi] ; mm0 = *a
453 movd mm3, [edi] ; fetch the sum
454 add esi, 4
455 pmuludq mm0, mm0 ; mm0 = sqr(a)
456 paddq mm2, mm0 ; add the carry
457 paddq mm2, mm3 ; add the low word
458 movd mm3, [edi+4]
459 movd [edi], mm2 ; store the 32bit result
460 psrlq mm2, 32
461 paddq mm2, mm3 ; add the high word
462 movd [edi+4], mm2 ; store the 32bit result
463 psrlq mm2, 32 ; save the carry.
464 add edi, 8
465 dec ecx ; --a_len
466 jnz L_35 ; jmp if a_len != 0
467 L_36:
468 movd ebx, mm2
469 cmp ebx, 0 ; is carry zero?
470 jz L_38
471 mov eax, [edi]
472 add eax, ebx
473 stosd
474 jnc L_38
475 L_37:
476 mov eax, [edi] ; add in current word from *c
477 adc eax, 0
478 stosd ; [es:edi] = ax; edi += 4;
479 jc L_37
480 L_38:
481 emms
482 pop ebx
483 pop esi
484 pop edi
485 leave
486 ret
487 nop
488 }
489 }
490
491 /*
492 * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
493 * so its high bit is 1. This code is from NSPR.
494 *
495 * Dump of assembler code for function s_mpv_div_2dx1d:
496 *
497 * esp + 0: Caller's ebx
498 * esp + 4: return address
499 * esp + 8: Nhi argument
500 * esp + 12: Nlo argument
501 * esp + 16: divisor argument
502 * esp + 20: qp argument
503 * esp + 24: rp argument
504 * registers:
505 * eax:
506 * ebx: carry
507 * ecx: a_len
508 * edx:
509 * esi: a ptr
510 * edi: c ptr
511 */
512 __declspec(naked) mp_err
s_mpv_div_2dx1d(mp_digit Nhi,mp_digit Nlo,mp_digit divisor,mp_digit * qp,mp_digit * rp)513 s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
514 mp_digit *qp, mp_digit *rp)
515 {
516 __asm {
517 push ebx
518 mov edx,[esp+8]
519 mov eax,[esp+12]
520 mov ebx,[esp+16]
521 div ebx
522 mov ebx,[esp+20]
523 mov [ebx],eax
524 mov ebx,[esp+24]
525 mov [ebx],edx
526 xor eax,eax ; return zero
527 pop ebx
528 ret
529 nop
530 }
531 }
532