1 /*
2  *  mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions.
3  *
4  * This Source Code Form is subject to the terms of the Mozilla Public
5  * License, v. 2.0. If a copy of the MPL was not distributed with this
6  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 
8 #include "mpi-priv.h"
9 
10 static int is_sse = -1;
11 extern unsigned long s_mpi_is_sse2();
12 
13 /*
14  *   ebp - 36:  caller's esi
15  *   ebp - 32:  caller's edi
16  *   ebp - 28:
17  *   ebp - 24:
18  *   ebp - 20:
19  *   ebp - 16:
20  *   ebp - 12:
21  *   ebp - 8:
22  *   ebp - 4:
23  *   ebp + 0:   caller's ebp
24  *   ebp + 4:   return address
25  *   ebp + 8:   a   argument
26  *   ebp + 12:  a_len   argument
27  *   ebp + 16:  b   argument
28  *   ebp + 20:  c   argument
29  *   registers:
30  *      eax:
31  *  ebx:    carry
32  *  ecx:    a_len
33  *  edx:
34  *  esi:    a ptr
35  *  edi:    c ptr
36  */
s_mpv_mul_d(const mp_digit * a,mp_size a_len,mp_digit b,mp_digit * c)37 __declspec(naked) void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
38 {
39     __asm {
40     mov    eax, is_sse
41     cmp    eax, 0
42     je     s_mpv_mul_d_x86
43     jg     s_mpv_mul_d_sse2
44     call   s_mpi_is_sse2
45     mov    is_sse, eax
46     cmp    eax, 0
47     jg     s_mpv_mul_d_sse2
48 s_mpv_mul_d_x86:
49     push   ebp
50     mov    ebp,esp
51     sub    esp,28
52     push   edi
53     push   esi
54     push   ebx
55     mov    ebx,0        ; carry = 0
56     mov    ecx,[ebp+12]     ; ecx = a_len
57     mov    edi,[ebp+20]
58     cmp    ecx,0
59     je     L_2          ; jmp if a_len == 0
60     mov    esi,[ebp+8]      ; esi = a
61     cld
62 L_1:
63     lodsd           ; eax = [ds:esi]; esi += 4
64     mov    edx,[ebp+16]     ; edx = b
65     mul    edx          ; edx:eax = Phi:Plo = a_i * b
66 
67     add    eax,ebx      ; add carry (ebx) to edx:eax
68     adc    edx,0
69     mov    ebx,edx      ; high half of product becomes next carry
70 
71     stosd           ; [es:edi] = ax; edi += 4;
72     dec    ecx          ; --a_len
73     jnz    L_1          ; jmp if a_len != 0
74 L_2:
75     mov    [edi],ebx        ; *c = carry
76     pop    ebx
77     pop    esi
78     pop    edi
79     leave
80     ret
81     nop
82 s_mpv_mul_d_sse2:
83     push   ebp
84     mov    ebp, esp
85     push   edi
86     push   esi
87     psubq  mm2, mm2     ; carry = 0
88     mov    ecx, [ebp+12]    ; ecx = a_len
89     movd   mm1, [ebp+16]    ; mm1 = b
90     mov    edi, [ebp+20]
91     cmp    ecx, 0
92     je     L_6          ; jmp if a_len == 0
93     mov    esi, [ebp+8]     ; esi = a
94     cld
95 L_5:
96     movd   mm0, [esi]       ; mm0 = *a++
97     add    esi, 4
98     pmuludq mm0, mm1        ; mm0 = b * *a++
99     paddq  mm2, mm0     ; add the carry
100     movd   [edi], mm2       ; store the 32bit result
101     add    edi, 4
102     psrlq  mm2, 32      ; save the carry
103     dec    ecx          ; --a_len
104     jnz    L_5          ; jmp if a_len != 0
105 L_6:
106     movd   [edi], mm2       ; *c = carry
107     emms
108     pop    esi
109     pop    edi
110     leave
111     ret
112     nop
113     }
114 }
115 
116 /*
117  *   ebp - 36:  caller's esi
118  *   ebp - 32:  caller's edi
119  *   ebp - 28:
120  *   ebp - 24:
121  *   ebp - 20:
122  *   ebp - 16:
123  *   ebp - 12:
124  *   ebp - 8:
125  *   ebp - 4:
126  *   ebp + 0:   caller's ebp
127  *   ebp + 4:   return address
128  *   ebp + 8:   a   argument
129  *   ebp + 12:  a_len   argument
130  *   ebp + 16:  b   argument
131  *   ebp + 20:  c   argument
132  *   registers:
133  *      eax:
134  *  ebx:    carry
135  *  ecx:    a_len
136  *  edx:
137  *  esi:    a ptr
138  *  edi:    c ptr
139  */
s_mpv_mul_d_add(const mp_digit * a,mp_size a_len,mp_digit b,mp_digit * c)140 __declspec(naked) void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
141 {
142     __asm {
143     mov    eax, is_sse
144     cmp    eax, 0
145     je     s_mpv_mul_d_add_x86
146     jg     s_mpv_mul_d_add_sse2
147     call   s_mpi_is_sse2
148     mov    is_sse, eax
149     cmp    eax, 0
150     jg     s_mpv_mul_d_add_sse2
151 s_mpv_mul_d_add_x86:
152     push   ebp
153     mov    ebp,esp
154     sub    esp,28
155     push   edi
156     push   esi
157     push   ebx
158     mov    ebx,0        ; carry = 0
159     mov    ecx,[ebp+12]     ; ecx = a_len
160     mov    edi,[ebp+20]
161     cmp    ecx,0
162     je     L_11         ; jmp if a_len == 0
163     mov    esi,[ebp+8]      ; esi = a
164     cld
165 L_10:
166     lodsd           ; eax = [ds:esi]; esi += 4
167     mov    edx,[ebp+16]     ; edx = b
168     mul    edx          ; edx:eax = Phi:Plo = a_i * b
169 
170     add    eax,ebx      ; add carry (ebx) to edx:eax
171     adc    edx,0
172     mov    ebx,[edi]        ; add in current word from *c
173     add    eax,ebx
174     adc    edx,0
175     mov    ebx,edx      ; high half of product becomes next carry
176 
177     stosd           ; [es:edi] = ax; edi += 4;
178     dec    ecx          ; --a_len
179     jnz    L_10         ; jmp if a_len != 0
180 L_11:
181     mov    [edi],ebx        ; *c = carry
182     pop    ebx
183     pop    esi
184     pop    edi
185     leave
186     ret
187     nop
188 s_mpv_mul_d_add_sse2:
189     push   ebp
190     mov    ebp, esp
191     push   edi
192     push   esi
193     psubq  mm2, mm2     ; carry = 0
194     mov    ecx, [ebp+12]    ; ecx = a_len
195     movd   mm1, [ebp+16]    ; mm1 = b
196     mov    edi, [ebp+20]
197     cmp    ecx, 0
198     je     L_16         ; jmp if a_len == 0
199     mov    esi, [ebp+8]     ; esi = a
200     cld
201 L_15:
202     movd   mm0, [esi]       ; mm0 = *a++
203     add    esi, 4
204     pmuludq mm0, mm1        ; mm0 = b * *a++
205     paddq  mm2, mm0     ; add the carry
206     movd   mm0, [edi]
207     paddq  mm2, mm0     ; add the carry
208     movd   [edi], mm2       ; store the 32bit result
209     add    edi, 4
210     psrlq  mm2, 32      ; save the carry
211     dec    ecx          ; --a_len
212     jnz    L_15         ; jmp if a_len != 0
213 L_16:
214     movd   [edi], mm2       ; *c = carry
215     emms
216     pop    esi
217     pop    edi
218     leave
219     ret
220     nop
221     }
222 }
223 
224 /*
225  *   ebp - 36:  caller's esi
226  *   ebp - 32:  caller's edi
227  *   ebp - 28:
228  *   ebp - 24:
229  *   ebp - 20:
230  *   ebp - 16:
231  *   ebp - 12:
232  *   ebp - 8:
233  *   ebp - 4:
234  *   ebp + 0:   caller's ebp
235  *   ebp + 4:   return address
236  *   ebp + 8:   a   argument
237  *   ebp + 12:  a_len   argument
238  *   ebp + 16:  b   argument
239  *   ebp + 20:  c   argument
240  *   registers:
241  *      eax:
242  *  ebx:    carry
243  *  ecx:    a_len
244  *  edx:
245  *  esi:    a ptr
246  *  edi:    c ptr
247  */
s_mpv_mul_d_add_prop(const mp_digit * a,mp_size a_len,mp_digit b,mp_digit * c)248 __declspec(naked) void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
249 {
250     __asm {
251     mov    eax, is_sse
252     cmp    eax, 0
253     je     s_mpv_mul_d_add_prop_x86
254     jg     s_mpv_mul_d_add_prop_sse2
255     call   s_mpi_is_sse2
256     mov    is_sse, eax
257     cmp    eax, 0
258     jg     s_mpv_mul_d_add_prop_sse2
259 s_mpv_mul_d_add_prop_x86:
260     push   ebp
261     mov    ebp,esp
262     sub    esp,28
263     push   edi
264     push   esi
265     push   ebx
266     mov    ebx,0        ; carry = 0
267     mov    ecx,[ebp+12]     ; ecx = a_len
268     mov    edi,[ebp+20]
269     cmp    ecx,0
270     je     L_21         ; jmp if a_len == 0
271     cld
272     mov    esi,[ebp+8]      ; esi = a
273 L_20:
274     lodsd           ; eax = [ds:esi]; esi += 4
275     mov    edx,[ebp+16]     ; edx = b
276     mul    edx          ; edx:eax = Phi:Plo = a_i * b
277 
278     add    eax,ebx      ; add carry (ebx) to edx:eax
279     adc    edx,0
280     mov    ebx,[edi]        ; add in current word from *c
281     add    eax,ebx
282     adc    edx,0
283     mov    ebx,edx      ; high half of product becomes next carry
284 
285     stosd           ; [es:edi] = ax; edi += 4;
286     dec    ecx          ; --a_len
287     jnz    L_20         ; jmp if a_len != 0
288 L_21:
289     cmp    ebx,0        ; is carry zero?
290     jz     L_23
291     mov    eax,[edi]        ; add in current word from *c
292     add    eax,ebx
293     stosd           ; [es:edi] = ax; edi += 4;
294     jnc    L_23
295 L_22:
296     mov    eax,[edi]        ; add in current word from *c
297     adc    eax,0
298     stosd           ; [es:edi] = ax; edi += 4;
299     jc     L_22
300 L_23:
301     pop    ebx
302     pop    esi
303     pop    edi
304     leave
305     ret
306     nop
307 s_mpv_mul_d_add_prop_sse2:
308     push   ebp
309     mov    ebp, esp
310     push   edi
311     push   esi
312     push   ebx
313     psubq  mm2, mm2     ; carry = 0
314     mov    ecx, [ebp+12]    ; ecx = a_len
315     movd   mm1, [ebp+16]    ; mm1 = b
316     mov    edi, [ebp+20]
317     cmp    ecx, 0
318     je     L_26         ; jmp if a_len == 0
319     mov    esi, [ebp+8]     ; esi = a
320     cld
321 L_25:
322     movd   mm0, [esi]       ; mm0 = *a++
323     movd   mm3, [edi]       ; fetch the sum
324     add    esi, 4
325     pmuludq mm0, mm1        ; mm0 = b * *a++
326     paddq  mm2, mm0     ; add the carry
327     paddq  mm2, mm3     ; add *c++
328     movd   [edi], mm2       ; store the 32bit result
329     add    edi, 4
330     psrlq  mm2, 32      ; save the carry
331     dec    ecx          ; --a_len
332     jnz    L_25         ; jmp if a_len != 0
333 L_26:
334     movd   ebx, mm2
335     cmp    ebx, 0       ; is carry zero?
336     jz     L_28
337     mov    eax, [edi]
338     add    eax, ebx
339     stosd
340     jnc    L_28
341 L_27:
342     mov    eax, [edi]       ; add in current word from *c
343     adc    eax, 0
344     stosd           ; [es:edi] = ax; edi += 4;
345     jc     L_27
346 L_28:
347     emms
348     pop    ebx
349     pop    esi
350     pop    edi
351     leave
352     ret
353     nop
354     }
355 }
356 
357 /*
358  *   ebp - 20:  caller's esi
359  *   ebp - 16:  caller's edi
360  *   ebp - 12:
361  *   ebp - 8:   carry
362  *   ebp - 4:   a_len   local
363  *   ebp + 0:   caller's ebp
364  *   ebp + 4:   return address
365  *   ebp + 8:   pa  argument
366  *   ebp + 12:  a_len   argument
367  *   ebp + 16:  ps  argument
368  *   ebp + 20:
369  *   registers:
370  *      eax:
371  *  ebx:    carry
372  *  ecx:    a_len
373  *  edx:
374  *  esi:    a ptr
375  *  edi:    c ptr
376  */
s_mpv_sqr_add_prop(const mp_digit * a,mp_size a_len,mp_digit * sqrs)377 __declspec(naked) void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
378 {
379     __asm {
380      mov    eax, is_sse
381      cmp    eax, 0
382      je     s_mpv_sqr_add_prop_x86
383      jg     s_mpv_sqr_add_prop_sse2
384      call   s_mpi_is_sse2
385      mov    is_sse, eax
386      cmp    eax, 0
387      jg     s_mpv_sqr_add_prop_sse2
388 s_mpv_sqr_add_prop_x86:
389      push   ebp
390      mov    ebp,esp
391      sub    esp,12
392      push   edi
393      push   esi
394      push   ebx
395      mov    ebx,0       ; carry = 0
396      mov    ecx,[ebp+12]    ; a_len
397      mov    edi,[ebp+16]    ; edi = ps
398      cmp    ecx,0
399      je     L_31        ; jump if a_len == 0
400      cld
401      mov    esi,[ebp+8]     ; esi = pa
402 L_30:
403      lodsd          ; eax = [ds:si]; si += 4;
404      mul    eax
405 
406      add    eax,ebx     ; add "carry"
407      adc    edx,0
408      mov    ebx,[edi]
409      add    eax,ebx     ; add low word from result
410      mov    ebx,[edi+4]
411      stosd          ; [es:di] = eax; di += 4;
412      adc    edx,ebx     ; add high word from result
413      mov    ebx,0
414      mov    eax,edx
415      adc    ebx,0
416      stosd          ; [es:di] = eax; di += 4;
417      dec    ecx         ; --a_len
418      jnz    L_30        ; jmp if a_len != 0
419 L_31:
420     cmp    ebx,0        ; is carry zero?
421     jz     L_34
422     mov    eax,[edi]        ; add in current word from *c
423     add    eax,ebx
424     stosd           ; [es:edi] = ax; edi += 4;
425     jnc    L_34
426 L_32:
427     mov    eax,[edi]        ; add in current word from *c
428     adc    eax,0
429     stosd           ; [es:edi] = ax; edi += 4;
430     jc     L_32
431 L_34:
432     pop    ebx
433     pop    esi
434     pop    edi
435     leave
436     ret
437     nop
438 s_mpv_sqr_add_prop_sse2:
439     push   ebp
440     mov    ebp, esp
441     push   edi
442     push   esi
443     push   ebx
444     psubq  mm2, mm2     ; carry = 0
445     mov    ecx, [ebp+12]    ; ecx = a_len
446     mov    edi, [ebp+16]
447     cmp    ecx, 0
448     je     L_36     ; jmp if a_len == 0
449     mov    esi, [ebp+8]     ; esi = a
450     cld
451 L_35:
452     movd   mm0, [esi]       ; mm0 = *a
453     movd   mm3, [edi]       ; fetch the sum
454     add    esi, 4
455     pmuludq mm0, mm0        ; mm0 = sqr(a)
456     paddq  mm2, mm0     ; add the carry
457     paddq  mm2, mm3     ; add the low word
458     movd   mm3, [edi+4]
459     movd   [edi], mm2       ; store the 32bit result
460     psrlq  mm2, 32
461     paddq  mm2, mm3     ; add the high word
462     movd   [edi+4], mm2     ; store the 32bit result
463     psrlq  mm2, 32      ; save the carry.
464     add    edi, 8
465     dec    ecx          ; --a_len
466     jnz    L_35         ; jmp if a_len != 0
467 L_36:
468     movd   ebx, mm2
469     cmp    ebx, 0       ; is carry zero?
470     jz     L_38
471     mov    eax, [edi]
472     add    eax, ebx
473     stosd
474     jnc    L_38
475 L_37:
476     mov    eax, [edi]       ; add in current word from *c
477     adc    eax, 0
478     stosd           ; [es:edi] = ax; edi += 4;
479     jc     L_37
480 L_38:
481     emms
482     pop    ebx
483     pop    esi
484     pop    edi
485     leave
486     ret
487     nop
488     }
489 }
490 
491 /*
492  *  Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
493  *  so its high bit is 1.   This code is from NSPR.
494  *
495  *  Dump of assembler code for function s_mpv_div_2dx1d:
496  *
497  *   esp +  0:   Caller's ebx
498  *   esp +  4:  return address
499  *   esp +  8:  Nhi argument
500  *   esp + 12:  Nlo argument
501  *   esp + 16:  divisor argument
502  *   esp + 20:  qp  argument
503  *   esp + 24:   rp argument
504  *   registers:
505  *      eax:
506  *  ebx:    carry
507  *  ecx:    a_len
508  *  edx:
509  *  esi:    a ptr
510  *  edi:    c ptr
511  */
512 __declspec(naked) mp_err
s_mpv_div_2dx1d(mp_digit Nhi,mp_digit Nlo,mp_digit divisor,mp_digit * qp,mp_digit * rp)513     s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
514                     mp_digit *qp, mp_digit *rp)
515 {
516     __asm {
517        push   ebx
518        mov    edx,[esp+8]
519        mov    eax,[esp+12]
520        mov    ebx,[esp+16]
521        div    ebx
522        mov    ebx,[esp+20]
523        mov    [ebx],eax
524        mov    ebx,[esp+24]
525        mov    [ebx],edx
526        xor    eax,eax       ; return zero
527        pop    ebx
528        ret
529        nop
530     }
531 }
532