1#
2# This Source Code Form is subject to the terms of the Mozilla Public
3# License, v. 2.0. If a copy of the MPL was not distributed with this
4# file, You can obtain one at http://mozilla.org/MPL/2.0/.
5
6.data
7.align 4
8 #
9 # -1 means to call _s_mpi_is_sse to determine if we support sse
10 #    instructions.
11 #  0 means to use x86 instructions
12 #  1 means to use sse2 instructions
13.type	is_sse,@object
14.size	is_sse,4
15is_sse: .long	-1
16
17#
18# sigh, handle the difference between -fPIC and not PIC
19# default to pic, since this file seems to be exclusively
20# linux right now (solaris uses mpi_i86pc.s and windows uses
21# mpi_x86_asm.c)
22#
23#.ifndef NO_PIC
24#.macro GET   var,reg
25#    movl   \var@GOTOFF(%ebx),\reg
26#.endm
27#.macro PUT   reg,var
28#    movl   \reg,\var@GOTOFF(%ebx)
29#.endm
30#.else
31.macro GET   var,reg
32    movl   \var,\reg
33.endm
34.macro PUT   reg,var
35    movl   \reg,\var
36.endm
37#.endif
38
39.text
40
41
42 #  ebp - 36:	caller's esi
43 #  ebp - 32:	caller's edi
44 #  ebp - 28:
45 #  ebp - 24:
46 #  ebp - 20:
47 #  ebp - 16:
48 #  ebp - 12:
49 #  ebp - 8:
50 #  ebp - 4:
51 #  ebp + 0:	caller's ebp
52 #  ebp + 4:	return address
53 #  ebp + 8:	a	argument
54 #  ebp + 12:	a_len	argument
55 #  ebp + 16:	b	argument
56 #  ebp + 20:	c	argument
57 #  registers:
58 # 	eax:
59 #	ebx:	carry
60 #	ecx:	a_len
61 #	edx:
62 #	esi:	a ptr
63 #	edi:	c ptr
64.globl	_s_mpv_mul_d
65.type	_s_mpv_mul_d,@function
66_s_mpv_mul_d:
67    GET    is_sse,%eax
68    cmp    $0,%eax
69    je     _s_mpv_mul_d_x86
70    jg     _s_mpv_mul_d_sse2
71    call   _s_mpi_is_sse2
72    PUT    %eax,is_sse
73    cmp    $0,%eax
74    jg     _s_mpv_mul_d_sse2
75_s_mpv_mul_d_x86:
76    push   %ebp
77    mov    %esp,%ebp
78    sub    $28,%esp
79    push   %edi
80    push   %esi
81    push   %ebx
82    movl   $0,%ebx		# carry = 0
83    mov    12(%ebp),%ecx	# ecx = a_len
84    mov    20(%ebp),%edi
85    cmp    $0,%ecx
86    je     2f			# jmp if a_len == 0
87    mov    8(%ebp),%esi		# esi = a
88    cld
891:
90    lodsl			# eax = [ds:esi]; esi += 4
91    mov    16(%ebp),%edx	# edx = b
92    mull   %edx			# edx:eax = Phi:Plo = a_i * b
93
94    add    %ebx,%eax		# add carry (%ebx) to edx:eax
95    adc    $0,%edx
96    mov    %edx,%ebx		# high half of product becomes next carry
97
98    stosl			# [es:edi] = ax; edi += 4;
99    dec    %ecx			# --a_len
100    jnz    1b			# jmp if a_len != 0
1012:
102    mov    %ebx,0(%edi)		# *c = carry
103    pop    %ebx
104    pop    %esi
105    pop    %edi
106    leave
107    ret
108    nop
109_s_mpv_mul_d_sse2:
110    push   %ebp
111    mov    %esp,%ebp
112    push   %edi
113    push   %esi
114    psubq  %mm2,%mm2		# carry = 0
115    mov    12(%ebp),%ecx	# ecx = a_len
116    movd   16(%ebp),%mm1	# mm1 = b
117    mov    20(%ebp),%edi
118    cmp    $0,%ecx
119    je     6f			# jmp if a_len == 0
120    mov    8(%ebp),%esi		# esi = a
121    cld
1225:
123    movd   0(%esi),%mm0         # mm0 = *a++
124    add    $4,%esi
125    pmuludq %mm1,%mm0           # mm0 = b * *a++
126    paddq  %mm0,%mm2            # add the carry
127    movd   %mm2,0(%edi)         # store the 32bit result
128    add    $4,%edi
129    psrlq  $32, %mm2		# save the carry
130    dec    %ecx			# --a_len
131    jnz    5b			# jmp if a_len != 0
1326:
133    movd   %mm2,0(%edi)		# *c = carry
134    emms
135    pop    %esi
136    pop    %edi
137    leave
138    ret
139    nop
140
141 #  ebp - 36:	caller's esi
142 #  ebp - 32:	caller's edi
143 #  ebp - 28:
144 #  ebp - 24:
145 #  ebp - 20:
146 #  ebp - 16:
147 #  ebp - 12:
148 #  ebp - 8:
149 #  ebp - 4:
150 #  ebp + 0:	caller's ebp
151 #  ebp + 4:	return address
152 #  ebp + 8:	a	argument
153 #  ebp + 12:	a_len	argument
154 #  ebp + 16:	b	argument
155 #  ebp + 20:	c	argument
156 #  registers:
157 # 	eax:
158 #	ebx:	carry
159 #	ecx:	a_len
160 #	edx:
161 #	esi:	a ptr
162 #	edi:	c ptr
163.globl	_s_mpv_mul_d_add
164.type	_s_mpv_mul_d_add,@function
165_s_mpv_mul_d_add:
166    GET    is_sse,%eax
167    cmp    $0,%eax
168    je     _s_mpv_mul_d_add_x86
169    jg     _s_mpv_mul_d_add_sse2
170    call   _s_mpi_is_sse2
171    PUT    %eax,is_sse
172    cmp    $0,%eax
173    jg     _s_mpv_mul_d_add_sse2
174_s_mpv_mul_d_add_x86:
175    push   %ebp
176    mov    %esp,%ebp
177    sub    $28,%esp
178    push   %edi
179    push   %esi
180    push   %ebx
181    movl   $0,%ebx		# carry = 0
182    mov    12(%ebp),%ecx	# ecx = a_len
183    mov    20(%ebp),%edi
184    cmp    $0,%ecx
185    je     11f			# jmp if a_len == 0
186    mov    8(%ebp),%esi		# esi = a
187    cld
18810:
189    lodsl			# eax = [ds:esi]; esi += 4
190    mov    16(%ebp),%edx	# edx = b
191    mull   %edx			# edx:eax = Phi:Plo = a_i * b
192
193    add    %ebx,%eax		# add carry (%ebx) to edx:eax
194    adc    $0,%edx
195    mov    0(%edi),%ebx		# add in current word from *c
196    add    %ebx,%eax
197    adc    $0,%edx
198    mov    %edx,%ebx		# high half of product becomes next carry
199
200    stosl			# [es:edi] = ax; edi += 4;
201    dec    %ecx			# --a_len
202    jnz    10b			# jmp if a_len != 0
20311:
204    mov    %ebx,0(%edi)		# *c = carry
205    pop    %ebx
206    pop    %esi
207    pop    %edi
208    leave
209    ret
210    nop
211_s_mpv_mul_d_add_sse2:
212    push   %ebp
213    mov    %esp,%ebp
214    push   %edi
215    push   %esi
216    psubq  %mm2,%mm2		# carry = 0
217    mov    12(%ebp),%ecx	# ecx = a_len
218    movd   16(%ebp),%mm1	# mm1 = b
219    mov    20(%ebp),%edi
220    cmp    $0,%ecx
221    je     16f			# jmp if a_len == 0
222    mov    8(%ebp),%esi		# esi = a
223    cld
22415:
225    movd   0(%esi),%mm0         # mm0 = *a++
226    add    $4,%esi
227    pmuludq %mm1,%mm0           # mm0 = b * *a++
228    paddq  %mm0,%mm2            # add the carry
229    movd   0(%edi),%mm0
230    paddq  %mm0,%mm2            # add the carry
231    movd   %mm2,0(%edi)         # store the 32bit result
232    add    $4,%edi
233    psrlq  $32, %mm2		# save the carry
234    dec    %ecx			# --a_len
235    jnz    15b			# jmp if a_len != 0
23616:
237    movd   %mm2,0(%edi)		# *c = carry
238    emms
239    pop    %esi
240    pop    %edi
241    leave
242    ret
243    nop
244
245 #  ebp - 8:	caller's esi
246 #  ebp - 4:	caller's edi
247 #  ebp + 0:	caller's ebp
248 #  ebp + 4:	return address
249 #  ebp + 8:	a	argument
250 #  ebp + 12:	a_len	argument
251 #  ebp + 16:	b	argument
252 #  ebp + 20:	c	argument
253 #  registers:
254 # 	eax:
255 #	ebx:	carry
256 #	ecx:	a_len
257 #	edx:
258 #	esi:	a ptr
259 #	edi:	c ptr
260.globl	_s_mpv_mul_d_add_prop
261.type	_s_mpv_mul_d_add_prop,@function
262_s_mpv_mul_d_add_prop:
263    GET    is_sse,%eax
264    cmp    $0,%eax
265    je     _s_mpv_mul_d_add_prop_x86
266    jg     _s_mpv_mul_d_add_prop_sse2
267    call   _s_mpi_is_sse2
268    PUT    %eax,is_sse
269    cmp    $0,%eax
270    jg     _s_mpv_mul_d_add_prop_sse2
271_s_mpv_mul_d_add_prop_x86:
272    push   %ebp
273    mov    %esp,%ebp
274    sub    $28,%esp
275    push   %edi
276    push   %esi
277    push   %ebx
278    movl   $0,%ebx		# carry = 0
279    mov    12(%ebp),%ecx	# ecx = a_len
280    mov    20(%ebp),%edi
281    cmp    $0,%ecx
282    je     21f			# jmp if a_len == 0
283    cld
284    mov    8(%ebp),%esi		# esi = a
28520:
286    lodsl			# eax = [ds:esi]; esi += 4
287    mov    16(%ebp),%edx	# edx = b
288    mull   %edx			# edx:eax = Phi:Plo = a_i * b
289
290    add    %ebx,%eax		# add carry (%ebx) to edx:eax
291    adc    $0,%edx
292    mov    0(%edi),%ebx		# add in current word from *c
293    add    %ebx,%eax
294    adc    $0,%edx
295    mov    %edx,%ebx		# high half of product becomes next carry
296
297    stosl			# [es:edi] = ax; edi += 4;
298    dec    %ecx			# --a_len
299    jnz    20b			# jmp if a_len != 0
30021:
301    cmp    $0,%ebx		# is carry zero?
302    jz     23f
303    mov    0(%edi),%eax		# add in current word from *c
304    add	   %ebx,%eax
305    stosl			# [es:edi] = ax; edi += 4;
306    jnc    23f
30722:
308    mov    0(%edi),%eax		# add in current word from *c
309    adc	   $0,%eax
310    stosl			# [es:edi] = ax; edi += 4;
311    jc     22b
31223:
313    pop    %ebx
314    pop    %esi
315    pop    %edi
316    leave
317    ret
318    nop
319_s_mpv_mul_d_add_prop_sse2:
320    push   %ebp
321    mov    %esp,%ebp
322    push   %edi
323    push   %esi
324    push   %ebx
325    psubq  %mm2,%mm2		# carry = 0
326    mov    12(%ebp),%ecx	# ecx = a_len
327    movd   16(%ebp),%mm1	# mm1 = b
328    mov    20(%ebp),%edi
329    cmp    $0,%ecx
330    je     26f			# jmp if a_len == 0
331    mov    8(%ebp),%esi		# esi = a
332    cld
33325:
334    movd   0(%esi),%mm0         # mm0 = *a++
335    movd   0(%edi),%mm3		# fetch the sum
336    add    $4,%esi
337    pmuludq %mm1,%mm0           # mm0 = b * *a++
338    paddq  %mm0,%mm2            # add the carry
339    paddq  %mm3,%mm2            # add *c++
340    movd   %mm2,0(%edi)         # store the 32bit result
341    add    $4,%edi
342    psrlq  $32, %mm2		# save the carry
343    dec    %ecx			# --a_len
344    jnz    25b			# jmp if a_len != 0
34526:
346    movd   %mm2,%ebx
347    cmp    $0,%ebx		# is carry zero?
348    jz     28f
349    mov    0(%edi),%eax
350    add    %ebx, %eax
351    stosl
352    jnc    28f
35327:
354    mov    0(%edi),%eax		# add in current word from *c
355    adc	   $0,%eax
356    stosl			# [es:edi] = ax; edi += 4;
357    jc     27b
35828:
359    emms
360    pop    %ebx
361    pop    %esi
362    pop    %edi
363    leave
364    ret
365    nop
366
367
368 #  ebp - 20:	caller's esi
369 #  ebp - 16:	caller's edi
370 #  ebp - 12:
371 #  ebp - 8:	carry
372 #  ebp - 4:	a_len	local
373 #  ebp + 0:	caller's ebp
374 #  ebp + 4:	return address
375 #  ebp + 8:	pa	argument
376 #  ebp + 12:	a_len	argument
377 #  ebp + 16:	ps	argument
378 #  ebp + 20:
379 #  registers:
380 # 	eax:
381 #	ebx:	carry
382 #	ecx:	a_len
383 #	edx:
384 #	esi:	a ptr
385 #	edi:	c ptr
386
387.globl	_s_mpv_sqr_add_prop
388.type	_s_mpv_sqr_add_prop,@function
389_s_mpv_sqr_add_prop:
390     GET   is_sse,%eax
391     cmp    $0,%eax
392     je     _s_mpv_sqr_add_prop_x86
393     jg     _s_mpv_sqr_add_prop_sse2
394     call   _s_mpi_is_sse2
395     PUT    %eax,is_sse
396     cmp    $0,%eax
397     jg     _s_mpv_sqr_add_prop_sse2
398_s_mpv_sqr_add_prop_x86:
399     push   %ebp
400     mov    %esp,%ebp
401     sub    $12,%esp
402     push   %edi
403     push   %esi
404     push   %ebx
405     movl   $0,%ebx		# carry = 0
406     mov    12(%ebp),%ecx	# a_len
407     mov    16(%ebp),%edi	# edi = ps
408     cmp    $0,%ecx
409     je     31f			# jump if a_len == 0
410     cld
411     mov    8(%ebp),%esi	# esi = pa
41230:
413     lodsl			# %eax = [ds:si]; si += 4;
414     mull   %eax
415
416     add    %ebx,%eax		# add "carry"
417     adc    $0,%edx
418     mov    0(%edi),%ebx
419     add    %ebx,%eax		# add low word from result
420     mov    4(%edi),%ebx
421     stosl			# [es:di] = %eax; di += 4;
422     adc    %ebx,%edx		# add high word from result
423     movl   $0,%ebx
424     mov    %edx,%eax
425     adc    $0,%ebx
426     stosl			# [es:di] = %eax; di += 4;
427     dec    %ecx		# --a_len
428     jnz    30b			# jmp if a_len != 0
42931:
430    cmp    $0,%ebx		# is carry zero?
431    jz     34f
432    mov    0(%edi),%eax		# add in current word from *c
433    add	   %ebx,%eax
434    stosl			# [es:edi] = ax; edi += 4;
435    jnc    34f
43632:
437    mov    0(%edi),%eax		# add in current word from *c
438    adc	   $0,%eax
439    stosl			# [es:edi] = ax; edi += 4;
440    jc     32b
44134:
442    pop    %ebx
443    pop    %esi
444    pop    %edi
445    leave
446    ret
447    nop
448_s_mpv_sqr_add_prop_sse2:
449    push   %ebp
450    mov    %esp,%ebp
451    push   %edi
452    push   %esi
453    push   %ebx
454    psubq  %mm2,%mm2		# carry = 0
455    mov    12(%ebp),%ecx	# ecx = a_len
456    mov    16(%ebp),%edi
457    cmp    $0,%ecx
458    je     36f			# jmp if a_len == 0
459    mov    8(%ebp),%esi		# esi = a
460    cld
46135:
462    movd   0(%esi),%mm0        # mm0 = *a
463    movd   0(%edi),%mm3	       # fetch the sum
464    add	   $4,%esi
465    pmuludq %mm0,%mm0          # mm0 = sqr(a)
466    paddq  %mm0,%mm2           # add the carry
467    paddq  %mm3,%mm2           # add the low word
468    movd   4(%edi),%mm3
469    movd   %mm2,0(%edi)        # store the 32bit result
470    psrlq  $32, %mm2
471    paddq  %mm3,%mm2           # add the high word
472    movd   %mm2,4(%edi)        # store the 32bit result
473    psrlq  $32, %mm2	       # save the carry.
474    add    $8,%edi
475    dec    %ecx			# --a_len
476    jnz    35b			# jmp if a_len != 0
47736:
478    movd   %mm2,%ebx
479    cmp    $0,%ebx		# is carry zero?
480    jz     38f
481    mov    0(%edi),%eax
482    add    %ebx, %eax
483    stosl
484    jnc    38f
48537:
486    mov    0(%edi),%eax		# add in current word from *c
487    adc	   $0,%eax
488    stosl			# [es:edi] = ax; edi += 4;
489    jc     37b
49038:
491    emms
492    pop    %ebx
493    pop    %esi
494    pop    %edi
495    leave
496    ret
497    nop
498
499 #
500 # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
501 # so its high bit is 1.   This code is from NSPR.
502 #
503 # mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
504 # 		          mp_digit *qp, mp_digit *rp)
505
506 #  esp +  0:   Caller's ebx
507 #  esp +  4:	return address
508 #  esp +  8:	Nhi	argument
509 #  esp + 12:	Nlo	argument
510 #  esp + 16:	divisor	argument
511 #  esp + 20:	qp	argument
512 #  esp + 24:   rp	argument
513 #  registers:
514 # 	eax:
515 #	ebx:	carry
516 #	ecx:	a_len
517 #	edx:
518 #	esi:	a ptr
519 #	edi:	c ptr
520 #
521
522.globl	_s_mpv_div_2dx1d
523.type	_s_mpv_div_2dx1d,@function
524_s_mpv_div_2dx1d:
525       push   %ebx
526       mov    8(%esp),%edx
527       mov    12(%esp),%eax
528       mov    16(%esp),%ebx
529       div    %ebx
530       mov    20(%esp),%ebx
531       mov    %eax,0(%ebx)
532       mov    24(%esp),%ebx
533       mov    %edx,0(%ebx)
534       xor    %eax,%eax		# return zero
535       pop    %ebx
536       ret
537       nop
538
539