1# MMX assist routines for sumsq
2# Copyright 2001 Phil Karn, KA9Q
3# May be used under the terms of the GNU Public License (GPL)
4
5	.text
6
7# Evaluate sum of squares of signed 16-bit input samples
8#  long long sumsq_mmx_assist(signed short *in,int cnt);
9	.global sumsq_mmx_assist
10	.type sumsq_mmx_assist,@function
11	.align 16
12sumsq_mmx_assist:
13	pushl %ebp
14	movl %esp,%ebp
15	pushl %esi
16	pushl %ecx
17	pushl %ebx
18
19	movl 8(%ebp),%esi
20	movl 12(%ebp),%ecx
21	xor %eax,%eax
22	xor %edx,%edx
23
24	# Since 4 * 32767**2 < 2**32, we can accumulate two at a time
251:	subl $8,%ecx
26	jl 2f
27	movq (%esi),%mm0	# S0 S1 S2 S3
28	pmaddwd %mm0,%mm0	# (S0^2+S1^2) (S2^2+S3^2)
29	movq 8(%esi),%mm6	# S4 S5 S6 S7
30	pmaddwd %mm6,%mm6	# (S4^2+S5^2) (S6^2+S7^2)
31	paddd %mm6,%mm0		# (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
32	movd %mm0,%ebx
33	addl %ebx,%eax
34	adcl $0,%edx
35	psrlq $32,%mm0
36	movd %mm0,%ebx
37	addl %ebx,%eax
38	adcl $0,%edx
39	addl $16,%esi
40	jmp 1b
41
422:	emms
43	popl %ebx
44	popl %ecx
45	popl %esi
46	popl %ebp
47	ret
48
49# Evaluate sum of squares of signed 16-bit input samples
50#  long sumsq_wd_mmx_assist(signed short *in,int cnt);
51#  Quick version, only safe for small numbers of small input values...
52	.global sumsq_wd_mmx_assist
53	.type sumsq_wd_mmx_assist,@function
54	.align 16
55sumsq_wd_mmx_assist:
56	pushl %ebp
57	movl %esp,%ebp
58	pushl %esi
59
60	movl 8(%ebp),%esi
61	movl 12(%ebp),%ecx
62	pxor %mm2,%mm2		# zero sum
63
641:	subl $8,%ecx
65	jl 2f
66	movq (%esi),%mm0	# S0 S1 S2 S3
67	pmaddwd %mm0,%mm0	# (S0*S0+S1*S1) (S2*S2+S3*S3)
68	movq 8(%esi),%mm1
69	pmaddwd %mm1,%mm1
70	paddd %mm1,%mm2
71	paddd %mm0,%mm2		# accumulate
72
73	addl $16,%esi
74	jmp 1b
75
762:	movd %mm2,%eax		# even sum
77	psrlq $32,%mm2
78	movd %mm2,%edx		# odd sum
79	addl %edx,%eax
80	emms
81	popl %esi
82	popl %ebp
83	ret
84