1;******************************************************************************
2;* Copyright (c) 2008 Loren Merritt
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION .text
24
25%macro SCALARPRODUCT 0
26; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
27;                                     int order, int mul)
28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
29    shl orderq, 1
30    movd    m7, mulm
31%if mmsize == 16
32    pshuflw m7, m7, 0
33    punpcklqdq m7, m7
34%else
35    pshufw  m7, m7, 0
36%endif
37    pxor    m6, m6
38    add v1q, orderq
39    add v2q, orderq
40    add v3q, orderq
41    neg orderq
42.loop:
43    movu    m0, [v2q + orderq]
44    movu    m1, [v2q + orderq + mmsize]
45    mova    m4, [v1q + orderq]
46    mova    m5, [v1q + orderq + mmsize]
47    movu    m2, [v3q + orderq]
48    movu    m3, [v3q + orderq + mmsize]
49    pmaddwd m0, m4
50    pmaddwd m1, m5
51    pmullw  m2, m7
52    pmullw  m3, m7
53    paddd   m6, m0
54    paddd   m6, m1
55    paddw   m2, m4
56    paddw   m3, m5
57    mova    [v1q + orderq], m2
58    mova    [v1q + orderq + mmsize], m3
59    add     orderq, mmsize*2
60    jl .loop
61    HADDD   m6, m0
62    movd   eax, m6
63    RET
64%endmacro
65
66INIT_MMX mmxext
67SCALARPRODUCT
68INIT_XMM sse2
69SCALARPRODUCT
70
71INIT_XMM sse4
72; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
73;                                     int order, int mul)
74cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
75    shl orderq, 1
76    movd    m7, mulm
77    SPLATW  m7, m7
78    pxor    m6, m6
79    add v1q, orderq
80    lea v2q, [v2q + 2*orderq]
81    add v3q, orderq
82    neg orderq
83.loop:
84    mova    m3, [v1q + orderq]
85    movu    m0, [v2q + 2*orderq]
86    pmovsxwd m4, m3
87    movu    m1, [v2q + 2*orderq + mmsize]
88    movhlps m5, m3
89    movu    m2, [v3q + orderq]
90    pmovsxwd m5, m5
91    pmullw  m2, m7
92    pmulld  m0, m4
93    pmulld  m1, m5
94    paddw   m2, m3
95    paddd   m6, m0
96    paddd   m6, m1
97    mova    [v1q + orderq], m2
98    add     orderq, 16
99    jl .loop
100    HADDD   m6, m0
101    movd   eax, m6
102    RET
103
104%macro SCALARPRODUCT_LOOP 1
105align 16
106.loop%1:
107    sub     orderq, mmsize*2
108%if %1
109    mova    m1, m4
110    mova    m4, [v2q + orderq]
111    mova    m0, [v2q + orderq + mmsize]
112    palignr m1, m0, %1
113    palignr m0, m4, %1
114    mova    m3, m5
115    mova    m5, [v3q + orderq]
116    mova    m2, [v3q + orderq + mmsize]
117    palignr m3, m2, %1
118    palignr m2, m5, %1
119%else
120    mova    m0, [v2q + orderq]
121    mova    m1, [v2q + orderq + mmsize]
122    mova    m2, [v3q + orderq]
123    mova    m3, [v3q + orderq + mmsize]
124%endif
125    %define t0  [v1q + orderq]
126    %define t1  [v1q + orderq + mmsize]
127%if ARCH_X86_64
128    mova    m8, t0
129    mova    m9, t1
130    %define t0  m8
131    %define t1  m9
132%endif
133    pmaddwd m0, t0
134    pmaddwd m1, t1
135    pmullw  m2, m7
136    pmullw  m3, m7
137    paddw   m2, t0
138    paddw   m3, t1
139    paddd   m6, m0
140    paddd   m6, m1
141    mova    [v1q + orderq], m2
142    mova    [v1q + orderq + mmsize], m3
143    jg .loop%1
144%if %1
145    jmp .end
146%endif
147%endmacro
148
149; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
150;                                     int order, int mul)
151INIT_XMM ssse3
152cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
153    shl orderq, 1
154    movd    m7, mulm
155    pshuflw m7, m7, 0
156    punpcklqdq m7, m7
157    pxor    m6, m6
158    mov    r4d, v2d
159    and    r4d, 15
160    and    v2q, ~15
161    and    v3q, ~15
162    mova    m4, [v2q + orderq]
163    mova    m5, [v3q + orderq]
164    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
165    cmp    r4d, 0
166    je .loop0
167    cmp    r4d, 2
168    je .loop2
169    cmp    r4d, 4
170    je .loop4
171    cmp    r4d, 6
172    je .loop6
173    cmp    r4d, 8
174    je .loop8
175    cmp    r4d, 10
176    je .loop10
177    cmp    r4d, 12
178    je .loop12
179SCALARPRODUCT_LOOP 14
180SCALARPRODUCT_LOOP 12
181SCALARPRODUCT_LOOP 10
182SCALARPRODUCT_LOOP 8
183SCALARPRODUCT_LOOP 6
184SCALARPRODUCT_LOOP 4
185SCALARPRODUCT_LOOP 2
186SCALARPRODUCT_LOOP 0
187.end:
188    HADDD   m6, m0
189    movd   eax, m6
190    RET
191