1;******************************************************************************
2;* linear least squares model
3;*
4;* Copyright (c) 2013 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86util.asm"
24
25SECTION .text
26
27%define MAX_VARS 32
28%define MAX_VARS_ALIGN (MAX_VARS+4)
29%define COVAR_STRIDE MAX_VARS_ALIGN*8
30%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
31
32struc LLSModel
33    .covariance:  resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
34    .coeff:       resq MAX_VARS*MAX_VARS
35    .variance:    resq MAX_VARS
36    .indep_count: resd 1
37endstruc
38
39%macro ADDPD_MEM 2
40%if cpuflag(avx)
41    vaddpd %2, %2, %1
42%else
43    addpd  %2, %1
44%endif
45    mova   %1, %2
46%endmacro
47
48INIT_XMM sse2
49%define movdqa movaps
50cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
51    %define covarq ctxq
52    mov     id, [ctxq + LLSModel.indep_count]
53    lea   varq, [varq + iq*8]
54    neg     iq
55    mov covar2q, covarq
56.loopi:
57    ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
58    mova    m1, [varq + iq*8]
59    mova    m3, [varq + iq*8 + 16]
60    pshufd  m4, m1, q1010
61    pshufd  m5, m1, q3232
62    pshufd  m6, m3, q1010
63    pshufd  m7, m3, q3232
64    mulpd   m0, m1, m4
65    mulpd   m1, m1, m5
66    lea covarq, [covar2q + 16]
67    ADDPD_MEM COVAR(-2,0), m0
68    ADDPD_MEM COVAR(-2,1), m1
69    lea     jq, [iq + 2]
70    cmp     jd, -2
71    jg .skip4x4
72.loop4x4:
73    ; Compute all 16 pairwise products of a 4x4 block
74    mulpd   m0, m4, m3
75    mulpd   m1, m5, m3
76    mulpd   m2, m6, m3
77    mulpd   m3, m3, m7
78    ADDPD_MEM COVAR(0,0), m0
79    ADDPD_MEM COVAR(0,1), m1
80    ADDPD_MEM COVAR(0,2), m2
81    ADDPD_MEM COVAR(0,3), m3
82    mova    m3, [varq + jq*8 + 16]
83    mulpd   m0, m4, m3
84    mulpd   m1, m5, m3
85    mulpd   m2, m6, m3
86    mulpd   m3, m3, m7
87    ADDPD_MEM COVAR(2,0), m0
88    ADDPD_MEM COVAR(2,1), m1
89    ADDPD_MEM COVAR(2,2), m2
90    ADDPD_MEM COVAR(2,3), m3
91    mova    m3, [varq + jq*8 + 32]
92    add covarq, 32
93    add     jq, 4
94    cmp     jd, -2
95    jle .loop4x4
96.skip4x4:
97    test    jd, jd
98    jg .skip2x4
99    mulpd   m4, m3
100    mulpd   m5, m3
101    mulpd   m6, m3
102    mulpd   m7, m3
103    ADDPD_MEM COVAR(0,0), m4
104    ADDPD_MEM COVAR(0,1), m5
105    ADDPD_MEM COVAR(0,2), m6
106    ADDPD_MEM COVAR(0,3), m7
107.skip2x4:
108    add     iq, 4
109    add covar2q, 4*COVAR_STRIDE+32
110    cmp     id, -2
111    jle .loopi
112    test    id, id
113    jg .ret
114    mov     jq, iq
115    %define covarq covar2q
116.loop2x1:
117    movsd   m0, [varq + iq*8]
118    movlhps m0, m0
119    mulpd   m0, [varq + jq*8]
120    ADDPD_MEM COVAR(0,0), m0
121    inc     iq
122    add covarq, COVAR_STRIDE
123    test    id, id
124    jle .loop2x1
125.ret:
126    REP_RET
127
128%macro UPDATE_LLS 0
129cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
130    %define covarq ctxq
131    mov  countd, [ctxq + LLSModel.indep_count]
132    lea count2d, [countq-2]
133    xor     id, id
134.loopi:
135    ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
136    mova    ymm1, [varq + iq*8]
137    vbroadcastsd ymm4, [varq + iq*8]
138    vbroadcastsd ymm5, [varq + iq*8 + 8]
139    vbroadcastsd ymm6, [varq + iq*8 + 16]
140    vbroadcastsd ymm7, [varq + iq*8 + 24]
141    vextractf128 xmm3, ymm1, 1
142%if cpuflag(fma3)
143    mova ymm0, COVAR(iq  ,0)
144    mova xmm2, COVAR(iq+2,2)
145    fmaddpd ymm0, ymm1, ymm4, ymm0
146    fmaddpd xmm2, xmm3, xmm6, xmm2
147    fmaddpd ymm1, ymm5, ymm1, COVAR(iq  ,1)
148    fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
149    mova COVAR(iq  ,0), ymm0
150    mova COVAR(iq  ,1), ymm1
151    mova COVAR(iq+2,2), xmm2
152    mova COVAR(iq+2,3), xmm3
153%else
154    vmulpd  ymm0, ymm1, ymm4
155    vmulpd  ymm1, ymm1, ymm5
156    vmulpd  xmm2, xmm3, xmm6
157    vmulpd  xmm3, xmm3, xmm7
158    ADDPD_MEM COVAR(iq  ,0), ymm0
159    ADDPD_MEM COVAR(iq  ,1), ymm1
160    ADDPD_MEM COVAR(iq+2,2), xmm2
161    ADDPD_MEM COVAR(iq+2,3), xmm3
162%endif ; cpuflag(fma3)
163    lea     jd, [iq + 4]
164    cmp     jd, count2d
165    jg .skip4x4
166.loop4x4:
167    ; Compute all 16 pairwise products of a 4x4 block
168    mova    ymm3, [varq + jq*8]
169%if cpuflag(fma3)
170    mova ymm0, COVAR(jq, 0)
171    mova ymm1, COVAR(jq, 1)
172    mova ymm2, COVAR(jq, 2)
173    fmaddpd ymm0, ymm3, ymm4, ymm0
174    fmaddpd ymm1, ymm3, ymm5, ymm1
175    fmaddpd ymm2, ymm3, ymm6, ymm2
176    fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
177    mova COVAR(jq, 0), ymm0
178    mova COVAR(jq, 1), ymm1
179    mova COVAR(jq, 2), ymm2
180    mova COVAR(jq, 3), ymm3
181%else
182    vmulpd  ymm0, ymm3, ymm4
183    vmulpd  ymm1, ymm3, ymm5
184    vmulpd  ymm2, ymm3, ymm6
185    vmulpd  ymm3, ymm3, ymm7
186    ADDPD_MEM COVAR(jq,0), ymm0
187    ADDPD_MEM COVAR(jq,1), ymm1
188    ADDPD_MEM COVAR(jq,2), ymm2
189    ADDPD_MEM COVAR(jq,3), ymm3
190%endif ; cpuflag(fma3)
191    add     jd, 4
192    cmp     jd, count2d
193    jle .loop4x4
194.skip4x4:
195    cmp     jd, countd
196    jg .skip2x4
197    mova    xmm3, [varq + jq*8]
198%if cpuflag(fma3)
199    mova xmm0, COVAR(jq, 0)
200    mova xmm1, COVAR(jq, 1)
201    mova xmm2, COVAR(jq, 2)
202    fmaddpd xmm0, xmm3, xmm4, xmm0
203    fmaddpd xmm1, xmm3, xmm5, xmm1
204    fmaddpd xmm2, xmm3, xmm6, xmm2
205    fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
206    mova COVAR(jq, 0), xmm0
207    mova COVAR(jq, 1), xmm1
208    mova COVAR(jq, 2), xmm2
209    mova COVAR(jq, 3), xmm3
210%else
211    vmulpd  xmm0, xmm3, xmm4
212    vmulpd  xmm1, xmm3, xmm5
213    vmulpd  xmm2, xmm3, xmm6
214    vmulpd  xmm3, xmm3, xmm7
215    ADDPD_MEM COVAR(jq,0), xmm0
216    ADDPD_MEM COVAR(jq,1), xmm1
217    ADDPD_MEM COVAR(jq,2), xmm2
218    ADDPD_MEM COVAR(jq,3), xmm3
219%endif ; cpuflag(fma3)
220.skip2x4:
221    add     id, 4
222    add covarq, 4*COVAR_STRIDE
223    cmp     id, count2d
224    jle .loopi
225    cmp     id, countd
226    jg .ret
227    mov     jd, id
228.loop2x1:
229    vmovddup xmm0, [varq + iq*8]
230%if cpuflag(fma3)
231    mova xmm1, [varq + jq*8]
232    fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
233    mova COVAR(jq,0), xmm0
234%else
235    vmulpd   xmm0, [varq + jq*8]
236    ADDPD_MEM COVAR(jq,0), xmm0
237%endif ; cpuflag(fma3)
238    inc     id
239    add covarq, COVAR_STRIDE
240    cmp     id, countd
241    jle .loop2x1
242.ret:
243    REP_RET
244%endmacro ; UPDATE_LLS
245
246%if HAVE_AVX_EXTERNAL
247INIT_YMM avx
248UPDATE_LLS
249%endif
250%if HAVE_FMA3_EXTERNAL
251INIT_YMM fma3
252UPDATE_LLS
253%endif
254
255INIT_XMM sse2
256cglobal evaluate_lls, 3,4,2, ctx, var, order, i
257    ; This function is often called on the same buffer as update_lls, but with
258    ; an offset. They can't both be aligned.
259    ; Load halves rather than movu to avoid store-forwarding stalls, since the
260    ; input was initialized immediately prior to this function using scalar math.
261    %define coefsq ctxq
262    mov     id, orderd
263    imul    orderd, MAX_VARS
264    lea     coefsq, [ctxq + LLSModel.coeff + orderq*8]
265    movsd   m0, [varq]
266    movhpd  m0, [varq + 8]
267    mulpd   m0, [coefsq]
268    lea coefsq, [coefsq + iq*8]
269    lea   varq, [varq + iq*8]
270    neg     iq
271    add     iq, 2
272.loop:
273    movsd   m1, [varq + iq*8]
274    movhpd  m1, [varq + iq*8 + 8]
275    mulpd   m1, [coefsq + iq*8]
276    addpd   m0, m1
277    add     iq, 2
278    jl .loop
279    jg .skip1
280    movsd   m1, [varq + iq*8]
281    mulsd   m1, [coefsq + iq*8]
282    addpd   m0, m1
283.skip1:
284    movhlps m1, m0
285    addsd   m0, m1
286%if ARCH_X86_32
287    movsd  r0m, m0
288    fld   qword r0m
289%endif
290    RET
291