1;****************************************************************************** 2;* linear least squares model 3;* 4;* Copyright (c) 2013 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "x86util.asm" 24 25SECTION .text 26 27%define MAX_VARS 32 28%define MAX_VARS_ALIGN (MAX_VARS+4) 29%define COVAR_STRIDE MAX_VARS_ALIGN*8 30%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] 31 32struc LLSModel 33 .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN 34 .coeff: resq MAX_VARS*MAX_VARS 35 .variance: resq MAX_VARS 36 .indep_count: resd 1 37endstruc 38 39%macro ADDPD_MEM 2 40%if cpuflag(avx) 41 vaddpd %2, %2, %1 42%else 43 addpd %2, %1 44%endif 45 mova %1, %2 46%endmacro 47 48INIT_XMM sse2 49%define movdqa movaps 50cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 51 %define covarq ctxq 52 mov id, [ctxq + LLSModel.indep_count] 53 lea varq, [varq + iq*8] 54 neg iq 55 mov covar2q, covarq 56.loopi: 57 ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal 58 mova m1, [varq + iq*8] 59 mova m3, [varq + iq*8 + 16] 60 pshufd m4, m1, q1010 61 pshufd m5, m1, q3232 62 pshufd m6, m3, q1010 63 pshufd m7, m3, q3232 64 mulpd m0, m1, m4 65 mulpd m1, m1, m5 66 lea covarq, [covar2q + 16] 67 ADDPD_MEM COVAR(-2,0), m0 68 ADDPD_MEM COVAR(-2,1), m1 69 lea jq, [iq + 2] 70 cmp jd, -2 71 jg .skip4x4 72.loop4x4: 73 ; Compute all 16 pairwise products of a 4x4 block 74 mulpd m0, m4, m3 75 mulpd m1, m5, m3 76 mulpd m2, m6, m3 77 mulpd m3, m3, m7 78 ADDPD_MEM COVAR(0,0), m0 79 ADDPD_MEM COVAR(0,1), m1 80 ADDPD_MEM COVAR(0,2), m2 81 ADDPD_MEM COVAR(0,3), m3 82 mova m3, [varq + jq*8 + 16] 83 mulpd m0, m4, m3 84 mulpd m1, m5, m3 85 mulpd m2, m6, m3 86 mulpd m3, m3, m7 87 ADDPD_MEM COVAR(2,0), m0 88 ADDPD_MEM COVAR(2,1), m1 89 ADDPD_MEM COVAR(2,2), m2 90 ADDPD_MEM COVAR(2,3), m3 91 mova m3, [varq + jq*8 + 32] 92 add covarq, 32 93 add jq, 4 94 cmp jd, -2 95 jle .loop4x4 96.skip4x4: 97 test jd, jd 98 jg .skip2x4 99 mulpd m4, m3 100 mulpd m5, m3 101 mulpd m6, m3 102 mulpd m7, m3 103 ADDPD_MEM COVAR(0,0), m4 104 ADDPD_MEM COVAR(0,1), m5 105 ADDPD_MEM COVAR(0,2), m6 106 ADDPD_MEM COVAR(0,3), m7 107.skip2x4: 108 add iq, 4 109 add covar2q, 4*COVAR_STRIDE+32 110 cmp id, -2 111 jle .loopi 112 test id, id 113 jg .ret 114 mov jq, iq 115 %define covarq covar2q 116.loop2x1: 117 movsd m0, [varq + iq*8] 118 movlhps m0, m0 119 mulpd m0, [varq + jq*8] 120 ADDPD_MEM COVAR(0,0), m0 121 inc iq 122 add covarq, COVAR_STRIDE 123 test id, id 124 jle .loop2x1 125.ret: 126 REP_RET 127 128%if HAVE_AVX_EXTERNAL 129INIT_YMM avx 130cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 131 %define covarq ctxq 132 mov countd, [ctxq + LLSModel.indep_count] 133 lea count2d, [countq-2] 134 xor id, id 135.loopi: 136 ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal 137 mova ymm1, [varq + iq*8] 138 vbroadcastsd ymm4, [varq + iq*8] 139 vbroadcastsd ymm5, [varq + iq*8 + 8] 140 vbroadcastsd ymm6, [varq + iq*8 + 16] 141 vbroadcastsd ymm7, [varq + iq*8 + 24] 142 vextractf128 xmm3, ymm1, 1 143 vmulpd ymm0, ymm1, ymm4 144 vmulpd ymm1, ymm1, ymm5 145 vmulpd xmm2, xmm3, xmm6 146 vmulpd xmm3, xmm3, xmm7 147 ADDPD_MEM COVAR(iq ,0), ymm0 148 ADDPD_MEM COVAR(iq ,1), ymm1 149 ADDPD_MEM COVAR(iq+2,2), xmm2 150 ADDPD_MEM COVAR(iq+2,3), xmm3 151 lea jd, [iq + 4] 152 cmp jd, count2d 153 jg .skip4x4 154.loop4x4: 155 ; Compute all 16 pairwise products of a 4x4 block 156 mova ymm3, [varq + jq*8] 157 vmulpd ymm0, ymm3, ymm4 158 vmulpd ymm1, ymm3, ymm5 159 vmulpd ymm2, ymm3, ymm6 160 vmulpd ymm3, ymm3, ymm7 161 ADDPD_MEM COVAR(jq,0), ymm0 162 ADDPD_MEM COVAR(jq,1), ymm1 163 ADDPD_MEM COVAR(jq,2), ymm2 164 ADDPD_MEM COVAR(jq,3), ymm3 165 add jd, 4 166 cmp jd, count2d 167 jle .loop4x4 168.skip4x4: 169 cmp jd, countd 170 jg .skip2x4 171 mova xmm3, [varq + jq*8] 172 vmulpd xmm0, xmm3, xmm4 173 vmulpd xmm1, xmm3, xmm5 174 vmulpd xmm2, xmm3, xmm6 175 vmulpd xmm3, xmm3, xmm7 176 ADDPD_MEM COVAR(jq,0), xmm0 177 ADDPD_MEM COVAR(jq,1), xmm1 178 ADDPD_MEM COVAR(jq,2), xmm2 179 ADDPD_MEM COVAR(jq,3), xmm3 180.skip2x4: 181 add id, 4 182 add covarq, 4*COVAR_STRIDE 183 cmp id, count2d 184 jle .loopi 185 cmp id, countd 186 jg .ret 187 mov jd, id 188.loop2x1: 189 vmovddup xmm0, [varq + iq*8] 190 vmulpd xmm0, [varq + jq*8] 191 ADDPD_MEM COVAR(jq,0), xmm0 192 inc id 193 add covarq, COVAR_STRIDE 194 cmp id, countd 195 jle .loop2x1 196.ret: 197 REP_RET 198%endif 199 200INIT_XMM sse2 201cglobal evaluate_lls, 3,4,2, ctx, var, order, i 202 ; This function is often called on the same buffer as update_lls, but with 203 ; an offset. They can't both be aligned. 204 ; Load halves rather than movu to avoid store-forwarding stalls, since the 205 ; input was initialized immediately prior to this function using scalar math. 206 %define coefsq ctxq 207 mov id, orderd 208 imul orderd, MAX_VARS 209 lea coefsq, [ctxq + LLSModel.coeff + orderq*8] 210 movsd m0, [varq] 211 movhpd m0, [varq + 8] 212 mulpd m0, [coefsq] 213 lea coefsq, [coefsq + iq*8] 214 lea varq, [varq + iq*8] 215 neg iq 216 add iq, 2 217.loop: 218 movsd m1, [varq + iq*8] 219 movhpd m1, [varq + iq*8 + 8] 220 mulpd m1, [coefsq + iq*8] 221 addpd m0, m1 222 add iq, 2 223 jl .loop 224 jg .skip1 225 movsd m1, [varq + iq*8] 226 mulsd m1, [coefsq + iq*8] 227 addpd m0, m1 228.skip1: 229 movhlps m1, m0 230 addsd m0, m1 231%if ARCH_X86_32 232 movsd r0m, m0 233 fld qword r0m 234%endif 235 RET 236