1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 21 нояб. 2018 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef DSP_ARCH_X86_SSE_GRAPHICS_EFFECTS_H_ 23 #define DSP_ARCH_X86_SSE_GRAPHICS_EFFECTS_H_ 24 25 #ifndef DSP_ARCH_X86_SSE2_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_X86_SSE2_IMPL */ 28 29 #define X4_TRANSPOSE \ 30 __ASM_EMIT("movaps %%xmm2, %%xmm4") \ 31 __ASM_EMIT("punpckldq %%xmm3, %%xmm2") \ 32 __ASM_EMIT("punpckhdq %%xmm3, %%xmm4") \ 33 __ASM_EMIT("movaps %%xmm0, %%xmm3") \ 34 __ASM_EMIT("punpckldq %%xmm1, %%xmm0") \ 35 __ASM_EMIT("punpckhdq %%xmm1, %%xmm3") \ 36 __ASM_EMIT("movaps %%xmm0, %%xmm1") \ 37 __ASM_EMIT("punpcklqdq %%xmm2, %%xmm0") \ 38 __ASM_EMIT("punpckhqdq %%xmm2, %%xmm1") \ 39 __ASM_EMIT("movaps %%xmm3, %%xmm2") \ 40 __ASM_EMIT("punpcklqdq %%xmm4, %%xmm2") \ 41 __ASM_EMIT("punpckhqdq %%xmm4, %%xmm3") \ 42 43 namespace sse2 44 { 45 static const float EFF_HSLA_HUE_XC[] __lsp_aligned16 = 46 { 47 1.0f, 1.0f, 1.0f, 1.0f 48 }; 49 50 #define EFF_HSLA_HUE_CORE \ 51 /* xmm0 = v, xmm6 = T, xmm7 = KT */ \ 52 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm5") /* xmm5 = 1 */ \ 53 __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 */ \ 54 __ASM_EMIT("movaps %%xmm5, %%xmm1") /* xmm1 = 1 */ \ 55 __ASM_EMIT("cmpps $6, %%xmm0, %%xmm4") /* xmm4 = [0 > v] */ \ 56 __ASM_EMIT("movups 0x00(%[eff]), %%xmm2") /* xmm2 = h s l a */ \ 57 __ASM_EMIT("subps %%xmm0, %%xmm1") /* xmm1 = 1 - v */ \ 58 __ASM_EMIT("shufps $0x00, %%xmm2, %%xmm2") /* xmm2 = EH */ \ 59 __ASM_EMIT("addps %%xmm5, %%xmm0") /* xmm0 = 1 + v */ \ 60 __ASM_EMIT("andps %%xmm4, %%xmm0") /* xmm0 = (1+v) & [0 > v] */ \ 61 __ASM_EMIT("andnps %%xmm1, %%xmm4") /* xmm4 = (1-v) & [0 <= v] */ \ 62 __ASM_EMIT("orps %%xmm4, %%xmm0") /* xmm0 = V = ((1+v) & [0 > v]) | ((1-v) & [0 <= v]) */ \ 63 /* xmm0 = V */ \ 64 /* xmm2 = EH */ \ 65 /* xmm6 = T */ \ 66 /* xmm7 = KT */ \ 67 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = V */ \ 68 __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 */ \ 69 __ASM_EMIT("subps %%xmm6, %%xmm1") /* xmm1 = V - T */ \ 70 __ASM_EMIT("addps %%xmm2, %%xmm0") /* xmm0 = EH + V */ \ 71 __ASM_EMIT("movaps %%xmm1, %%xmm3") /* xmm2 = V - T */ \ 72 __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm3 = EH + T */ \ 73 __ASM_EMIT("mulps %%xmm7, %%xmm3") /* xmm3 = (V-T)*KT */ \ 74 __ASM_EMIT("cmpps $2, %%xmm1, %%xmm4") /* xmm4 = [(V - T) >= 0] */ \ 75 __ASM_EMIT("andps %%xmm4, %%xmm3") /* xmm3 = A = ((V-T)*KT) & [(V - T) >= 0] */ \ 76 __ASM_EMIT("andps %%xmm4, %%xmm2") /* xmm2 = (EH + T) & [(V - T) >= 0] */ \ 77 __ASM_EMIT("andnps %%xmm0, %%xmm4") /* xmm4 = (EH + V) & [(V - T) < 0] */ \ 78 __ASM_EMIT("orps %%xmm2, %%xmm4") /* xmm4 = NH = ((EH + T) & [(V - T) >= 0]) | ((EH + V) & [(V - T) < 0]) */ \ 79 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm0") /* xmm0 = 1 */ \ 80 __ASM_EMIT("movaps %%xmm4, %%xmm1") /* xmm1 = NH */ \ 81 __ASM_EMIT("movaps %%xmm0, %%xmm5") /* xmm5 = 1 */ \ 82 __ASM_EMIT("cmpps $6, %%xmm4, %%xmm0") /* xmm0 = [1 > NH] */ \ 83 __ASM_EMIT("subps %%xmm5, %%xmm4") /* xmm4 = NH - 1 */ \ 84 __ASM_EMIT("movups 0x00(%[eff]), %%xmm2") /* xmm2 = h s l a */ \ 85 __ASM_EMIT("andps %%xmm0, %%xmm1") /* xmm1 = NH & [1 > H] */ \ 86 __ASM_EMIT("andnps %%xmm4, %%xmm0") /* xmm0 = (NH-1) & [1 <= H] */ \ 87 __ASM_EMIT("orps %%xmm1, %%xmm0") /* xmm0 = H = (NH & [1 > H]) | ((NH-1) & [1 <= H]) */ \ 88 __ASM_EMIT("movaps %%xmm2, %%xmm1") /* xmm1 = h s l a */ \ 89 __ASM_EMIT("shufps $0x55, %%xmm1, %%xmm1") /* xmm1 = S */ \ 90 __ASM_EMIT("shufps $0xaa, %%xmm2, %%xmm2") /* xmm2 = L */ \ 91 \ 92 X4_TRANSPOSE 93 94 eff_hsla_hue(float * dst,const float * v,const dsp::hsla_hue_eff_t * eff,size_t count)95 void eff_hsla_hue(float *dst, const float *v, const dsp::hsla_hue_eff_t *eff, size_t count) 96 { 97 ARCH_X86_ASM( 98 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm6") /* xmm6 = 1 */ 99 __ASM_EMIT("movss 0x10(%[eff]), %%xmm4") /* xmm4 = t 0 0 0 */ 100 __ASM_EMIT("movaps %%xmm6, %%xmm7") /* xmm7 = 1 */ 101 __ASM_EMIT("shufps $0x00, %%xmm4, %%xmm4") /* xmm4 = t */ 102 __ASM_EMIT("subps %%xmm4, %%xmm6") /* xmm6 = T = 1 - t */ 103 __ASM_EMIT("divps %%xmm4, %%xmm7") /* xmm7 = KT = 1 / t */ 104 105 __ASM_EMIT("sub $4, %[count]") 106 __ASM_EMIT("jb 2f") 107 108 //----------------------------------------------------------------- 109 // 4x blocks 110 __ASM_EMIT("1:") 111 112 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") /* xmm0 = v */ 113 EFF_HSLA_HUE_CORE 114 115 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 116 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 117 __ASM_EMIT("movups %%xmm2, 0x20(%[dst])") 118 __ASM_EMIT("movups %%xmm3, 0x30(%[dst])") 119 120 __ASM_EMIT("add $0x10, %[src]") 121 __ASM_EMIT("add $0x40, %[dst]") 122 __ASM_EMIT("sub $4, %[count]") 123 __ASM_EMIT("jae 1b") 124 125 __ASM_EMIT("2:") 126 __ASM_EMIT("add $4, %[count]") 127 __ASM_EMIT("jle 10f") 128 129 //----------------------------------------------------------------- 130 // 1x - 3x block 131 // Load last variable-sized chunk 132 __ASM_EMIT("test $1, %[count]") 133 __ASM_EMIT("jz 4f") 134 __ASM_EMIT("movss 0x00(%[src]), %%xmm0") 135 __ASM_EMIT("add $0x04, %[src]") 136 __ASM_EMIT("movlhps %%xmm0, %%xmm0") 137 __ASM_EMIT("4:") 138 __ASM_EMIT("test $2, %[count]") 139 __ASM_EMIT("jz 6f") 140 __ASM_EMIT("movlps 0x00(%[src]), %%xmm0") 141 __ASM_EMIT("6:") 142 143 EFF_HSLA_HUE_CORE 144 145 // Store last chunk 146 __ASM_EMIT("test $1, %[count]") 147 __ASM_EMIT("jz 8f") 148 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 149 __ASM_EMIT("add $0x10, %[dst]") 150 __ASM_EMIT("8:") 151 __ASM_EMIT("test $2, %[count]") 152 __ASM_EMIT("jz 10f") 153 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 154 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 155 156 __ASM_EMIT("10:") 157 158 : [dst] "+r" (dst), [src] "+r" (v), [count] "+r" (count) 159 : [eff] "r" (eff), 160 [XC] "o" (EFF_HSLA_HUE_XC) 161 : "cc", "memory", 162 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 163 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 164 ); 165 /* 166 float value, hue, alpha; 167 float t = 1.0f - eff->thresh; 168 float kt = 1.0f / eff->thresh; 169 170 for (size_t i=0; i<count; ++i, dst += 4) 171 { 172 value = v[i]; 173 value = (0 > value) ? 1.0f + value : 1.0f - value; 174 175 if ((value - t) >= 0) 176 { 177 hue = eff->h + t; 178 alpha = ((value - t) * kt); 179 } 180 else 181 { 182 hue = eff->h + value; 183 alpha = 0.0f; 184 } 185 186 dst[0] = (hue < 1.0f) ? hue : hue - 1.0f; 187 dst[1] = eff->s; 188 dst[2] = eff->l; 189 dst[3] = alpha; 190 }*/ 191 } 192 193 #undef EFF_HSLA_HUE_CORE 194 195 static const float EFF_HSLA_ALPHA_XC[] __lsp_aligned16 = 196 { 197 1.0f, 1.0f, 1.0f, 1.0f 198 }; 199 200 #define EFF_HSLA_ALPHA_CORE \ 201 /* xmm3 = v */ \ 202 __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 */ \ 203 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm5") /* xmm5 = 1 */ \ 204 __ASM_EMIT("cmpps $6, %%xmm3, %%xmm4") /* xmm4 = [0 > v] */ \ 205 __ASM_EMIT("movups 0x00(%[eff]), %%xmm0") /* xmm0 = hsla */ \ 206 __ASM_EMIT("movaps %%xmm3, %%xmm6") /* xmm6 = v */ \ 207 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = hsla */ \ 208 __ASM_EMIT("addps %%xmm5, %%xmm3") /* xmm3 = 1 + v */ \ 209 __ASM_EMIT("movaps %%xmm1, %%xmm2") /* xmm2 = hsla */ \ 210 __ASM_EMIT("subps %%xmm6, %%xmm5") /* xmm5 = 1 - v */ \ 211 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = H */ \ 212 __ASM_EMIT("andps %%xmm4, %%xmm3") /* xmm4 = (1+v) & [0 > v] */ \ 213 __ASM_EMIT("shufps $0x55, %%xmm1, %%xmm1") /* xmm1 = S */ \ 214 __ASM_EMIT("andnps %%xmm5, %%xmm4") /* xmm4 = (1-v) & [0 <= v] */ \ 215 __ASM_EMIT("shufps $0xaa, %%xmm2, %%xmm2") /* xmm1 = L */ \ 216 __ASM_EMIT("orps %%xmm4, %%xmm3") /* xmm3 = A = ((1+v) & [0 > v]) | ((1-v) & [0 <= v]) */ \ 217 \ 218 X4_TRANSPOSE 219 220 /* 221 value = v[i]; 222 value = (0.0f > value) ? 1.0f + value : 1.0f - value; 223 224 dst[0] = eff->h; 225 dst[1] = eff->s; 226 dst[2] = eff->l; 227 dst[3] = value; // Fill alpha channel 228 */ 229 eff_hsla_alpha(float * dst,const float * v,const dsp::hsla_alpha_eff_t * eff,size_t count)230 void eff_hsla_alpha(float *dst, const float *v, const dsp::hsla_alpha_eff_t *eff, size_t count) 231 { 232 ARCH_X86_ASM( 233 __ASM_EMIT("sub $4, %[count]") 234 __ASM_EMIT("jb 2f") 235 236 //----------------------------------------------------------------- 237 // 4x blocks 238 __ASM_EMIT("1:") 239 240 __ASM_EMIT("movups 0x00(%[src]), %%xmm3") /* xmm3 = v */ 241 EFF_HSLA_ALPHA_CORE 242 243 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 244 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 245 __ASM_EMIT("movups %%xmm2, 0x20(%[dst])") 246 __ASM_EMIT("movups %%xmm3, 0x30(%[dst])") 247 248 __ASM_EMIT("add $0x10, %[src]") 249 __ASM_EMIT("add $0x40, %[dst]") 250 __ASM_EMIT("sub $4, %[count]") 251 __ASM_EMIT("jae 1b") 252 253 __ASM_EMIT("2:") 254 __ASM_EMIT("add $4, %[count]") 255 __ASM_EMIT("jle 10f") 256 257 //----------------------------------------------------------------- 258 // 1x - 3x block 259 // Load last variable-sized chunk 260 __ASM_EMIT("test $1, %[count]") 261 __ASM_EMIT("jz 4f") 262 __ASM_EMIT("movss 0x00(%[src]), %%xmm3") 263 __ASM_EMIT("add $0x04, %[src]") 264 __ASM_EMIT("movlhps %%xmm3, %%xmm3") 265 __ASM_EMIT("4:") 266 __ASM_EMIT("test $2, %[count]") 267 __ASM_EMIT("jz 6f") 268 __ASM_EMIT("movlps 0x00(%[src]), %%xmm3") 269 __ASM_EMIT("6:") 270 271 EFF_HSLA_ALPHA_CORE 272 273 // Store last chunk 274 __ASM_EMIT("test $1, %[count]") 275 __ASM_EMIT("jz 8f") 276 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 277 __ASM_EMIT("add $0x10, %[dst]") 278 __ASM_EMIT("8:") 279 __ASM_EMIT("test $2, %[count]") 280 __ASM_EMIT("jz 10f") 281 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 282 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 283 284 __ASM_EMIT("10:") 285 286 : [dst] "+r" (dst), [src] "+r" (v), [count] "+r" (count) 287 : [eff] "r" (eff), 288 [XC] "o" (EFF_HSLA_ALPHA_XC) 289 : "cc", "memory", 290 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 291 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 292 ); 293 294 } 295 #undef EFF_HSLA_ALPHA_CORE 296 297 static const uint32_t EFF_HSLA_SAT_XC[] __lsp_aligned16 = 298 { 299 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 300 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 301 }; 302 303 #define EFF_HSLA_SAT_CORE \ 304 /* xmm1 = v, xmm6 = T, xmm7 = KT */ \ 305 __ASM_EMIT("movaps %%xmm6, %%xmm0") /* xmm0 = T */ \ 306 __ASM_EMIT("andps 0x10 + %[XC], %%xmm1") /* xmm1 = V */ \ 307 __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 */ \ 308 __ASM_EMIT("subps %%xmm1, %%xmm0") /* xmm0 = T-V */ \ 309 __ASM_EMIT("cmpps $6, %%xmm0, %%xmm5") /* xmm5 = [ 0 >= T - V] */ \ 310 __ASM_EMIT("mulps %%xmm7, %%xmm0") /* xmm0 = (T-V)*KT */ \ 311 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = [ 0 >= T - V] */ \ 312 __ASM_EMIT("movups 0x00(%[eff]), %%xmm2") /* xmm2 = hsla */ \ 313 __ASM_EMIT("andps %%xmm5, %%xmm1") /* xmm1 = V & [ 0 >= T - V] */ \ 314 __ASM_EMIT("andnps %%xmm0, %%xmm3") /* xmm3 = A = (T-V)*KT & [ 0 < T - V] */ \ 315 __ASM_EMIT("andnps %%xmm6, %%xmm5") /* xmm5 = T & [0 < T-V] */ \ 316 __ASM_EMIT("movaps %%xmm2, %%xmm0") /* xmm0 = hsla */ \ 317 __ASM_EMIT("orps %%xmm5, %%xmm1") /* xmm1 = KS = (V & [ 0 >= T - V]) | (T & [0 < T-V]) */ \ 318 __ASM_EMIT("movaps %%xmm2, %%xmm4") /* xmm4 = hsla */ \ 319 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = H */ \ 320 __ASM_EMIT("shufps $0x55, %%xmm4, %%xmm4") /* xmm4 = s */ \ 321 __ASM_EMIT("shufps $0xaa, %%xmm2, %%xmm2") /* xmm2 = L */ \ 322 __ASM_EMIT("mulps %%xmm4, %%xmm1") /* xmm1 = S = s * KS */ \ 323 \ 324 X4_TRANSPOSE 325 326 /* 327 kt = 1.0f / eff->thresh; 328 value = (value >= 0.0f) ? value : -value; 329 330 if (0 >= (eff->thresh - value)) 331 { 332 dst[1] = eff->s * value; 333 dst[3] = 0.0f; 334 } 335 else 336 { 337 dst[1] = eff->s * eff->thresh; 338 dst[3] = (eff->thresh - value) * kt; 339 } 340 341 dst[0] = eff->h; 342 dst[2] = eff->l; 343 */ 344 eff_hsla_sat(float * dst,const float * v,const dsp::hsla_sat_eff_t * eff,size_t count)345 void eff_hsla_sat(float *dst, const float *v, const dsp::hsla_sat_eff_t *eff, size_t count) 346 { 347 ARCH_X86_ASM( 348 __ASM_EMIT("movss 0x10(%[eff]), %%xmm6") /* xmm6 = t 0 0 0 */ 349 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm7") /* xmm7 = 1 */ 350 __ASM_EMIT("shufps $0x00, %%xmm6, %%xmm6") /* xmm6 = T */ 351 __ASM_EMIT("divps %%xmm6, %%xmm7") /* xmm7 = KT = 1 / t */ 352 353 __ASM_EMIT("sub $4, %[count]") 354 __ASM_EMIT("jb 2f") 355 356 //----------------------------------------------------------------- 357 // 4x blocks 358 __ASM_EMIT("1:") 359 360 __ASM_EMIT("movups 0x00(%[src]), %%xmm1") /* xmm1 = v */ 361 EFF_HSLA_SAT_CORE 362 363 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 364 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 365 __ASM_EMIT("movups %%xmm2, 0x20(%[dst])") 366 __ASM_EMIT("movups %%xmm3, 0x30(%[dst])") 367 368 __ASM_EMIT("add $0x10, %[src]") 369 __ASM_EMIT("add $0x40, %[dst]") 370 __ASM_EMIT("sub $4, %[count]") 371 __ASM_EMIT("jae 1b") 372 373 __ASM_EMIT("2:") 374 __ASM_EMIT("add $4, %[count]") 375 __ASM_EMIT("jle 10f") 376 377 //----------------------------------------------------------------- 378 // 1x - 3x block 379 // Load last variable-sized chunk 380 __ASM_EMIT("test $1, %[count]") 381 __ASM_EMIT("jz 4f") 382 __ASM_EMIT("movss 0x00(%[src]), %%xmm1") 383 __ASM_EMIT("add $0x04, %[src]") 384 __ASM_EMIT("movlhps %%xmm1, %%xmm1") 385 __ASM_EMIT("4:") 386 __ASM_EMIT("test $2, %[count]") 387 __ASM_EMIT("jz 6f") 388 __ASM_EMIT("movlps 0x00(%[src]), %%xmm1") 389 __ASM_EMIT("6:") 390 391 EFF_HSLA_SAT_CORE 392 393 // Store last chunk 394 __ASM_EMIT("test $1, %[count]") 395 __ASM_EMIT("jz 8f") 396 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 397 __ASM_EMIT("add $0x10, %[dst]") 398 __ASM_EMIT("8:") 399 __ASM_EMIT("test $2, %[count]") 400 __ASM_EMIT("jz 10f") 401 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 402 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 403 404 __ASM_EMIT("10:") 405 406 : [dst] "+r" (dst), [src] "+r" (v), [count] "+r" (count) 407 : [eff] "r" (eff), 408 [XC] "o" (EFF_HSLA_SAT_XC) 409 : "cc", "memory", 410 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 411 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 412 ); 413 } 414 415 #undef EFF_HSLA_SAT_CORE 416 417 static const uint32_t EFF_HSLA_LIGHT_XC[] __lsp_aligned16 = 418 { 419 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 420 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff 421 }; 422 423 #define EFF_HSLA_LIGHT_CORE \ 424 /* xmm2 = v, xmm6 = T, xmm7 = KT */ \ 425 __ASM_EMIT("movaps %%xmm6, %%xmm0") /* xmm0 = T */ \ 426 __ASM_EMIT("andps 0x10 + %[XC], %%xmm2") /* xmm2 = V */ \ 427 __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 */ \ 428 __ASM_EMIT("subps %%xmm2, %%xmm0") /* xmm0 = T-V */ \ 429 __ASM_EMIT("cmpps $6, %%xmm0, %%xmm5") /* xmm5 = [ 0 >= T - V] */ \ 430 __ASM_EMIT("mulps %%xmm7, %%xmm0") /* xmm0 = (T-V)*KT */ \ 431 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = [ 0 >= T - V] */ \ 432 __ASM_EMIT("movups 0x00(%[eff]), %%xmm1") /* xmm1 = hsla */ \ 433 __ASM_EMIT("andps %%xmm5, %%xmm2") /* xmm2 = V & [ 0 >= T - V] */ \ 434 __ASM_EMIT("andnps %%xmm0, %%xmm3") /* xmm3 = A = (T-V)*KT & [ 0 < T - V] */ \ 435 __ASM_EMIT("andnps %%xmm6, %%xmm5") /* xmm5 = T & [0 < T-V] */ \ 436 __ASM_EMIT("movaps %%xmm1, %%xmm0") /* xmm0 = hsla */ \ 437 __ASM_EMIT("orps %%xmm5, %%xmm2") /* xmm2 = KL = (V & [ 0 >= T - V]) | (T & [0 < T-V]) */ \ 438 __ASM_EMIT("movaps %%xmm1, %%xmm4") /* xmm4 = hsla */ \ 439 __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = H */ \ 440 __ASM_EMIT("shufps $0xaa, %%xmm4, %%xmm4") /* xmm4 = l */ \ 441 __ASM_EMIT("shufps $0x55, %%xmm1, %%xmm1") /* xmm1 = S */ \ 442 __ASM_EMIT("mulps %%xmm4, %%xmm2") /* xmm2 = L = l * KL */ \ 443 \ 444 X4_TRANSPOSE 445 446 /* 447 value = (value >= 0.0f) ? value : -value; 448 449 if (0 >= (eff->thresh - value)) 450 { 451 dst[2] = eff->l * value; 452 dst[3] = 0.0f; 453 } 454 else 455 { 456 dst[2] = eff->l * eff->thresh; 457 dst[3] = (eff->thresh - value) * kt; 458 } 459 460 dst[0] = eff->h; 461 dst[1] = eff->s; 462 */ 463 eff_hsla_light(float * dst,const float * v,const dsp::hsla_light_eff_t * eff,size_t count)464 void eff_hsla_light(float *dst, const float *v, const dsp::hsla_light_eff_t *eff, size_t count) 465 { 466 ARCH_X86_ASM( 467 __ASM_EMIT("movss 0x10(%[eff]), %%xmm6") /* xmm6 = t 0 0 0 */ 468 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm7") /* xmm7 = 1 */ 469 __ASM_EMIT("shufps $0x00, %%xmm6, %%xmm6") /* xmm6 = T */ 470 __ASM_EMIT("divps %%xmm6, %%xmm7") /* xmm7 = KT = 1 / t */ 471 472 __ASM_EMIT("sub $4, %[count]") 473 __ASM_EMIT("jb 2f") 474 475 //----------------------------------------------------------------- 476 // 4x blocks 477 __ASM_EMIT("1:") 478 479 __ASM_EMIT("movups 0x00(%[src]), %%xmm2") /* xmm1 = v */ 480 EFF_HSLA_LIGHT_CORE 481 482 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 483 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 484 __ASM_EMIT("movups %%xmm2, 0x20(%[dst])") 485 __ASM_EMIT("movups %%xmm3, 0x30(%[dst])") 486 487 __ASM_EMIT("add $0x10, %[src]") 488 __ASM_EMIT("add $0x40, %[dst]") 489 __ASM_EMIT("sub $4, %[count]") 490 __ASM_EMIT("jae 1b") 491 492 __ASM_EMIT("2:") 493 __ASM_EMIT("add $4, %[count]") 494 __ASM_EMIT("jle 10f") 495 496 //----------------------------------------------------------------- 497 // 1x - 3x block 498 // Load last variable-sized chunk 499 __ASM_EMIT("test $1, %[count]") 500 __ASM_EMIT("jz 4f") 501 __ASM_EMIT("movss 0x00(%[src]), %%xmm2") 502 __ASM_EMIT("add $0x04, %[src]") 503 __ASM_EMIT("movlhps %%xmm2, %%xmm2") 504 __ASM_EMIT("4:") 505 __ASM_EMIT("test $2, %[count]") 506 __ASM_EMIT("jz 6f") 507 __ASM_EMIT("movlps 0x00(%[src]), %%xmm2") 508 __ASM_EMIT("6:") 509 510 EFF_HSLA_LIGHT_CORE 511 512 // Store last chunk 513 __ASM_EMIT("test $1, %[count]") 514 __ASM_EMIT("jz 8f") 515 __ASM_EMIT("movups %%xmm2, 0x00(%[dst])") 516 __ASM_EMIT("add $0x10, %[dst]") 517 __ASM_EMIT("8:") 518 __ASM_EMIT("test $2, %[count]") 519 __ASM_EMIT("jz 10f") 520 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 521 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 522 523 __ASM_EMIT("10:") 524 525 : [dst] "+r" (dst), [src] "+r" (v), [count] "+r" (count) 526 : [eff] "r" (eff), 527 [XC] "o" (EFF_HSLA_LIGHT_XC) 528 : "cc", "memory", 529 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 530 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 531 ); 532 } 533 534 #undef EFF_HSLA_LIGHT_CORE 535 536 } 537 538 #undef X4_TRANSPOSE 539 540 #endif /* DSP_ARCH_X86_SSE_GRAPHICS_EFFECTS_H_ */ 541