1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 15 нояб. 2018 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #ifndef INCLUDE_DSP_ARCH_X86_SSE2_GRAPHICS_H_ 23 #define INCLUDE_DSP_ARCH_X86_SSE2_GRAPHICS_H_ 24 25 #ifndef DSP_ARCH_X86_SSE2_IMPL 26 #error "This header should not be included directly" 27 #endif /* DSP_ARCH_X86_SSE2_IMPL */ 28 29 namespace sse2 30 { 31 #define FVEC4(x) x, x, x, x 32 33 static const float HSL_RGB[] __lsp_aligned16 = 34 { 35 FVEC4(0.5f), // 1/2 36 FVEC4(0.333333333333f), // 1/3 37 FVEC4(1.0f), // 1 38 FVEC4(6.0f), // 6 39 FVEC4(0.166666666667f), // 1/6 40 FVEC4(0.666666666667f) // 2/3 41 }; 42 43 static const float RGB_HSL[] __lsp_aligned16 = 44 { 45 FVEC4(4.0f), 46 FVEC4(2.0f), 47 FVEC4(6.0f), 48 FVEC4(1.0f), 49 FVEC4(0.5f), 50 FVEC4(0.166666666667f) // 1/6 51 }; 52 53 static const float RGBA_TO_BGRA32[] __lsp_aligned16 = 54 { 55 FVEC4(255.0f) 56 }; 57 58 #undef FVEC4 59 60 #define HSLA_TRANSPOSE \ 61 __ASM_EMIT("movaps %%xmm2, %%xmm4") \ 62 __ASM_EMIT("punpckldq %%xmm3, %%xmm2") \ 63 __ASM_EMIT("punpckhdq %%xmm3, %%xmm4") \ 64 __ASM_EMIT("movaps %%xmm0, %%xmm3") \ 65 __ASM_EMIT("punpckldq %%xmm1, %%xmm0") \ 66 __ASM_EMIT("punpckhdq %%xmm1, %%xmm3") \ 67 __ASM_EMIT("movaps %%xmm0, %%xmm1") \ 68 __ASM_EMIT("punpcklqdq %%xmm2, %%xmm0") \ 69 __ASM_EMIT("punpckhqdq %%xmm2, %%xmm1") \ 70 __ASM_EMIT("movaps %%xmm3, %%xmm2") \ 71 __ASM_EMIT("punpcklqdq %%xmm4, %%xmm2") \ 72 __ASM_EMIT("punpckhqdq %%xmm4, %%xmm3") \ 73 74 #define HSLA_TO_RGBA_CORE \ 75 /* Transpose */\ 76 HSLA_TRANSPOSE \ 77 \ 78 /* xmm0 = h0 h1 h2 h3 = H */ \ 79 /* xmm1 = s0 s1 s2 s3 = S */ \ 80 /* xmm2 = l0 l1 l2 l3 = L */ \ 81 /* xmm3 = a0 a1 a2 a3 = A */ \ 82 /* Calc temp1 (T1) and temp2 (T2) */ \ 83 __ASM_EMIT("movaps %%xmm1, %%xmm6") /* xmm6 = S */ \ 84 __ASM_EMIT("movaps %%xmm2, %%xmm7") /* xmm7 = L */ \ 85 __ASM_EMIT("addps %%xmm2, %%xmm6") /* xmm6 = L + S */ \ 86 __ASM_EMIT("mulps %%xmm1, %%xmm7") /* xmm7 = L * S */ \ 87 __ASM_EMIT("subps %%xmm7, %%xmm6") /* xmm6 = L + S - L * S */ \ 88 __ASM_EMIT("addps %%xmm2, %%xmm7") /* xmm7 = L + L * S */ \ 89 __ASM_EMIT("movaps %%xmm2, %%xmm5") /* xmm5 = L */ \ 90 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm4") /* xmm4 = 0.5 */ \ 91 __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = L + L */ \ 92 __ASM_EMIT("cmpps $2, %%xmm2, %%xmm4") /* xmm4 = [L >= 0.5f] */ \ 93 __ASM_EMIT("andps %%xmm4, %%xmm6") /* xmm6 = [L >= 0.5f] & (L+S - L*S) */ \ 94 __ASM_EMIT("andnps %%xmm7, %%xmm4") /* xmm4 = [L < 0.5f] & (L + L*S) */ \ 95 __ASM_EMIT("orps %%xmm6, %%xmm4") /* xmm4 = T2 = ([0.5f < L] & (L + L*S)) | ([0.5f >= L] & (L+S - L*S)) */ \ 96 __ASM_EMIT("movaps %%xmm0, %%xmm1") /* xmm1 = TG = H */ \ 97 __ASM_EMIT("subps %%xmm4, %%xmm5") /* xmm5 = T1 = L + L - T2 */ \ 98 __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = H */ \ 99 \ 100 __ASM_EMIT("movaps 0x10 + %[XC], %%xmm6") /* xmm6 = 1/3 */ \ 101 __ASM_EMIT("addps %%xmm6, %%xmm0") /* xmm0 = H + 1/3 */ \ 102 __ASM_EMIT("subps %%xmm6, %%xmm2") /* xmm2 = H - 1/3 */ \ 103 \ 104 __ASM_EMIT("movaps 0x20 + %[XC], %%xmm7") /* xmm7 = 1 */ \ 105 __ASM_EMIT("movaps %%xmm0, %%xmm6") /* xmm6 = H + 1/3 */ \ 106 __ASM_EMIT("subps %%xmm7, %%xmm6") /* xmm6 = H + 1/3 - 1 */ \ 107 __ASM_EMIT("cmpps $5, %%xmm0, %%xmm7") /* xmm7 = [(H + 1/3) <= 1] */ \ 108 __ASM_EMIT("andps %%xmm7, %%xmm0") /* xmm0 = (H + 1/3) & [(H + 1/3) <= 1] */ \ 109 __ASM_EMIT("andnps %%xmm6, %%xmm7") /* xmm7 = (H + 1/3 - 1) & [(H + 1/3) > 1] */ \ 110 __ASM_EMIT("orps %%xmm7, %%xmm0") /* xmm0 = TR = ((H + 1/3) & [(H + 1/3) <= 1]) | ((H + 1/3 - 1) & [(H + 1/3) > 1]) */ \ 111 \ 112 __ASM_EMIT("movaps 0x20 + %[XC], %%xmm7") /* xmm7 = 1 */ \ 113 __ASM_EMIT("movaps %%xmm2, %%xmm6") /* xmm6 = H - 1/3 */ \ 114 __ASM_EMIT("addps %%xmm7, %%xmm6") /* xmm6 = H - 1/3 + 1 */ \ 115 __ASM_EMIT("xorps %%xmm7, %%xmm7") /* xmm7 = 0 */ \ 116 __ASM_EMIT("cmpps $2, %%xmm2, %%xmm7") /* xmm7 = [(H - 1/3) >= 0] */ \ 117 __ASM_EMIT("andps %%xmm7, %%xmm2") /* xmm2 = (H - 1/3) & [(H - 1/3) >= 0] */ \ 118 __ASM_EMIT("andnps %%xmm6, %%xmm7") /* xmm6 = (H - 1/3 + 1) & [(H - 1/3) < 0] */ \ 119 __ASM_EMIT("orps %%xmm7, %%xmm2") /* xmm2 = TB = ((H - 1/3) & [(H - 1/3) >= 0]) | ((H - 1/3 + 1) & [(H - 1/3) < 0]) */ \ 120 \ 121 __ASM_EMIT("movaps %%xmm4, %%xmm6") /* xmm6 = T2 */ \ 122 __ASM_EMIT("subps %%xmm5, %%xmm6") /* xmm6 = T2 - T1 */ \ 123 __ASM_EMIT("mulps 0x30 + %[XC], %%xmm6") /* xmm6 = K = (T2 - T1)*6.0 */ \ 124 \ 125 /* xmm0 = TR */ \ 126 /* xmm1 = TG */ \ 127 /* xmm2 = TB */ \ 128 /* xmm3 = A */ \ 129 /* xmm4 = T2 */ \ 130 /* xmm5 = T1 */ \ 131 /* xmm6 = K */ \ 132 __ASM_EMIT("movaps %%xmm0, 0x00(%[HSLM])") /* TR */ \ 133 __ASM_EMIT("movaps %%xmm1, 0x10(%[HSLM])") /* TG */ \ 134 __ASM_EMIT("movaps %%xmm2, 0x20(%[HSLM])") /* TB */ \ 135 __ASM_EMIT("movaps %%xmm3, 0x30(%[HSLM])") /* A */ \ 136 __ASM_EMIT("movaps %%xmm4, 0x40(%[HSLM])") /* T2 */ \ 137 __ASM_EMIT("movaps %%xmm5, 0x50(%[HSLM])") /* T1 */ \ 138 \ 139 __ASM_EMIT("movaps 0x50 + %[XC], %%xmm3") /* xmm3 = 2/3 */ \ 140 __ASM_EMIT("movaps %%xmm5, %%xmm7") /* xmm7 = T1 */ \ 141 __ASM_EMIT("mulps %%xmm6, %%xmm0") /* xmm0 = k*TR */ \ 142 __ASM_EMIT("mulps %%xmm6, %%xmm3") /* xmm3 = K * 2/3 */ \ 143 __ASM_EMIT("mulps %%xmm6, %%xmm1") /* xmm1 = k*TG */ \ 144 __ASM_EMIT("movaps %%xmm3, %%xmm4") /* xmm4 = K * 2/3 */ \ 145 __ASM_EMIT("mulps %%xmm6, %%xmm2") /* xmm2 = k*TB */ \ 146 __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = K * 2/3 */ \ 147 __ASM_EMIT("subps %%xmm0, %%xmm3") /* xmm3 = K * (2/3 - TR) */ \ 148 __ASM_EMIT("subps %%xmm1, %%xmm4") /* xmm4 = K * (2/3 - TG) */ \ 149 __ASM_EMIT("subps %%xmm2, %%xmm5") /* xmm5 = K * (2/3 - TB) */ \ 150 __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = KTR = k*TR + T1 */ \ 151 __ASM_EMIT("addps %%xmm7, %%xmm1") /* xmm1 = KTG = k*TG + T1 */ \ 152 __ASM_EMIT("addps %%xmm7, %%xmm2") /* xmm2 = KTB = k*TB + T1 */ \ 153 __ASM_EMIT("addps %%xmm7, %%xmm3") /* xmm3 = RTR = K * (2/3 - TR) + T1 */ \ 154 __ASM_EMIT("addps %%xmm7, %%xmm4") /* xmm4 = RTG = K * (2/3 - TG) + T1 */ \ 155 __ASM_EMIT("addps %%xmm7, %%xmm5") /* xmm5 = RTB = K * (2/3 - TB) + T1 */ \ 156 \ 157 __ASM_EMIT("movaps %%xmm0, 0x60(%[HSLM])") /* KTR */ \ 158 __ASM_EMIT("movaps %%xmm1, 0x70(%[HSLM])") /* KTG */ \ 159 __ASM_EMIT("movaps %%xmm2, 0x80(%[HSLM])") /* KTB */ \ 160 __ASM_EMIT("movaps %%xmm3, 0x90(%[HSLM])") /* RTR */ \ 161 __ASM_EMIT("movaps %%xmm4, 0xa0(%[HSLM])") /* RTG */ \ 162 __ASM_EMIT("movaps %%xmm5, 0xb0(%[HSLM])") /* RTB */ \ 163 \ 164 /* Now we have enough data to process */ \ 165 __ASM_EMIT("movaps 0x00(%[HSLM]), %%xmm0") /* xmm0 = TR */ \ 166 __ASM_EMIT("movaps 0x10(%[HSLM]), %%xmm1") /* xmm1 = TG */ \ 167 __ASM_EMIT("movaps 0x20(%[HSLM]), %%xmm2") /* xmm2 = TB */ \ 168 __ASM_EMIT("movaps 0x30(%[HSLM]), %%xmm3") /* xmm3 = A */ \ 169 \ 170 /* Process red */ \ 171 __ASM_EMIT("movaps %%xmm0, %%xmm5") /* xmm5 = TR */ \ 172 __ASM_EMIT("movaps %%xmm0, %%xmm6") /* xmm6 = TR */ \ 173 __ASM_EMIT("cmpps $1, 0x00 + %[XC], %%xmm0") /* xmm0 = [ TR < 0.5 ] */ \ 174 __ASM_EMIT("cmpps $1, 0x40 + %[XC], %%xmm5") /* xmm5 = [ TR < 1/6 ] */ \ 175 __ASM_EMIT("cmpps $1, 0x50 + %[XC], %%xmm6") /* xmm6 = [ TR < 2/3 ] */ \ 176 __ASM_EMIT("movaps %%xmm5, %%xmm7") /* xmm7 = [ TR < 1/6 ] */ \ 177 __ASM_EMIT("andnps %%xmm0, %%xmm7") /* xmm7 = [ TR >= 1/6 ] & [ TR < 0.5 ] */ \ 178 __ASM_EMIT("andnps %%xmm6, %%xmm0") /* xmm0 = [ TR >= 0.5 ] & [ TR < 2/3 ] */ \ 179 __ASM_EMIT("andps 0x60(%[HSLM]), %%xmm5") /* xmm5 = KTR & [ TR < 1/6 ] */ \ 180 __ASM_EMIT("andps 0x40(%[HSLM]), %%xmm7") /* xmm7 = T2 & [ TR < 1/6 ] */ \ 181 __ASM_EMIT("andnps 0x50(%[HSLM]), %%xmm6") /* xmm6 = T1 & [ TR >= 2/3 ] */ \ 182 __ASM_EMIT("andps 0x90(%[HSLM]), %%xmm0") /* xmm0 = RTR & [ TR >= 0.5 ] & [ TR < 2/3 ] */ \ 183 __ASM_EMIT("orps %%xmm7, %%xmm6") \ 184 __ASM_EMIT("orps %%xmm5, %%xmm0") \ 185 __ASM_EMIT("orps %%xmm6, %%xmm0") \ 186 \ 187 /* Process green */ \ 188 __ASM_EMIT("movaps %%xmm1, %%xmm5") /* xmm5 = TG */ \ 189 __ASM_EMIT("movaps %%xmm1, %%xmm6") /* xmm6 = TG */ \ 190 __ASM_EMIT("cmpps $1, 0x00 + %[XC], %%xmm1") /* xmm1 = [ TG < 0.5 ] */ \ 191 __ASM_EMIT("cmpps $1, 0x40 + %[XC], %%xmm5") /* xmm5 = [ TG < 1/6 ] */ \ 192 __ASM_EMIT("cmpps $1, 0x50 + %[XC], %%xmm6") /* xmm6 = [ TG < 2/3 ] */ \ 193 __ASM_EMIT("movaps %%xmm5, %%xmm7") /* xmm7 = [ TG < 1/6 ] */ \ 194 __ASM_EMIT("andnps %%xmm1, %%xmm7") /* xmm7 = [ TG >= 1/6 ] & [ TG < 0.5 ] */ \ 195 __ASM_EMIT("andnps %%xmm6, %%xmm1") /* xmm1 = [ TG >= 0.5 ] & [ TG < 2/3 ] */ \ 196 __ASM_EMIT("andps 0x70(%[HSLM]), %%xmm5") /* xmm5 = KTG & [ TG < 1/6 ] */ \ 197 __ASM_EMIT("andps 0x40(%[HSLM]), %%xmm7") /* xmm7 = T2 & [ TG < 1/6 ] */ \ 198 __ASM_EMIT("andnps 0x50(%[HSLM]), %%xmm6") /* xmm6 = T1 & [ TG >= 2/3 ] */ \ 199 __ASM_EMIT("andps 0xa0(%[HSLM]), %%xmm1") /* xmm1 = RTG & [ TG >= 0.5 ] & [ TG < 2/3 ] */ \ 200 __ASM_EMIT("orps %%xmm7, %%xmm6") \ 201 __ASM_EMIT("orps %%xmm5, %%xmm1") \ 202 __ASM_EMIT("orps %%xmm6, %%xmm1") \ 203 \ 204 /* Process blue */ \ 205 __ASM_EMIT("movaps %%xmm2, %%xmm5") /* xmm5 = TB */ \ 206 __ASM_EMIT("movaps %%xmm2, %%xmm6") /* xmm6 = TB */ \ 207 __ASM_EMIT("cmpps $1, 0x00 + %[XC], %%xmm2") /* xmm2 = [ TB < 0.5 ] */ \ 208 __ASM_EMIT("cmpps $1, 0x40 + %[XC], %%xmm5") /* xmm5 = [ TB < 1/6 ] */ \ 209 __ASM_EMIT("cmpps $1, 0x50 + %[XC], %%xmm6") /* xmm6 = [ TB < 2/3 ] */ \ 210 __ASM_EMIT("movaps %%xmm5, %%xmm7") /* xmm7 = [ TB < 1/6 ] */ \ 211 __ASM_EMIT("andnps %%xmm2, %%xmm7") /* xmm7 = [ TB >= 1/6 ] & [ TB < 0.5 ] */ \ 212 __ASM_EMIT("andnps %%xmm6, %%xmm2") /* xmm2 = [ TB >= 0.5 ] & [ TB < 2/3 ] */ \ 213 __ASM_EMIT("andps 0x80(%[HSLM]), %%xmm5") /* xmm5 = KTB & [ TB < 1/6 ] */ \ 214 __ASM_EMIT("andps 0x40(%[HSLM]), %%xmm7") /* xmm7 = T2 & [ TB < 1/6 ] */ \ 215 __ASM_EMIT("andnps 0x50(%[HSLM]), %%xmm6") /* xmm6 = T1 & [ TB >= 2/3 ] */ \ 216 __ASM_EMIT("andps 0xb0(%[HSLM]), %%xmm2") /* xmm2 = RTB & [ TB >= 0.5 ] & [ TB < 2/3 ] */ \ 217 __ASM_EMIT("orps %%xmm7, %%xmm6") \ 218 __ASM_EMIT("orps %%xmm5, %%xmm2") \ 219 __ASM_EMIT("orps %%xmm6, %%xmm2") \ 220 \ 221 /* Transpose final result back */ \ 222 HSLA_TRANSPOSE 223 hsla_to_rgba(float * dst,const float * src,size_t count)224 void hsla_to_rgba(float *dst, const float *src, size_t count) 225 { 226 #pragma pack(push, 1) 227 struct { 228 float tr[4], tg[4], tb[4], a[4]; 229 float t2[4], t1[4]; 230 float ktr[4], ktg[4], ktb[4]; 231 float rtr[4], rtg[4], rtb[4]; 232 } hslm __lsp_aligned16; 233 #pragma pack(pop) 234 235 ARCH_X86_ASM 236 ( 237 __ASM_EMIT("sub $4, %[count]") 238 __ASM_EMIT("jb 2f") 239 240 //----------------------------------------------------------------- 241 // 4x blocks 242 __ASM_EMIT("1:") 243 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") // xmm0 = h0 s0 l0 a0 244 __ASM_EMIT("movups 0x10(%[src]), %%xmm1") // xmm1 = h1 s1 l1 a1 245 __ASM_EMIT("movups 0x20(%[src]), %%xmm2") // xmm2 = h2 s2 l2 a2 246 __ASM_EMIT("movups 0x30(%[src]), %%xmm3") // xmm3 = h3 s3 l3 a3 247 248 HSLA_TO_RGBA_CORE 249 250 // Store result 251 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 252 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 253 __ASM_EMIT("movups %%xmm2, 0x20(%[dst])") 254 __ASM_EMIT("movups %%xmm3, 0x30(%[dst])") 255 256 // Repeat loop 257 __ASM_EMIT("add $0x40, %[src]") 258 __ASM_EMIT("add $0x40, %[dst]") 259 __ASM_EMIT("sub $4, %[count]") 260 __ASM_EMIT("jae 1b") 261 262 __ASM_EMIT("2:") 263 __ASM_EMIT("add $4, %[count]") 264 __ASM_EMIT("jle 10f") 265 266 //----------------------------------------------------------------- 267 // 1x - 3x block 268 // Load last variable-sized chunk 269 __ASM_EMIT("test $1, %[count]") 270 __ASM_EMIT("jz 4f") 271 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") 272 __ASM_EMIT("add $0x10, %[src]") 273 __ASM_EMIT("4:") 274 __ASM_EMIT("test $2, %[count]") 275 __ASM_EMIT("jz 6f") 276 __ASM_EMIT("movups 0x00(%[src]), %%xmm1") 277 __ASM_EMIT("movups 0x10(%[src]), %%xmm2") 278 __ASM_EMIT("6:") 279 280 HSLA_TO_RGBA_CORE 281 282 // Store last chunk 283 __ASM_EMIT("test $1, %[count]") 284 __ASM_EMIT("jz 8f") 285 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 286 __ASM_EMIT("add $0x10, %[dst]") 287 __ASM_EMIT("8:") 288 __ASM_EMIT("test $2, %[count]") 289 __ASM_EMIT("jz 10f") 290 __ASM_EMIT("movups %%xmm1, 0x00(%[dst])") 291 __ASM_EMIT("movups %%xmm2, 0x10(%[dst])") 292 293 __ASM_EMIT("10:") 294 295 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 296 : [XC] "o" (HSL_RGB), [HSLM] "r" (&hslm) 297 : "cc", "memory", 298 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 299 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 300 ); 301 /* 302 //Set the temporary values 303 if (HSL_RGB_0_5 > L) 304 temp2 = (L + S) - (L * S) 305 else 306 temp2 = L + (L * S); 307 308 temp1 = L + L - temp2; 309 310 tempr = H + HSL_RGB_1_3; 311 tempg = H; 312 tempb = H - HSL_RGB_1_3; 313 314 if (tempr > 1.0f) 315 tempr -= 1.0f; 316 if (tempb < 0.0f) 317 tempb += 1.0f; 318 319 k = (temp2 - temp1) * 6.0f; 320 321 //Red 322 if (tempr < HSL_RGB_0_5) 323 R = (tempr < HSL_RGB_1_6) ? temp1 + k * tempr : temp2; 324 else 325 R = (tempr < HSL_RGB_2_3) ? temp1 + k * (HSL_RGB_2_3 - tempr) : temp1; 326 327 //Green 328 if (tempg < HSL_RGB_0_5) 329 G = (tempg < HSL_RGB_1_6) ? temp1 + k * tempg : temp2; 330 else 331 G = (tempg < HSL_RGB_2_3) ? temp1 + k * (HSL_RGB_2_3 - tempg) : temp1; 332 333 //Blue 334 if (tempb < HSL_RGB_0_5) 335 B = (tempb < HSL_RGB_1_6) ? temp1 + k * tempb : temp2; 336 else 337 B = (tempb < HSL_RGB_2_3) ? temp1 + k * (HSL_RGB_2_3 - tempb) : temp1; 338 */ 339 } 340 341 #undef HSLA_TO_RGBA_CORE 342 343 #define RGBA_TO_HSLA_CORE \ 344 /* Transpose */\ 345 HSLA_TRANSPOSE \ 346 \ 347 /* xmm0 = r0 r1 r2 r3 = R */ \ 348 /* xmm1 = g0 h1 g2 g3 = G */ \ 349 /* xmm2 = b0 b1 b2 b3 = B */ \ 350 /* xmm3 = a0 a1 a2 a3 = A */ \ 351 __ASM_EMIT("movaps %%xmm0, %%xmm6") \ 352 __ASM_EMIT("movaps %%xmm1, %%xmm7") \ 353 __ASM_EMIT("minps %%xmm2, %%xmm6") \ 354 __ASM_EMIT("maxps %%xmm0, %%xmm7") \ 355 __ASM_EMIT("minps %%xmm1, %%xmm6") /* xmm6 = CMIN */ \ 356 __ASM_EMIT("maxps %%xmm2, %%xmm7") /* xmm7 = CMAX */ \ 357 __ASM_EMIT("movaps %%xmm0, 0x00(%[RGBM])") /* R */ \ 358 __ASM_EMIT("movaps %%xmm1, 0x10(%[RGBM])") /* G */ \ 359 __ASM_EMIT("movaps %%xmm2, 0x20(%[RGBM])") /* B */ \ 360 __ASM_EMIT("movaps %%xmm3, 0x30(%[RGBM])") /* A */ \ 361 __ASM_EMIT("movaps %%xmm6, 0x40(%[RGBM])") /* CMIN */ \ 362 __ASM_EMIT("movaps %%xmm7, 0x50(%[RGBM])") /* CMAX */ \ 363 __ASM_EMIT("movaps %%xmm7, %%xmm5") /* xmm5 = CMAX */ \ 364 __ASM_EMIT("subps %%xmm6, %%xmm7") /* xmm7 = D = CMAX - CMIN */ \ 365 \ 366 __ASM_EMIT("movaps %%xmm0, %%xmm3") /* xmm3 = R */ \ 367 __ASM_EMIT("subps %%xmm1, %%xmm0") /* xmm0 = R - G */ \ 368 __ASM_EMIT("subps %%xmm2, %%xmm1") /* xmm1 = G - B */ \ 369 __ASM_EMIT("divps %%xmm7, %%xmm0") /* xmm0 = (R-G)/D */ \ 370 __ASM_EMIT("subps %%xmm3, %%xmm2") /* xmm2 = B - R */ \ 371 __ASM_EMIT("addps 0x00 + %[XC], %%xmm0") /* xmm0 = HB = (R-G)/D + 4 */ \ 372 __ASM_EMIT("divps %%xmm7, %%xmm2") /* xmm2 = (B-R)/D */ \ 373 __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 */ \ 374 __ASM_EMIT("divps %%xmm7, %%xmm1") /* xmm1 = (G-B)/D */ \ 375 __ASM_EMIT("addps 0x10 + %[XC], %%xmm2") /* xmm2 = HG = (B-R)/D + 2 */ \ 376 __ASM_EMIT("cmpps $6, %%xmm1, %%xmm3") /* xmm3 = (G-B)/D < 0 */ \ 377 __ASM_EMIT("andps 0x20 + %[XC], %%xmm3") /* xmm3 = [ (G-B)/D < 0 ] & 6 */ \ 378 __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = HR = (G-B)/D + [ (G-B)/D < 0 ] & 6 */ \ 379 \ 380 /* xmm0 = HB */ \ 381 /* xmm1 = HR */ \ 382 /* xmm2 = HG */ \ 383 /* xmm5 = CMAX */ \ 384 /* xmm6 = CMIN */ \ 385 /* xmm7 = D */ \ 386 __ASM_EMIT("movaps %%xmm5, %%xmm6") /* xmm6 = CMAX */ \ 387 __ASM_EMIT("cmpps $0, 0x00(%[RGBM]), %%xmm5") /* xmm5 = [ R == CMAX ] */ \ 388 __ASM_EMIT("cmpps $0, 0x10(%[RGBM]), %%xmm6") /* xmm6 = [ G == CMAX ] */ \ 389 __ASM_EMIT("movaps %%xmm5, %%xmm3") /* xmm3 = [ R == CMAX ] */ \ 390 __ASM_EMIT("movaps %%xmm6, %%xmm4") /* xmm4 = [ G == CMAX ] */ \ 391 __ASM_EMIT("andps %%xmm5, %%xmm1") /* xmm1 = HR & [ R == CMAX ] */ \ 392 __ASM_EMIT("andnps %%xmm0, %%xmm3") /* xmm3 = HB & [ R != CMAX ] */ \ 393 __ASM_EMIT("andnps %%xmm2, %%xmm5") /* xmm5 = HG & [ R != CMAX ] */ \ 394 __ASM_EMIT("andnps %%xmm3, %%xmm4") /* xmm4 = HB & [ R != CMAX ] & [ G != CMAX ] */ \ 395 __ASM_EMIT("andps %%xmm6, %%xmm5") /* xmm5 = HG & [ R != CMAX ] & [ G == CMAX ] */ \ 396 __ASM_EMIT("orps %%xmm4, %%xmm1") /* xmm1 = (HR & [ R == CMAX ]) | (HB & [ R != CMAX ] & [ G != CMAX ]) */ \ 397 __ASM_EMIT("xorps %%xmm6, %%xmm6") /* xmm6 = 0 */ \ 398 __ASM_EMIT("orps %%xmm5, %%xmm1") /* xmm1 = (HR & [ R == CMAX ]) | (HG & [ R != CMAX ] & [ G == CMAX ]) | (HB & [ R != CMAX ] & [ G != CMAX ]) */ \ 399 __ASM_EMIT("cmpps $4, %%xmm7, %%xmm6") /* xmm6 = [ D != 0 ] */ \ 400 __ASM_EMIT("andps %%xmm6, %%xmm1") /* xmm1 = [ D !- 0 ] & ((HR & [ R == CMAX ]) | (HG & [ R != CMAX ] & [ G == CMAX ]) | (HB & [ R != CMAX ] & [ G != CMAX ])) */ \ 401 \ 402 __ASM_EMIT("movaps 0x40(%[RGBM]), %%xmm2") /* xmm2 = CMIN */ \ 403 __ASM_EMIT("movaps 0x30 + %[XC], %%xmm6") /* xmm6 = 1 */ \ 404 __ASM_EMIT("movaps %%xmm1, %%xmm0") /* xmm0 = h */ \ 405 __ASM_EMIT("addps 0x50(%[RGBM]), %%xmm2") /* xmm2 = CMAX + CMIN */ \ 406 __ASM_EMIT("movaps %%xmm6, %%xmm5") /* xmm5 = 1 */ \ 407 __ASM_EMIT("movaps %%xmm7, %%xmm1") /* xmm1 = D */ \ 408 __ASM_EMIT("mulps 0x40 + %[XC], %%xmm2") /* xmm2 = L = 0.5 * (CMAX+CMIN) */ \ 409 __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 */ \ 410 __ASM_EMIT("movaps %%xmm2, %%xmm3") /* xmm3 = L */ \ 411 __ASM_EMIT("subps %%xmm2, %%xmm5") /* xmm5 = 1 - L */ \ 412 __ASM_EMIT("cmpps $4, %%xmm2, %%xmm4") /* xmm4 = [ L != 0 ] */ \ 413 __ASM_EMIT("cmpps $4, 0x30 + %[XC], %%xmm3") /* xmm3 = [ L != 1 ] */ \ 414 __ASM_EMIT("divps %%xmm2, %%xmm1") /* xmm1 = D / L */ \ 415 __ASM_EMIT("divps %%xmm5, %%xmm7") /* xmm7 = D / (1-L) */ \ 416 __ASM_EMIT("cmpps $6, %%xmm2, %%xmm6") /* xmm6 = [ L < 1 ] */ \ 417 __ASM_EMIT("andps %%xmm4, %%xmm1") /* xmm1 = [ L != 0 ] & (D/L) */ \ 418 __ASM_EMIT("andps %%xmm3, %%xmm7") /* xmm7 = [ L != 1 ] & (D/(1-L)) */ \ 419 __ASM_EMIT("andps %%xmm6, %%xmm1") /* xmm1 = [ L != 0 ] & [ L < 1 ] & (D/L) */ \ 420 __ASM_EMIT("mulps 0x50 + %[XC], %%xmm0") /* xmm0 = H = h * 1/6 */ \ 421 __ASM_EMIT("andnps %%xmm7, %%xmm6") /* xmm6 = [ L > 1 ] & (D/(1-L)) */ \ 422 __ASM_EMIT("orps %%xmm6, %%xmm1") /* xmm1 = s = ([ L != 0 ] & [ L < 1 ] & (D/L)) | ([ L != 1 ] & (D/(1-L))) */ \ 423 __ASM_EMIT("movaps 0x30(%[RGBM]), %%xmm3") /* xmm3 = A */ \ 424 __ASM_EMIT("mulps 0x40 + %[XC], %%xmm1") /* xmm1 = S = s * 0.5 */ \ 425 \ 426 /* Transpose back */ \ 427 HSLA_TRANSPOSE 428 429 /* 430 float cmax = (R < G) ? ((B < G) ? G : B) : ((B < R) ? R : B); 431 float cmin = (R < G) ? ((B < R) ? B : R) : ((B < G) ? B : G); 432 float d = cmax - cmin; 433 434 H = 0.0f; 435 S = 0.0f; 436 L = HSL_RGB_0_5 * (cmax + cmin); 437 438 // Calculate hue 439 if (R == cmax) 440 { 441 H = (G - B) / d; 442 if (H < 0.0f) 443 H += 6.0f; 444 } 445 else if (G == cmax) 446 H = (B - R) / d + 2.0f; 447 else 448 H = (R - G) / d + 4.0f; 449 450 // Calculate saturation 451 if (L < 1.0f) 452 S = d / L; 453 else if (L > 1.0f) 454 S = d / (1.0f - L); 455 456 // Normalize hue and saturation 457 H *= HSL_RGB_1_6; 458 S *= HSL_RGB_0_5; 459 */ 460 rgba_to_hsla(float * dst,const float * src,size_t count)461 void rgba_to_hsla(float *dst, const float *src, size_t count) 462 { 463 #pragma pack(push, 1) 464 struct { 465 float r[4], g[4], b[4], a[4]; 466 float cmin[4], cmax[4]; 467 } rgbm __lsp_aligned16; 468 #pragma pack(pop) 469 470 ARCH_X86_ASM 471 ( 472 __ASM_EMIT("sub $4, %[count]") 473 __ASM_EMIT("jb 2f") 474 475 //----------------------------------------------------------------- 476 // 4x blocks 477 __ASM_EMIT("1:") 478 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") // xmm0 = r0 g0 b0 a0 479 __ASM_EMIT("movups 0x10(%[src]), %%xmm1") // xmm1 = r1 g1 b1 a1 480 __ASM_EMIT("movups 0x20(%[src]), %%xmm2") // xmm2 = r2 g2 b2 a2 481 __ASM_EMIT("movups 0x30(%[src]), %%xmm3") // xmm3 = r3 g3 b3 a3 482 483 RGBA_TO_HSLA_CORE 484 485 // Store result 486 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 487 __ASM_EMIT("movups %%xmm1, 0x10(%[dst])") 488 __ASM_EMIT("movups %%xmm2, 0x20(%[dst])") 489 __ASM_EMIT("movups %%xmm3, 0x30(%[dst])") 490 491 // Repeat loop 492 __ASM_EMIT("add $0x40, %[src]") 493 __ASM_EMIT("add $0x40, %[dst]") 494 __ASM_EMIT("sub $4, %[count]") 495 __ASM_EMIT("jae 1b") 496 497 __ASM_EMIT("2:") 498 499 __ASM_EMIT("add $4, %[count]") 500 __ASM_EMIT("jle 10f") 501 502 //----------------------------------------------------------------- 503 // 1x - 3x block 504 // Load last variable-sized chunk 505 __ASM_EMIT("test $1, %[count]") 506 __ASM_EMIT("jz 4f") 507 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") 508 __ASM_EMIT("add $0x10, %[src]") 509 __ASM_EMIT("4:") 510 __ASM_EMIT("test $2, %[count]") 511 __ASM_EMIT("jz 6f") 512 __ASM_EMIT("movups 0x00(%[src]), %%xmm1") 513 __ASM_EMIT("movups 0x10(%[src]), %%xmm2") 514 __ASM_EMIT("6:") 515 516 RGBA_TO_HSLA_CORE 517 518 // Store last chunk 519 __ASM_EMIT("test $1, %[count]") 520 __ASM_EMIT("jz 8f") 521 __ASM_EMIT("movups %%xmm0, 0x00(%[dst])") 522 __ASM_EMIT("add $0x10, %[dst]") 523 __ASM_EMIT("8:") 524 __ASM_EMIT("test $2, %[count]") 525 __ASM_EMIT("jz 10f") 526 __ASM_EMIT("movups %%xmm1, 0x00(%[dst])") 527 __ASM_EMIT("movups %%xmm2, 0x10(%[dst])") 528 529 __ASM_EMIT("10:") 530 531 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 532 : [XC] "o" (RGB_HSL), [RGBM] "r" (&rgbm) 533 : "cc", "memory", 534 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 535 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 536 ); 537 } 538 539 #undef RGBA_TO_HSLA_CORE 540 541 #define RGBA_TO_RGBA32_CORE \ 542 HSLA_TRANSPOSE \ 543 \ 544 /* xmm0 = r */ \ 545 /* xmm1 = g */ \ 546 /* xmm2 = b */ \ 547 /* xmm3 = a */ \ 548 __ASM_EMIT("movaps 0x00 + %[XC], %%xmm7") /* xmm7 = 255 */ \ 549 __ASM_EMIT("xorps %%xmm0, %%xmm2") /* xmm2 = b^r */ \ 550 __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 */ \ 551 __ASM_EMIT("xorps %%xmm2, %%xmm0") /* xmm0 = r^b^r = b */ \ 552 __ASM_EMIT("xorps %%xmm0, %%xmm2") /* xmm2 = b^r^b = r */ \ 553 __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 */ \ 554 __ASM_EMIT("mulps %%xmm7, %%xmm3") /* xmm3 = a * 255 */ \ 555 __ASM_EMIT("subps %%xmm3, %%xmm7") /* xmm7 = A = 255 - a*255 */ \ 556 __ASM_EMIT("movaps %%xmm7, %%xmm3") /* xmm3 = A */ \ 557 __ASM_EMIT("mulps %%xmm7, %%xmm0") /* xmm0 = B = b * A */ \ 558 __ASM_EMIT("mulps %%xmm3, %%xmm1") /* xmm1 = G = g * A */ \ 559 __ASM_EMIT("mulps %%xmm7, %%xmm2") /* xmm2 = R = r * A */ \ 560 __ASM_EMIT("cmpps $2, %%xmm0, %%xmm4") /* xmm4 = [ B >= 0 ] */ \ 561 __ASM_EMIT("cmpps $2, %%xmm1, %%xmm5") /* xmm5 = [ G >= 0 ] */ \ 562 __ASM_EMIT("xorps %%xmm6, %%xmm6") /* xmm6 = 0 */ \ 563 __ASM_EMIT("xorps %%xmm7, %%xmm7") /* xmm7 = 0 */ \ 564 __ASM_EMIT("cmpps $2, %%xmm2, %%xmm6") /* xmm6 = [ R >= 0 ] */ \ 565 __ASM_EMIT("cmpps $2, %%xmm3, %%xmm7") /* xmm7 = [ A >= 0 ] */ \ 566 __ASM_EMIT("andps %%xmm4, %%xmm0") /* xmm0 = B & [ B >= 0 ] */ \ 567 __ASM_EMIT("andps %%xmm5, %%xmm1") /* xmm1 = G & [ G >= 0 ] */ \ 568 __ASM_EMIT("andps %%xmm6, %%xmm2") /* xmm2 = R & [ R >= 0 ] */ \ 569 __ASM_EMIT("andps %%xmm7, %%xmm3") /* xmm3 = A & [ A >= 0 ] */ \ 570 \ 571 HSLA_TRANSPOSE \ 572 /* xmm0 = b0 g0 r0 a0 */ \ 573 /* xmm1 = b1 g1 r1 a1 */ \ 574 /* xmm2 = b2 g2 r2 a2 */ \ 575 /* xmm3 = b3 g3 r3 a3 */ \ 576 \ 577 __ASM_EMIT("cvtps2dq %%xmm0, %%xmm0") /* xmm0 = int(b0 g0 r0 a0) */ \ 578 __ASM_EMIT("cvtps2dq %%xmm1, %%xmm1") /* xmm1 = int(b1 g1 r1 a1) */ \ 579 __ASM_EMIT("cvtps2dq %%xmm2, %%xmm2") /* xmm2 = int(b2 g2 r2 a2) */ \ 580 __ASM_EMIT("cvtps2dq %%xmm3, %%xmm3") /* xmm3 = int(b3 g3 r3 a3) */ \ 581 __ASM_EMIT("packssdw %%xmm1, %%xmm0") /* xmm0 = b0 g0 r0 a0 b1 g1 r1 a1 */ \ 582 __ASM_EMIT("packssdw %%xmm3, %%xmm2") /* xmm2 = b2 g2 r2 a2 b3 g3 r3 a3 */ \ 583 __ASM_EMIT("packuswb %%xmm2, %%xmm0") /* xmm0 = b0 g0 r0 a0 b1 g1 r1 a1 b2 g2 r2 a2 b3 g3 r3 a3 */ 584 585 rgba_to_bgra32(void * dst,const float * src,size_t count)586 void rgba_to_bgra32(void *dst, const float *src, size_t count) 587 { 588 uint32_t mxcsr[2]; 589 uint32_t tmp; 590 591 ARCH_X86_ASM 592 ( 593 // Set rounding mode to zero 594 __ASM_EMIT("stmxcsr %[mxcsr]") 595 __ASM_EMIT("movl %[mxcsr], %[tmp]") 596 __ASM_EMIT("or $0x6000, %[tmp]") 597 __ASM_EMIT("movl %[tmp], 0x04 + %[mxcsr]") 598 __ASM_EMIT("ldmxcsr 0x04 + %[mxcsr]") 599 600 __ASM_EMIT("sub $4, %[count]") 601 __ASM_EMIT("jb 2f") 602 603 //----------------------------------------------------------------- 604 // 4x blocks 605 __ASM_EMIT("1:") 606 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") // xmm0 = r0 g0 b0 a0 607 __ASM_EMIT("movups 0x10(%[src]), %%xmm1") // xmm1 = r1 g1 b1 a1 608 __ASM_EMIT("movups 0x20(%[src]), %%xmm2") // xmm2 = r2 g2 b2 a2 609 __ASM_EMIT("movups 0x30(%[src]), %%xmm3") // xmm3 = r3 g3 b3 a3 610 611 RGBA_TO_RGBA32_CORE 612 613 // Store result 614 __ASM_EMIT("movdqu %%xmm0, 0x00(%[dst])") 615 616 // Repeat loop 617 __ASM_EMIT("add $0x40, %[src]") 618 __ASM_EMIT("add $0x10, %[dst]") 619 __ASM_EMIT("sub $4, %[count]") 620 __ASM_EMIT("jae 1b") 621 622 __ASM_EMIT("2:") 623 __ASM_EMIT("add $4, %[count]") 624 __ASM_EMIT("jle 10f") 625 626 //----------------------------------------------------------------- 627 // 1x - 3x block 628 // Load last variable-sized chunk 629 __ASM_EMIT("test $2, %[count]") 630 __ASM_EMIT("jz 4f") 631 __ASM_EMIT("movups 0x00(%[src]), %%xmm0") 632 __ASM_EMIT("movups 0x10(%[src]), %%xmm1") 633 __ASM_EMIT("add $0x20, %[src]") 634 __ASM_EMIT("4:") 635 __ASM_EMIT("test $1, %[count]") 636 __ASM_EMIT("jz 6f") 637 __ASM_EMIT("movups 0x00(%[src]), %%xmm2") 638 __ASM_EMIT("6:") 639 640 RGBA_TO_RGBA32_CORE 641 642 // Store last chunk 643 __ASM_EMIT("test $2, %[count]") 644 __ASM_EMIT("jz 8f") 645 __ASM_EMIT("movlps %%xmm0, 0x00(%[dst])") 646 __ASM_EMIT("add $0x08, %[dst]") 647 __ASM_EMIT("8:") 648 __ASM_EMIT("test $1, %[count]") 649 __ASM_EMIT("jz 10f") 650 __ASM_EMIT("movhlps %%xmm0, %%xmm0") 651 __ASM_EMIT("movss %%xmm0, 0x00(%[dst])") 652 653 __ASM_EMIT("10:") 654 655 // Restore rounding mode 656 __ASM_EMIT("ldmxcsr %[mxcsr]") 657 658 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), 659 [tmp] "=&r" (tmp) 660 : [XC] "o" (RGBA_TO_BGRA32), [mxcsr] "o" (mxcsr) 661 : "cc", "memory", 662 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 663 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 664 ); 665 } 666 667 #undef RGBA_TO_RGBA32_CORE 668 669 #undef HSLA_TRANSPOSE 670 rgba32_to_bgra32(void * dst,const void * src,size_t count)671 void rgba32_to_bgra32(void *dst, const void *src, size_t count) 672 { 673 size_t off; 674 675 ARCH_X86_ASM 676 ( 677 __ASM_EMIT("movdqa %[MASK], %%xmm6") // xmm6 = 00 ff 00 ff 678 __ASM_EMIT("xor %[off], %[off]") // off = 0 679 __ASM_EMIT("movdqa %%xmm6, %%xmm7") // xmm7 = 00 ff 00 ff 680 __ASM_EMIT("pslld $8, %%xmm6") // xmm6 = ff 00 ff 00 681 682 // 8-element blocks 683 __ASM_EMIT("sub $8, %[count]") 684 __ASM_EMIT("jb 2f") 685 __ASM_EMIT("1:") 686 __ASM_EMIT("movdqu 0x00(%[src], %[off]), %%xmm0") // xmm0 = A1 R1 G1 B1 687 __ASM_EMIT("movdqu 0x10(%[src], %[off]), %%xmm1") // xmm1 = A2 R2 G2 B2 688 __ASM_EMIT("movdqa %%xmm0, %%xmm2") // xmm2 = A1 R1 G1 B1 689 __ASM_EMIT("movdqa %%xmm1, %%xmm3") // xmm3 = A2 R2 G2 B2 690 __ASM_EMIT("pand %%xmm7, %%xmm0") // xmm0 = 00 R1 00 B1 691 __ASM_EMIT("pand %%xmm6, %%xmm2") // xmm2 = A1 00 G1 00 692 __ASM_EMIT("pand %%xmm7, %%xmm1") // xmm1 = 00 R2 00 B2 693 __ASM_EMIT("pand %%xmm6, %%xmm3") // xmm3 = A2 00 G2 00 694 __ASM_EMIT("movdqa %%xmm0, %%xmm4") // xmm4 = A1 00 G1 00 695 __ASM_EMIT("movdqa %%xmm1, %%xmm5") // xmm5 = A2 00 G2 00 696 __ASM_EMIT("pslld $16, %%xmm0") // xmm0 = 00 B1 00 00 697 __ASM_EMIT("pslld $16, %%xmm1") // xmm1 = 00 B2 00 00 698 __ASM_EMIT("psrld $16, %%xmm4") // xmm4 = 00 00 00 R1 699 __ASM_EMIT("psrld $16, %%xmm5") // xmm5 = 00 00 00 R2 700 __ASM_EMIT("orpd %%xmm2, %%xmm0") // xmm0 = A1 B1 G1 00 701 __ASM_EMIT("orpd %%xmm3, %%xmm1") // xmm1 = A2 B2 G2 00 702 __ASM_EMIT("orpd %%xmm4, %%xmm0") // xmm0 = A1 B1 G1 R1 703 __ASM_EMIT("orpd %%xmm5, %%xmm1") // xmm1 = A2 B2 G2 R2 704 __ASM_EMIT("movdqu %%xmm0, 0x00(%[dst], %[off])") 705 __ASM_EMIT("movdqu %%xmm1, 0x10(%[dst], %[off])") 706 __ASM_EMIT("add $0x20, %[off]") 707 __ASM_EMIT("sub $8, %[count]") 708 __ASM_EMIT("jae 1b") 709 710 // 4-element block 711 __ASM_EMIT("2:") 712 __ASM_EMIT("add $4, %[count]") 713 __ASM_EMIT("jl 4f") 714 __ASM_EMIT("movdqu 0x00(%[src], %[off]), %%xmm0") // xmm0 = A1 R1 G1 B1 715 __ASM_EMIT("movdqa %%xmm0, %%xmm2") // xmm2 = A1 R1 G1 B1 716 __ASM_EMIT("pand %%xmm7, %%xmm0") // xmm0 = 00 R1 00 B1 717 __ASM_EMIT("pand %%xmm6, %%xmm2") // xmm2 = A1 00 G1 00 718 __ASM_EMIT("movdqa %%xmm0, %%xmm4") // xmm4 = A1 00 G1 00 719 __ASM_EMIT("pslld $16, %%xmm0") // xmm0 = 00 B1 00 00 720 __ASM_EMIT("psrld $16, %%xmm4") // xmm4 = 00 00 00 R1 721 __ASM_EMIT("orpd %%xmm2, %%xmm0") // xmm0 = A1 B1 G1 00 722 __ASM_EMIT("orpd %%xmm4, %%xmm0") // xmm0 = A1 B1 G1 R1 723 __ASM_EMIT("movdqu %%xmm0, 0x00(%[dst], %[off])") 724 __ASM_EMIT("add $0x10, %[off]") 725 __ASM_EMIT("sub $4, %[count]") 726 727 // Tail 728 __ASM_EMIT("4:") 729 __ASM_EMIT("add $3, %[count]") 730 __ASM_EMIT("jl 6f") 731 __ASM_EMIT("5:") 732 __ASM_EMIT("movd 0x00(%[src], %[off]), %%xmm0") // xmm0 = AA RR GG BB 733 __ASM_EMIT("movdqa %%xmm0, %%xmm1") // xmm1 = AA RR GG BB 734 __ASM_EMIT("pand %%xmm7, %%xmm0") // xmm0 = 00 RR 00 BB 735 __ASM_EMIT("pand %%xmm6, %%xmm1") // xmm1 = AA 00 GG 00 736 __ASM_EMIT("movdqa %%xmm0, %%xmm2") // xmm2 = 00 RR 00 BB 737 __ASM_EMIT("pslld $16, %%xmm0") // xmm0 = 00 BB 00 00 738 __ASM_EMIT("psrld $16, %%xmm2") // xmm2 = 00 00 00 RR 739 __ASM_EMIT("orpd %%xmm1, %%xmm0") // xmm0 = AA 00 GG RR 740 __ASM_EMIT("orpd %%xmm2, %%xmm0") // xmm0 = AA BB GG RR 741 __ASM_EMIT("movd %%xmm0, 0x00(%[dst], %[off])") 742 __ASM_EMIT("add $4, %[off]") 743 __ASM_EMIT("dec %[count]") 744 __ASM_EMIT("jge 5b") 745 746 // End 747 __ASM_EMIT("6:") 748 749 : [dst] "+r"(dst), [src] "+r"(src), [count] "+r" (count), 750 [off] "=&r" (off) 751 : [MASK] "m" (X_CMASK) 752 : "cc", "memory", 753 "%xmm0", "%xmm1", "%xmm2", "%xmm3", 754 "%xmm4", "%xmm5", "%xmm6", "%xmm7" 755 ); 756 } 757 758 } 759 760 #endif /* INCLUDE_DSP_ARCH_X86_SSE2_GRAPHICS_H_ */ 761