1 /***************************************************************************** 2 * i420_yuy2.h : YUV to YUV conversion module for vlc 3 ***************************************************************************** 4 * Copyright (C) 2000, 2001 VLC authors and VideoLAN 5 * $Id: 66a89508e994d5aec506a3407d7ce78c56db4ddd $ 6 * 7 * Authors: Samuel Hocevar <sam@zoy.org> 8 * Damien Fouilleul <damien@videolan.org> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of the GNU Lesser General Public License as published by 12 * the Free Software Foundation; either version 2.1 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software Foundation, 22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. 23 *****************************************************************************/ 24 25 #ifdef MODULE_NAME_IS_i420_yuy2_mmx 26 27 #if defined(CAN_COMPILE_MMX) 28 29 /* MMX assembly */ 30 31 #define MMX_CALL(MMX_INSTRUCTIONS) \ 32 do { \ 33 __asm__ __volatile__( \ 34 ".p2align 3 \n\t \ 35 movd (%0), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 36 movd (%1), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 37 movq (%2), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 38 movq (%3), %%mm3 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 39 " \ 40 : \ 41 : "r" (p_u), "r" (p_v), \ 42 "r" (p_y1), "r" (p_y2) \ 43 : "mm0", "mm1", "mm2", "mm3"); \ 44 __asm__ __volatile__( \ 45 ".p2align 3 \n\t" \ 46 MMX_INSTRUCTIONS \ 47 : \ 48 : "r" (p_line1), "r" (p_line2) \ 49 : "mm0", "mm1", "mm2", "mm3"); \ 50 p_line1 += 16; p_line2 += 16; \ 51 p_y1 += 8; p_y2 += 8; \ 52 p_u += 4; p_v += 4; \ 53 } while(0) 54 55 #define MMX_END __asm__ __volatile__ ( "emms" ) 56 57 #define MMX_YUV420_YUYV " \n\ 58 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 59 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 60 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ 61 movq %%mm2, (%0) # Store low YUYV \n\ 62 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 63 movq %%mm0, 8(%0) # Store high YUYV \n\ 64 movq %%mm3, %%mm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 65 punpcklbw %%mm1, %%mm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ 66 movq %%mm4, (%1) # Store low YUYV \n\ 67 punpckhbw %%mm1, %%mm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ 68 movq %%mm3, 8(%1) # Store high YUYV \n\ 69 " 70 71 #define MMX_YUV420_YVYU " \n\ 72 punpcklbw %%mm1, %%mm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 73 movq %%mm0, %%mm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 74 punpcklbw %%mm2, %%mm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 75 movq %%mm1, (%0) # Store low YUYV \n\ 76 punpckhbw %%mm2, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 77 movq %%mm0, 8(%0) # Store high YUYV \n\ 78 movq %%mm3, %%mm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 79 punpcklbw %%mm2, %%mm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ 80 movq %%mm4, (%1) # Store low YUYV \n\ 81 punpckhbw %%mm2, %%mm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ 82 movq %%mm3, 8(%1) # Store high YUYV \n\ 83 " 84 85 #define MMX_YUV420_UYVY " \n\ 86 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 87 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 88 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 89 movq %%mm2, (%0) # Store low UYVY \n\ 90 movq %%mm1, %%mm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 91 punpckhbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 92 movq %%mm2, 8(%0) # Store high UYVY \n\ 93 movq %%mm1, %%mm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 94 punpcklbw %%mm3, %%mm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ 95 movq %%mm4, (%1) # Store low UYVY \n\ 96 punpckhbw %%mm3, %%mm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ 97 movq %%mm1, 8(%1) # Store high UYVY \n\ 98 " 99 100 #elif defined(HAVE_MMX_INTRINSICS) 101 102 /* MMX intrinsics */ 103 104 #include <mmintrin.h> 105 106 #define MMX_CALL(MMX_INSTRUCTIONS) \ 107 do { \ 108 __m64 mm0, mm1, mm2, mm3, mm4; \ 109 MMX_INSTRUCTIONS \ 110 p_line1 += 16; p_line2 += 16; \ 111 p_y1 += 8; p_y2 += 8; \ 112 p_u += 4; p_v += 4; \ 113 } while(0) 114 115 #define MMX_END _mm_empty() 116 117 #define MMX_YUV420_YUYV \ 118 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ 119 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ 120 mm0 = (__m64)*(uint64_t*)p_y1; \ 121 mm3 = (__m64)*(uint64_t*)p_y2; \ 122 mm1 = _mm_unpacklo_pi8(mm1, mm2); \ 123 mm2 = mm0; \ 124 mm2 = _mm_unpacklo_pi8(mm2, mm1); \ 125 *(uint64_t*)p_line1 = (uint64_t)mm2; \ 126 mm0 = _mm_unpackhi_pi8(mm0, mm1); \ 127 *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\ 128 mm4 = mm3; \ 129 mm4 = _mm_unpacklo_pi8(mm4, mm1); \ 130 *(uint64_t*)p_line2 = (uint64_t)mm4; \ 131 mm3 = _mm_unpackhi_pi8(mm3, mm1); \ 132 *(uint64_t*)(p_line2+8) = (uint64_t)mm3; 133 134 #define MMX_YUV420_YVYU \ 135 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \ 136 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ 137 mm0 = (__m64)*(uint64_t*)p_y1; \ 138 mm3 = (__m64)*(uint64_t*)p_y2; \ 139 mm1 = _mm_unpacklo_pi8(mm1, mm2); \ 140 mm2 = mm0; \ 141 mm2 = _mm_unpacklo_pi8(mm2, mm1); \ 142 *(uint64_t*)p_line1 = (uint64_t)mm2; \ 143 mm0 = _mm_unpackhi_pi8(mm0, mm1); \ 144 *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\ 145 mm4 = mm3; \ 146 mm4 = _mm_unpacklo_pi8(mm4, mm1); \ 147 *(uint64_t*)p_line2 = (uint64_t)mm4; \ 148 mm3 = _mm_unpackhi_pi8(mm3, mm1); \ 149 *(uint64_t*)(p_line2+8) = (uint64_t)mm3; 150 151 #define MMX_YUV420_UYVY \ 152 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ 153 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ 154 mm0 = (__m64)*(uint64_t*)p_y1; \ 155 mm3 = (__m64)*(uint64_t*)p_y2; \ 156 mm1 = _mm_unpacklo_pi8(mm1, mm2); \ 157 mm2 = mm1; \ 158 mm2 = _mm_unpacklo_pi8(mm2, mm0); \ 159 *(uint64_t*)p_line1 = (uint64_t)mm2; \ 160 mm2 = mm1; \ 161 mm2 = _mm_unpackhi_pi8(mm2, mm0); \ 162 *(uint64_t*)(p_line1+8) = (uint64_t)mm2;\ 163 mm4 = mm1; \ 164 mm4 = _mm_unpacklo_pi8(mm4, mm3); \ 165 *(uint64_t*)p_line2 = (uint64_t)mm4; \ 166 mm1 = _mm_unpackhi_pi8(mm1, mm3); \ 167 *(uint64_t*)(p_line2+8) = (uint64_t)mm1; 168 169 #endif 170 171 #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 ) 172 173 #if defined(CAN_COMPILE_SSE2) 174 175 /* SSE2 assembly */ 176 177 #define SSE2_CALL(SSE2_INSTRUCTIONS) \ 178 do { \ 179 __asm__ __volatile__( \ 180 ".p2align 3 \n\t \ 181 movq (%0), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\ 182 movq (%1), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\ 183 " \ 184 : \ 185 : "r" (p_u), "r" (p_v) \ 186 : "xmm1", "xmm2"); \ 187 __asm__ __volatile__( \ 188 ".p2align 3 \n\t" \ 189 SSE2_INSTRUCTIONS \ 190 : \ 191 : "r" (p_line1), "r" (p_line2), \ 192 "r" (p_y1), "r" (p_y2) \ 193 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); \ 194 p_line1 += 32; p_line2 += 32; \ 195 p_y1 += 16; p_y2 += 16; \ 196 p_u += 8; p_v += 8; \ 197 } while(0) 198 199 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" ) 200 201 #define SSE2_YUV420_YUYV_ALIGNED " \n\ 202 movdqa (%2), %%xmm0 # Load 16 Y y15 y14 y13 .. y2 y1 y0 \n\ 203 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 204 punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\ 205 movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\ 206 punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\ 207 movntdq %%xmm2, (%0) # Store low YUYV \n\ 208 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 209 movntdq %%xmm0, 16(%0) # Store high YUYV \n\ 210 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 211 punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ 212 movntdq %%xmm4, (%1) # Store low YUYV \n\ 213 punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ 214 movntdq %%xmm3, 16(%1) # Store high YUYV \n\ 215 " 216 217 #define SSE2_YUV420_YUYV_UNALIGNED " \n\ 218 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 219 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 220 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ 221 prefetchnta (%1) # Tell CPU not to cache output YUYV data \n\ 222 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 223 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 224 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ 225 movdqu %%xmm2, (%0) # Store low YUYV \n\ 226 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 227 movdqu %%xmm0, 16(%0) # Store high YUYV \n\ 228 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 229 punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ 230 movdqu %%xmm4, (%1) # Store low YUYV \n\ 231 punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ 232 movdqu %%xmm3, 16(%1) # Store high YUYV \n\ 233 " 234 235 #define SSE2_YUV420_YVYU_ALIGNED " \n\ 236 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 237 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 238 punpcklbw %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 239 movdqa %%xmm0, %%xmm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 240 punpcklbw %%xmm2, %%xmm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 241 movntdq %%xmm1, (%0) # Store low YUYV \n\ 242 punpckhbw %%xmm2, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 243 movntdq %%xmm0, 16(%0) # Store high YUYV \n\ 244 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 245 punpcklbw %%xmm2, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ 246 movntdq %%xmm4, (%1) # Store low YUYV \n\ 247 punpckhbw %%xmm2, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ 248 movntdq %%xmm3, 16(%1) # Store high YUYV \n\ 249 " 250 251 #define SSE2_YUV420_YVYU_UNALIGNED " \n\ 252 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 253 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 254 prefetchnta (%0) # Tell CPU not to cache output YVYU data \n\ 255 prefetchnta (%1) # Tell CPU not to cache output YVYU data \n\ 256 punpcklbw %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 257 movdqu %%xmm0, %%xmm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 258 punpcklbw %%xmm2, %%xmm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 259 movdqu %%xmm1, (%0) # Store low YUYV \n\ 260 punpckhbw %%xmm2, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 261 movdqu %%xmm0, 16(%0) # Store high YUYV \n\ 262 movdqu %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 263 punpcklbw %%xmm2, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ 264 movdqu %%xmm4, (%1) # Store low YUYV \n\ 265 punpckhbw %%xmm2, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ 266 movdqu %%xmm3, 16(%1) # Store high YUYV \n\ 267 " 268 269 #define SSE2_YUV420_UYVY_ALIGNED " \n\ 270 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 271 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 272 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 273 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 274 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 275 movntdq %%xmm2, (%0) # Store low UYVY \n\ 276 movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 277 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 278 movntdq %%xmm2, 16(%0) # Store high UYVY \n\ 279 movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 280 punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ 281 movntdq %%xmm4, (%1) # Store low UYVY \n\ 282 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ 283 movntdq %%xmm1, 16(%1) # Store high UYVY \n\ 284 " 285 286 #define SSE2_YUV420_UYVY_UNALIGNED " \n\ 287 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 288 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 289 prefetchnta (%0) # Tell CPU not to cache output UYVY data \n\ 290 prefetchnta (%1) # Tell CPU not to cache output UYVY data \n\ 291 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 292 movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 293 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 294 movdqu %%xmm2, (%0) # Store low UYVY \n\ 295 movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 296 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 297 movdqu %%xmm2, 16(%0) # Store high UYVY \n\ 298 movdqu %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 299 punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ 300 movdqu %%xmm4, (%1) # Store low UYVY \n\ 301 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ 302 movdqu %%xmm1, 16(%1) # Store high UYVY \n\ 303 " 304 305 #elif defined(HAVE_SSE2_INTRINSICS) 306 307 /* SSE2 intrinsics */ 308 309 #include <emmintrin.h> 310 311 #define SSE2_CALL(SSE2_INSTRUCTIONS) \ 312 do { \ 313 __m128i xmm0, xmm1, xmm2, xmm3, xmm4; \ 314 SSE2_INSTRUCTIONS \ 315 p_line1 += 32; p_line2 += 32; \ 316 p_y1 += 16; p_y2 += 16; \ 317 p_u += 8; p_v += 8; \ 318 } while(0) 319 320 #define SSE2_END _mm_sfence() 321 322 #define SSE2_YUV420_YUYV_ALIGNED \ 323 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ 324 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ 325 xmm0 = _mm_load_si128((__m128i *)p_y1); \ 326 xmm3 = _mm_load_si128((__m128i *)p_y2); \ 327 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ 328 xmm2 = xmm0; \ 329 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ 330 _mm_stream_si128((__m128i*)(p_line1), xmm2); \ 331 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ 332 _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \ 333 xmm4 = xmm3; \ 334 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ 335 _mm_stream_si128((__m128i*)(p_line2), xmm4); \ 336 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ 337 _mm_stream_si128((__m128i*)(p_line1+16), xmm3); 338 339 #define SSE2_YUV420_YUYV_UNALIGNED \ 340 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ 341 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ 342 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \ 343 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \ 344 _mm_prefetch(p_line1, _MM_HINT_NTA); \ 345 _mm_prefetch(p_line2, _MM_HINT_NTA); \ 346 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ 347 xmm2 = xmm0; \ 348 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ 349 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \ 350 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ 351 _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \ 352 xmm4 = xmm3; \ 353 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ 354 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \ 355 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ 356 _mm_storeu_si128((__m128i*)(p_line1+16), xmm3); 357 358 #define SSE2_YUV420_YVYU_ALIGNED \ 359 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ 360 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ 361 xmm0 = _mm_load_si128((__m128i *)p_y1); \ 362 xmm3 = _mm_load_si128((__m128i *)p_y2); \ 363 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ 364 xmm2 = xmm0; \ 365 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ 366 _mm_stream_si128((__m128i*)(p_line1), xmm2); \ 367 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ 368 _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \ 369 xmm4 = xmm3; \ 370 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ 371 _mm_stream_si128((__m128i*)(p_line2), xmm4); \ 372 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ 373 _mm_stream_si128((__m128i*)(p_line1+16), xmm3); 374 375 #define SSE2_YUV420_YVYU_UNALIGNED \ 376 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ 377 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ 378 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \ 379 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \ 380 _mm_prefetch(p_line1, _MM_HINT_NTA); \ 381 _mm_prefetch(p_line2, _MM_HINT_NTA); \ 382 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ 383 xmm2 = xmm0; \ 384 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ 385 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \ 386 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ 387 _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \ 388 xmm4 = xmm3; \ 389 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ 390 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \ 391 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ 392 _mm_storeu_si128((__m128i*)(p_line1+16), xmm3); 393 394 #define SSE2_YUV420_UYVY_ALIGNED \ 395 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ 396 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ 397 xmm0 = _mm_load_si128((__m128i *)p_y1); \ 398 xmm3 = _mm_load_si128((__m128i *)p_y2); \ 399 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ 400 xmm2 = xmm1; \ 401 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \ 402 _mm_stream_si128((__m128i*)(p_line1), xmm2); \ 403 xmm2 = xmm1; \ 404 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ 405 _mm_stream_si128((__m128i*)(p_line1+16), xmm2); \ 406 xmm4 = xmm1; \ 407 xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \ 408 _mm_stream_si128((__m128i*)(p_line2), xmm4); \ 409 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ 410 _mm_stream_si128((__m128i*)(p_line1+16), xmm1); 411 412 #define SSE2_YUV420_UYVY_UNALIGNED \ 413 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ 414 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ 415 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \ 416 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \ 417 _mm_prefetch(p_line1, _MM_HINT_NTA); \ 418 _mm_prefetch(p_line2, _MM_HINT_NTA); \ 419 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ 420 xmm2 = xmm1; \ 421 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \ 422 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \ 423 xmm2 = xmm1; \ 424 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ 425 _mm_storeu_si128((__m128i*)(p_line1+16), xmm2); \ 426 xmm4 = xmm1; \ 427 xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \ 428 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \ 429 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ 430 _mm_storeu_si128((__m128i*)(p_line1+16), xmm1); 431 432 #endif 433 434 #endif 435 436 /* Used in both accelerated and C modules */ 437 438 #define C_YUV420_YVYU( ) \ 439 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \ 440 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \ 441 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \ 442 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \ 443 444 #define C_YUV420_Y211( ) \ 445 *(p_line1)++ = *(p_y1); p_y1 += 2; \ 446 *(p_line2)++ = *(p_y2); p_y2 += 2; \ 447 *(p_line1)++ = *(p_line2)++ = *(p_u) - 0x80; p_u += 2; \ 448 *(p_line1)++ = *(p_y1); p_y1 += 2; \ 449 *(p_line2)++ = *(p_y2); p_y2 += 2; \ 450 *(p_line1)++ = *(p_line2)++ = *(p_v) - 0x80; p_v += 2; \ 451 452 453 #define C_YUV420_YUYV( ) \ 454 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \ 455 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \ 456 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \ 457 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \ 458 459 #define C_YUV420_UYVY( ) \ 460 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \ 461 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \ 462 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \ 463 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \ 464 465