1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17 18pw_11585x2: times 8 dw 23170 19pd_8192: times 4 dd 8192 20 21%macro TRANSFORM_COEFFS 2 22pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 23pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 24%endmacro 25 26TRANSFORM_COEFFS 11585, 11585 27TRANSFORM_COEFFS 15137, 6270 28TRANSFORM_COEFFS 16069, 3196 29TRANSFORM_COEFFS 9102, 13623 30 31%macro STORE_OUTPUT 2 ; index, result 32 ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); 33 ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); 34 ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); 35 ; _mm_store_si128((__m128i *)(dst_ptr), out0); 36 ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); 37 pxor m11, m11 38 pcmpgtw m11, m%2 39 movdqa m12, m%2 40 punpcklwd m%2, m11 41 punpckhwd m12, m11 42 mova [outputq + 4*%1 + 0], m%2 43 mova [outputq + 4*%1 + 16], m12 44%endmacro 45 46SECTION .text 47 48%if ARCH_X86_64 49INIT_XMM ssse3 50cglobal fdct8x8, 3, 5, 13, input, output, stride 51 52 mova m8, [GLOBAL(pd_8192)] 53 mova m12, [GLOBAL(pw_11585x2)] 54 55 lea r3, [2 * strideq] 56 lea r4, [4 * strideq] 57 mova m0, [inputq] 58 mova m1, [inputq + r3] 59 lea inputq, [inputq + r4] 60 mova m2, [inputq] 61 mova m3, [inputq + r3] 62 lea inputq, [inputq + r4] 63 mova m4, [inputq] 64 mova m5, [inputq + r3] 65 lea inputq, [inputq + r4] 66 mova m6, [inputq] 67 mova m7, [inputq + r3] 68 69 ; left shift by 2 to increase forward transformation precision 70 psllw m0, 2 71 psllw m1, 2 72 psllw m2, 2 73 psllw m3, 2 74 psllw m4, 2 75 psllw m5, 2 76 psllw m6, 2 77 psllw m7, 2 78 79 ; column transform 80 ; stage 1 81 paddw m10, m0, m7 82 psubw m0, m7 83 84 paddw m9, m1, m6 85 psubw m1, m6 86 87 paddw m7, m2, m5 88 psubw m2, m5 89 90 paddw m6, m3, m4 91 psubw m3, m4 92 93 ; stage 2 94 paddw m5, m9, m7 95 psubw m9, m7 96 97 paddw m4, m10, m6 98 psubw m10, m6 99 100 paddw m7, m1, m2 101 psubw m1, m2 102 103 ; stage 3 104 paddw m6, m4, m5 105 psubw m4, m5 106 107 pmulhrsw m1, m12 108 pmulhrsw m7, m12 109 110 ; sin(pi / 8), cos(pi / 8) 111 punpcklwd m2, m10, m9 112 punpckhwd m10, m9 113 pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] 114 pmaddwd m2, [GLOBAL(pw_6270_m15137)] 115 pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] 116 pmaddwd m10, [GLOBAL(pw_6270_m15137)] 117 paddd m5, m8 118 paddd m2, m8 119 paddd m9, m8 120 paddd m10, m8 121 psrad m5, 14 122 psrad m2, 14 123 psrad m9, 14 124 psrad m10, 14 125 packssdw m5, m9 126 packssdw m2, m10 127 128 pmulhrsw m6, m12 129 pmulhrsw m4, m12 130 131 paddw m9, m3, m1 132 psubw m3, m1 133 134 paddw m10, m0, m7 135 psubw m0, m7 136 137 ; stage 4 138 ; sin(pi / 16), cos(pi / 16) 139 punpcklwd m1, m10, m9 140 punpckhwd m10, m9 141 pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] 142 pmaddwd m1, [GLOBAL(pw_3196_m16069)] 143 pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] 144 pmaddwd m10, [GLOBAL(pw_3196_m16069)] 145 paddd m7, m8 146 paddd m1, m8 147 paddd m9, m8 148 paddd m10, m8 149 psrad m7, 14 150 psrad m1, 14 151 psrad m9, 14 152 psrad m10, 14 153 packssdw m7, m9 154 packssdw m1, m10 155 156 ; sin(3 * pi / 16), cos(3 * pi / 16) 157 punpcklwd m11, m0, m3 158 punpckhwd m0, m3 159 pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] 160 pmaddwd m11, [GLOBAL(pw_13623_m9102)] 161 pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] 162 pmaddwd m0, [GLOBAL(pw_13623_m9102)] 163 paddd m9, m8 164 paddd m11, m8 165 paddd m3, m8 166 paddd m0, m8 167 psrad m9, 14 168 psrad m11, 14 169 psrad m3, 14 170 psrad m0, 14 171 packssdw m9, m3 172 packssdw m11, m0 173 174 ; transpose 175 ; stage 1 176 punpcklwd m0, m6, m7 177 punpcklwd m3, m5, m11 178 punpckhwd m6, m7 179 punpckhwd m5, m11 180 punpcklwd m7, m4, m9 181 punpcklwd m10, m2, m1 182 punpckhwd m4, m9 183 punpckhwd m2, m1 184 185 ; stage 2 186 punpckldq m9, m0, m3 187 punpckldq m1, m6, m5 188 punpckhdq m0, m3 189 punpckhdq m6, m5 190 punpckldq m3, m7, m10 191 punpckldq m5, m4, m2 192 punpckhdq m7, m10 193 punpckhdq m4, m2 194 195 ; stage 3 196 punpcklqdq m10, m9, m3 197 punpckhqdq m9, m3 198 punpcklqdq m2, m0, m7 199 punpckhqdq m0, m7 200 punpcklqdq m3, m1, m5 201 punpckhqdq m1, m5 202 punpcklqdq m7, m6, m4 203 punpckhqdq m6, m4 204 205 ; row transform 206 ; stage 1 207 paddw m5, m10, m6 208 psubw m10, m6 209 210 paddw m4, m9, m7 211 psubw m9, m7 212 213 paddw m6, m2, m1 214 psubw m2, m1 215 216 paddw m7, m0, m3 217 psubw m0, m3 218 219 ;stage 2 220 paddw m1, m5, m7 221 psubw m5, m7 222 223 paddw m3, m4, m6 224 psubw m4, m6 225 226 paddw m7, m9, m2 227 psubw m9, m2 228 229 ; stage 3 230 punpcklwd m6, m1, m3 231 punpckhwd m1, m3 232 pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] 233 pmaddwd m6, [GLOBAL(pw_11585_m11585)] 234 pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] 235 pmaddwd m1, [GLOBAL(pw_11585_m11585)] 236 paddd m2, m8 237 paddd m6, m8 238 paddd m3, m8 239 paddd m1, m8 240 psrad m2, 14 241 psrad m6, 14 242 psrad m3, 14 243 psrad m1, 14 244 packssdw m2, m3 245 packssdw m6, m1 246 247 pmulhrsw m7, m12 248 pmulhrsw m9, m12 249 250 punpcklwd m3, m5, m4 251 punpckhwd m5, m4 252 pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] 253 pmaddwd m3, [GLOBAL(pw_6270_m15137)] 254 pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] 255 pmaddwd m5, [GLOBAL(pw_6270_m15137)] 256 paddd m1, m8 257 paddd m3, m8 258 paddd m4, m8 259 paddd m5, m8 260 psrad m1, 14 261 psrad m3, 14 262 psrad m4, 14 263 psrad m5, 14 264 packssdw m1, m4 265 packssdw m3, m5 266 267 paddw m4, m0, m9 268 psubw m0, m9 269 270 paddw m5, m10, m7 271 psubw m10, m7 272 273 ; stage 4 274 punpcklwd m9, m5, m4 275 punpckhwd m5, m4 276 pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] 277 pmaddwd m9, [GLOBAL(pw_3196_m16069)] 278 pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] 279 pmaddwd m5, [GLOBAL(pw_3196_m16069)] 280 paddd m7, m8 281 paddd m9, m8 282 paddd m4, m8 283 paddd m5, m8 284 psrad m7, 14 285 psrad m9, 14 286 psrad m4, 14 287 psrad m5, 14 288 packssdw m7, m4 289 packssdw m9, m5 290 291 punpcklwd m4, m10, m0 292 punpckhwd m10, m0 293 pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] 294 pmaddwd m4, [GLOBAL(pw_13623_m9102)] 295 pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] 296 pmaddwd m10, [GLOBAL(pw_13623_m9102)] 297 paddd m5, m8 298 paddd m4, m8 299 paddd m0, m8 300 paddd m10, m8 301 psrad m5, 14 302 psrad m4, 14 303 psrad m0, 14 304 psrad m10, 14 305 packssdw m5, m0 306 packssdw m4, m10 307 308 ; transpose 309 ; stage 1 310 punpcklwd m0, m2, m7 311 punpcklwd m10, m1, m4 312 punpckhwd m2, m7 313 punpckhwd m1, m4 314 punpcklwd m7, m6, m5 315 punpcklwd m4, m3, m9 316 punpckhwd m6, m5 317 punpckhwd m3, m9 318 319 ; stage 2 320 punpckldq m5, m0, m10 321 punpckldq m9, m2, m1 322 punpckhdq m0, m10 323 punpckhdq m2, m1 324 punpckldq m10, m7, m4 325 punpckldq m1, m6, m3 326 punpckhdq m7, m4 327 punpckhdq m6, m3 328 329 ; stage 3 330 punpcklqdq m4, m5, m10 331 punpckhqdq m5, m10 332 punpcklqdq m3, m0, m7 333 punpckhqdq m0, m7 334 punpcklqdq m10, m9, m1 335 punpckhqdq m9, m1 336 punpcklqdq m7, m2, m6 337 punpckhqdq m2, m6 338 339 psraw m1, m4, 15 340 psraw m6, m5, 15 341 psraw m8, m3, 15 342 psraw m11, m0, 15 343 344 psubw m4, m1 345 psubw m5, m6 346 psubw m3, m8 347 psubw m0, m11 348 349 psraw m4, 1 350 psraw m5, 1 351 psraw m3, 1 352 psraw m0, 1 353 354 psraw m1, m10, 15 355 psraw m6, m9, 15 356 psraw m8, m7, 15 357 psraw m11, m2, 15 358 359 psubw m10, m1 360 psubw m9, m6 361 psubw m7, m8 362 psubw m2, m11 363 364 psraw m10, 1 365 psraw m9, 1 366 psraw m7, 1 367 psraw m2, 1 368 369 STORE_OUTPUT 0, 4 370 STORE_OUTPUT 8, 5 371 STORE_OUTPUT 16, 3 372 STORE_OUTPUT 24, 0 373 STORE_OUTPUT 32, 10 374 STORE_OUTPUT 40, 9 375 STORE_OUTPUT 48, 7 376 STORE_OUTPUT 56, 2 377 378 RET 379%endif 380