1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <emmintrin.h> // SSE2
14 #include <smmintrin.h> /* SSE4.1 */
15 #include <immintrin.h> /* AVX2 */
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/x86/mem_sse2.h"
19 #include "av1/common/onyxc_int.h"
20 #include "av1/common/txb_common.h"
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/synonyms_avx2.h"
23
av1_txb_init_levels_avx2(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)24 void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
25 const int height, uint8_t *const levels) {
26 const int stride = width + TX_PAD_HOR;
27 const __m256i y_zeros = _mm256_setzero_si256();
28
29 const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
30 uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
31 uint8_t *pre_buf_end = pre_buf + pre_len;
32 do {
33 yy_storeu_256(pre_buf, y_zeros);
34 pre_buf += 32;
35 } while (pre_buf < pre_buf_end);
36
37 const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
38 uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
39 uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
40
41 do {
42 yy_storeu_256(bottom_buf, y_zeros);
43 bottom_buf += 32;
44 } while (bottom_buf < bottom_buf_end);
45
46 int i = 0;
47 uint8_t *ls = levels;
48 const tran_low_t *cf = coeff;
49 if (width == 4) {
50 do {
51 const __m256i c0 = yy_loadu_256(cf);
52 const __m256i c1 = yy_loadu_256(cf + 8);
53 const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
54 const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
55 const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
56 const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
57 yy_storeu_256(ls, res);
58 ls += 32;
59 cf += 16;
60 i += 4;
61 } while (i < height);
62 } else if (width == 8) {
63 do {
64 const __m256i coeffA = yy_loadu_256(cf);
65 const __m256i coeffB = yy_loadu_256(cf + 8);
66 const __m256i coeffC = yy_loadu_256(cf + 16);
67 const __m256i coeffD = yy_loadu_256(cf + 24);
68 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
69 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
70 const __m256i absAB = _mm256_abs_epi16(coeffAB);
71 const __m256i absCD = _mm256_abs_epi16(coeffCD);
72 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
73 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
74 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
75 const __m128i res0 = _mm256_castsi256_si128(res);
76 const __m128i res1 = _mm256_extracti128_si256(res, 1);
77 xx_storel_64(ls, res0);
78 *(int32_t *)(ls + width) = 0;
79 xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
80 *(int32_t *)(ls + width + stride) = 0;
81 xx_storel_64(ls + stride * 2, res1);
82 *(int32_t *)(ls + width + stride * 2) = 0;
83 xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
84 *(int32_t *)(ls + width + stride * 3) = 0;
85 cf += 32;
86 ls += stride << 2;
87 i += 4;
88 } while (i < height);
89 } else if (width == 16) {
90 do {
91 const __m256i coeffA = yy_loadu_256(cf);
92 const __m256i coeffB = yy_loadu_256(cf + 8);
93 const __m256i coeffC = yy_loadu_256(cf + 16);
94 const __m256i coeffD = yy_loadu_256(cf + 24);
95 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
96 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
97 const __m256i absAB = _mm256_abs_epi16(coeffAB);
98 const __m256i absCD = _mm256_abs_epi16(coeffCD);
99 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
100 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
101 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
102 xx_storeu_128(ls, _mm256_castsi256_si128(res));
103 xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
104 cf += 32;
105 *(int32_t *)(ls + width) = 0;
106 *(int32_t *)(ls + stride + width) = 0;
107 ls += stride << 1;
108 i += 2;
109 } while (i < height);
110 } else {
111 do {
112 const __m256i coeffA = yy_loadu_256(cf);
113 const __m256i coeffB = yy_loadu_256(cf + 8);
114 const __m256i coeffC = yy_loadu_256(cf + 16);
115 const __m256i coeffD = yy_loadu_256(cf + 24);
116 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
117 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
118 const __m256i absAB = _mm256_abs_epi16(coeffAB);
119 const __m256i absCD = _mm256_abs_epi16(coeffCD);
120 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
121 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
122 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
123 yy_storeu_256(ls, res);
124 cf += 32;
125 *(int32_t *)(ls + width) = 0;
126 ls += stride;
127 i += 1;
128 } while (i < height);
129 }
130 }
131