1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h>  // SSE2
14 #include <smmintrin.h>  /* SSE4.1 */
15 #include <immintrin.h>  /* AVX2 */
16 
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/x86/mem_sse2.h"
19 #include "av1/common/onyxc_int.h"
20 #include "av1/common/txb_common.h"
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/synonyms_avx2.h"
23 
av1_txb_init_levels_avx2(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)24 void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
25                               const int height, uint8_t *const levels) {
26   const int stride = width + TX_PAD_HOR;
27   const __m256i y_zeros = _mm256_setzero_si256();
28 
29   const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
30   uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
31   uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
32 
33   do {
34     yy_storeu_256(bottom_buf, y_zeros);
35     bottom_buf += 32;
36   } while (bottom_buf < bottom_buf_end);
37 
38   int i = 0;
39   uint8_t *ls = levels;
40   const tran_low_t *cf = coeff;
41   if (width == 4) {
42     do {
43       const __m256i c0 = yy_loadu_256(cf);
44       const __m256i c1 = yy_loadu_256(cf + 8);
45       const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
46       const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
47       const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
48       const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
49       yy_storeu_256(ls, res);
50       ls += 32;
51       cf += 16;
52       i += 4;
53     } while (i < height);
54   } else if (width == 8) {
55     do {
56       const __m256i coeffA = yy_loadu_256(cf);
57       const __m256i coeffB = yy_loadu_256(cf + 8);
58       const __m256i coeffC = yy_loadu_256(cf + 16);
59       const __m256i coeffD = yy_loadu_256(cf + 24);
60       const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
61       const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
62       const __m256i absAB = _mm256_abs_epi16(coeffAB);
63       const __m256i absCD = _mm256_abs_epi16(coeffCD);
64       const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
65       const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
66       const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
67       const __m128i res0 = _mm256_castsi256_si128(res);
68       const __m128i res1 = _mm256_extracti128_si256(res, 1);
69       xx_storel_64(ls, res0);
70       *(int32_t *)(ls + width) = 0;
71       xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
72       *(int32_t *)(ls + width + stride) = 0;
73       xx_storel_64(ls + stride * 2, res1);
74       *(int32_t *)(ls + width + stride * 2) = 0;
75       xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
76       *(int32_t *)(ls + width + stride * 3) = 0;
77       cf += 32;
78       ls += stride << 2;
79       i += 4;
80     } while (i < height);
81   } else if (width == 16) {
82     do {
83       const __m256i coeffA = yy_loadu_256(cf);
84       const __m256i coeffB = yy_loadu_256(cf + 8);
85       const __m256i coeffC = yy_loadu_256(cf + 16);
86       const __m256i coeffD = yy_loadu_256(cf + 24);
87       const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
88       const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
89       const __m256i absAB = _mm256_abs_epi16(coeffAB);
90       const __m256i absCD = _mm256_abs_epi16(coeffCD);
91       const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
92       const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
93       const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
94       xx_storeu_128(ls, _mm256_castsi256_si128(res));
95       xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
96       cf += 32;
97       *(int32_t *)(ls + width) = 0;
98       *(int32_t *)(ls + stride + width) = 0;
99       ls += stride << 1;
100       i += 2;
101     } while (i < height);
102   } else {
103     do {
104       const __m256i coeffA = yy_loadu_256(cf);
105       const __m256i coeffB = yy_loadu_256(cf + 8);
106       const __m256i coeffC = yy_loadu_256(cf + 16);
107       const __m256i coeffD = yy_loadu_256(cf + 24);
108       const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
109       const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
110       const __m256i absAB = _mm256_abs_epi16(coeffAB);
111       const __m256i absCD = _mm256_abs_epi16(coeffCD);
112       const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
113       const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
114       const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
115       yy_storeu_256(ls, res);
116       cf += 32;
117       *(int32_t *)(ls + width) = 0;
118       ls += stride;
119       i += 1;
120     } while (i < height);
121   }
122 }
123