1 /*
2 * Copyright(c) 2020 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #include "transpose_avx2.h"
13 
svt_av1_haar_ac_sad_8x8_uint8_input_avx2(uint8_t * input,int stride,int hbd)14 int svt_av1_haar_ac_sad_8x8_uint8_input_avx2(uint8_t *input, int stride, int hbd) {
15     DECLARE_ALIGNED(32, int32_t, output[64]);
16 
17     int32_t *out_ptr = output;
18     int      i;
19 
20     if (hbd) {
21         uint16_t *x16 = CONVERT_TO_SHORTPTR(input);
22         for (i = 0; i < 8; i++) {
23             __m256i inp = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)x16));
24             _mm256_storeu_si256((__m256i *)out_ptr, _mm256_slli_epi32(inp, 2));
25 
26             x16 += stride;
27             out_ptr += 8;
28         }
29     } else {
30         for (i = 0; i < 8; i++) {
31             __m256i inp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)input));
32             _mm256_storeu_si256((__m256i *)out_ptr, _mm256_slli_epi32(inp, 2));
33 
34             input += stride;
35             out_ptr += 8;
36         }
37     }
38 
39     const __m256i indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
40     const __m256i one     = _mm256_set1_epi32(1);
41     const __m256i two     = _mm256_set1_epi32(2);
42 
43     for (i = 0; i < 4; i++) {
44         __m256i x_2n   = _mm256_i32gather_epi32(output + i * 16, indices, 8);
45         __m256i x_2np1 = _mm256_i32gather_epi32(output + i * 16 + 1, indices, 8);
46         __m256i x_2np2 = _mm256_i32gather_epi32(output + i * 16 + 2, indices, 8);
47 
48         __m256i a = _mm256_slli_epi32(x_2n, 1);
49         __m256i b = _mm256_add_epi32(_mm256_add_epi32(x_2n, x_2np2), one);
50         b         = _mm256_sub_epi32(x_2np1, _mm256_srli_epi32(b, 1));
51 
52         int32_t idx_3 = output[i * 16 + 7] - output[i * 16 + 6];
53         int32_t idx_7 = output[i * 16 + 15] - output[i * 16 + 14];
54         b             = _mm256_insert_epi32(b, idx_3, 3);
55         b             = _mm256_insert_epi32(b, idx_7, 7);
56 
57         __m256i r = _mm256_shuffle_epi32(b, (1 << 4) + (2 << 6));
58 
59         a = _mm256_add_epi32(a,
60                              _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(r, b), one), 1));
61 
62         _mm_storeu_si128((__m128i *)(output + i * 16), _mm256_castsi256_si128(a));
63         _mm_storeu_si128((__m128i *)(output + i * 16 + 4), _mm256_castsi256_si128(b));
64         _mm_storeu_si128((__m128i *)(output + i * 16 + 8), _mm256_extracti128_si256(a, 0x1));
65         _mm_storeu_si128((__m128i *)(output + i * 16 + 12), _mm256_extracti128_si256(b, 0x1));
66     }
67 
68     transpose_32bit_8x8_avx2((const __m256i *const)output, (__m256i *const)output);
69 
70     __m256i sum[8];
71     for (i = 0; i < 4; i++) {
72         __m256i x_2n   = _mm256_i32gather_epi32(output + i * 16, indices, 8);
73         __m256i x_2np1 = _mm256_i32gather_epi32(output + i * 16 + 1, indices, 8);
74         __m256i x_2np2 = _mm256_i32gather_epi32(output + i * 16 + 2, indices, 8);
75 
76         __m256i a = x_2n;
77         __m256i b = _mm256_sub_epi32(_mm256_slli_epi32(x_2np1, 1), x_2n);
78         b         = _mm256_add_epi32(_mm256_sub_epi32(b, x_2np2), two);
79         b         = _mm256_srai_epi32(b, 2);
80 
81         int32_t idx_3 = (output[i * 16 + 7] - output[i * 16 + 6] + 1) >> 1;
82         int32_t idx_7 = (output[i * 16 + 15] - output[i * 16 + 14] + 1) >> 1;
83         b             = _mm256_insert_epi32(b, idx_3, 3);
84         b             = _mm256_insert_epi32(b, idx_7, 7);
85 
86         __m256i r = _mm256_shuffle_epi32(b, (1 << 4) + (2 << 6));
87 
88         a = _mm256_add_epi32(a,
89                              _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(r, b), one), 1));
90 
91         sum[2 * i]     = _mm256_abs_epi32(a);
92         sum[2 * i + 1] = _mm256_abs_epi32(b);
93     }
94 
95     __m256i sum13, sum45, sum67;
96 
97     sum13 = _mm256_add_epi32(sum[1], sum[3]);
98     sum45 = _mm256_add_epi32(sum[4], sum[5]);
99     sum67 = _mm256_add_epi32(sum[6], sum[7]);
100     sum13 = _mm256_add_epi32(sum13, _mm256_add_epi32(sum45, sum67));
101 
102     __m128i sum_128 = _mm_add_epi32(_mm256_castsi256_si128(sum13),
103                                     _mm256_extracti128_si256(sum13, 1));
104 
105     sum_128 = _mm_hadd_epi32(sum_128, sum_128);
106     sum_128 = _mm_hadd_epi32(sum_128, sum_128);
107 
108     return _mm_cvtsi128_si32(sum_128);
109 }
110