1 /*
2 * Copyright(c) 2020 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "transpose_avx2.h"
13
svt_av1_haar_ac_sad_8x8_uint8_input_avx2(uint8_t * input,int stride,int hbd)14 int svt_av1_haar_ac_sad_8x8_uint8_input_avx2(uint8_t *input, int stride, int hbd) {
15 DECLARE_ALIGNED(32, int32_t, output[64]);
16
17 int32_t *out_ptr = output;
18 int i;
19
20 if (hbd) {
21 uint16_t *x16 = CONVERT_TO_SHORTPTR(input);
22 for (i = 0; i < 8; i++) {
23 __m256i inp = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)x16));
24 _mm256_storeu_si256((__m256i *)out_ptr, _mm256_slli_epi32(inp, 2));
25
26 x16 += stride;
27 out_ptr += 8;
28 }
29 } else {
30 for (i = 0; i < 8; i++) {
31 __m256i inp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)input));
32 _mm256_storeu_si256((__m256i *)out_ptr, _mm256_slli_epi32(inp, 2));
33
34 input += stride;
35 out_ptr += 8;
36 }
37 }
38
39 const __m256i indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
40 const __m256i one = _mm256_set1_epi32(1);
41 const __m256i two = _mm256_set1_epi32(2);
42
43 for (i = 0; i < 4; i++) {
44 __m256i x_2n = _mm256_i32gather_epi32(output + i * 16, indices, 8);
45 __m256i x_2np1 = _mm256_i32gather_epi32(output + i * 16 + 1, indices, 8);
46 __m256i x_2np2 = _mm256_i32gather_epi32(output + i * 16 + 2, indices, 8);
47
48 __m256i a = _mm256_slli_epi32(x_2n, 1);
49 __m256i b = _mm256_add_epi32(_mm256_add_epi32(x_2n, x_2np2), one);
50 b = _mm256_sub_epi32(x_2np1, _mm256_srli_epi32(b, 1));
51
52 int32_t idx_3 = output[i * 16 + 7] - output[i * 16 + 6];
53 int32_t idx_7 = output[i * 16 + 15] - output[i * 16 + 14];
54 b = _mm256_insert_epi32(b, idx_3, 3);
55 b = _mm256_insert_epi32(b, idx_7, 7);
56
57 __m256i r = _mm256_shuffle_epi32(b, (1 << 4) + (2 << 6));
58
59 a = _mm256_add_epi32(a,
60 _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(r, b), one), 1));
61
62 _mm_storeu_si128((__m128i *)(output + i * 16), _mm256_castsi256_si128(a));
63 _mm_storeu_si128((__m128i *)(output + i * 16 + 4), _mm256_castsi256_si128(b));
64 _mm_storeu_si128((__m128i *)(output + i * 16 + 8), _mm256_extracti128_si256(a, 0x1));
65 _mm_storeu_si128((__m128i *)(output + i * 16 + 12), _mm256_extracti128_si256(b, 0x1));
66 }
67
68 transpose_32bit_8x8_avx2((const __m256i *const)output, (__m256i *const)output);
69
70 __m256i sum[8];
71 for (i = 0; i < 4; i++) {
72 __m256i x_2n = _mm256_i32gather_epi32(output + i * 16, indices, 8);
73 __m256i x_2np1 = _mm256_i32gather_epi32(output + i * 16 + 1, indices, 8);
74 __m256i x_2np2 = _mm256_i32gather_epi32(output + i * 16 + 2, indices, 8);
75
76 __m256i a = x_2n;
77 __m256i b = _mm256_sub_epi32(_mm256_slli_epi32(x_2np1, 1), x_2n);
78 b = _mm256_add_epi32(_mm256_sub_epi32(b, x_2np2), two);
79 b = _mm256_srai_epi32(b, 2);
80
81 int32_t idx_3 = (output[i * 16 + 7] - output[i * 16 + 6] + 1) >> 1;
82 int32_t idx_7 = (output[i * 16 + 15] - output[i * 16 + 14] + 1) >> 1;
83 b = _mm256_insert_epi32(b, idx_3, 3);
84 b = _mm256_insert_epi32(b, idx_7, 7);
85
86 __m256i r = _mm256_shuffle_epi32(b, (1 << 4) + (2 << 6));
87
88 a = _mm256_add_epi32(a,
89 _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(r, b), one), 1));
90
91 sum[2 * i] = _mm256_abs_epi32(a);
92 sum[2 * i + 1] = _mm256_abs_epi32(b);
93 }
94
95 __m256i sum13, sum45, sum67;
96
97 sum13 = _mm256_add_epi32(sum[1], sum[3]);
98 sum45 = _mm256_add_epi32(sum[4], sum[5]);
99 sum67 = _mm256_add_epi32(sum[6], sum[7]);
100 sum13 = _mm256_add_epi32(sum13, _mm256_add_epi32(sum45, sum67));
101
102 __m128i sum_128 = _mm_add_epi32(_mm256_castsi256_si128(sum13),
103 _mm256_extracti128_si256(sum13, 1));
104
105 sum_128 = _mm_hadd_epi32(sum_128, sum_128);
106 sum_128 = _mm_hadd_epi32(sum_128, sum_128);
107
108 return _mm_cvtsi128_si32(sum_128);
109 }
110