1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13 #include "./aom_dsp_rtcd.h"
14
15 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
16 const uint8_t *ref, int ref_stride,
17 unsigned int *sse, int *sum);
18
19 void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
20 const uint8_t *ref, int ref_stride, unsigned int *sse,
21 int *sum);
22
variance_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int w,int h,unsigned int * sse,int * sum,get_var_avx2 var_fn,int block_size)23 static void variance_avx2(const uint8_t *src, int src_stride,
24 const uint8_t *ref, int ref_stride, int w, int h,
25 unsigned int *sse, int *sum, get_var_avx2 var_fn,
26 int block_size) {
27 int i, j;
28
29 *sse = 0;
30 *sum = 0;
31
32 for (i = 0; i < h; i += 16) {
33 for (j = 0; j < w; j += block_size) {
34 unsigned int sse0;
35 int sum0;
36 var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
37 ref_stride, &sse0, &sum0);
38 *sse += sse0;
39 *sum += sum0;
40 }
41 }
42 }
43
aom_variance16x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)44 unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
45 const uint8_t *ref, int ref_stride,
46 unsigned int *sse) {
47 int sum;
48 unsigned int variance;
49 variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
50 aom_get16x16var_avx2, 16);
51
52 variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
53 _mm256_zeroupper();
54 return variance;
55 }
56
aom_mse16x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)57 unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
58 const uint8_t *ref, int ref_stride,
59 unsigned int *sse) {
60 int sum;
61 aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
62 _mm256_zeroupper();
63 return *sse;
64 }
65
aom_variance32x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)66 unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
67 const uint8_t *ref, int ref_stride,
68 unsigned int *sse) {
69 int sum;
70 unsigned int variance;
71 variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
72 aom_get32x32var_avx2, 32);
73
74 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
75 _mm256_zeroupper();
76 return variance;
77 }
78
aom_variance32x32_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)79 unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
80 const uint8_t *ref, int ref_stride,
81 unsigned int *sse) {
82 int sum;
83 unsigned int variance;
84 variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
85 aom_get32x32var_avx2, 32);
86
87 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
88 _mm256_zeroupper();
89 return variance;
90 }
91
aom_variance64x64_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)92 unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
93 const uint8_t *ref, int ref_stride,
94 unsigned int *sse) {
95 int sum;
96 unsigned int variance;
97 variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
98 aom_get32x32var_avx2, 32);
99
100 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
101 _mm256_zeroupper();
102 return variance;
103 }
104
aom_variance64x32_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)105 unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
106 const uint8_t *ref, int ref_stride,
107 unsigned int *sse) {
108 int sum;
109 unsigned int variance;
110 variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
111 aom_get32x32var_avx2, 32);
112
113 variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
114 _mm256_zeroupper();
115 return variance;
116 }
117
118 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
119 int x_offset, int y_offset,
120 const uint8_t *dst, int dst_stride,
121 int height, unsigned int *sse);
122
123 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
124 const uint8_t *src, int src_stride, int x_offset, int y_offset,
125 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
126 int height, unsigned int *sseptr);
127
aom_sub_pixel_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse)128 unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
129 int src_stride, int x_offset,
130 int y_offset, const uint8_t *dst,
131 int dst_stride,
132 unsigned int *sse) {
133 unsigned int sse1;
134 const int se1 = aom_sub_pixel_variance32xh_avx2(
135 src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
136 unsigned int sse2;
137 const int se2 =
138 aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
139 dst + 32, dst_stride, 64, &sse2);
140 const int se = se1 + se2;
141 unsigned int variance;
142 *sse = sse1 + sse2;
143
144 variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
145 _mm256_zeroupper();
146 return variance;
147 }
148
aom_sub_pixel_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse)149 unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
150 int src_stride, int x_offset,
151 int y_offset, const uint8_t *dst,
152 int dst_stride,
153 unsigned int *sse) {
154 const int se = aom_sub_pixel_variance32xh_avx2(
155 src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
156
157 const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
158 _mm256_zeroupper();
159 return variance;
160 }
161
aom_sub_pixel_avg_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse,const uint8_t * sec)162 unsigned int aom_sub_pixel_avg_variance64x64_avx2(
163 const uint8_t *src, int src_stride, int x_offset, int y_offset,
164 const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
165 unsigned int sse1;
166 const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
167 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
168 unsigned int sse2;
169 const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
170 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
171 64, 64, &sse2);
172 const int se = se1 + se2;
173 unsigned int variance;
174
175 *sse = sse1 + sse2;
176
177 variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
178 _mm256_zeroupper();
179 return variance;
180 }
181
aom_sub_pixel_avg_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse,const uint8_t * sec)182 unsigned int aom_sub_pixel_avg_variance32x32_avx2(
183 const uint8_t *src, int src_stride, int x_offset, int y_offset,
184 const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
185 // Process 32 elements in parallel.
186 const int se = aom_sub_pixel_avg_variance32xh_avx2(
187 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
188
189 const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
190 _mm256_zeroupper();
191 return variance;
192 }
193