1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 #include "./aom_dsp_rtcd.h"
14 
15 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
16                              const uint8_t *ref, int ref_stride,
17                              unsigned int *sse, int *sum);
18 
19 void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
20                           const uint8_t *ref, int ref_stride, unsigned int *sse,
21                           int *sum);
22 
variance_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int w,int h,unsigned int * sse,int * sum,get_var_avx2 var_fn,int block_size)23 static void variance_avx2(const uint8_t *src, int src_stride,
24                           const uint8_t *ref, int ref_stride, int w, int h,
25                           unsigned int *sse, int *sum, get_var_avx2 var_fn,
26                           int block_size) {
27   int i, j;
28 
29   *sse = 0;
30   *sum = 0;
31 
32   for (i = 0; i < h; i += 16) {
33     for (j = 0; j < w; j += block_size) {
34       unsigned int sse0;
35       int sum0;
36       var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
37              ref_stride, &sse0, &sum0);
38       *sse += sse0;
39       *sum += sum0;
40     }
41   }
42 }
43 
aom_variance16x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)44 unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
45                                     const uint8_t *ref, int ref_stride,
46                                     unsigned int *sse) {
47   int sum;
48   unsigned int variance;
49   variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
50                 aom_get16x16var_avx2, 16);
51 
52   variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
53   _mm256_zeroupper();
54   return variance;
55 }
56 
aom_mse16x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)57 unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
58                                const uint8_t *ref, int ref_stride,
59                                unsigned int *sse) {
60   int sum;
61   aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
62   _mm256_zeroupper();
63   return *sse;
64 }
65 
aom_variance32x16_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)66 unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
67                                     const uint8_t *ref, int ref_stride,
68                                     unsigned int *sse) {
69   int sum;
70   unsigned int variance;
71   variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
72                 aom_get32x32var_avx2, 32);
73 
74   variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
75   _mm256_zeroupper();
76   return variance;
77 }
78 
aom_variance32x32_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)79 unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
80                                     const uint8_t *ref, int ref_stride,
81                                     unsigned int *sse) {
82   int sum;
83   unsigned int variance;
84   variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
85                 aom_get32x32var_avx2, 32);
86 
87   variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
88   _mm256_zeroupper();
89   return variance;
90 }
91 
aom_variance64x64_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)92 unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
93                                     const uint8_t *ref, int ref_stride,
94                                     unsigned int *sse) {
95   int sum;
96   unsigned int variance;
97   variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
98                 aom_get32x32var_avx2, 32);
99 
100   variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
101   _mm256_zeroupper();
102   return variance;
103 }
104 
aom_variance64x32_avx2(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse)105 unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
106                                     const uint8_t *ref, int ref_stride,
107                                     unsigned int *sse) {
108   int sum;
109   unsigned int variance;
110   variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
111                 aom_get32x32var_avx2, 32);
112 
113   variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
114   _mm256_zeroupper();
115   return variance;
116 }
117 
118 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
119                                              int x_offset, int y_offset,
120                                              const uint8_t *dst, int dst_stride,
121                                              int height, unsigned int *sse);
122 
123 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
124     const uint8_t *src, int src_stride, int x_offset, int y_offset,
125     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
126     int height, unsigned int *sseptr);
127 
aom_sub_pixel_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse)128 unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
129                                               int src_stride, int x_offset,
130                                               int y_offset, const uint8_t *dst,
131                                               int dst_stride,
132                                               unsigned int *sse) {
133   unsigned int sse1;
134   const int se1 = aom_sub_pixel_variance32xh_avx2(
135       src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
136   unsigned int sse2;
137   const int se2 =
138       aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
139                                       dst + 32, dst_stride, 64, &sse2);
140   const int se = se1 + se2;
141   unsigned int variance;
142   *sse = sse1 + sse2;
143 
144   variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
145   _mm256_zeroupper();
146   return variance;
147 }
148 
aom_sub_pixel_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse)149 unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
150                                               int src_stride, int x_offset,
151                                               int y_offset, const uint8_t *dst,
152                                               int dst_stride,
153                                               unsigned int *sse) {
154   const int se = aom_sub_pixel_variance32xh_avx2(
155       src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
156 
157   const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
158   _mm256_zeroupper();
159   return variance;
160 }
161 
aom_sub_pixel_avg_variance64x64_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse,const uint8_t * sec)162 unsigned int aom_sub_pixel_avg_variance64x64_avx2(
163     const uint8_t *src, int src_stride, int x_offset, int y_offset,
164     const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
165   unsigned int sse1;
166   const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
167       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
168   unsigned int sse2;
169   const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
170       src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
171       64, 64, &sse2);
172   const int se = se1 + se2;
173   unsigned int variance;
174 
175   *sse = sse1 + sse2;
176 
177   variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
178   _mm256_zeroupper();
179   return variance;
180 }
181 
aom_sub_pixel_avg_variance32x32_avx2(const uint8_t * src,int src_stride,int x_offset,int y_offset,const uint8_t * dst,int dst_stride,unsigned int * sse,const uint8_t * sec)182 unsigned int aom_sub_pixel_avg_variance32x32_avx2(
183     const uint8_t *src, int src_stride, int x_offset, int y_offset,
184     const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
185   // Process 32 elements in parallel.
186   const int se = aom_sub_pixel_avg_variance32xh_avx2(
187       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
188 
189   const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
190   _mm256_zeroupper();
191   return variance;
192 }
193