1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "aom_dsp/aom_dsp_common.h"
15 #include "aom_mem/aom_mem.h"
16 #include "aom_ports/mem.h"
17 
aom_highbd_quantize_b_sse2(const tran_low_t * coeff_ptr,intptr_t count,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)18 void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
19                                 const int16_t *zbin_ptr,
20                                 const int16_t *round_ptr,
21                                 const int16_t *quant_ptr,
22                                 const int16_t *quant_shift_ptr,
23                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
24                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
25                                 const int16_t *scan, const int16_t *iscan) {
26   int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
27   __m128i zbins[2];
28   __m128i nzbins[2];
29 
30   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
31                            (int)zbin_ptr[0]);
32   zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
33 
34   nzbins[0] = _mm_setzero_si128();
35   nzbins[1] = _mm_setzero_si128();
36   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
37   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
38 
39   (void)scan;
40 
41   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
42   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
43 
44   // Pre-scan pass
45   for (i = ((int)count / 4) - 1; i >= 0; i--) {
46     __m128i coeffs, cmp1, cmp2;
47     int test;
48     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
49     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
50     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
51     cmp1 = _mm_and_si128(cmp1, cmp2);
52     test = _mm_movemask_epi8(cmp1);
53     if (test == 0xffff)
54       non_zero_regs--;
55     else
56       break;
57   }
58 
59   // Quantization pass:
60   for (i = 0; i < non_zero_regs; i++) {
61     __m128i coeffs, coeffs_sign, tmp1, tmp2;
62     int test;
63     int abs_coeff[4];
64     int coeff_sign[4];
65 
66     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
67     coeffs_sign = _mm_srai_epi32(coeffs, 31);
68     coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
69     tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
70     tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
71     tmp1 = _mm_or_si128(tmp1, tmp2);
72     test = _mm_movemask_epi8(tmp1);
73     _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
74     _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
75 
76     for (j = 0; j < 4; j++) {
77       if (test & (1 << (4 * j))) {
78         int k = 4 * i + j;
79         const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
80         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
81         const uint32_t abs_qcoeff =
82             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
83         qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
84         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
85         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
86       }
87     }
88   }
89   *eob_ptr = eob_i + 1;
90 }
91 
aom_highbd_quantize_b_32x32_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)92 void aom_highbd_quantize_b_32x32_sse2(
93     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
94     const int16_t *round_ptr, const int16_t *quant_ptr,
95     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
96     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
97     const int16_t *scan, const int16_t *iscan) {
98   __m128i zbins[2];
99   __m128i nzbins[2];
100   int idx = 0;
101   int idx_arr[1024];
102   int i, eob = -1;
103   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
104   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
105   (void)scan;
106   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
107   zbins[1] = _mm_set1_epi32(zbin1_tmp);
108 
109   nzbins[0] = _mm_setzero_si128();
110   nzbins[1] = _mm_setzero_si128();
111   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
112   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
113 
114   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
115   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
116 
117   // Pre-scan pass
118   for (i = 0; i < n_coeffs / 4; i++) {
119     __m128i coeffs, cmp1, cmp2;
120     int test;
121     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
122     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
123     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
124     cmp1 = _mm_and_si128(cmp1, cmp2);
125     test = _mm_movemask_epi8(cmp1);
126     if (!(test & 0xf)) idx_arr[idx++] = i * 4;
127     if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
128     if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
129     if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
130   }
131 
132   // Quantization pass: only process the coefficients selected in
133   // pre-scan pass. Note: idx can be zero.
134   for (i = 0; i < idx; i++) {
135     const int rc = idx_arr[i];
136     const int coeff = coeff_ptr[rc];
137     const int coeff_sign = AOMSIGN(coeff);
138     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
139     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
140     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
141     const uint32_t abs_qcoeff =
142         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
143     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
144     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
145     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
146   }
147   *eob_ptr = eob + 1;
148 }
149 
aom_highbd_quantize_b_64x64_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)150 void aom_highbd_quantize_b_64x64_sse2(
151     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
152     const int16_t *round_ptr, const int16_t *quant_ptr,
153     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
154     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
155     const int16_t *scan, const int16_t *iscan) {
156   __m128i zbins[2];
157   __m128i nzbins[2];
158   int idx = 0;
159   int idx_arr[1024];
160   int i, eob = -1;
161   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
162   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
163   (void)scan;
164   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
165   zbins[1] = _mm_set1_epi32(zbin1_tmp);
166 
167   nzbins[0] = _mm_setzero_si128();
168   nzbins[1] = _mm_setzero_si128();
169   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
170   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
171 
172   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
173   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
174 
175   // Pre-scan pass
176   for (i = 0; i < n_coeffs / 4; i++) {
177     __m128i coeffs, cmp1, cmp2;
178     int test;
179     coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
180     cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
181     cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
182     cmp1 = _mm_and_si128(cmp1, cmp2);
183     test = _mm_movemask_epi8(cmp1);
184     if (!(test & 0xf)) idx_arr[idx++] = i * 4;
185     if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
186     if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
187     if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
188   }
189 
190   // Quantization pass: only process the coefficients selected in
191   // pre-scan pass. Note: idx can be zero.
192   for (i = 0; i < idx; i++) {
193     const int rc = idx_arr[i];
194     const int coeff = coeff_ptr[rc];
195     const int coeff_sign = AOMSIGN(coeff);
196     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
197     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
198     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
199     const uint32_t abs_qcoeff =
200         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
201     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
202     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
203     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
204   }
205   *eob_ptr = eob + 1;
206 }
207