1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <emmintrin.h>
13 #include <xmmintrin.h>
14
15 #include "./vp9_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
19
vp9_quantize_fp_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)20 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
21 int skip_block, const int16_t *round_ptr,
22 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
23 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
24 uint16_t *eob_ptr, const int16_t *scan,
25 const int16_t *iscan) {
26 __m128i zero;
27 __m128i thr;
28 int nzflag;
29 __m128i eob;
30 __m128i round, quant, dequant;
31
32 (void)scan;
33 (void)skip_block;
34 assert(!skip_block);
35
36 coeff_ptr += n_coeffs;
37 iscan += n_coeffs;
38 qcoeff_ptr += n_coeffs;
39 dqcoeff_ptr += n_coeffs;
40 n_coeffs = -n_coeffs;
41 zero = _mm_setzero_si128();
42
43 {
44 __m128i coeff0, coeff1;
45
46 // Setup global values
47 {
48 round = _mm_load_si128((const __m128i *)round_ptr);
49 quant = _mm_load_si128((const __m128i *)quant_ptr);
50 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
51 }
52
53 {
54 __m128i coeff0_sign, coeff1_sign;
55 __m128i qcoeff0, qcoeff1;
56 __m128i qtmp0, qtmp1;
57 // Do DC and first 15 AC
58 coeff0 = load_tran_low(coeff_ptr + n_coeffs);
59 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
60
61 // Poor man's sign extract
62 coeff0_sign = _mm_srai_epi16(coeff0, 15);
63 coeff1_sign = _mm_srai_epi16(coeff1, 15);
64 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
65 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
66 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
67 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
68
69 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
70 round = _mm_unpackhi_epi64(round, round);
71 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
72 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
73 quant = _mm_unpackhi_epi64(quant, quant);
74 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
75
76 // Reinsert signs
77 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
78 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
79 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
80 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
81
82 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
83 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
84
85 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
86 dequant = _mm_unpackhi_epi64(dequant, dequant);
87 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
88
89 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
90 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
91 }
92
93 {
94 // Scan for eob
95 __m128i zero_coeff0, zero_coeff1;
96 __m128i nzero_coeff0, nzero_coeff1;
97 __m128i iscan0, iscan1;
98 __m128i eob1;
99 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
100 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
101 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
102 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
103 iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
104 iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
105 // Add one to convert from indices to counts
106 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
107 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
108 eob = _mm_and_si128(iscan0, nzero_coeff0);
109 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
110 eob = _mm_max_epi16(eob, eob1);
111 }
112 n_coeffs += 8 * 2;
113 }
114
115 thr = _mm_srai_epi16(dequant, 1);
116
117 // AC only loop
118 while (n_coeffs < 0) {
119 __m128i coeff0, coeff1;
120 {
121 __m128i coeff0_sign, coeff1_sign;
122 __m128i qcoeff0, qcoeff1;
123 __m128i qtmp0, qtmp1;
124
125 coeff0 = load_tran_low(coeff_ptr + n_coeffs);
126 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
127
128 // Poor man's sign extract
129 coeff0_sign = _mm_srai_epi16(coeff0, 15);
130 coeff1_sign = _mm_srai_epi16(coeff1, 15);
131 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
132 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
133 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
134 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
135
136 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
137 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
138
139 if (nzflag) {
140 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
141 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
142 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
143 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
144
145 // Reinsert signs
146 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
147 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
148 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
149 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
150
151 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
152 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
153
154 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
155 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
156
157 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
158 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
159 } else {
160 store_zero_tran_low(qcoeff_ptr + n_coeffs);
161 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
162
163 store_zero_tran_low(dqcoeff_ptr + n_coeffs);
164 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
165 }
166 }
167
168 if (nzflag) {
169 // Scan for eob
170 __m128i zero_coeff0, zero_coeff1;
171 __m128i nzero_coeff0, nzero_coeff1;
172 __m128i iscan0, iscan1;
173 __m128i eob0, eob1;
174 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
175 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
176 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
177 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
178 iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
179 iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
180 // Add one to convert from indices to counts
181 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
182 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
183 eob0 = _mm_and_si128(iscan0, nzero_coeff0);
184 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
185 eob0 = _mm_max_epi16(eob0, eob1);
186 eob = _mm_max_epi16(eob, eob0);
187 }
188 n_coeffs += 8 * 2;
189 }
190
191 // Accumulate EOB
192 {
193 __m128i eob_shuffled;
194 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
195 eob = _mm_max_epi16(eob, eob_shuffled);
196 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
197 eob = _mm_max_epi16(eob, eob_shuffled);
198 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
199 eob = _mm_max_epi16(eob, eob_shuffled);
200 *eob_ptr = _mm_extract_epi16(eob, 1);
201 }
202 }
203