1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbDefinitions.h"
7 #include <emmintrin.h>
8 #include "smmintrin.h"
9
10 #define SAO_EO_TYPES 4
11 #define SAO_EO_CATEGORIES 4
12 #define SAO_BO_INTERVALS 32
13
14
15 static EB_S16/*EB_U16*/ maskTable[8][8] =
16 {
17 { -1, 0, 0, 0, 0, 0, 0, 0 },
18 { -1, -1, 0, 0, 0, 0, 0, 0 },
19 { -1, -1, -1, 0, 0, 0, 0, 0 },
20 { -1, -1, -1, -1, 0, 0, 0, 0 },
21 { -1, -1, -1, -1, -1, 0, 0, 0 },
22 { -1, -1, -1, -1, -1, -1, 0, 0 },
23 { -1, -1, -1, -1, -1, -1, -1, 0 },
24 { -1, -1, -1, -1, -1, -1, -1, -1 }
25 };
26
27
countEdge(__m128i * eoDiff,__m128i * eoCount,EB_BYTE ptr,EB_S32 offset,__m128i x0,__m128i diff,__m128i mask)28 static void countEdge(__m128i *eoDiff, __m128i *eoCount, EB_BYTE ptr, EB_S32 offset, __m128i x0, __m128i diff, __m128i mask)
29 {
30 __m128i x1, x2;
31 __m128i c1, c2;
32 __m128i cat, select;
33
34 x1 = _mm_loadu_si128((__m128i *)(ptr + offset));
35 x2 = _mm_loadu_si128((__m128i *)(ptr - offset));
36 x1 = _mm_xor_si128(x1, _mm_set1_epi8(-128));
37 x2 = _mm_xor_si128(x2, _mm_set1_epi8(-128));
38
39 c1 = _mm_sub_epi8(_mm_cmplt_epi8(x0, x1), _mm_cmpgt_epi8(x0, x1));
40 c2 = _mm_sub_epi8(_mm_cmplt_epi8(x0, x2), _mm_cmpgt_epi8(x0, x2));
41 cat = _mm_add_epi8(c1, c2);
42 cat = _mm_and_si128(cat, mask);
43
44 select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(-2));
45 eoCount[0] = _mm_sub_epi8(eoCount[0], select);
46 eoDiff[0] = _mm_add_epi64(eoDiff[0], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
47
48 select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(-1));
49 eoCount[1] = _mm_sub_epi8(eoCount[1], select);
50 eoDiff[1] = _mm_add_epi64(eoDiff[1], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
51
52 select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(1));
53 eoCount[2] = _mm_sub_epi8(eoCount[2], select);
54 eoDiff[2] = _mm_add_epi64(eoDiff[2], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
55
56 select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(2));
57 eoCount[3] = _mm_sub_epi8(eoCount[3], select);
58 eoDiff[3] = _mm_add_epi64(eoDiff[3], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
59 }
60
61
countBand(EB_S32 * boDiff,EB_U16 * boCount,EB_S32 cat01,EB_S32 diff01,EB_U8 * validSample)62 static void countBand(EB_S32 *boDiff, EB_U16 *boCount, EB_S32 cat01, EB_S32 diff01, EB_U8 *validSample)
63 {
64 EB_U8 cat0 = (EB_U8)cat01;
65 EB_U8 cat1 = (EB_U8)(cat01 >> 8);
66 EB_S8 diff0 = (EB_S8)diff01;
67 EB_S8 diff1 = (EB_S8)(diff01 >> 8);
68
69 if (validSample[0])
70 {
71 boCount[cat0]++;
72 boDiff[cat0] += diff0;
73 }
74
75 if (validSample[1])
76 {
77 boCount[cat1]++;
78 boDiff[cat1] += diff1;
79 }
80 }
81
updateValidSamples(EB_U8 * validSample,EB_U8 size,EB_S32 colCount,EB_S32 colCountDiv2)82 static void updateValidSamples(EB_U8 *validSample, EB_U8 size, EB_S32 colCount, EB_S32 colCountDiv2)
83 {
84 if (!validSample)
85 return;
86
87 EB_MEMSET(validSample, 1, size);
88 if (colCountDiv2 == 0)
89 {
90 validSample[0] = (colCount & 1) ? 1 : 0;
91 validSample[1] = 0;
92 }
93 else if (colCountDiv2 < 0)
94 EB_MEMSET(validSample, 0, size);
95 }
96
GatherSaoStatisticsLcu_BT_SSE2(EB_U8 * inputSamplePtr,EB_U32 inputStride,EB_U8 * reconSamplePtr,EB_U32 reconStride,EB_U32 lcuWidth,EB_U32 lcuHeight,EB_S32 * boDiff,EB_U16 * boCount,EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES+1],EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES+1])97 EB_EXTERN EB_ERRORTYPE GatherSaoStatisticsLcu_BT_SSE2(
98 EB_U8 *inputSamplePtr, // input parameter, source Picture Ptr
99 EB_U32 inputStride, // input parameter, source stride
100 EB_U8 *reconSamplePtr, // input parameter, deblocked Picture Ptr
101 EB_U32 reconStride, // input parameter, deblocked stride
102 EB_U32 lcuWidth, // input parameter, LCU width
103 EB_U32 lcuHeight, // input parameter, LCU height
104 EB_S32 *boDiff, // output parameter, used to store Band Offset diff, boDiff[SAO_BO_INTERVALS]
105 EB_U16 *boCount, // output parameter, used to store Band Offset count, boCount[SAO_BO_INTERVALS]
106 EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1], // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
107 EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1]) // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
108 {
109 EB_S32 colCount, colCountDiv2, rowCount;
110 #define EPIL16EXACTBYTES 2
111 EB_U8 validSample[EPIL16EXACTBYTES];
112 EB_U8 resetSize = sizeof(validSample[0]) * EPIL16EXACTBYTES;
113 EB_S32 i, j;
114
115 __m128i eoDiffX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
116 __m128i eoCountX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
117
118 lcuWidth -= 2;
119 lcuHeight -= 2;
120 inputSamplePtr += inputStride + 1;
121 reconSamplePtr += reconStride + 1;
122
123 colCount = lcuWidth;
124
125 for (i = 0; i < SAO_BO_INTERVALS / 8; i++)
126 {
127 _mm_storeu_si128((__m128i *)(boCount + 8 * i), _mm_setzero_si128());
128 }
129 for (i = 0; i < SAO_BO_INTERVALS / 4; i++)
130 {
131 _mm_storeu_si128((__m128i *)(boDiff + 4 * i), _mm_setzero_si128());
132 }
133
134 for (i = 0; i < SAO_EO_TYPES; i++)
135 {
136 for (j = 0; j < SAO_EO_CATEGORIES; j++)
137 {
138 eoDiffX[i][j] = _mm_setzero_si128();
139 eoCountX[i][j] = _mm_setzero_si128();
140 }
141 }
142
143 do
144 {
145 __m128i mask = _mm_setzero_si128();
146 EB_BYTE ptr = reconSamplePtr;
147 EB_BYTE qtr = inputSamplePtr;
148 EB_S32 idx;
149
150 rowCount = lcuHeight;
151
152 idx = (colCount >> 1) - 1;
153 if (idx > 7)
154 {
155 idx = 7;
156 }
157
158 if (idx >= 0)
159 mask = _mm_loadu_si128((__m128i *)maskTable[idx]);
160 do
161 {
162 __m128i x0, y0;
163 __m128i cat, diff;
164 x0 = _mm_loadu_si128((__m128i *)ptr);
165 y0 = _mm_loadu_si128((__m128i *)qtr);
166
167 // Band offset
168 cat = _mm_srli_epi16(_mm_and_si128(x0, _mm_set1_epi8(-8)), 3);
169 cat = _mm_and_si128(cat, mask);
170 x0 = _mm_xor_si128(x0, _mm_set1_epi8(-128));
171 y0 = _mm_xor_si128(y0, _mm_set1_epi8(-128));
172 diff = _mm_subs_epi8(y0, x0);
173 diff = _mm_and_si128(diff, mask);
174
175 // Conditionally add the valid samples when counting BO diffs and counts.
176 // Because when executing the right edge (colCount != 16) of a LCU, the redundant
177 // samples will be loaded for calculation with the valid ones, as the use SSE2
178 // intrinsics typically need to handle __m128i data type (128 bits, or 16 bytes).
179 //
180 // Note: the redundant samples wouldn't cause memory out of bound access, because
181 // the contents of the memory (page frame and page table are ready, although some
182 // bits the thread shouldn't touch) are loaded into the intrinsic temp variables,
183 // and later into XMM registers of CPU for calculation.
184 colCountDiv2 = colCount >> 1;
185
186 // Note: can not use variable as the 2nd argument of _mm_extract_epi16(), whose
187 // selector must be an integer constant in the range 0..7. So handle the bi-bytes
188 // one by one...
189 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
190 colCountDiv2--;
191 countBand(boDiff, boCount, _mm_extract_epi16(cat, 0), _mm_extract_epi16(diff, 0), validSample);
192
193 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
194 colCountDiv2--;
195 countBand(boDiff, boCount, _mm_extract_epi16(cat, 1), _mm_extract_epi16(diff, 1), validSample);
196
197 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
198 colCountDiv2--;
199 countBand(boDiff, boCount, _mm_extract_epi16(cat, 2), _mm_extract_epi16(diff, 2), validSample);
200
201 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
202 colCountDiv2--;
203 countBand(boDiff, boCount, _mm_extract_epi16(cat, 3), _mm_extract_epi16(diff, 3), validSample);
204
205 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
206 colCountDiv2--;
207 countBand(boDiff, boCount, _mm_extract_epi16(cat, 4), _mm_extract_epi16(diff, 4), validSample);
208
209 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
210 colCountDiv2--;
211 countBand(boDiff, boCount, _mm_extract_epi16(cat, 5), _mm_extract_epi16(diff, 5), validSample);
212
213 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
214 colCountDiv2--;
215 countBand(boDiff, boCount, _mm_extract_epi16(cat, 6), _mm_extract_epi16(diff, 6), validSample);
216
217 updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
218 colCountDiv2--;
219 countBand(boDiff, boCount, _mm_extract_epi16(cat, 7), _mm_extract_epi16(diff, 7), validSample);
220
221 // Edge offset
222
223 // Add 128 to difference to make it an unsigned integer to allow use of _mm_sad_epu8 intrinsic
224 // This difference will be subtracted from the end result
225 diff = _mm_xor_si128(diff, _mm_set1_epi8(-128));
226
227 countEdge(eoDiffX[0], eoCountX[0], ptr, 1, x0, diff, mask);
228 countEdge(eoDiffX[1], eoCountX[1], ptr, reconStride, x0, diff, mask);
229 countEdge(eoDiffX[2], eoCountX[2], ptr, reconStride + 1, x0, diff, mask);
230 countEdge(eoDiffX[3], eoCountX[3], ptr, reconStride - 1, x0, diff, mask);
231
232 ptr += reconStride;
233 qtr += inputStride;
234 } while (--rowCount);
235
236 reconSamplePtr += 16;
237 inputSamplePtr += 16;
238
239 colCount -= 16;
240 } while (colCount > 0);
241
242 for (i = 0; i < SAO_EO_TYPES; i++)
243 {
244 for (j = 0; j < SAO_EO_CATEGORIES; j++)
245 {
246 __m128i x0;
247 EB_U32 *p;
248 EB_U16/*EB_U32*/ count;
249
250 // Note: accumulation of counts over 8 bits is ok since the maximum count is 62*4 = 248
251 x0 = _mm_sad_epu8(eoCountX[i][j], _mm_setzero_si128());
252 count = (EB_U16)(_mm_extract_epi32(x0, 0) + _mm_extract_epi32(x0, 2));
253 eoCount[i][j] = count;
254
255 // Note: subtracting 128 that was previously added in main loop
256 p = (EB_U32 *)&eoDiffX[i][j];
257 eoDiff[i][j] = p[0] + p[2] - 128 * count;
258 }
259 }
260
261 return EB_ErrorNone;
262 }
263
264
GatherSaoStatisticsLcu_OnlyEo_90_45_135_BT_SSE2(EB_U8 * inputSamplePtr,EB_U32 inputStride,EB_U8 * reconSamplePtr,EB_U32 reconStride,EB_U32 lcuWidth,EB_U32 lcuHeight,EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES+1],EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES+1])265 EB_EXTERN EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_BT_SSE2(
266 EB_U8 *inputSamplePtr, // input parameter, source Picture Ptr
267 EB_U32 inputStride, // input parameter, source stride
268 EB_U8 *reconSamplePtr, // input parameter, deblocked Picture Ptr
269 EB_U32 reconStride, // input parameter, deblocked stride
270 EB_U32 lcuWidth, // input parameter, LCU width
271 EB_U32 lcuHeight, // input parameter, LCU height
272 EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES+1], // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
273 EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES+1]) // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
274 {
275 EB_S32 colCount, rowCount;
276 EB_S32 i, j;
277
278 __m128i eoDiffX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
279 __m128i eoCountX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
280
281 lcuWidth -= 2;
282 lcuHeight -= 2;
283 inputSamplePtr += inputStride + 1;
284 reconSamplePtr += reconStride + 1;
285
286 colCount = lcuWidth;
287
288 for (i = 1; i < SAO_EO_TYPES; i++)
289 {
290 for (j = 0; j < SAO_EO_CATEGORIES; j++)
291 {
292 eoDiffX[i][j] = _mm_setzero_si128();
293 eoCountX[i][j] = _mm_setzero_si128();
294 }
295 }
296
297 do
298 {
299 __m128i mask;
300 EB_BYTE ptr = reconSamplePtr;
301 EB_BYTE qtr = inputSamplePtr;
302 EB_S32 idx;
303
304 rowCount = lcuHeight;
305
306 idx = (colCount >> 1) - 1;
307 mask = (idx >= 0 && idx < 8) ? _mm_loadu_si128((__m128i *)maskTable[idx]) : _mm_loadu_si128((__m128i *)maskTable[7]);
308 do
309 {
310 __m128i x0, y0;
311 __m128i diff;
312 x0 = _mm_loadu_si128((__m128i *)ptr);
313 y0 = _mm_loadu_si128((__m128i *)qtr);
314
315 x0 = _mm_xor_si128(x0, _mm_set1_epi8(-128));
316 y0 = _mm_xor_si128(y0, _mm_set1_epi8(-128));
317 diff = _mm_subs_epi8(y0, x0);
318 diff = _mm_and_si128(diff, mask);
319
320 // Edge offset
321
322 // Add 128 to difference to make it an unsigned integer to allow use of _mm_sad_epu8 intrinsic
323 // This difference will be subtracted from the end result
324 diff = _mm_xor_si128(diff, _mm_set1_epi8(-128));
325
326 countEdge(eoDiffX[1], eoCountX[1], ptr, reconStride, x0, diff, mask);
327 countEdge(eoDiffX[2], eoCountX[2], ptr, reconStride+1, x0, diff, mask);
328 countEdge(eoDiffX[3], eoCountX[3], ptr, reconStride-1, x0, diff, mask);
329
330 ptr += reconStride;
331 qtr += inputStride;
332 }
333 while (--rowCount);
334
335 reconSamplePtr += 16;
336 inputSamplePtr += 16;
337
338 colCount -= 16;
339 }
340 while (colCount > 0);
341
342 for (i = 1; i < SAO_EO_TYPES; i++)
343 {
344 for (j = 0; j < SAO_EO_CATEGORIES; j++)
345 {
346 __m128i x0;
347 EB_U32 *p;
348 EB_U16/*EB_U32*/ count;
349
350 // Note: accumulation of counts over 8 bits is ok since the maximum count is 62*4 = 248
351 x0 = _mm_sad_epu8(eoCountX[i][j], _mm_setzero_si128());
352 count =(EB_U16)(_mm_extract_epi32(x0, 0) + _mm_extract_epi32(x0, 2));
353 eoCount[i][j] = count;
354
355 // Note: subtracting 128 that was previously added in main loop
356 p = (EB_U32 *)&eoDiffX[i][j];
357 eoDiff[i][j] = p[0] + p[2] - 128 * count;
358 }
359 }
360
361 return EB_ErrorNone;
362 }
363