1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbDefinitions.h"
7 #include <emmintrin.h>
8 #include "smmintrin.h"
9 
10 #define SAO_EO_TYPES 4
11 #define SAO_EO_CATEGORIES 4
12 #define SAO_BO_INTERVALS 32
13 
14 
15 static EB_S16/*EB_U16*/ maskTable[8][8] =
16 {
17   { -1, 0, 0, 0, 0, 0, 0, 0 },
18   { -1, -1, 0, 0, 0, 0, 0, 0 },
19   { -1, -1, -1, 0, 0, 0, 0, 0 },
20   { -1, -1, -1, -1, 0, 0, 0, 0 },
21   { -1, -1, -1, -1, -1, 0, 0, 0 },
22   { -1, -1, -1, -1, -1, -1, 0, 0 },
23   { -1, -1, -1, -1, -1, -1, -1, 0 },
24   { -1, -1, -1, -1, -1, -1, -1, -1 }
25 };
26 
27 
countEdge(__m128i * eoDiff,__m128i * eoCount,EB_BYTE ptr,EB_S32 offset,__m128i x0,__m128i diff,__m128i mask)28 static void countEdge(__m128i *eoDiff, __m128i *eoCount, EB_BYTE ptr, EB_S32 offset, __m128i x0, __m128i diff, __m128i mask)
29 {
30   __m128i x1, x2;
31   __m128i c1, c2;
32   __m128i cat, select;
33 
34   x1 = _mm_loadu_si128((__m128i *)(ptr + offset));
35   x2 = _mm_loadu_si128((__m128i *)(ptr - offset));
36   x1 = _mm_xor_si128(x1, _mm_set1_epi8(-128));
37   x2 = _mm_xor_si128(x2, _mm_set1_epi8(-128));
38 
39   c1 = _mm_sub_epi8(_mm_cmplt_epi8(x0, x1), _mm_cmpgt_epi8(x0, x1));
40   c2 = _mm_sub_epi8(_mm_cmplt_epi8(x0, x2), _mm_cmpgt_epi8(x0, x2));
41   cat = _mm_add_epi8(c1, c2);
42   cat = _mm_and_si128(cat, mask);
43 
44   select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(-2));
45   eoCount[0] = _mm_sub_epi8(eoCount[0], select);
46   eoDiff[0] = _mm_add_epi64(eoDiff[0], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
47 
48   select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(-1));
49   eoCount[1] = _mm_sub_epi8(eoCount[1], select);
50   eoDiff[1] = _mm_add_epi64(eoDiff[1], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
51 
52   select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(1));
53   eoCount[2] = _mm_sub_epi8(eoCount[2], select);
54   eoDiff[2] = _mm_add_epi64(eoDiff[2], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
55 
56   select = _mm_cmpeq_epi8(cat, _mm_set1_epi8(2));
57   eoCount[3] = _mm_sub_epi8(eoCount[3], select);
58   eoDiff[3] = _mm_add_epi64(eoDiff[3], _mm_sad_epu8(_mm_and_si128(diff, select), _mm_setzero_si128()));
59 }
60 
61 
countBand(EB_S32 * boDiff,EB_U16 * boCount,EB_S32 cat01,EB_S32 diff01,EB_U8 * validSample)62 static void countBand(EB_S32 *boDiff, EB_U16 *boCount, EB_S32 cat01, EB_S32 diff01, EB_U8 *validSample)
63 {
64     EB_U8 cat0 = (EB_U8)cat01;
65     EB_U8 cat1 = (EB_U8)(cat01 >> 8);
66     EB_S8 diff0 = (EB_S8)diff01;
67     EB_S8 diff1 = (EB_S8)(diff01 >> 8);
68 
69     if (validSample[0])
70     {
71         boCount[cat0]++;
72         boDiff[cat0] += diff0;
73     }
74 
75     if (validSample[1])
76     {
77         boCount[cat1]++;
78         boDiff[cat1] += diff1;
79     }
80 }
81 
updateValidSamples(EB_U8 * validSample,EB_U8 size,EB_S32 colCount,EB_S32 colCountDiv2)82 static void updateValidSamples(EB_U8 *validSample, EB_U8 size, EB_S32 colCount, EB_S32 colCountDiv2)
83 {
84     if (!validSample)
85         return;
86 
87     EB_MEMSET(validSample, 1, size);
88     if (colCountDiv2 == 0)
89     {
90         validSample[0] = (colCount & 1) ? 1 : 0;
91         validSample[1] = 0;
92     }
93     else if (colCountDiv2 < 0)
94         EB_MEMSET(validSample, 0, size);
95 }
96 
GatherSaoStatisticsLcu_BT_SSE2(EB_U8 * inputSamplePtr,EB_U32 inputStride,EB_U8 * reconSamplePtr,EB_U32 reconStride,EB_U32 lcuWidth,EB_U32 lcuHeight,EB_S32 * boDiff,EB_U16 * boCount,EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES+1],EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES+1])97 EB_EXTERN EB_ERRORTYPE GatherSaoStatisticsLcu_BT_SSE2(
98     EB_U8                   *inputSamplePtr,        // input parameter, source Picture Ptr
99     EB_U32                   inputStride,           // input parameter, source stride
100     EB_U8                   *reconSamplePtr,        // input parameter, deblocked Picture Ptr
101     EB_U32                   reconStride,           // input parameter, deblocked stride
102     EB_U32                   lcuWidth,              // input parameter, LCU width
103     EB_U32                   lcuHeight,             // input parameter, LCU height
104     EB_S32                  *boDiff,                // output parameter, used to store Band Offset diff, boDiff[SAO_BO_INTERVALS]
105     EB_U16                  *boCount,										// output parameter, used to store Band Offset count, boCount[SAO_BO_INTERVALS]
106     EB_S32                   eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1],     // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
107     EB_U16                   eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES + 1])    // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
108 {
109     EB_S32 colCount, colCountDiv2, rowCount;
110 #define EPIL16EXACTBYTES 2
111     EB_U8 validSample[EPIL16EXACTBYTES];
112     EB_U8 resetSize = sizeof(validSample[0]) * EPIL16EXACTBYTES;
113     EB_S32 i, j;
114 
115     __m128i eoDiffX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
116     __m128i eoCountX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
117 
118     lcuWidth -= 2;
119     lcuHeight -= 2;
120     inputSamplePtr += inputStride + 1;
121     reconSamplePtr += reconStride + 1;
122 
123     colCount = lcuWidth;
124 
125     for (i = 0; i < SAO_BO_INTERVALS / 8; i++)
126     {
127         _mm_storeu_si128((__m128i *)(boCount + 8 * i), _mm_setzero_si128());
128     }
129     for (i = 0; i < SAO_BO_INTERVALS / 4; i++)
130     {
131         _mm_storeu_si128((__m128i *)(boDiff + 4 * i), _mm_setzero_si128());
132     }
133 
134     for (i = 0; i < SAO_EO_TYPES; i++)
135     {
136         for (j = 0; j < SAO_EO_CATEGORIES; j++)
137         {
138             eoDiffX[i][j] = _mm_setzero_si128();
139             eoCountX[i][j] = _mm_setzero_si128();
140         }
141     }
142 
143     do
144     {
145         __m128i mask = _mm_setzero_si128();
146         EB_BYTE ptr = reconSamplePtr;
147         EB_BYTE qtr = inputSamplePtr;
148         EB_S32 idx;
149 
150         rowCount = lcuHeight;
151 
152         idx = (colCount >> 1) - 1;
153         if (idx > 7)
154         {
155             idx = 7;
156         }
157 
158         if (idx >= 0)
159             mask = _mm_loadu_si128((__m128i *)maskTable[idx]);
160         do
161         {
162             __m128i x0, y0;
163             __m128i cat, diff;
164             x0 = _mm_loadu_si128((__m128i *)ptr);
165             y0 = _mm_loadu_si128((__m128i *)qtr);
166 
167             // Band offset
168             cat = _mm_srli_epi16(_mm_and_si128(x0, _mm_set1_epi8(-8)), 3);
169             cat = _mm_and_si128(cat, mask);
170             x0 = _mm_xor_si128(x0, _mm_set1_epi8(-128));
171             y0 = _mm_xor_si128(y0, _mm_set1_epi8(-128));
172             diff = _mm_subs_epi8(y0, x0);
173             diff = _mm_and_si128(diff, mask);
174 
175             // Conditionally add the valid samples when counting BO diffs and counts.
176             // Because when executing the right edge (colCount != 16) of a LCU, the redundant
177             // samples will be loaded for calculation with the valid ones, as the use SSE2
178             // intrinsics typically need to handle __m128i data type (128 bits, or 16 bytes).
179             //
180             // Note: the redundant samples wouldn't cause memory out of bound access, because
181             // the contents of the memory (page frame and page table are ready, although some
182             // bits the thread shouldn't touch) are loaded into the intrinsic temp variables,
183             // and later into XMM registers of CPU for calculation.
184             colCountDiv2 = colCount >> 1;
185 
186             // Note: can not use variable as the 2nd argument of _mm_extract_epi16(), whose
187             // selector must be an integer constant in the range 0..7. So handle the bi-bytes
188             // one by one...
189             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
190             colCountDiv2--;
191             countBand(boDiff, boCount, _mm_extract_epi16(cat, 0), _mm_extract_epi16(diff, 0), validSample);
192 
193             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
194             colCountDiv2--;
195             countBand(boDiff, boCount, _mm_extract_epi16(cat, 1), _mm_extract_epi16(diff, 1), validSample);
196 
197             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
198             colCountDiv2--;
199             countBand(boDiff, boCount, _mm_extract_epi16(cat, 2), _mm_extract_epi16(diff, 2), validSample);
200 
201             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
202             colCountDiv2--;
203             countBand(boDiff, boCount, _mm_extract_epi16(cat, 3), _mm_extract_epi16(diff, 3), validSample);
204 
205             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
206             colCountDiv2--;
207             countBand(boDiff, boCount, _mm_extract_epi16(cat, 4), _mm_extract_epi16(diff, 4), validSample);
208 
209             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
210             colCountDiv2--;
211             countBand(boDiff, boCount, _mm_extract_epi16(cat, 5), _mm_extract_epi16(diff, 5), validSample);
212 
213             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
214             colCountDiv2--;
215             countBand(boDiff, boCount, _mm_extract_epi16(cat, 6), _mm_extract_epi16(diff, 6), validSample);
216 
217             updateValidSamples(validSample, resetSize, colCount, colCountDiv2);
218             colCountDiv2--;
219             countBand(boDiff, boCount, _mm_extract_epi16(cat, 7), _mm_extract_epi16(diff, 7), validSample);
220 
221             // Edge offset
222 
223             // Add 128 to difference to make it an unsigned integer to allow use of _mm_sad_epu8 intrinsic
224             // This difference will be subtracted from the end result
225             diff = _mm_xor_si128(diff, _mm_set1_epi8(-128));
226 
227             countEdge(eoDiffX[0], eoCountX[0], ptr, 1, x0, diff, mask);
228             countEdge(eoDiffX[1], eoCountX[1], ptr, reconStride, x0, diff, mask);
229             countEdge(eoDiffX[2], eoCountX[2], ptr, reconStride + 1, x0, diff, mask);
230             countEdge(eoDiffX[3], eoCountX[3], ptr, reconStride - 1, x0, diff, mask);
231 
232             ptr += reconStride;
233             qtr += inputStride;
234         } while (--rowCount);
235 
236         reconSamplePtr += 16;
237         inputSamplePtr += 16;
238 
239         colCount -= 16;
240     } while (colCount > 0);
241 
242     for (i = 0; i < SAO_EO_TYPES; i++)
243     {
244         for (j = 0; j < SAO_EO_CATEGORIES; j++)
245         {
246             __m128i x0;
247             EB_U32 *p;
248             EB_U16/*EB_U32*/ count;
249 
250             // Note: accumulation of counts over 8 bits is ok since the maximum count is 62*4 = 248
251             x0 = _mm_sad_epu8(eoCountX[i][j], _mm_setzero_si128());
252             count = (EB_U16)(_mm_extract_epi32(x0, 0) + _mm_extract_epi32(x0, 2));
253             eoCount[i][j] = count;
254 
255             // Note: subtracting 128 that was previously added in main loop
256             p = (EB_U32 *)&eoDiffX[i][j];
257             eoDiff[i][j] = p[0] + p[2] - 128 * count;
258         }
259     }
260 
261     return EB_ErrorNone;
262 }
263 
264 
GatherSaoStatisticsLcu_OnlyEo_90_45_135_BT_SSE2(EB_U8 * inputSamplePtr,EB_U32 inputStride,EB_U8 * reconSamplePtr,EB_U32 reconStride,EB_U32 lcuWidth,EB_U32 lcuHeight,EB_S32 eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES+1],EB_U16 eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES+1])265 EB_EXTERN EB_ERRORTYPE GatherSaoStatisticsLcu_OnlyEo_90_45_135_BT_SSE2(
266 	EB_U8                   *inputSamplePtr,        // input parameter, source Picture Ptr
267 	EB_U32                   inputStride,           // input parameter, source stride
268 	EB_U8                   *reconSamplePtr,        // input parameter, deblocked Picture Ptr
269 	EB_U32                   reconStride,           // input parameter, deblocked stride
270 	EB_U32                   lcuWidth,              // input parameter, LCU width
271 	EB_U32                   lcuHeight,             // input parameter, LCU height
272 	EB_S32                   eoDiff[SAO_EO_TYPES][SAO_EO_CATEGORIES+1],     // output parameter, used to store Edge Offset diff, eoDiff[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
273 	EB_U16                   eoCount[SAO_EO_TYPES][SAO_EO_CATEGORIES+1])    // output parameter, used to store Edge Offset count, eoCount[SAO_EO_TYPES] [SAO_EO_CATEGORIES]
274 {
275   EB_S32 colCount, rowCount;
276   EB_S32 i, j;
277 
278   __m128i eoDiffX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
279   __m128i eoCountX[SAO_EO_TYPES][SAO_EO_CATEGORIES];
280 
281   lcuWidth -= 2;
282   lcuHeight -= 2;
283   inputSamplePtr += inputStride + 1;
284   reconSamplePtr += reconStride + 1;
285 
286   colCount = lcuWidth;
287 
288   for (i = 1; i < SAO_EO_TYPES; i++)
289   {
290     for (j = 0; j < SAO_EO_CATEGORIES; j++)
291     {
292       eoDiffX[i][j] = _mm_setzero_si128();
293       eoCountX[i][j] = _mm_setzero_si128();
294     }
295   }
296 
297   do
298   {
299     __m128i mask;
300     EB_BYTE ptr = reconSamplePtr;
301     EB_BYTE qtr = inputSamplePtr;
302     EB_S32 idx;
303 
304     rowCount = lcuHeight;
305 
306     idx = (colCount >> 1) - 1;
307 	mask = (idx >= 0 && idx < 8) ? _mm_loadu_si128((__m128i *)maskTable[idx]) : _mm_loadu_si128((__m128i *)maskTable[7]);
308     do
309     {
310       __m128i x0, y0;
311       __m128i diff;
312       x0 = _mm_loadu_si128((__m128i *)ptr);
313       y0 = _mm_loadu_si128((__m128i *)qtr);
314 
315       x0 = _mm_xor_si128(x0, _mm_set1_epi8(-128));
316       y0 = _mm_xor_si128(y0, _mm_set1_epi8(-128));
317       diff = _mm_subs_epi8(y0, x0);
318       diff = _mm_and_si128(diff, mask);
319 
320       // Edge offset
321 
322       // Add 128 to difference to make it an unsigned integer to allow use of _mm_sad_epu8 intrinsic
323       // This difference will be subtracted from the end result
324       diff = _mm_xor_si128(diff, _mm_set1_epi8(-128));
325 
326       countEdge(eoDiffX[1], eoCountX[1], ptr, reconStride, x0, diff, mask);
327       countEdge(eoDiffX[2], eoCountX[2], ptr, reconStride+1, x0, diff, mask);
328       countEdge(eoDiffX[3], eoCountX[3], ptr, reconStride-1, x0, diff, mask);
329 
330       ptr += reconStride;
331       qtr += inputStride;
332     }
333     while (--rowCount);
334 
335     reconSamplePtr += 16;
336     inputSamplePtr += 16;
337 
338     colCount -= 16;
339   }
340   while (colCount > 0);
341 
342   for (i = 1; i < SAO_EO_TYPES; i++)
343   {
344     for (j = 0; j < SAO_EO_CATEGORIES; j++)
345     {
346       __m128i x0;
347       EB_U32 *p;
348       EB_U16/*EB_U32*/ count;
349 
350       // Note: accumulation of counts over 8 bits is ok since the maximum count is 62*4 = 248
351       x0 = _mm_sad_epu8(eoCountX[i][j], _mm_setzero_si128());
352       count =(EB_U16)(_mm_extract_epi32(x0, 0) + _mm_extract_epi32(x0, 2));
353       eoCount[i][j] = count;
354 
355       // Note: subtracting 128 that was previously added in main loop
356       p = (EB_U32 *)&eoDiffX[i][j];
357       eoDiff[i][j] = p[0] + p[2] - 128 * count;
358     }
359   }
360 
361   return EB_ErrorNone;
362 }
363