1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbAvcStyleMcp_SSSE3.h"
7 
8 #include "EbDefinitions.h"
9 
10 #include "emmintrin.h"
11 #include "tmmintrin.h"
12 
13 
14 EB_EXTERN EB_ALIGN(16) const EB_S8 EbHevcAvcStyleLumaIFCoeff8_SSSE3[]= {
15     -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25,
16      9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,
17     -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18,
18     18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2,
19     -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9, -1,  9,
20     25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1
21 };
22 
23 
24 void PictureCopyKernel_SSSE3(
25 	EB_BYTE                  src,
26 	EB_U32                   srcStride,
27 	EB_BYTE                  dst,
28 	EB_U32                   dstStride,
29 	EB_U32                   areaWidth,
30 	EB_U32                   areaHeight,
31 	EB_U32                   bytesPerSample);
32 
AvcStyleLumaInterpolationFilterHorizontal_SSSE3_INTRIN(EB_BYTE refPic,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 puWidth,EB_U32 puHeight,EB_BYTE tempBuf,EB_U32 fracPos)33 void AvcStyleLumaInterpolationFilterHorizontal_SSSE3_INTRIN(
34     EB_BYTE refPic,
35     EB_U32 srcStride,
36     EB_BYTE dst,
37     EB_U32 dstStride,
38     EB_U32 puWidth,
39     EB_U32 puHeight,
40     EB_BYTE tempBuf,
41     EB_U32 fracPos)
42 {
43     (void)tempBuf;
44     __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
45     EB_U32 width_cnt, height_cnt;
46     EB_U32 IFShift = 5;
47 
48     fracPos <<= 5;
49     IFOffset = _mm_set1_epi16(0x0010);
50     IFCoeff_1_0 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 32));
51     IFCoeff_3_2 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 16));
52 
53     if (!(puWidth & 15)) { // 16x
54         __m128i ref0, ref1, ref2, ref3, ref01_lo, ref01_hi, ref23_lo, ref23_hi, sum_lo, sum_hi;
55 
56         for (height_cnt = 0; height_cnt < puHeight; ++height_cnt){
57             for (width_cnt = 0; width_cnt < puWidth; width_cnt += 16) {
58                 ref0 = _mm_loadu_si128((__m128i *)(refPic + width_cnt - 1));
59                 ref1 = _mm_loadu_si128((__m128i *)(refPic + width_cnt));
60                 ref2 = _mm_loadu_si128((__m128i *)(refPic + width_cnt + 1));
61                 ref3 = _mm_loadu_si128((__m128i *)(refPic + width_cnt + 2));
62 
63                 ref01_lo = _mm_unpacklo_epi8(ref0, ref1);
64                 ref01_hi = _mm_unpackhi_epi8(ref0, ref1);
65                 ref23_lo = _mm_unpacklo_epi8(ref2, ref3);
66                 ref23_hi = _mm_unpackhi_epi8(ref2, ref3);
67 
68                 sum_lo = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_lo, IFCoeff_1_0), _mm_maddubs_epi16(ref23_lo, IFCoeff_3_2)), IFOffset), IFShift);
69                 sum_hi = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_hi, IFCoeff_1_0), _mm_maddubs_epi16(ref23_hi, IFCoeff_3_2)), IFOffset), IFShift);
70                 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
71                 _mm_storeu_si128((__m128i *)(dst + width_cnt), sum_clip_U8);
72             }
73             refPic += srcStride;
74             dst += dstStride;
75         }
76     }
77     else { //8x
78         __m128i  sum01, sum23, sum;
79 
80         for (height_cnt = 0; height_cnt < puHeight; ++height_cnt){
81             for (width_cnt = 0; width_cnt < puWidth; width_cnt += 8) {
82                 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPic + width_cnt - 1)),
83                                                             _mm_loadl_epi64((__m128i *)(refPic + width_cnt))), IFCoeff_1_0);
84 
85                 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPic + width_cnt + 1)),
86                                                             _mm_loadl_epi64((__m128i *)(refPic + width_cnt + 2))), IFCoeff_3_2);
87 
88                 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
89                 sum_clip_U8 = _mm_packus_epi16(sum, sum);
90 
91                 _mm_storel_epi64((__m128i *)(dst + width_cnt), sum_clip_U8);
92             }
93             refPic += srcStride;
94             dst += dstStride;
95         }
96 
97     }
98 }
99 
AvcStyleLumaInterpolationFilterVertical_SSSE3_INTRIN(EB_BYTE refPic,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 puWidth,EB_U32 puHeight,EB_BYTE tempBuf,EB_U32 fracPos)100 void AvcStyleLumaInterpolationFilterVertical_SSSE3_INTRIN(
101     EB_BYTE refPic,
102     EB_U32 srcStride,
103     EB_BYTE dst,
104     EB_U32 dstStride,
105     EB_U32 puWidth,
106     EB_U32 puHeight,
107     EB_BYTE tempBuf,
108     EB_U32 fracPos)
109 {
110     (void)tempBuf;
111     __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
112     EB_U32 width_cnt, height_cnt;
113     EB_U32 IFShift = 5;
114     EB_U32 srcStrideSkip = srcStride;
115     EB_BYTE refPicTemp, dstTemp;
116 
117     fracPos <<= 5;
118     refPic -= srcStride;
119     IFOffset = _mm_set1_epi16(0x0010);
120     IFCoeff_1_0 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 32));
121     IFCoeff_3_2 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 16));
122     if (!(puWidth & 15)) { //16x
123 
124         __m128i sum_lo, sum_hi, ref0, refs, ref2s, ref3s;
125 
126         for (width_cnt = 0; width_cnt < puWidth; width_cnt += 16) {
127 
128             refPicTemp = refPic;
129             dstTemp = dst;
130 
131             for (height_cnt = 0; height_cnt < puHeight; ++height_cnt) {
132                 ref0 = _mm_loadu_si128((__m128i *)(refPicTemp));
133                 refs = _mm_loadu_si128((__m128i *)(refPicTemp + srcStride));
134                 ref2s = _mm_loadu_si128((__m128i *)(refPicTemp + 2 * srcStride));
135                 ref3s = _mm_loadu_si128((__m128i *)(refPicTemp + 3 * srcStride));
136 
137                 sum_lo = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(ref0, refs), IFCoeff_1_0),
138                     _mm_maddubs_epi16(_mm_unpacklo_epi8(ref2s, ref3s), IFCoeff_3_2));
139 
140                 sum_hi = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(ref0, refs), IFCoeff_1_0),
141                     _mm_maddubs_epi16(_mm_unpackhi_epi8(ref2s, ref3s), IFCoeff_3_2));
142 
143                 sum_lo = _mm_srai_epi16(_mm_add_epi16(sum_lo, IFOffset), IFShift);
144                 sum_hi = _mm_srai_epi16(_mm_add_epi16(sum_hi, IFOffset), IFShift);
145                 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
146                 _mm_storeu_si128((__m128i *)(dstTemp), sum_clip_U8);
147                 dstTemp += dstStride;
148                 refPicTemp += srcStrideSkip;
149             }
150             refPic += 16;
151             dst += 16;
152         }
153     }
154     else { //8x
155         __m128i sum, sum01, sum23;
156 
157         for (width_cnt = 0; width_cnt < puWidth; width_cnt += 8) {
158 
159             refPicTemp = refPic;
160             dstTemp = dst;
161 
162             for (height_cnt = 0; height_cnt < puHeight; ++height_cnt) {
163                 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp)),
164                                                             _mm_loadl_epi64((__m128i *)(refPicTemp + srcStride))), IFCoeff_1_0);
165 
166                 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp + 2 * srcStride)),
167                                                             _mm_loadl_epi64((__m128i *)(refPicTemp + 3 * srcStride))), IFCoeff_3_2);
168 
169                 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
170                 sum_clip_U8 = _mm_packus_epi16(sum, sum);
171                 _mm_storel_epi64((__m128i *)(dstTemp), sum_clip_U8);
172 
173                 dstTemp += dstStride;
174                 refPicTemp += srcStrideSkip;
175             }
176             refPic += 8;
177             dst += 8;
178         }
179     }
180 }
181