1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbAvcStyleMcp_SSSE3.h"
7
8 #include "EbDefinitions.h"
9
10 #include "emmintrin.h"
11 #include "tmmintrin.h"
12
13
14 EB_EXTERN EB_ALIGN(16) const EB_S8 EbHevcAvcStyleLumaIFCoeff8_SSSE3[]= {
15 -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25,
16 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1,
17 -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18,
18 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2, 18, -2,
19 -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9, -1, 9,
20 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1, 25, -1
21 };
22
23
24 void PictureCopyKernel_SSSE3(
25 EB_BYTE src,
26 EB_U32 srcStride,
27 EB_BYTE dst,
28 EB_U32 dstStride,
29 EB_U32 areaWidth,
30 EB_U32 areaHeight,
31 EB_U32 bytesPerSample);
32
AvcStyleLumaInterpolationFilterHorizontal_SSSE3_INTRIN(EB_BYTE refPic,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 puWidth,EB_U32 puHeight,EB_BYTE tempBuf,EB_U32 fracPos)33 void AvcStyleLumaInterpolationFilterHorizontal_SSSE3_INTRIN(
34 EB_BYTE refPic,
35 EB_U32 srcStride,
36 EB_BYTE dst,
37 EB_U32 dstStride,
38 EB_U32 puWidth,
39 EB_U32 puHeight,
40 EB_BYTE tempBuf,
41 EB_U32 fracPos)
42 {
43 (void)tempBuf;
44 __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
45 EB_U32 width_cnt, height_cnt;
46 EB_U32 IFShift = 5;
47
48 fracPos <<= 5;
49 IFOffset = _mm_set1_epi16(0x0010);
50 IFCoeff_1_0 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 32));
51 IFCoeff_3_2 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 16));
52
53 if (!(puWidth & 15)) { // 16x
54 __m128i ref0, ref1, ref2, ref3, ref01_lo, ref01_hi, ref23_lo, ref23_hi, sum_lo, sum_hi;
55
56 for (height_cnt = 0; height_cnt < puHeight; ++height_cnt){
57 for (width_cnt = 0; width_cnt < puWidth; width_cnt += 16) {
58 ref0 = _mm_loadu_si128((__m128i *)(refPic + width_cnt - 1));
59 ref1 = _mm_loadu_si128((__m128i *)(refPic + width_cnt));
60 ref2 = _mm_loadu_si128((__m128i *)(refPic + width_cnt + 1));
61 ref3 = _mm_loadu_si128((__m128i *)(refPic + width_cnt + 2));
62
63 ref01_lo = _mm_unpacklo_epi8(ref0, ref1);
64 ref01_hi = _mm_unpackhi_epi8(ref0, ref1);
65 ref23_lo = _mm_unpacklo_epi8(ref2, ref3);
66 ref23_hi = _mm_unpackhi_epi8(ref2, ref3);
67
68 sum_lo = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_lo, IFCoeff_1_0), _mm_maddubs_epi16(ref23_lo, IFCoeff_3_2)), IFOffset), IFShift);
69 sum_hi = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(ref01_hi, IFCoeff_1_0), _mm_maddubs_epi16(ref23_hi, IFCoeff_3_2)), IFOffset), IFShift);
70 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
71 _mm_storeu_si128((__m128i *)(dst + width_cnt), sum_clip_U8);
72 }
73 refPic += srcStride;
74 dst += dstStride;
75 }
76 }
77 else { //8x
78 __m128i sum01, sum23, sum;
79
80 for (height_cnt = 0; height_cnt < puHeight; ++height_cnt){
81 for (width_cnt = 0; width_cnt < puWidth; width_cnt += 8) {
82 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPic + width_cnt - 1)),
83 _mm_loadl_epi64((__m128i *)(refPic + width_cnt))), IFCoeff_1_0);
84
85 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPic + width_cnt + 1)),
86 _mm_loadl_epi64((__m128i *)(refPic + width_cnt + 2))), IFCoeff_3_2);
87
88 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
89 sum_clip_U8 = _mm_packus_epi16(sum, sum);
90
91 _mm_storel_epi64((__m128i *)(dst + width_cnt), sum_clip_U8);
92 }
93 refPic += srcStride;
94 dst += dstStride;
95 }
96
97 }
98 }
99
AvcStyleLumaInterpolationFilterVertical_SSSE3_INTRIN(EB_BYTE refPic,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 puWidth,EB_U32 puHeight,EB_BYTE tempBuf,EB_U32 fracPos)100 void AvcStyleLumaInterpolationFilterVertical_SSSE3_INTRIN(
101 EB_BYTE refPic,
102 EB_U32 srcStride,
103 EB_BYTE dst,
104 EB_U32 dstStride,
105 EB_U32 puWidth,
106 EB_U32 puHeight,
107 EB_BYTE tempBuf,
108 EB_U32 fracPos)
109 {
110 (void)tempBuf;
111 __m128i IFOffset, IFCoeff_1_0, IFCoeff_3_2, sum_clip_U8;
112 EB_U32 width_cnt, height_cnt;
113 EB_U32 IFShift = 5;
114 EB_U32 srcStrideSkip = srcStride;
115 EB_BYTE refPicTemp, dstTemp;
116
117 fracPos <<= 5;
118 refPic -= srcStride;
119 IFOffset = _mm_set1_epi16(0x0010);
120 IFCoeff_1_0 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 32));
121 IFCoeff_3_2 = _mm_load_si128((__m128i *)(EbHevcAvcStyleLumaIFCoeff8_SSSE3 + fracPos - 16));
122 if (!(puWidth & 15)) { //16x
123
124 __m128i sum_lo, sum_hi, ref0, refs, ref2s, ref3s;
125
126 for (width_cnt = 0; width_cnt < puWidth; width_cnt += 16) {
127
128 refPicTemp = refPic;
129 dstTemp = dst;
130
131 for (height_cnt = 0; height_cnt < puHeight; ++height_cnt) {
132 ref0 = _mm_loadu_si128((__m128i *)(refPicTemp));
133 refs = _mm_loadu_si128((__m128i *)(refPicTemp + srcStride));
134 ref2s = _mm_loadu_si128((__m128i *)(refPicTemp + 2 * srcStride));
135 ref3s = _mm_loadu_si128((__m128i *)(refPicTemp + 3 * srcStride));
136
137 sum_lo = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpacklo_epi8(ref0, refs), IFCoeff_1_0),
138 _mm_maddubs_epi16(_mm_unpacklo_epi8(ref2s, ref3s), IFCoeff_3_2));
139
140 sum_hi = _mm_add_epi16(_mm_maddubs_epi16(_mm_unpackhi_epi8(ref0, refs), IFCoeff_1_0),
141 _mm_maddubs_epi16(_mm_unpackhi_epi8(ref2s, ref3s), IFCoeff_3_2));
142
143 sum_lo = _mm_srai_epi16(_mm_add_epi16(sum_lo, IFOffset), IFShift);
144 sum_hi = _mm_srai_epi16(_mm_add_epi16(sum_hi, IFOffset), IFShift);
145 sum_clip_U8 = _mm_packus_epi16(sum_lo, sum_hi);
146 _mm_storeu_si128((__m128i *)(dstTemp), sum_clip_U8);
147 dstTemp += dstStride;
148 refPicTemp += srcStrideSkip;
149 }
150 refPic += 16;
151 dst += 16;
152 }
153 }
154 else { //8x
155 __m128i sum, sum01, sum23;
156
157 for (width_cnt = 0; width_cnt < puWidth; width_cnt += 8) {
158
159 refPicTemp = refPic;
160 dstTemp = dst;
161
162 for (height_cnt = 0; height_cnt < puHeight; ++height_cnt) {
163 sum01 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp)),
164 _mm_loadl_epi64((__m128i *)(refPicTemp + srcStride))), IFCoeff_1_0);
165
166 sum23 = _mm_maddubs_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(refPicTemp + 2 * srcStride)),
167 _mm_loadl_epi64((__m128i *)(refPicTemp + 3 * srcStride))), IFCoeff_3_2);
168
169 sum = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(sum01, sum23), IFOffset), IFShift);
170 sum_clip_U8 = _mm_packus_epi16(sum, sum);
171 _mm_storel_epi64((__m128i *)(dstTemp), sum_clip_U8);
172
173 dstTemp += dstStride;
174 refPicTemp += srcStrideSkip;
175 }
176 refPic += 8;
177 dst += 8;
178 }
179 }
180 }
181