1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in all
11 // copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 // SOFTWARE.
20
21 /*
22 //
23 // Purpose:
24 // Inverse DCT transform with nonzero elements only in
25 // top left 1x1, 2x2 or 4x4 quadrant + de-quantization and level shift
26 //
27 */
28
29 #include "precomp.h"
30
31 #ifndef __OWNJ_H__
32 #include "ownj.h"
33 #endif
34
35 #if (_IPP >= _IPP_W7) || (_IPP32E >= _IPP32E_M7)
36
37 #include <emmintrin.h>
38
39 extern void mfxdct_8x8_inv_2x2_16s(Ipp16s* pSrc, Ipp16s* pDst);
40 extern void mfxdct_8x8_inv_4x4_16s(Ipp16s* pSrc, Ipp16s* pDst);
41
42 static const __ALIGN16 Ipp16u iSA[8] = { 128, 128, 128, 128, 128, 128, 128, 128 };
43
44
mfxdct_quant_inv8x8_1x1_ls(const Ipp16s * pSrc,Ipp8u * pDst,int dstStep,const Ipp16u * pQuantInvTable)45 extern void mfxdct_quant_inv8x8_1x1_ls(
46 const Ipp16s* pSrc,
47 Ipp8u* pDst,
48 int dstStep,
49 const Ipp16u* pQuantInvTable)
50 {
51 __ALIGN16 __m128i _iS0;
52
53 Ipp16s val = ((pSrc[0] * pQuantInvTable[0]) >> 3) + 128;
54 pDst[0] = (Ipp8u)(val > 255 ? 255 : (val < 0 ? 0 : val));
55
56 _iS0 = _mm_set1_epi8((char)pDst[0]);
57 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 0*dstStep), _iS0);
58 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 1*dstStep), _iS0);
59 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 2*dstStep), _iS0);
60 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 3*dstStep), _iS0);
61 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 4*dstStep), _iS0);
62 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 5*dstStep), _iS0);
63 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 6*dstStep), _iS0);
64 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 7*dstStep), _iS0);
65
66 return;
67 } /* mfxdct_quant_inv8x8_1x1_ls() */
68
69
mfxdct_quant_inv8x8_2x2_ls(const Ipp16s * pSrc,Ipp8u * pDst,int dstStep,const Ipp16u * pQuantInvTable)70 extern void mfxdct_quant_inv8x8_2x2_ls(
71 const Ipp16s* pSrc,
72 Ipp8u* pDst,
73 int dstStep,
74 const Ipp16u* pQuantInvTable)
75 {
76 Ipp16s* buf;
77 Ipp16s ptr[64*sizeof(Ipp16s) + CPU_CACHE_LINE-1];
78
79 __ALIGN16 __m128i _iS0, _iS1, _iS2, _iSA;
80
81 buf = (Ipp16s*)IPP_ALIGNED_PTR(&ptr[0],CPU_CACHE_LINE);
82
83 if((IPP_UINT_PTR(pSrc) || IPP_UINT_PTR(pQuantInvTable)) & 0x0F) { /* Source or quant table is not 16-byte-aligned */
84 _iS0 = _mm_loadu_si128((__m128i*)(pSrc + 0));
85 _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable + 0));
86 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
87 _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
88
89 _iS0 = _mm_loadu_si128((__m128i*)(pSrc + 8));
90 _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable + 8));
91 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
92 _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
93 } else {
94 _iS0 = _mm_load_si128((__m128i*)(pSrc + 0));
95 _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable + 0));
96 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
97 _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
98
99 _iS0 = _mm_load_si128((__m128i*)(pSrc + 8));
100 _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable + 8));
101 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
102 _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
103 }
104
105 mfxdct_8x8_inv_2x2_16s(buf, buf);
106
107 _mm_setzero_si128();
108 _iSA = _mm_load_si128((__m128i*)(iSA + 0));
109
110 _iS0 = _mm_load_si128((__m128i*)(buf + 0));
111 _iS0 = _mm_adds_epi16(_iS0,_iSA);
112 _iS2 = _mm_packus_epi16(_iS0, _iS0);
113 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 0*dstStep), _iS2);
114
115 _iS1 = _mm_load_si128((__m128i*)(buf + 8));
116 _iS1 = _mm_adds_epi16(_iS1,_iSA);
117 _iS2 = _mm_packus_epi16(_iS1, _iS1);
118 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 1*dstStep), _iS2);
119
120 _iS0 = _mm_load_si128((__m128i*)(buf + 16));
121 _iS0 = _mm_adds_epi16(_iS0,_iSA);
122 _iS2 = _mm_packus_epi16(_iS0, _iS0);
123 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 2*dstStep), _iS2);
124
125 _iS1 = _mm_load_si128((__m128i*)(buf + 24));
126 _iS1 = _mm_adds_epi16(_iS1,_iSA);
127 _iS2 = _mm_packus_epi16(_iS1, _iS1);
128 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 3*dstStep), _iS2);
129
130 _iS0 = _mm_load_si128((__m128i*)(buf + 32));
131 _iS0 = _mm_adds_epi16(_iS0,_iSA);
132 _iS2 = _mm_packus_epi16(_iS0, _iS0);
133 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 4*dstStep), _iS2);
134
135 _iS1 = _mm_load_si128((__m128i*)(buf + 40));
136 _iS1 = _mm_adds_epi16(_iS1,_iSA);
137 _iS2 = _mm_packus_epi16(_iS1, _iS1);
138 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 5*dstStep), _iS2);
139
140 _iS0 = _mm_load_si128((__m128i*)(buf + 48));
141 _iS0 = _mm_adds_epi16(_iS0,_iSA);
142 _iS2 = _mm_packus_epi16(_iS0, _iS0);
143 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 6*dstStep), _iS2);
144
145 _iS1 = _mm_load_si128((__m128i*)(buf + 56));
146 _iS1 = _mm_adds_epi16(_iS1,_iSA);
147 _iS2 = _mm_packus_epi16(_iS1, _iS1);
148 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 7*dstStep), _iS2);
149
150 return;
151 } /* mfxdct_quant_inv8x8_2x2_ls() */
152
153
mfxdct_quant_inv8x8_4x4_ls(const Ipp16s * pSrc,Ipp8u * pDst,int dstStep,const Ipp16u * pQuantInvTable)154 extern void mfxdct_quant_inv8x8_4x4_ls(
155 const Ipp16s* pSrc,
156 Ipp8u* pDst,
157 int dstStep,
158 const Ipp16u* pQuantInvTable)
159 {
160 Ipp16s* buf;
161 Ipp16s ptr[64*sizeof(Ipp16s) + CPU_CACHE_LINE-1];
162
163 __ALIGN16 __m128i _iS0, _iS1, _iS2, _iSA;
164
165 buf = (Ipp16s*)IPP_ALIGNED_PTR(&ptr[0],CPU_CACHE_LINE);
166 if((IPP_UINT_PTR(pSrc) || IPP_UINT_PTR(pQuantInvTable)) & 0x0F) { /* If pSrc or pQuantInvTable is not aligned on 16 byte */
167 _iS0 = _mm_loadu_si128((__m128i*)(pSrc + 0));
168 _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable + 0));
169 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
170 _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
171
172 _iS0 = _mm_loadu_si128((__m128i*)(pSrc + 8));
173 _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable + 8));
174 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
175 _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
176
177 _iS0 = _mm_loadu_si128((__m128i*)(pSrc + 16));
178 _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable + 16));
179 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
180 _mm_store_si128((__m128i*)((Ipp16s*)buf + 16), _iS2);
181
182 _iS0 = _mm_loadu_si128((__m128i*)(pSrc + 24));
183 _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable + 24));
184 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
185 _mm_store_si128((__m128i*)((Ipp16s*)buf + 24), _iS2);
186 } else { /* Everything is aligned */
187 _iS0 = _mm_load_si128((__m128i*)(pSrc + 0));
188 _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable + 0));
189 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
190 _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
191
192 _iS0 = _mm_load_si128((__m128i*)(pSrc + 8));
193 _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable + 8));
194 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
195 _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
196
197 _iS0 = _mm_load_si128((__m128i*)(pSrc + 16));
198 _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable + 16));
199 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
200 _mm_store_si128((__m128i*)((Ipp16s*)buf + 16), _iS2);
201
202 _iS0 = _mm_load_si128((__m128i*)(pSrc + 24));
203 _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable + 24));
204 _iS2 = _mm_mullo_epi16(_iS0, _iS1);
205 _mm_store_si128((__m128i*)((Ipp16s*)buf + 24), _iS2);
206 }
207
208 mfxdct_8x8_inv_4x4_16s(buf, buf);
209
210 _mm_setzero_si128();
211 _iSA = _mm_load_si128((__m128i*)(iSA + 0));
212
213 _iS0 = _mm_load_si128((__m128i*)(buf + 0));
214 _iS0 = _mm_adds_epi16(_iS0,_iSA);
215 _iS2 = _mm_packus_epi16(_iS0, _iS0);
216 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 0*dstStep), _iS2);
217
218 _iS1 = _mm_load_si128((__m128i*)(buf + 8));
219 _iS1 = _mm_adds_epi16(_iS1,_iSA);
220 _iS2 = _mm_packus_epi16(_iS1, _iS1);
221 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 1*dstStep), _iS2);
222
223 _iS0 = _mm_load_si128((__m128i*)(buf + 16));
224 _iS0 = _mm_adds_epi16(_iS0,_iSA);
225 _iS2 = _mm_packus_epi16(_iS0, _iS0);
226 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 2*dstStep), _iS2);
227
228 _iS1 = _mm_load_si128((__m128i*)(buf + 24));
229 _iS1 = _mm_adds_epi16(_iS1,_iSA);
230 _iS2 = _mm_packus_epi16(_iS1, _iS1);
231 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 3*dstStep), _iS2);
232
233 _iS0 = _mm_load_si128((__m128i*)(buf + 32));
234 _iS0 = _mm_adds_epi16(_iS0,_iSA);
235 _iS2 = _mm_packus_epi16(_iS0, _iS0);
236 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 4*dstStep), _iS2);
237
238 _iS1 = _mm_load_si128((__m128i*)(buf + 40));
239 _iS1 = _mm_adds_epi16(_iS1,_iSA);
240 _iS2 = _mm_packus_epi16(_iS1, _iS1);
241 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 5*dstStep), _iS2);
242
243 _iS0 = _mm_load_si128((__m128i*)(buf + 48));
244 _iS0 = _mm_adds_epi16(_iS0,_iSA);
245 _iS2 = _mm_packus_epi16(_iS0, _iS0);
246 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 6*dstStep), _iS2);
247
248 _iS1 = _mm_load_si128((__m128i*)(buf + 56));
249 _iS1 = _mm_adds_epi16(_iS1,_iSA);
250 _iS2 = _mm_packus_epi16(_iS1, _iS1);
251 _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 7*dstStep), _iS2);
252 return;
253 } /* mfxdct_quant_inv8x8_4x4_ls() */
254
255 #endif
256