1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in all
11 // copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 // SOFTWARE.
20 
21 /*
22 //
23 //  Purpose:
24 //   Inverse DCT transform with nonzero elements only in
25 //   top left 1x1, 2x2 or 4x4  quadrant + de-quantization and level shift
26 //
27 */
28 
29 #include "precomp.h"
30 
31 #ifndef __OWNJ_H__
32 #include "ownj.h"
33 #endif
34 
35 #if (_IPP >= _IPP_W7) || (_IPP32E >= _IPP32E_M7)
36 
37 #include <emmintrin.h>
38 
39 extern void mfxdct_8x8_inv_2x2_16s(Ipp16s* pSrc, Ipp16s* pDst);
40 extern void mfxdct_8x8_inv_4x4_16s(Ipp16s* pSrc, Ipp16s* pDst);
41 
42 static const __ALIGN16 Ipp16u iSA[8] = { 128, 128, 128, 128, 128, 128, 128, 128 };
43 
44 
mfxdct_quant_inv8x8_1x1_ls(const Ipp16s * pSrc,Ipp8u * pDst,int dstStep,const Ipp16u * pQuantInvTable)45 extern void mfxdct_quant_inv8x8_1x1_ls(
46   const Ipp16s* pSrc,
47         Ipp8u*  pDst,
48         int     dstStep,
49   const Ipp16u* pQuantInvTable)
50 {
51   __ALIGN16 __m128i _iS0;
52 
53   Ipp16s val = ((pSrc[0] * pQuantInvTable[0]) >> 3) + 128;
54   pDst[0] = (Ipp8u)(val > 255 ? 255 : (val < 0 ? 0 : val));
55 
56   _iS0 = _mm_set1_epi8((char)pDst[0]);
57   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 0*dstStep), _iS0);
58   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 1*dstStep), _iS0);
59   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 2*dstStep), _iS0);
60   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 3*dstStep), _iS0);
61   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 4*dstStep), _iS0);
62   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 5*dstStep), _iS0);
63   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 6*dstStep), _iS0);
64   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 7*dstStep), _iS0);
65 
66   return;
67 } /* mfxdct_quant_inv8x8_1x1_ls() */
68 
69 
mfxdct_quant_inv8x8_2x2_ls(const Ipp16s * pSrc,Ipp8u * pDst,int dstStep,const Ipp16u * pQuantInvTable)70 extern void mfxdct_quant_inv8x8_2x2_ls(
71   const Ipp16s* pSrc,
72         Ipp8u*  pDst,
73         int     dstStep,
74   const Ipp16u* pQuantInvTable)
75 {
76   Ipp16s*   buf;
77   Ipp16s    ptr[64*sizeof(Ipp16s) + CPU_CACHE_LINE-1];
78 
79   __ALIGN16 __m128i _iS0, _iS1, _iS2, _iSA;
80 
81   buf = (Ipp16s*)IPP_ALIGNED_PTR(&ptr[0],CPU_CACHE_LINE);
82 
83   if((IPP_UINT_PTR(pSrc) || IPP_UINT_PTR(pQuantInvTable)) & 0x0F) {  /* Source or quant table is not 16-byte-aligned */
84     _iS0 = _mm_loadu_si128((__m128i*)(pSrc +  0));
85     _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable +  0));
86     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
87     _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
88 
89     _iS0 = _mm_loadu_si128((__m128i*)(pSrc +  8));
90     _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable +  8));
91     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
92     _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
93   } else {
94     _iS0 = _mm_load_si128((__m128i*)(pSrc +  0));
95     _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable +  0));
96     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
97     _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
98 
99     _iS0 = _mm_load_si128((__m128i*)(pSrc +  8));
100     _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable +  8));
101     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
102     _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
103   }
104 
105   mfxdct_8x8_inv_2x2_16s(buf, buf);
106 
107   _mm_setzero_si128();
108   _iSA = _mm_load_si128((__m128i*)(iSA + 0));
109 
110   _iS0 = _mm_load_si128((__m128i*)(buf + 0));
111   _iS0 = _mm_adds_epi16(_iS0,_iSA);
112   _iS2 = _mm_packus_epi16(_iS0, _iS0);
113   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 0*dstStep), _iS2);
114 
115   _iS1 = _mm_load_si128((__m128i*)(buf + 8));
116   _iS1 = _mm_adds_epi16(_iS1,_iSA);
117   _iS2 = _mm_packus_epi16(_iS1, _iS1);
118   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 1*dstStep), _iS2);
119 
120   _iS0 = _mm_load_si128((__m128i*)(buf + 16));
121   _iS0 = _mm_adds_epi16(_iS0,_iSA);
122   _iS2 = _mm_packus_epi16(_iS0, _iS0);
123   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 2*dstStep), _iS2);
124 
125   _iS1 = _mm_load_si128((__m128i*)(buf + 24));
126   _iS1 = _mm_adds_epi16(_iS1,_iSA);
127   _iS2 = _mm_packus_epi16(_iS1, _iS1);
128   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 3*dstStep), _iS2);
129 
130   _iS0 = _mm_load_si128((__m128i*)(buf + 32));
131   _iS0 = _mm_adds_epi16(_iS0,_iSA);
132   _iS2 = _mm_packus_epi16(_iS0, _iS0);
133   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 4*dstStep), _iS2);
134 
135   _iS1 = _mm_load_si128((__m128i*)(buf + 40));
136   _iS1 = _mm_adds_epi16(_iS1,_iSA);
137   _iS2 = _mm_packus_epi16(_iS1, _iS1);
138   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 5*dstStep), _iS2);
139 
140   _iS0 = _mm_load_si128((__m128i*)(buf + 48));
141   _iS0 = _mm_adds_epi16(_iS0,_iSA);
142   _iS2 = _mm_packus_epi16(_iS0, _iS0);
143   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 6*dstStep), _iS2);
144 
145   _iS1 = _mm_load_si128((__m128i*)(buf + 56));
146   _iS1 = _mm_adds_epi16(_iS1,_iSA);
147   _iS2 = _mm_packus_epi16(_iS1, _iS1);
148   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 7*dstStep), _iS2);
149 
150   return;
151 } /* mfxdct_quant_inv8x8_2x2_ls() */
152 
153 
mfxdct_quant_inv8x8_4x4_ls(const Ipp16s * pSrc,Ipp8u * pDst,int dstStep,const Ipp16u * pQuantInvTable)154 extern void mfxdct_quant_inv8x8_4x4_ls(
155   const Ipp16s* pSrc,
156         Ipp8u*  pDst,
157         int     dstStep,
158   const Ipp16u* pQuantInvTable)
159 {
160   Ipp16s*   buf;
161   Ipp16s    ptr[64*sizeof(Ipp16s) + CPU_CACHE_LINE-1];
162 
163   __ALIGN16 __m128i _iS0, _iS1, _iS2, _iSA;
164 
165   buf = (Ipp16s*)IPP_ALIGNED_PTR(&ptr[0],CPU_CACHE_LINE);
166   if((IPP_UINT_PTR(pSrc) || IPP_UINT_PTR(pQuantInvTable)) & 0x0F) {  /* If pSrc or pQuantInvTable is not aligned on 16 byte */
167     _iS0 = _mm_loadu_si128((__m128i*)(pSrc +  0));
168     _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable +  0));
169     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
170     _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
171 
172     _iS0 = _mm_loadu_si128((__m128i*)(pSrc +  8));
173     _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable +  8));
174     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
175     _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
176 
177     _iS0 = _mm_loadu_si128((__m128i*)(pSrc +  16));
178     _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable +  16));
179     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
180     _mm_store_si128((__m128i*)((Ipp16s*)buf + 16), _iS2);
181 
182     _iS0 = _mm_loadu_si128((__m128i*)(pSrc +  24));
183     _iS1 = _mm_loadu_si128((__m128i*)(pQuantInvTable +  24));
184     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
185     _mm_store_si128((__m128i*)((Ipp16s*)buf + 24), _iS2);
186   } else { /* Everything is aligned */
187     _iS0 = _mm_load_si128((__m128i*)(pSrc +  0));
188     _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable +  0));
189     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
190     _mm_store_si128((__m128i*)((Ipp16s*)buf + 0), _iS2);
191 
192     _iS0 = _mm_load_si128((__m128i*)(pSrc +  8));
193     _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable +  8));
194     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
195     _mm_store_si128((__m128i*)((Ipp16s*)buf + 8), _iS2);
196 
197     _iS0 = _mm_load_si128((__m128i*)(pSrc +  16));
198     _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable +  16));
199     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
200     _mm_store_si128((__m128i*)((Ipp16s*)buf + 16), _iS2);
201 
202     _iS0 = _mm_load_si128((__m128i*)(pSrc +  24));
203     _iS1 = _mm_load_si128((__m128i*)(pQuantInvTable +  24));
204     _iS2 = _mm_mullo_epi16(_iS0, _iS1);
205     _mm_store_si128((__m128i*)((Ipp16s*)buf + 24), _iS2);
206   }
207 
208   mfxdct_8x8_inv_4x4_16s(buf, buf);
209 
210   _mm_setzero_si128();
211   _iSA = _mm_load_si128((__m128i*)(iSA + 0));
212 
213   _iS0 = _mm_load_si128((__m128i*)(buf + 0));
214   _iS0 = _mm_adds_epi16(_iS0,_iSA);
215   _iS2 = _mm_packus_epi16(_iS0, _iS0);
216   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 0*dstStep), _iS2);
217 
218   _iS1 = _mm_load_si128((__m128i*)(buf + 8));
219   _iS1 = _mm_adds_epi16(_iS1,_iSA);
220   _iS2 = _mm_packus_epi16(_iS1, _iS1);
221   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 1*dstStep), _iS2);
222 
223   _iS0 = _mm_load_si128((__m128i*)(buf + 16));
224   _iS0 = _mm_adds_epi16(_iS0,_iSA);
225   _iS2 = _mm_packus_epi16(_iS0, _iS0);
226   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 2*dstStep), _iS2);
227 
228   _iS1 = _mm_load_si128((__m128i*)(buf + 24));
229   _iS1 = _mm_adds_epi16(_iS1,_iSA);
230   _iS2 = _mm_packus_epi16(_iS1, _iS1);
231   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 3*dstStep), _iS2);
232 
233   _iS0 = _mm_load_si128((__m128i*)(buf + 32));
234   _iS0 = _mm_adds_epi16(_iS0,_iSA);
235   _iS2 = _mm_packus_epi16(_iS0, _iS0);
236   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 4*dstStep), _iS2);
237 
238   _iS1 = _mm_load_si128((__m128i*)(buf + 40));
239   _iS1 = _mm_adds_epi16(_iS1,_iSA);
240   _iS2 = _mm_packus_epi16(_iS1, _iS1);
241   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 5*dstStep), _iS2);
242 
243   _iS0 = _mm_load_si128((__m128i*)(buf + 48));
244   _iS0 = _mm_adds_epi16(_iS0,_iSA);
245   _iS2 = _mm_packus_epi16(_iS0, _iS0);
246   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 6*dstStep), _iS2);
247 
248   _iS1 = _mm_load_si128((__m128i*)(buf + 56));
249   _iS1 = _mm_adds_epi16(_iS1,_iSA);
250   _iS2 = _mm_packus_epi16(_iS1, _iS1);
251   _mm_storel_epi64((__m128i*)((Ipp8u*)pDst + 7*dstStep), _iS2);
252   return;
253 } /* mfxdct_quant_inv8x8_4x4_ls() */
254 
255 #endif
256