1 /* Copyright  (C) 2010-2020 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (scaler_int.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include <gfx/scaler/scaler_int.h>
24 
25 #include <retro_inline.h>
26 
27 #ifdef SCALER_NO_SIMD
28 #undef __SSE2__
29 #endif
30 
31 #if defined(__SSE2__)
32 #include <emmintrin.h>
33 #ifdef _WIN32
34 #include <intrin.h>
35 #endif
36 #endif
37 
38 /* ARGB8888 scaler is split in two:
39  *
40  * First, horizontal scaler is applied.
41  * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
42  * to left to occupy 15 bits.
43  *
44  * The sign bit is kept empty as we have to do signed multiplication for the
45  * filter.
46  *
47  * A mulhi [(a * b) >> 16] is applied which loses some precision, but is
48  * very efficient for SIMD.
49  * It is accurate enough for 8-bit purposes.
50  *
51  * The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
52  * the output is kept with 16-bit channels, and will now have 13 bits
53  * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
54  *
55  * Vertical scaler takes the 13 bit channels, and performs the
56  * same mulhi steps.
57  * Another 2 bits of precision is lost, which ends up as 11 bits.
58  * Scaling is now complete. Channels are shifted right by 3, and saturated
59  * into 8-bit values.
60  *
61  * The C version of scalers perform the exact same operations as the
62  * SIMD code for testing purposes.
63  */
64 
scaler_argb8888_vert(const struct scaler_ctx * ctx,void * output_,int stride)65 void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
66 {
67    int h, w, y;
68    const uint64_t      *input = ctx->scaled.frame;
69    uint32_t           *output = (uint32_t*)output_;
70 
71    const int16_t *filter_vert = ctx->vert.filter;
72 
73    for (h = 0; h < ctx->out_height; h++,
74          filter_vert += ctx->vert.filter_stride, output += stride >> 2)
75    {
76       const uint64_t *input_base = input + ctx->vert.filter_pos[h]
77          * (ctx->scaled.stride >> 3);
78 
79       for (w = 0; w < ctx->out_width; w++)
80       {
81          const uint64_t *input_base_y = input_base + w;
82 #if defined(__SSE2__)
83          __m128i final;
84          __m128i res = _mm_setzero_si128();
85 
86          for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
87                input_base_y += (ctx->scaled.stride >> 2))
88          {
89             __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
90             __m128i col   = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
91 
92             res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
93          }
94 
95          for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
96          {
97             __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
98             __m128i col   = _mm_set_epi64x(0, input_base_y[0]);
99 
100             res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
101          }
102 
103          res       = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
104          res       = _mm_srai_epi16(res, (7 - 2 - 2));
105 
106          final     = _mm_packus_epi16(res, res);
107 
108          output[w] = _mm_cvtsi128_si32(final);
109 #else
110          int16_t res_a = 0;
111          int16_t res_r = 0;
112          int16_t res_g = 0;
113          int16_t res_b = 0;
114 
115          for (y = 0; y < ctx->vert.filter_len; y++,
116                input_base_y += (ctx->scaled.stride >> 3))
117          {
118             uint64_t col   = *input_base_y;
119 
120             int16_t a      = (col >> 48) & 0xffff;
121             int16_t r      = (col >> 32) & 0xffff;
122             int16_t g      = (col >> 16) & 0xffff;
123             int16_t b      = (col >>  0) & 0xffff;
124 
125             int16_t coeff  = filter_vert[y];
126 
127             res_a         += (a * coeff) >> 16;
128             res_r         += (r * coeff) >> 16;
129             res_g         += (g * coeff) >> 16;
130             res_b         += (b * coeff) >> 16;
131          }
132 
133          res_a           >>= (7 - 2 - 2);
134          res_r           >>= (7 - 2 - 2);
135          res_g           >>= (7 - 2 - 2);
136          res_b           >>= (7 - 2 - 2);
137 
138          output[w]         =
139             (clamp_8bit(res_a) << 24) |
140             (clamp_8bit(res_r) << 16) |
141             (clamp_8bit(res_g) << 8)  |
142             (clamp_8bit(res_b) << 0);
143 #endif
144       }
145    }
146 }
147 
scaler_argb8888_horiz(const struct scaler_ctx * ctx,const void * input_,int stride)148 void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
149 {
150    int h, w, x;
151    const uint32_t *input = (uint32_t*)input_;
152    uint64_t *output      = ctx->scaled.frame;
153 
154    for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
155          output += ctx->scaled.stride >> 3)
156    {
157       const int16_t *filter_horiz = ctx->horiz.filter;
158 
159       for (w = 0; w < ctx->scaled.width; w++,
160             filter_horiz += ctx->horiz.filter_stride)
161       {
162          const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
163 #if defined(__SSE2__)
164          __m128i res = _mm_setzero_si128();
165 #ifndef __x86_64__
166          union
167          {
168             uint32_t *u32;
169             uint64_t *u64;
170          } u;
171 #endif
172          for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2)
173          {
174             __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
175 
176             __m128i col   = _mm_unpacklo_epi8(_mm_set_epi64x(0,
177                      ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
178 
179             col           = _mm_slli_epi16(col, 7);
180             res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
181          }
182 
183          for (; x < ctx->horiz.filter_len; x++)
184          {
185             __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
186             __m128i col   = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
187 
188             col           = _mm_slli_epi16(col, 7);
189             res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
190          }
191 
192          res              = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
193 
194 #ifdef __x86_64__
195          output[w]        = _mm_cvtsi128_si64(res);
196 #else /* 32-bit doesn't have si64. Do it in two steps. */
197          u.u64    = output + w;
198          u.u32[0] = _mm_cvtsi128_si32(res);
199          u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4));
200 #endif
201 #else
202          int16_t res_a = 0;
203          int16_t res_r = 0;
204          int16_t res_g = 0;
205          int16_t res_b = 0;
206 
207          for (x = 0; x < ctx->horiz.filter_len; x++)
208          {
209             uint32_t col   = input_base_x[x];
210 
211             int16_t a      = (col >> (24 - 7)) & (0xff << 7);
212             int16_t r      = (col >> (16 - 7)) & (0xff << 7);
213             int16_t g      = (col >> ( 8 - 7)) & (0xff << 7);
214             int16_t b      = (col << ( 0 + 7)) & (0xff << 7);
215 
216             int16_t coeff  = filter_horiz[x];
217 
218             res_a         += (a * coeff) >> 16;
219             res_r         += (r * coeff) >> 16;
220             res_g         += (g * coeff) >> 16;
221             res_b         += (b * coeff) >> 16;
222          }
223 
224          output[w]         = (
225                (uint64_t)res_a  << 48)  |
226                ((uint64_t)res_r << 32)  |
227                ((uint64_t)res_g << 16)  |
228                ((uint64_t)res_b << 0);
229 #endif
230       }
231    }
232 }
233 
scaler_argb8888_point_special(const struct scaler_ctx * ctx,void * output_,const void * input_,int out_width,int out_height,int in_width,int in_height,int out_stride,int in_stride)234 void scaler_argb8888_point_special(const struct scaler_ctx *ctx,
235       void *output_, const void *input_,
236       int out_width, int out_height,
237       int in_width, int in_height,
238       int out_stride, int in_stride)
239 {
240    int h, w;
241    int x_pos             = (1 << 15) * in_width / out_width - (1 << 15);
242    int x_step            = (1 << 16) * in_width / out_width;
243    int y_pos             = (1 << 15) * in_height / out_height - (1 << 15);
244    int y_step            = (1 << 16) * in_height / out_height;
245    const uint32_t *input = (const uint32_t*)input_;
246    uint32_t *output      = (uint32_t*)output_;
247 
248    if (x_pos < 0)
249       x_pos = 0;
250    if (y_pos < 0)
251       y_pos = 0;
252 
253    for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2)
254    {
255       int               x = x_pos;
256       const uint32_t *inp = input + (y_pos >> 16) * (in_stride >> 2);
257 
258       for (w = 0; w < out_width; w++, x += x_step)
259          output[w] = inp[x >> 16];
260    }
261 }
262