1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 namespace CAROTENE_NS {
43 
44 #ifdef CAROTENE_NEON
45 
46 #define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
47     void convertScale(const Size2D &_size,                                       \
48                       const T1 * srcBase, ptrdiff_t srcStride,                   \
49                       T2 * dstBase, ptrdiff_t dstStride,                         \
50                       f64 alpha, f64 beta)                                       \
51     {                                                                            \
52         internal::assertSupportedConfiguration();                                \
53         Size2D size(_size);                                                      \
54         if (srcStride == dstStride &&                                            \
55             srcStride == (ptrdiff_t)(size.width))                                \
56         {                                                                        \
57             size.width *= size.height;                                           \
58             size.height = 1;                                                     \
59         }                                                                        \
60         const ptrdiff_t sstep = srcStride / sizeof(T1);                          \
61         const ptrdiff_t dstep = dstStride / sizeof(T2);                          \
62         const size_t w = size.width & ~(SIMD_SIZE-1);                            \
63         if (size.width >= SIMD_SIZE)                                             \
64         {                                                                        \
65             const T1* _src = srcBase;                                            \
66             T2* _dst = dstBase;                                                  \
67             CVTINIT                                                              \
68             for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
69                 CVTROW                                                           \
70         }                                                                        \
71         if(w < size.width)                                                       \
72         {                                                                        \
73             const T1* _src = srcBase;                                            \
74             T2* _dst = dstBase;                                                  \
75             for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
76                 for(size_t i = w; i < size.width; i++ )                          \
77                     _dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \
78         }                                                                        \
79     }
80 
81 #define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW)                             \
82     void convertScale(const Size2D &_size,                                       \
83                       const T1 * srcBase, ptrdiff_t srcStride,                   \
84                       T1 * dstBase, ptrdiff_t dstStride,                         \
85                       f64 alpha, f64 beta)                                       \
86     {                                                                            \
87         internal::assertSupportedConfiguration();                                \
88         Size2D size(_size);                                                      \
89         if (srcStride == dstStride &&                                            \
90             srcStride == (ptrdiff_t)(size.width))                                \
91         {                                                                        \
92             size.width *= size.height;                                           \
93             size.height = 1;                                                     \
94         }                                                                        \
95         const ptrdiff_t sstep = srcStride / sizeof(T1);                          \
96         const ptrdiff_t dstep = dstStride / sizeof(T1);                          \
97         const size_t w = size.width & ~(SIMD_SIZE-1);                            \
98         if (size.width >= SIMD_SIZE)                                             \
99         {                                                                        \
100             const T1* _src = srcBase;                                            \
101             T1* _dst = dstBase;                                                  \
102             CVTSINIT                                                             \
103             for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
104                 CVTSROW                                                          \
105         }                                                                        \
106         if(w < size.width)                                                       \
107         {                                                                        \
108             const T1* _src = srcBase;                                            \
109             T1* _dst = dstBase;                                                  \
110             for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep )  \
111                 for(size_t i = w; i < size.width; i++ )                          \
112                     _dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \
113         }                                                                        \
114     }
115 
116 #else
117 
118 #define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW)                            \
119     void convertScale(const Size2D &,                                            \
120                       const T1 *, ptrdiff_t,                                     \
121                       T2 *, ptrdiff_t,                                           \
122                       f64, f64)                                                  \
123     {                                                                            \
124         internal::assertSupportedConfiguration();                                \
125     }
126 
127 #define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW)                             \
128     void convertScale(const Size2D &,                                            \
129                       const T1 *, ptrdiff_t,                                     \
130                       T1 *, ptrdiff_t,                                           \
131                       f64, f64)                                                  \
132     {                                                                            \
133         internal::assertSupportedConfiguration();                                \
134     }
135 
136 #endif
137 
138 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
139 CVTS_FUNC1(u8, 16,
140     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
141     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
142 {
143     for (size_t i = 0; i < w; i += 16)
144     {
145         internal::prefetch(_src + i);
146         __asm__ (
147             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
148             "vmovl.u8 q3, d4                                       \n\t"
149             "vmovl.u8 q4, d5                                       \n\t"
150             "vmovl.u16 q5, d6                                      \n\t"
151             "vmovl.u16 q6, d7                                      \n\t"
152             "vmovl.u16 q7, d8                                      \n\t"
153             "vmovl.u16 q8, d9                                      \n\t"
154             "vcvt.f32.u32 q9, q5                                   \n\t"
155             "vcvt.f32.u32 q10, q6                                  \n\t"
156             "vcvt.f32.u32 q11, q7                                  \n\t"
157             "vcvt.f32.u32 q12, q8                                  \n\t"
158             "vmul.f32 q13, q9, q0                                  \n\t"
159             "vmul.f32 q14, q10, q0                                 \n\t"
160             "vmul.f32 q15, q11, q0                                 \n\t"
161             "vmul.f32 q2, q12, q0                                  \n\t"
162             "vadd.f32 q3, q13, q1                                  \n\t"
163             "vadd.f32 q4, q14, q1                                  \n\t"
164             "vadd.f32 q5, q15, q1                                  \n\t"
165             "vadd.f32 q6, q2, q1                                   \n\t"
166             "vcvt.s32.f32 q7, q3                                   \n\t"
167             "vcvt.s32.f32 q8, q4                                   \n\t"
168             "vcvt.s32.f32 q9, q5                                   \n\t"
169             "vcvt.s32.f32 q10, q6                                  \n\t"
170             "vqmovun.s32 d22, q7                                   \n\t"
171             "vqmovun.s32 d23, q8                                   \n\t"
172             "vqmovun.s32 d24, q9                                   \n\t"
173             "vqmovun.s32 d25, q10                                  \n\t"
174             "vqmovn.u16 d26, q11                                   \n\t"
175             "vqmovn.u16 d27, q12                                   \n\t"
176             "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
177             : /*no output*/
178             : [src] "r" (_src + i),
179               [dst1] "r" (_dst + i + 0),
180               "w"  (vscale), "w" (vshift)
181             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
182         );
183     }
184 })
185 #else
186 CVTS_FUNC1(u8, 16,
187     float32x4_t vscale = vdupq_n_f32((f32)alpha);
188     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
189 {
190     for (size_t i = 0; i < w; i += 16)
191     {
192         internal::prefetch(_src + i);
193         uint8x16_t vline = vld1q_u8(_src + i);
194         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
195         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
196         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
197         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
198         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
199         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
200         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
201         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
202         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
203         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
204         vline1_f32 = vmulq_f32(vline1_f32, vscale);
205         vline2_f32 = vmulq_f32(vline2_f32, vscale);
206         vline3_f32 = vmulq_f32(vline3_f32, vscale);
207         vline4_f32 = vmulq_f32(vline4_f32, vscale);
208         vline1_f32 = vaddq_f32(vline1_f32, vshift);
209         vline2_f32 = vaddq_f32(vline2_f32, vshift);
210         vline3_f32 = vaddq_f32(vline3_f32, vshift);
211         vline4_f32 = vaddq_f32(vline4_f32, vshift);
212         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
213         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
214         int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
215         int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
216         uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
217         uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
218         vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
219     }
220 })
221 #endif
222 
223 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
224 CVTS_FUNC(u8, s8, 16,
225     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
226     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
227 {
228     for (size_t i = 0; i < w; i += 16)
229     {
230         internal::prefetch(_src + i);
231         __asm__ (
232             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
233             "vmovl.u8 q3, d4                                       \n\t"
234             "vmovl.u8 q4, d5                                       \n\t"
235             "vmovl.u16 q5, d6                                      \n\t"
236             "vmovl.u16 q6, d7                                      \n\t"
237             "vmovl.u16 q7, d8                                      \n\t"
238             "vmovl.u16 q8, d9                                      \n\t"
239             "vcvt.f32.u32 q9, q5                                   \n\t"
240             "vcvt.f32.u32 q10, q6                                  \n\t"
241             "vcvt.f32.u32 q11, q7                                  \n\t"
242             "vcvt.f32.u32 q12, q8                                  \n\t"
243             "vmul.f32 q13, q9, q0                                  \n\t"
244             "vmul.f32 q14, q10, q0                                 \n\t"
245             "vmul.f32 q15, q11, q0                                 \n\t"
246             "vmul.f32 q2, q12, q0                                  \n\t"
247             "vadd.f32 q3, q13, q1                                  \n\t"
248             "vadd.f32 q4, q14, q1                                  \n\t"
249             "vadd.f32 q5, q15, q1                                  \n\t"
250             "vadd.f32 q6, q2, q1                                   \n\t"
251             "vcvt.s32.f32 q7, q3                                   \n\t"
252             "vcvt.s32.f32 q8, q4                                   \n\t"
253             "vcvt.s32.f32 q9, q5                                   \n\t"
254             "vcvt.s32.f32 q10, q6                                  \n\t"
255             "vqmovn.s32 d22, q7                                    \n\t"
256             "vqmovn.s32 d23, q8                                    \n\t"
257             "vqmovn.s32 d24, q9                                    \n\t"
258             "vqmovn.s32 d25, q10                                   \n\t"
259             "vqmovn.s16 d26, q11                                   \n\t"
260             "vqmovn.s16 d27, q12                                   \n\t"
261             "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
262             : //no output
263             : [src] "r" (_src + i),
264               [dst1] "r" (_dst + i + 0),
265               "w"  (vscale), "w" (vshift)
266             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
267         );
268     }
269 })
270 #else
271 CVTS_FUNC(u8, s8, 16,
272     float32x4_t vscale = vdupq_n_f32((f32)alpha);
273     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
274 {
275     for (size_t i = 0; i < w; i += 16)
276     {
277         internal::prefetch(_src + i);
278         uint8x16_t vline = vld1q_u8(_src + i);
279         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
280         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
281         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
282         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
283         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
284         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
285         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
286         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
287         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
288         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
289         vline1_f32 = vmulq_f32(vline1_f32, vscale);
290         vline2_f32 = vmulq_f32(vline2_f32, vscale);
291         vline3_f32 = vmulq_f32(vline3_f32, vscale);
292         vline4_f32 = vmulq_f32(vline4_f32, vscale);
293         vline1_f32 = vaddq_f32(vline1_f32, vshift);
294         vline2_f32 = vaddq_f32(vline2_f32, vshift);
295         vline3_f32 = vaddq_f32(vline3_f32, vshift);
296         vline4_f32 = vaddq_f32(vline4_f32, vshift);
297         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
298         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
299         int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
300         int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
301         int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
302         int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
303         vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
304     }
305 })
306 #endif
307 
308 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
309 CVTS_FUNC(u8, u16, 16,
310     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
311     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
312 {
313     for (size_t i = 0; i < w; i += 16)
314     {
315         internal::prefetch(_src + i);
316         __asm__ (
317             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
318             "vmovl.u8 q3, d4                                       \n\t"
319             "vmovl.u8 q4, d5                                       \n\t"
320             "vmovl.u16 q5, d6                                      \n\t"
321             "vmovl.u16 q6, d7                                      \n\t"
322             "vmovl.u16 q7, d8                                      \n\t"
323             "vmovl.u16 q8, d9                                      \n\t"
324             "vcvt.f32.u32 q9, q5                                   \n\t"
325             "vcvt.f32.u32 q10, q6                                  \n\t"
326             "vcvt.f32.u32 q11, q7                                  \n\t"
327             "vcvt.f32.u32 q12, q8                                  \n\t"
328             "vmul.f32 q13, q9, q0                                  \n\t"
329             "vmul.f32 q14, q10, q0                                 \n\t"
330             "vmul.f32 q15, q11, q0                                 \n\t"
331             "vmul.f32 q2, q12, q0                                  \n\t"
332             "vadd.f32 q3, q13, q1                                  \n\t"
333             "vadd.f32 q4, q14, q1                                  \n\t"
334             "vadd.f32 q5, q15, q1                                  \n\t"
335             "vadd.f32 q6, q2, q1                                   \n\t"
336             "vcvt.s32.f32 q7, q3                                   \n\t"
337             "vcvt.s32.f32 q8, q4                                   \n\t"
338             "vcvt.s32.f32 q9, q5                                   \n\t"
339             "vcvt.s32.f32 q10, q6                                  \n\t"
340             "vqmovun.s32 d22, q7                                   \n\t"
341             "vqmovun.s32 d23, q8                                   \n\t"
342             "vqmovun.s32 d24, q9                                   \n\t"
343             "vqmovun.s32 d25, q10                                  \n\t"
344             "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
345             "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
346             : /*no output*/
347             : [src] "r" (_src + i),
348               [dst1] "r" (_dst + i + 0),
349               [dst2] "r" (_dst + i + 8),
350               "w"  (vscale), "w" (vshift)
351             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
352         );
353     }
354 })
355 #else
356 CVTS_FUNC(u8, u16, 16,
357     float32x4_t vscale = vdupq_n_f32((f32)alpha);
358     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
359 {
360     for (size_t i = 0; i < w; i += 16)
361     {
362         internal::prefetch(_src + i);
363         uint8x16_t vline = vld1q_u8(_src + i);
364         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
365         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
366         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
367         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
368         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
369         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
370         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
371         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
372         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
373         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
374         vline1_f32 = vmulq_f32(vline1_f32, vscale);
375         vline2_f32 = vmulq_f32(vline2_f32, vscale);
376         vline3_f32 = vmulq_f32(vline3_f32, vscale);
377         vline4_f32 = vmulq_f32(vline4_f32, vscale);
378         vline1_f32 = vaddq_f32(vline1_f32, vshift);
379         vline2_f32 = vaddq_f32(vline2_f32, vshift);
380         vline3_f32 = vaddq_f32(vline3_f32, vshift);
381         vline4_f32 = vaddq_f32(vline4_f32, vshift);
382         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
383         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
384         int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
385         int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
386         vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
387         vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
388     }
389 })
390 #endif
391 
392 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
393 CVTS_FUNC(u8, s16, 16,
394     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
395     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
396 {
397     for (size_t i = 0; i < w; i += 16)
398     {
399         internal::prefetch(_src + i);
400         __asm__ (
401             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
402             "vmovl.u8 q3, d4                                       \n\t"
403             "vmovl.u8 q4, d5                                       \n\t"
404             "vmovl.u16 q5, d6                                      \n\t"
405             "vmovl.u16 q6, d7                                      \n\t"
406             "vmovl.u16 q7, d8                                      \n\t"
407             "vmovl.u16 q8, d9                                      \n\t"
408             "vcvt.f32.u32 q9, q5                                   \n\t"
409             "vcvt.f32.u32 q10, q6                                  \n\t"
410             "vcvt.f32.u32 q11, q7                                  \n\t"
411             "vcvt.f32.u32 q12, q8                                  \n\t"
412             "vmul.f32 q13, q9, q0                                  \n\t"
413             "vmul.f32 q14, q10, q0                                 \n\t"
414             "vmul.f32 q15, q11, q0                                 \n\t"
415             "vmul.f32 q2, q12, q0                                  \n\t"
416             "vadd.f32 q3, q13, q1                                  \n\t"
417             "vadd.f32 q4, q14, q1                                  \n\t"
418             "vadd.f32 q5, q15, q1                                  \n\t"
419             "vadd.f32 q6, q2, q1                                   \n\t"
420             "vcvt.s32.f32 q7, q3                                   \n\t"
421             "vcvt.s32.f32 q8, q4                                   \n\t"
422             "vcvt.s32.f32 q9, q5                                   \n\t"
423             "vcvt.s32.f32 q10, q6                                  \n\t"
424             "vqmovn.s32 d22, q7                                    \n\t"
425             "vqmovn.s32 d23, q8                                    \n\t"
426             "vqmovn.s32 d24, q9                                    \n\t"
427             "vqmovn.s32 d25, q10                                   \n\t"
428             "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
429             "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
430             : //no output
431             : [src] "r" (_src + i),
432               [dst1] "r" (_dst + i + 0),
433               [dst2] "r" (_dst + i + 8),
434               "w"  (vscale), "w" (vshift)
435             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
436         );
437     }
438 })
439 #else
440 CVTS_FUNC(u8, s16, 16,
441     float32x4_t vscale = vdupq_n_f32((f32)alpha);
442     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
443 {
444     for (size_t i = 0; i < w; i += 16)
445     {
446         internal::prefetch(_src + i);
447         uint8x16_t vline = vld1q_u8(_src + i);
448         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
449         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
450         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
451         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
452         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
453         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
454         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
455         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
456         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
457         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
458         vline1_f32 = vmulq_f32(vline1_f32, vscale);
459         vline2_f32 = vmulq_f32(vline2_f32, vscale);
460         vline3_f32 = vmulq_f32(vline3_f32, vscale);
461         vline4_f32 = vmulq_f32(vline4_f32, vscale);
462         vline1_f32 = vaddq_f32(vline1_f32, vshift);
463         vline2_f32 = vaddq_f32(vline2_f32, vshift);
464         vline3_f32 = vaddq_f32(vline3_f32, vshift);
465         vline4_f32 = vaddq_f32(vline4_f32, vshift);
466         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
467         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
468         int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
469         int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
470         vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
471         vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
472     }
473 })
474 #endif
475 
476 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
477 CVTS_FUNC(u8, s32, 16,
478     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
479     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
480 {
481     for (size_t i = 0; i < w; i += 16)
482     {
483         internal::prefetch(_src + i);
484         __asm__ (
485             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
486             "vmovl.u8 q3, d4                                       \n\t"
487             "vmovl.u8 q4, d5                                       \n\t"
488             "vmovl.u16 q5, d6                                      \n\t"
489             "vmovl.u16 q6, d7                                      \n\t"
490             "vmovl.u16 q7, d8                                      \n\t"
491             "vmovl.u16 q8, d9                                      \n\t"
492             "vcvt.f32.u32 q9, q5                                   \n\t"
493             "vcvt.f32.u32 q10, q6                                  \n\t"
494             "vcvt.f32.u32 q11, q7                                  \n\t"
495             "vcvt.f32.u32 q12, q8                                  \n\t"
496             "vmul.f32 q13, q9, q0                                  \n\t"
497             "vmul.f32 q14, q10, q0                                 \n\t"
498             "vmul.f32 q15, q11, q0                                 \n\t"
499             "vmul.f32 q2, q12, q0                                  \n\t"
500             "vadd.f32 q3, q13, q1                                  \n\t"
501             "vadd.f32 q4, q14, q1                                  \n\t"
502             "vadd.f32 q5, q15, q1                                  \n\t"
503             "vadd.f32 q6, q2, q1                                   \n\t"
504             "vcvt.s32.f32 q7, q3                                   \n\t"
505             "vcvt.s32.f32 q8, q4                                   \n\t"
506             "vcvt.s32.f32 q9, q5                                   \n\t"
507             "vcvt.s32.f32 q10, q6                                  \n\t"
508             "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
509             "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
510             "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
511             "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
512             : /*no output*/
513             : [src] "r" (_src + i),
514               [dst1] "r" (_dst + i + 0),
515               [dst2] "r" (_dst + i + 4),
516               [dst3] "r" (_dst + i + 8),
517               [dst4] "r" (_dst + i + 12),
518               "w"  (vscale), "w" (vshift)
519             : "d4","d5","d6","d7","d8","d9","d10",
520             "d11","d12","d13","d14","d15","d16","d17",
521             "d18","d19","d20","d21","d22","d23","d24",
522             "d25","d26","d27","d28","d29","d30","d31"
523         );
524     }
525 })
526 #else
527 CVTS_FUNC(u8, s32, 16,
528     float32x4_t vscale = vdupq_n_f32((f32)alpha);
529     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
530 {
531     for (size_t i = 0; i < w; i += 16)
532     {
533         internal::prefetch(_src + i);
534         uint8x16_t vline = vld1q_u8(_src + i);
535         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
536         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
537         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
538         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
539         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
540         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
541         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
542         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
543         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
544         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
545         vline1_f32 = vmulq_f32(vline1_f32, vscale);
546         vline2_f32 = vmulq_f32(vline2_f32, vscale);
547         vline3_f32 = vmulq_f32(vline3_f32, vscale);
548         vline4_f32 = vmulq_f32(vline4_f32, vscale);
549         vline1_f32 = vaddq_f32(vline1_f32, vshift);
550         vline2_f32 = vaddq_f32(vline2_f32, vshift);
551         vline3_f32 = vaddq_f32(vline3_f32, vshift);
552         vline4_f32 = vaddq_f32(vline4_f32, vshift);
553         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
554         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
555         int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
556         int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
557         vst1q_s32(_dst + i + 0,  vline1_s32);
558         vst1q_s32(_dst + i + 4,  vline2_s32);
559         vst1q_s32(_dst + i + 8,  vline3_s32);
560         vst1q_s32(_dst + i + 12, vline4_s32);
561     }
562 })
563 #endif
564 
565 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
566 CVTS_FUNC(u8, f32, 16,
567     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
568     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
569 {
570     for (size_t i = 0; i < w; i += 16)
571     {
572         internal::prefetch(_src + i);
573         __asm__ (
574             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
575             "vmovl.u8 q3, d4                                       \n\t"
576             "vmovl.u8 q4, d5                                       \n\t"
577             "vmovl.u16 q5, d6                                      \n\t"
578             "vmovl.u16 q6, d7                                      \n\t"
579             "vmovl.u16 q7, d8                                      \n\t"
580             "vmovl.u16 q8, d9                                      \n\t"
581             "vcvt.f32.u32 q9, q5                                   \n\t"
582             "vcvt.f32.u32 q10, q6                                  \n\t"
583             "vcvt.f32.u32 q11, q7                                  \n\t"
584             "vcvt.f32.u32 q12, q8                                  \n\t"
585             "vmul.f32 q13, q9, q0                                  \n\t"
586             "vmul.f32 q14, q10, q0                                 \n\t"
587             "vmul.f32 q15, q11, q0                                 \n\t"
588             "vmul.f32 q2, q12, q0                                  \n\t"
589             "vadd.f32 q3, q13, q1                                  \n\t"
590             "vadd.f32 q4, q14, q1                                  \n\t"
591             "vadd.f32 q5, q15, q1                                  \n\t"
592             "vadd.f32 q6, q2, q1                                   \n\t"
593             "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
594             "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
595             "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
596             "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
597             : /*no output*/
598             : [src] "r" (_src + i),
599               [dst1] "r" (_dst + i + 0),
600               [dst2] "r" (_dst + i + 4),
601               [dst3] "r" (_dst + i + 8),
602               [dst4] "r" (_dst + i + 12),
603               "w"  (vscale), "w" (vshift)
604             : "d4","d5","d6","d7","d8","d9","d10",
605             "d11","d12","d13","d14","d15","d16","d17",
606             "d18","d19","d20","d21","d22","d23","d24",
607             "d25","d26","d27","d28","d29","d30","d31"
608         );
609     }
610 })
611 #else
612 CVTS_FUNC(u8, f32, 16,
613     float32x4_t vscale = vdupq_n_f32((f32)alpha);
614     float32x4_t vshift = vdupq_n_f32((f32)beta);,
615 {
616     for (size_t i = 0; i < w; i += 16)
617     {
618         internal::prefetch(_src + i);
619         uint8x16_t vline = vld1q_u8(_src + i);
620         uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline));
621         uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline));
622         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16));
623         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16));
624         uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16));
625         uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16));
626         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
627         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
628         float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32);
629         float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32);
630         vline1_f32 = vmulq_f32(vline1_f32, vscale);
631         vline2_f32 = vmulq_f32(vline2_f32, vscale);
632         vline3_f32 = vmulq_f32(vline3_f32, vscale);
633         vline4_f32 = vmulq_f32(vline4_f32, vscale);
634         vline1_f32 = vaddq_f32(vline1_f32, vshift);
635         vline2_f32 = vaddq_f32(vline2_f32, vshift);
636         vline3_f32 = vaddq_f32(vline3_f32, vshift);
637         vline4_f32 = vaddq_f32(vline4_f32, vshift);
638         vst1q_f32(_dst + i + 0,  vline1_f32);
639         vst1q_f32(_dst + i + 4,  vline2_f32);
640         vst1q_f32(_dst + i + 8,  vline3_f32);
641         vst1q_f32(_dst + i + 12, vline4_f32);
642     }
643 })
644 #endif
645 
646 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
647 CVTS_FUNC(s8, u8, 16,
648     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
649     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
650 {
651     for (size_t i = 0; i < w; i += 16)
652     {
653         internal::prefetch(_src + i);
654         __asm__ (
655             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
656             "vmovl.s8 q3, d4                                       \n\t"
657             "vmovl.s8 q4, d5                                       \n\t"
658             "vmovl.s16 q5, d6                                      \n\t"
659             "vmovl.s16 q6, d7                                      \n\t"
660             "vmovl.s16 q7, d8                                      \n\t"
661             "vmovl.s16 q8, d9                                      \n\t"
662             "vcvt.f32.s32 q9, q5                                   \n\t"
663             "vcvt.f32.s32 q10, q6                                  \n\t"
664             "vcvt.f32.s32 q11, q7                                  \n\t"
665             "vcvt.f32.s32 q12, q8                                  \n\t"
666             "vmul.f32 q13, q9, q0                                  \n\t"
667             "vmul.f32 q14, q10, q0                                 \n\t"
668             "vmul.f32 q15, q11, q0                                 \n\t"
669             "vmul.f32 q2, q12, q0                                  \n\t"
670             "vadd.f32 q3, q13, q1                                  \n\t"
671             "vadd.f32 q4, q14, q1                                  \n\t"
672             "vadd.f32 q5, q15, q1                                  \n\t"
673             "vadd.f32 q6, q2, q1                                   \n\t"
674             "vcvt.s32.f32 q7, q3                                   \n\t"
675             "vcvt.s32.f32 q8, q4                                   \n\t"
676             "vcvt.s32.f32 q9, q5                                   \n\t"
677             "vcvt.s32.f32 q10, q6                                  \n\t"
678             "vqmovun.s32 d22, q7                                   \n\t"
679             "vqmovun.s32 d23, q8                                   \n\t"
680             "vqmovun.s32 d24, q9                                   \n\t"
681             "vqmovun.s32 d25, q10                                  \n\t"
682             "vqmovn.u16 d26, q11                                   \n\t"
683             "vqmovn.u16 d27, q12                                   \n\t"
684             "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
685             : /*no output*/
686             : [src] "r" (_src + i),
687               [dst1] "r" (_dst + i + 0),
688               "w"  (vscale), "w" (vshift)
689             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
690         );
691     }
692 })
693 #else
694 CVTS_FUNC(s8, u8, 16,
695     float32x4_t vscale = vdupq_n_f32((f32)alpha);
696     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
697 {
698     for (size_t i = 0; i < w; i += 16)
699     {
700         internal::prefetch(_src + i);
701         int8x16_t vline = vld1q_s8(_src + i);
702         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
703         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
704         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
705         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
706         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
707         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
708         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
709         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
710         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
711         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
712         vline1_f32 = vmulq_f32(vline1_f32, vscale);
713         vline2_f32 = vmulq_f32(vline2_f32, vscale);
714         vline3_f32 = vmulq_f32(vline3_f32, vscale);
715         vline4_f32 = vmulq_f32(vline4_f32, vscale);
716         vline1_f32 = vaddq_f32(vline1_f32, vshift);
717         vline2_f32 = vaddq_f32(vline2_f32, vshift);
718         vline3_f32 = vaddq_f32(vline3_f32, vshift);
719         vline4_f32 = vaddq_f32(vline4_f32, vshift);
720         vline1_s32 = vcvtq_s32_f32(vline1_f32);
721         vline2_s32 = vcvtq_s32_f32(vline2_f32);
722         vline3_s32 = vcvtq_s32_f32(vline3_f32);
723         vline4_s32 = vcvtq_s32_f32(vline4_f32);
724         uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
725         uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
726         vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
727     }
728 })
729 #endif
730 
731 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
732 CVTS_FUNC1(s8, 16,
733     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
734     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
735 {
736     for (size_t i = 0; i < w; i += 16)
737     {
738         internal::prefetch(_src + i);
739         __asm__ (
740             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
741             "vmovl.s8 q3, d4                                       \n\t"
742             "vmovl.s8 q4, d5                                       \n\t"
743             "vmovl.s16 q5, d6                                      \n\t"
744             "vmovl.s16 q6, d7                                      \n\t"
745             "vmovl.s16 q7, d8                                      \n\t"
746             "vmovl.s16 q8, d9                                      \n\t"
747             "vcvt.f32.s32 q9, q5                                   \n\t"
748             "vcvt.f32.s32 q10, q6                                  \n\t"
749             "vcvt.f32.s32 q11, q7                                  \n\t"
750             "vcvt.f32.s32 q12, q8                                  \n\t"
751             "vmul.f32 q13, q9, q0                                  \n\t"
752             "vmul.f32 q14, q10, q0                                 \n\t"
753             "vmul.f32 q15, q11, q0                                 \n\t"
754             "vmul.f32 q2, q12, q0                                  \n\t"
755             "vadd.f32 q3, q13, q1                                  \n\t"
756             "vadd.f32 q4, q14, q1                                  \n\t"
757             "vadd.f32 q5, q15, q1                                  \n\t"
758             "vadd.f32 q6, q2, q1                                   \n\t"
759             "vcvt.s32.f32 q7, q3                                   \n\t"
760             "vcvt.s32.f32 q8, q4                                   \n\t"
761             "vcvt.s32.f32 q9, q5                                   \n\t"
762             "vcvt.s32.f32 q10, q6                                  \n\t"
763             "vqmovn.s32 d22, q7                                    \n\t"
764             "vqmovn.s32 d23, q8                                    \n\t"
765             "vqmovn.s32 d24, q9                                    \n\t"
766             "vqmovn.s32 d25, q10                                   \n\t"
767             "vqmovn.s16 d26, q11                                   \n\t"
768             "vqmovn.s16 d27, q12                                   \n\t"
769             "vst1.8 {d26-d27}, [%[dst1]]                           \n\t"
770             : /*no output*/
771             : [src] "r" (_src + i),
772               [dst1] "r" (_dst + i + 0),
773               "w"  (vscale), "w" (vshift)
774             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
775         );
776     }
777 })
778 #else
779 CVTS_FUNC1(s8, 16,
780     float32x4_t vscale = vdupq_n_f32((f32)alpha);
781     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
782 {
783     for (size_t i = 0; i < w; i += 16)
784     {
785         internal::prefetch(_src + i);
786         int8x16_t vline = vld1q_s8(_src + i);
787         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
788         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
789         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
790         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
791         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
792         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
793         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
794         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
795         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
796         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
797         vline1_f32 = vmulq_f32(vline1_f32, vscale);
798         vline2_f32 = vmulq_f32(vline2_f32, vscale);
799         vline3_f32 = vmulq_f32(vline3_f32, vscale);
800         vline4_f32 = vmulq_f32(vline4_f32, vscale);
801         vline1_f32 = vaddq_f32(vline1_f32, vshift);
802         vline2_f32 = vaddq_f32(vline2_f32, vshift);
803         vline3_f32 = vaddq_f32(vline3_f32, vshift);
804         vline4_f32 = vaddq_f32(vline4_f32, vshift);
805         vline1_s32 = vcvtq_s32_f32(vline1_f32);
806         vline2_s32 = vcvtq_s32_f32(vline2_f32);
807         vline3_s32 = vcvtq_s32_f32(vline3_f32);
808         vline4_s32 = vcvtq_s32_f32(vline4_f32);
809         int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
810         int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
811         vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
812     }
813 })
814 #endif
815 
816 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
817 CVTS_FUNC(s8, u16, 16,
818     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
819     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
820 {
821     for (size_t i = 0; i < w; i += 16)
822     {
823         internal::prefetch(_src + i);
824         __asm__ (
825             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
826             "vmovl.s8 q3, d4                                       \n\t"
827             "vmovl.s8 q4, d5                                       \n\t"
828             "vmovl.s16 q5, d6                                      \n\t"
829             "vmovl.s16 q6, d7                                      \n\t"
830             "vmovl.s16 q7, d8                                      \n\t"
831             "vmovl.s16 q8, d9                                      \n\t"
832             "vcvt.f32.s32 q9, q5                                   \n\t"
833             "vcvt.f32.s32 q10, q6                                  \n\t"
834             "vcvt.f32.s32 q11, q7                                  \n\t"
835             "vcvt.f32.s32 q12, q8                                  \n\t"
836             "vmul.f32 q13, q9, q0                                  \n\t"
837             "vmul.f32 q14, q10, q0                                 \n\t"
838             "vmul.f32 q15, q11, q0                                 \n\t"
839             "vmul.f32 q2, q12, q0                                  \n\t"
840             "vadd.f32 q3, q13, q1                                  \n\t"
841             "vadd.f32 q4, q14, q1                                  \n\t"
842             "vadd.f32 q5, q15, q1                                  \n\t"
843             "vadd.f32 q6, q2, q1                                   \n\t"
844             "vcvt.s32.f32 q7, q3                                   \n\t"
845             "vcvt.s32.f32 q8, q4                                   \n\t"
846             "vcvt.s32.f32 q9, q5                                   \n\t"
847             "vcvt.s32.f32 q10, q6                                  \n\t"
848             "vqmovun.s32 d22, q7                                   \n\t"
849             "vqmovun.s32 d23, q8                                   \n\t"
850             "vqmovun.s32 d24, q9                                   \n\t"
851             "vqmovun.s32 d25, q10                                  \n\t"
852             "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
853             "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
854             : /*no output*/
855             : [src] "r" (_src + i),
856               [dst1] "r" (_dst + i + 0),
857               [dst2] "r" (_dst + i + 8),
858               "w"  (vscale), "w" (vshift)
859             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
860         );
861     }
862 })
863 #else
864 CVTS_FUNC(s8, u16, 16,
865     float32x4_t vscale = vdupq_n_f32((f32)alpha);
866     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
867 {
868     for (size_t i = 0; i < w; i += 16)
869     {
870         internal::prefetch(_src + i);
871         int8x16_t vline = vld1q_s8(_src + i);
872         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
873         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
874         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
875         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
876         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
877         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
878         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
879         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
880         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
881         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
882         vline1_f32 = vmulq_f32(vline1_f32, vscale);
883         vline2_f32 = vmulq_f32(vline2_f32, vscale);
884         vline3_f32 = vmulq_f32(vline3_f32, vscale);
885         vline4_f32 = vmulq_f32(vline4_f32, vscale);
886         vline1_f32 = vaddq_f32(vline1_f32, vshift);
887         vline2_f32 = vaddq_f32(vline2_f32, vshift);
888         vline3_f32 = vaddq_f32(vline3_f32, vshift);
889         vline4_f32 = vaddq_f32(vline4_f32, vshift);
890         vline1_s32 = vcvtq_s32_f32(vline1_f32);
891         vline2_s32 = vcvtq_s32_f32(vline2_f32);
892         vline3_s32 = vcvtq_s32_f32(vline3_f32);
893         vline4_s32 = vcvtq_s32_f32(vline4_f32);
894         uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
895         uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
896         vst1q_u16(_dst + i + 0, vRes1_u16);
897         vst1q_u16(_dst + i + 8, vRes2_u16);
898     }
899 })
900 #endif
901 
902 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
903 CVTS_FUNC(s8, s16, 16,
904     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
905     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
906 {
907     for (size_t i = 0; i < w; i += 16)
908     {
909         internal::prefetch(_src + i);
910         __asm__ (
911             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
912             "vmovl.s8 q3, d4                                       \n\t"
913             "vmovl.s8 q4, d5                                       \n\t"
914             "vmovl.s16 q5, d6                                      \n\t"
915             "vmovl.s16 q6, d7                                      \n\t"
916             "vmovl.s16 q7, d8                                      \n\t"
917             "vmovl.s16 q8, d9                                      \n\t"
918             "vcvt.f32.s32 q9, q5                                   \n\t"
919             "vcvt.f32.s32 q10, q6                                  \n\t"
920             "vcvt.f32.s32 q11, q7                                  \n\t"
921             "vcvt.f32.s32 q12, q8                                  \n\t"
922             "vmul.f32 q13, q9, q0                                  \n\t"
923             "vmul.f32 q14, q10, q0                                 \n\t"
924             "vmul.f32 q15, q11, q0                                 \n\t"
925             "vmul.f32 q2, q12, q0                                  \n\t"
926             "vadd.f32 q3, q13, q1                                  \n\t"
927             "vadd.f32 q4, q14, q1                                  \n\t"
928             "vadd.f32 q5, q15, q1                                  \n\t"
929             "vadd.f32 q6, q2, q1                                   \n\t"
930             "vcvt.s32.f32 q7, q3                                   \n\t"
931             "vcvt.s32.f32 q8, q4                                   \n\t"
932             "vcvt.s32.f32 q9, q5                                   \n\t"
933             "vcvt.s32.f32 q10, q6                                  \n\t"
934             "vqmovn.s32 d22, q7                                    \n\t"
935             "vqmovn.s32 d23, q8                                    \n\t"
936             "vqmovn.s32 d24, q9                                    \n\t"
937             "vqmovn.s32 d25, q10                                   \n\t"
938             "vst1.16 {d22-d23}, [%[dst1]]                          \n\t"
939             "vst1.16 {d24-d25}, [%[dst2]]                          \n\t"
940             : /*no output*/
941             : [src] "r" (_src + i),
942               [dst1] "r" (_dst + i + 0),
943               [dst2] "r" (_dst + i + 8),
944               "w"  (vscale), "w" (vshift)
945             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
946         );
947     }
948 })
949 #else
950 CVTS_FUNC(s8, s16, 16,
951     float32x4_t vscale = vdupq_n_f32((f32)alpha);
952     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
953 {
954     for (size_t i = 0; i < w; i += 16)
955     {
956         internal::prefetch(_src + i);
957         int8x16_t vline = vld1q_s8(_src + i);
958         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
959         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
960         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
961         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
962         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
963         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
964         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
965         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
966         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
967         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
968         vline1_f32 = vmulq_f32(vline1_f32, vscale);
969         vline2_f32 = vmulq_f32(vline2_f32, vscale);
970         vline3_f32 = vmulq_f32(vline3_f32, vscale);
971         vline4_f32 = vmulq_f32(vline4_f32, vscale);
972         vline1_f32 = vaddq_f32(vline1_f32, vshift);
973         vline2_f32 = vaddq_f32(vline2_f32, vshift);
974         vline3_f32 = vaddq_f32(vline3_f32, vshift);
975         vline4_f32 = vaddq_f32(vline4_f32, vshift);
976         vline1_s32 = vcvtq_s32_f32(vline1_f32);
977         vline2_s32 = vcvtq_s32_f32(vline2_f32);
978         vline3_s32 = vcvtq_s32_f32(vline3_f32);
979         vline4_s32 = vcvtq_s32_f32(vline4_f32);
980         int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
981         int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
982         vst1q_s16(_dst + i + 0, vRes1_s16);
983         vst1q_s16(_dst + i + 8, vRes2_s16);
984     }
985 })
986 #endif
987 
988 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
989 CVTS_FUNC(s8, s32, 16,
990     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
991     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
992 {
993     for (size_t i = 0; i < w; i += 16)
994     {
995         internal::prefetch(_src + i);
996         __asm__ (
997             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
998             "vmovl.s8 q3, d4                                       \n\t"
999             "vmovl.s8 q4, d5                                       \n\t"
1000             "vmovl.s16 q5, d6                                      \n\t"
1001             "vmovl.s16 q6, d7                                      \n\t"
1002             "vmovl.s16 q7, d8                                      \n\t"
1003             "vmovl.s16 q8, d9                                      \n\t"
1004             "vcvt.f32.s32 q9, q5                                   \n\t"
1005             "vcvt.f32.s32 q10, q6                                  \n\t"
1006             "vcvt.f32.s32 q11, q7                                  \n\t"
1007             "vcvt.f32.s32 q12, q8                                  \n\t"
1008             "vmul.f32 q13, q9, q0                                  \n\t"
1009             "vmul.f32 q14, q10, q0                                 \n\t"
1010             "vmul.f32 q15, q11, q0                                 \n\t"
1011             "vmul.f32 q2, q12, q0                                  \n\t"
1012             "vadd.f32 q3, q13, q1                                  \n\t"
1013             "vadd.f32 q4, q14, q1                                  \n\t"
1014             "vadd.f32 q5, q15, q1                                  \n\t"
1015             "vadd.f32 q6, q2, q1                                   \n\t"
1016             "vcvt.s32.f32 q7, q3                                   \n\t"
1017             "vcvt.s32.f32 q8, q4                                   \n\t"
1018             "vcvt.s32.f32 q9, q5                                   \n\t"
1019             "vcvt.s32.f32 q10, q6                                  \n\t"
1020             "vst1.32 {d14-d15}, [%[dst1]]                          \n\t"
1021             "vst1.32 {d16-d17}, [%[dst2]]                          \n\t"
1022             "vst1.32 {d18-d19}, [%[dst3]]                          \n\t"
1023             "vst1.32 {d20-d21}, [%[dst4]]                          \n\t"
1024             : /*no output*/
1025             : [src] "r" (_src + i),
1026               [dst1] "r" (_dst + i + 0),
1027               [dst2] "r" (_dst + i + 4),
1028               [dst3] "r" (_dst + i + 8),
1029               [dst4] "r" (_dst + i + 12),
1030               "w"  (vscale), "w" (vshift)
1031             : "d4","d5","d6","d7","d8","d9","d10",
1032             "d11","d12","d13","d14","d15","d16","d17",
1033             "d18","d19","d20","d21","d22","d23","d24",
1034             "d25","d26","d27","d28","d29","d30","d31"
1035         );
1036     }
1037 })
1038 #else
1039 CVTS_FUNC(s8, s32, 16,
1040     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1041     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1042 {
1043     for (size_t i = 0; i < w; i += 16)
1044     {
1045         internal::prefetch(_src + i);
1046         int8x16_t vline = vld1q_s8(_src + i);
1047         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
1048         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
1049         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
1050         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
1051         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
1052         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
1053         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1054         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1055         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
1056         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
1057         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1058         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1059         vline3_f32 = vmulq_f32(vline3_f32, vscale);
1060         vline4_f32 = vmulq_f32(vline4_f32, vscale);
1061         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1062         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1063         vline3_f32 = vaddq_f32(vline3_f32, vshift);
1064         vline4_f32 = vaddq_f32(vline4_f32, vshift);
1065         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1066         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1067         vline3_s32 = vcvtq_s32_f32(vline3_f32);
1068         vline4_s32 = vcvtq_s32_f32(vline4_f32);
1069         vst1q_s32(_dst + i + 0,  vline1_s32);
1070         vst1q_s32(_dst + i + 4,  vline2_s32);
1071         vst1q_s32(_dst + i + 8,  vline3_s32);
1072         vst1q_s32(_dst + i + 12, vline4_s32);
1073     }
1074 })
1075 #endif
1076 
1077 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1078 CVTS_FUNC(s8, f32, 16,
1079     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1080     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
1081 {
1082     for (size_t i = 0; i < w; i += 16)
1083     {
1084         internal::prefetch(_src + i);
1085         __asm__ (
1086             "vld1.8 {d4-d5}, [%[src]]                              \n\t"
1087             "vmovl.s8 q3, d4                                       \n\t"
1088             "vmovl.s8 q4, d5                                       \n\t"
1089             "vmovl.s16 q5, d6                                      \n\t"
1090             "vmovl.s16 q6, d7                                      \n\t"
1091             "vmovl.s16 q7, d8                                      \n\t"
1092             "vmovl.s16 q8, d9                                      \n\t"
1093             "vcvt.f32.s32 q9, q5                                   \n\t"
1094             "vcvt.f32.s32 q10, q6                                  \n\t"
1095             "vcvt.f32.s32 q11, q7                                  \n\t"
1096             "vcvt.f32.s32 q12, q8                                  \n\t"
1097             "vmul.f32 q13, q9, q0                                  \n\t"
1098             "vmul.f32 q14, q10, q0                                 \n\t"
1099             "vmul.f32 q15, q11, q0                                 \n\t"
1100             "vmul.f32 q2, q12, q0                                  \n\t"
1101             "vadd.f32 q3, q13, q1                                  \n\t"
1102             "vadd.f32 q4, q14, q1                                  \n\t"
1103             "vadd.f32 q5, q15, q1                                  \n\t"
1104             "vadd.f32 q6, q2, q1                                   \n\t"
1105             "vst1.32 {d6-d7}, [%[dst1]]                            \n\t"
1106             "vst1.32 {d8-d9}, [%[dst2]]                            \n\t"
1107             "vst1.32 {d10-d11}, [%[dst3]]                          \n\t"
1108             "vst1.32 {d12-d13}, [%[dst4]]                          \n\t"
1109             : /*no output*/
1110             : [src] "r" (_src + i),
1111               [dst1] "r" (_dst + i + 0),
1112               [dst2] "r" (_dst + i + 4),
1113               [dst3] "r" (_dst + i + 8),
1114               [dst4] "r" (_dst + i + 12),
1115               "w"  (vscale), "w" (vshift)
1116             : "d4","d5","d6","d7","d8","d9","d10",
1117             "d11","d12","d13","d14","d15","d16","d17",
1118             "d18","d19","d20","d21","d22","d23","d24",
1119             "d25","d26","d27","d28","d29","d30","d31"
1120         );
1121     }
1122 })
1123 #else
1124 CVTS_FUNC(s8, f32, 16,
1125     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1126     float32x4_t vshift = vdupq_n_f32((f32)beta);,
1127 {
1128     for (size_t i = 0; i < w; i += 16)
1129     {
1130         internal::prefetch(_src + i);
1131         int8x16_t vline = vld1q_s8(_src + i);
1132         int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline));
1133         int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline));
1134         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16));
1135         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16));
1136         int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16));
1137         int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16));
1138         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1139         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1140         float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32);
1141         float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32);
1142         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1143         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1144         vline3_f32 = vmulq_f32(vline3_f32, vscale);
1145         vline4_f32 = vmulq_f32(vline4_f32, vscale);
1146         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1147         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1148         vline3_f32 = vaddq_f32(vline3_f32, vshift);
1149         vline4_f32 = vaddq_f32(vline4_f32, vshift);
1150         vst1q_f32(_dst + i + 0,  vline1_f32);
1151         vst1q_f32(_dst + i + 4,  vline2_f32);
1152         vst1q_f32(_dst + i + 8,  vline3_f32);
1153         vst1q_f32(_dst + i + 12, vline4_f32);
1154     }
1155 })
1156 #endif
1157 
1158 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1159 CVTS_FUNC(u16, u8, 16,
1160     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1161     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1162 {
1163     for (size_t i = 0; i < w; i += 8)
1164     {
1165         internal::prefetch(_src + i);
1166         __asm__ (
1167             "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
1168             "vmovl.u16 q3, d4                                      \n\t"
1169             "vmovl.u16 q4, d5                                      \n\t"
1170             "vcvt.f32.u32 q5, q3                                   \n\t"
1171             "vcvt.f32.u32 q6, q4                                   \n\t"
1172             "vmul.f32 q7, q5, q0                                   \n\t"
1173             "vmul.f32 q8, q6, q0                                   \n\t"
1174             "vadd.f32 q9, q7, q1                                   \n\t"
1175             "vadd.f32 q10, q8, q1                                  \n\t"
1176             "vcvt.s32.f32 q11, q9                                  \n\t"
1177             "vcvt.s32.f32 q12, q10                                 \n\t"
1178             "vqmovn.s32 d26, q11                                   \n\t"
1179             "vqmovn.s32 d27, q12                                   \n\t"
1180             "vqmovun.s16 d28, q13                                  \n\t"
1181              "vst1.8 {d28}, [%[dst]]                               \n\t"
1182             : /*no output*/
1183             : [src1] "r" (_src + i),
1184               [dst] "r" (_dst + i + 0),
1185                "w"  (vscale), "w" (vshift)
1186             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1187         );
1188     }
1189 })
1190 #else
1191 CVTS_FUNC(u16, u8, 16,
1192     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1193     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1194 {
1195     for (size_t i = 0; i < w; i += 8)
1196     {
1197         internal::prefetch(_src + i);
1198         uint16x8_t vline = vld1q_u16(_src + i);
1199         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1200         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1201         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1202         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1203         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1204         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1205         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1206         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1207         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1208         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1209         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1210         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1211         uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
1212         vst1_u8(_dst + i, vRes);
1213     }
1214 })
1215 #endif
1216 
1217 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1218 CVTS_FUNC(u16, s8, 16,
1219     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1220     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1221 {
1222     for (size_t i = 0; i < w; i += 8)
1223     {
1224         internal::prefetch(_src + i);
1225         __asm__ (
1226             "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
1227             "vmovl.u16 q3, d4                                      \n\t"
1228             "vmovl.u16 q4, d5                                      \n\t"
1229             "vcvt.f32.u32 q5, q3                                   \n\t"
1230             "vcvt.f32.u32 q6, q4                                   \n\t"
1231             "vmul.f32 q7, q5, q0                                   \n\t"
1232             "vmul.f32 q8, q6, q0                                   \n\t"
1233             "vadd.f32 q9, q7, q1                                   \n\t"
1234             "vadd.f32 q10, q8, q1                                  \n\t"
1235             "vcvt.s32.f32 q11, q9                                  \n\t"
1236             "vcvt.s32.f32 q12, q10                                 \n\t"
1237             "vqmovn.s32 d26, q11                                   \n\t"
1238             "vqmovn.s32 d27, q12                                   \n\t"
1239             "vqmovn.s16 d28, q13                                   \n\t"
1240             "vst1.8 {d28}, [%[dst]]                                \n\t"
1241             : /*no output*/
1242             : [src1] "r" (_src + i),
1243               [dst] "r" (_dst + i + 0),
1244                "w"  (vscale), "w" (vshift)
1245             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1246         );
1247     }
1248 })
1249 #else
1250 CVTS_FUNC(u16, s8, 16,
1251     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1252     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1253 {
1254     for (size_t i = 0; i < w; i += 8)
1255     {
1256         internal::prefetch(_src + i);
1257         uint16x8_t vline = vld1q_u16(_src + i);
1258         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1259         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1260         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1261         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1262         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1263         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1264         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1265         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1266         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1267         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1268         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1269         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1270         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
1271         vst1_s8(_dst + i, vRes);
1272     }
1273 })
1274 #endif
1275 
1276 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1277 CVTS_FUNC1(u16, 16,
1278     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1279     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1280 {
1281     for (size_t i = 0; i < w; i += 8)
1282     {
1283         internal::prefetch(_src + i);
1284         __asm__ (
1285             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1286             "vmovl.u16 q3, d4                                       \n\t"
1287             "vmovl.u16 q4, d5                                       \n\t"
1288             "vcvt.f32.u32 q5, q3                                    \n\t"
1289             "vcvt.f32.u32 q6, q4                                    \n\t"
1290             "vmul.f32 q7, q5, q0                                    \n\t"
1291             "vmul.f32 q8, q6, q0                                    \n\t"
1292             "vadd.f32 q9, q7, q1                                    \n\t"
1293             "vadd.f32 q10, q8, q1                                   \n\t"
1294             "vcvt.s32.f32 q11, q9                                   \n\t"
1295             "vcvt.s32.f32 q12, q10                                  \n\t"
1296             "vqmovun.s32 d26, q11                                   \n\t"
1297             "vqmovun.s32 d27, q12                                   \n\t"
1298             "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
1299             : /*no output*/
1300             : [src] "r" (_src + i),
1301               [dst] "r" (_dst + i + 0),
1302               "w" (vshift), "w" (vscale)
1303             : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1304         );
1305     }
1306 })
1307 #else
1308 CVTS_FUNC1(u16, 16,
1309     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1310     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1311 {
1312     for (size_t i = 0; i < w; i += 8)
1313     {
1314         internal::prefetch(_src + i);
1315         uint16x8_t vline = vld1q_u16(_src + i);
1316         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1317         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1318         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1319         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1320         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1321         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1322         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1323         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1324         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1325         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1326         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
1327         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
1328         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
1329     }
1330 })
1331 #endif
1332 
1333 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1334 CVTS_FUNC(u16, s16, 8,
1335     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1336     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1337 {
1338     for (size_t i = 0; i < w; i += 8)
1339     {
1340         internal::prefetch(_src + i);
1341         __asm__ (
1342             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1343             "vmovl.u16 q3, d4                                       \n\t"
1344             "vmovl.u16 q4, d5                                       \n\t"
1345             "vcvt.f32.u32 q5, q3                                    \n\t"
1346             "vcvt.f32.u32 q6, q4                                    \n\t"
1347             "vmul.f32 q7, q5, q0                                    \n\t"
1348             "vmul.f32 q8, q6, q0                                    \n\t"
1349             "vadd.f32 q9, q7, q1                                    \n\t"
1350             "vadd.f32 q10, q8, q1                                   \n\t"
1351             "vcvt.s32.f32 q11, q9                                   \n\t"
1352             "vcvt.s32.f32 q12, q10                                  \n\t"
1353             "vqmovn.s32 d26, q11                                    \n\t"
1354             "vqmovn.s32 d27, q12                                    \n\t"
1355             "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
1356             : /*no output*/
1357             : [src] "r" (_src + i),
1358               [dst] "r" (_dst + i + 0),
1359               "w" (vshift), "w" (vscale)
1360             : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1361         );
1362     }
1363 })
1364 #else
1365 CVTS_FUNC(u16, s16, 8,
1366     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1367     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1368 {
1369     for (size_t i = 0; i < w; i += 8)
1370     {
1371         internal::prefetch(_src + i);
1372         uint16x8_t vline = vld1q_u16(_src + i);
1373         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1374         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1375         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1376         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1377         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1378         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1379         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1380         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1381         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1382         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1383         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1384         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1385         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
1386     }
1387 })
1388 #endif
1389 
1390 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1391 CVTS_FUNC(u16, s32, 8,
1392     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1393     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1394 {
1395     for (size_t i = 0; i < w; i += 8)
1396     {
1397         internal::prefetch(_src + i);
1398         __asm__ (
1399             "vld1.16 {d4-d5}, [%[src]]                        \n\t"
1400             "vmovl.u16 q3, d4                                 \n\t"
1401             "vmovl.u16 q4, d5                                 \n\t"
1402             "vcvt.f32.u32 q5, q3                              \n\t"
1403             "vcvt.f32.u32 q6, q4                              \n\t"
1404             "vmul.f32 q7, q5, q0                              \n\t"
1405             "vmul.f32 q8, q6, q0                              \n\t"
1406             "vadd.f32 q9, q7, q1                              \n\t"
1407             "vadd.f32 q10, q8, q1                             \n\t"
1408             "vcvt.s32.f32 q11, q9                             \n\t"
1409             "vcvt.s32.f32 q12, q10                            \n\t"
1410             "vst1.32 {d22-d23}, [%[dst1]]                     \n\t"
1411             "vst1.32 {d24-d25}, [%[dst2]]                     \n\t"
1412             : /*no output*/
1413             : [src] "r" (_src + i),
1414               [dst1] "r" (_dst + i),
1415               [dst2] "r" (_dst + i + 4),
1416               "w" (vshift), "w" (vscale)
1417             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
1418         );
1419     }
1420 })
1421 #else
1422 CVTS_FUNC(u16, s32, 8,
1423     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1424     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1425 {
1426     for (size_t i = 0; i < w; i += 8)
1427     {
1428         internal::prefetch(_src + i);
1429         uint16x8_t vline = vld1q_u16(_src + i);
1430         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1431         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1432         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1433         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1434         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1435         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1436         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1437         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1438         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
1439         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
1440         vst1q_s32(_dst + i + 0, vline1_s32);
1441         vst1q_s32(_dst + i + 4, vline2_s32);
1442     }
1443 })
1444 #endif
1445 
1446 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1447 CVTS_FUNC(u16, f32, 8,
1448     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1449     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
1450 {
1451     for (size_t i = 0; i < w; i += 8)
1452     {
1453         internal::prefetch(_src + i);
1454         __asm__ (
1455             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1456             "vmovl.u16 q3, d4                                       \n\t"
1457             "vmovl.u16 q4, d5                                       \n\t"
1458              "vcvt.f32.u32 q5, q3                                    \n\t"
1459             "vcvt.f32.u32 q6, q4                                    \n\t"
1460             "vmul.f32 q7, q5, q0                                    \n\t"
1461             "vmul.f32 q8, q6, q0                                    \n\t"
1462             "vadd.f32 q9, q7, q1                                    \n\t"
1463             "vadd.f32 q10, q8, q1                                   \n\t"
1464             "vst1.32 {d18-d19}, [%[dst1]]                           \n\t"
1465             "vst1.32 {d20-d21}, [%[dst2]]                           \n\t"
1466             : /*no output*/
1467             : [src] "r" (_src + i),
1468               [dst1] "r" (_dst + i + 0),
1469               [dst2] "r" (_dst + i + 4),
1470               "w"  (vscale), "w" (vshift)
1471             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
1472         );
1473     }
1474 })
1475 #else
1476 CVTS_FUNC(u16, f32, 8,
1477     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1478     float32x4_t vshift = vdupq_n_f32((f32)beta);,
1479 {
1480     for (size_t i = 0; i < w; i += 8)
1481     {
1482         internal::prefetch(_src + i);
1483         uint16x8_t vline = vld1q_u16(_src + i);
1484         uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline));
1485         uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline));
1486         float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32);
1487         float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32);
1488         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1489         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1490         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1491         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1492         vst1q_f32(_dst + i + 0, vline1_f32);
1493         vst1q_f32(_dst + i + 4, vline2_f32);
1494     }
1495 })
1496 #endif
1497 
1498 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1499 CVTS_FUNC(s16, u8, 16,
1500     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1501     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1502 {
1503     for (size_t i = 0; i < w; i += 8)
1504     {
1505         internal::prefetch(_src + i);
1506         __asm__ (
1507             "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
1508             "vmovl.s16 q3, d4                                      \n\t"
1509             "vmovl.s16 q4, d5                                      \n\t"
1510             "vcvt.f32.s32 q5, q3                                   \n\t"
1511             "vcvt.f32.s32 q6, q4                                   \n\t"
1512             "vmul.f32 q7, q5, q0                                   \n\t"
1513             "vmul.f32 q8, q6, q0                                   \n\t"
1514             "vadd.f32 q9, q7, q1                                   \n\t"
1515             "vadd.f32 q10, q8, q1                                  \n\t"
1516             "vcvt.s32.f32 q11, q9                                  \n\t"
1517             "vcvt.s32.f32 q12, q10                                 \n\t"
1518             "vqmovn.s32 d26, q11                                   \n\t"
1519             "vqmovn.s32 d27, q12                                   \n\t"
1520             "vqmovun.s16 d28, q13                                  \n\t"
1521             "vst1.8 {d28}, [%[dst]]                                \n\t"
1522             : /*no output*/
1523             : [src1] "r" (_src + i),
1524               [dst] "r" (_dst + i + 0),
1525                "w"  (vscale), "w" (vshift)
1526             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1527         );
1528     }
1529 })
1530 #else
1531 CVTS_FUNC(s16, u8, 16,
1532     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1533     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1534 {
1535     for (size_t i = 0; i < w; i += 8)
1536     {
1537         internal::prefetch(_src + i);
1538         int16x8_t vline = vld1q_s16(_src + i);
1539         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1540         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1541         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1542         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1543         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1544         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1545         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1546         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1547         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1548         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1549         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1550         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1551         uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
1552         vst1_u8(_dst + i, vRes);
1553     }
1554 })
1555 #endif
1556 
1557 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1558 CVTS_FUNC(s16, s8, 16,
1559     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1560     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1561 {
1562     for (size_t i = 0; i < w; i += 8)
1563     {
1564         internal::prefetch(_src + i);
1565         __asm__ (
1566             "vld1.8 {d4-d5}, [%[src1]]                             \n\t"
1567             "vmovl.s16 q3, d4                                      \n\t"
1568             "vmovl.s16 q4, d5                                      \n\t"
1569             "vcvt.f32.s32 q5, q3                                   \n\t"
1570             "vcvt.f32.s32 q6, q4                                   \n\t"
1571             "vmul.f32 q7, q5, q0                                   \n\t"
1572             "vmul.f32 q8, q6, q0                                   \n\t"
1573             "vadd.f32 q9, q7, q1                                   \n\t"
1574             "vadd.f32 q10, q8, q1                                  \n\t"
1575             "vcvt.s32.f32 q11, q9                                  \n\t"
1576             "vcvt.s32.f32 q12, q10                                 \n\t"
1577             "vqmovn.s32 d26, q11                                   \n\t"
1578             "vqmovn.s32 d27, q12                                   \n\t"
1579             "vqmovn.s16 d28, q13                                   \n\t"
1580             "vst1.8 {d28}, [%[dst]]                                \n\t"
1581             : /*no output*/
1582             : [src1] "r" (_src + i),
1583               [dst] "r" (_dst + i + 0),
1584                "w"  (vscale), "w" (vshift)
1585             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28"
1586         );
1587     }
1588 })
1589 #else
1590 CVTS_FUNC(s16, s8, 16,
1591     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1592     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1593 {
1594     for (size_t i = 0; i < w; i += 8)
1595     {
1596         internal::prefetch(_src + i);
1597         int16x8_t vline = vld1q_s16(_src + i);
1598         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1599         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1600         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1601         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1602         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1603         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1604         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1605         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1606         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1607         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1608         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1609         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1610         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
1611         vst1_s8(_dst + i, vRes);
1612     }
1613 })
1614 #endif
1615 
1616 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1617 CVTS_FUNC(s16, u16, 8,
1618     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1619     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1620 {
1621     for (size_t i = 0; i < w; i += 8)
1622     {
1623         internal::prefetch(_src + i);
1624         __asm__ (
1625             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1626             "vmovl.s16 q3, d4                                       \n\t"
1627             "vmovl.s16 q4, d5                                       \n\t"
1628             "vcvt.f32.s32 q5, q3                                    \n\t"
1629             "vcvt.f32.s32 q6, q4                                    \n\t"
1630             "vmul.f32 q7, q5, q0                                    \n\t"
1631             "vmul.f32 q8, q6, q0                                    \n\t"
1632             "vadd.f32 q9, q7, q1                                    \n\t"
1633             "vadd.f32 q10, q8, q1                                   \n\t"
1634             "vcvt.s32.f32 q11, q9                                   \n\t"
1635             "vcvt.s32.f32 q12, q10                                  \n\t"
1636             "vqmovun.s32 d26, q11                                   \n\t"
1637             "vqmovun.s32 d27, q12                                   \n\t"
1638             "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
1639             : /*no output*/
1640             : [src] "r" (_src + i),
1641               [dst] "r" (_dst + i + 0),
1642               "w"  (vscale), "w" (vshift)
1643             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1644         );
1645     }
1646 })
1647 #else
1648 CVTS_FUNC(s16, u16, 8,
1649     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1650     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1651 {
1652     for (size_t i = 0; i < w; i += 8)
1653     {
1654         internal::prefetch(_src + i);
1655         int16x8_t vline = vld1q_s16(_src + i);
1656         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1657         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1658         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1659         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1660         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1661         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1662         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1663         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1664         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1665         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1666         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
1667         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
1668         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
1669     }
1670 })
1671 #endif
1672 
1673 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1674 CVTS_FUNC1(s16, 16,
1675     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1676     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1677 {
1678     for (size_t i = 0; i < w; i += 8)
1679     {
1680         internal::prefetch(_src + i);
1681         __asm__ (
1682             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1683             "vmovl.s16 q3, d4                                       \n\t"
1684             "vmovl.s16 q4, d5                                       \n\t"
1685             "vcvt.f32.s32 q5, q3                                    \n\t"
1686             "vcvt.f32.s32 q6, q4                                    \n\t"
1687             "vmul.f32 q7, q5, q0                                    \n\t"
1688             "vmul.f32 q8, q6, q0                                    \n\t"
1689             "vadd.f32 q9, q7, q1                                    \n\t"
1690             "vadd.f32 q10, q8, q1                                   \n\t"
1691             "vcvt.s32.f32 q11, q9                                   \n\t"
1692             "vcvt.s32.f32 q12, q10                                  \n\t"
1693             "vqmovn.s32 d26, q11                                    \n\t"
1694             "vqmovn.s32 d27, q12                                    \n\t"
1695             "vst1.16 {d26-d27}, [%[dst]]                            \n\t"
1696             : /*no output*/
1697             : [src] "r" (_src + i),
1698               [dst] "r" (_dst + i + 0),
1699               "w" (vshift), "w" (vscale)
1700             : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27"
1701         );
1702     }
1703 })
1704 #else
1705 CVTS_FUNC1(s16, 16,
1706     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1707     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1708 {
1709     for (size_t i = 0; i < w; i += 8)
1710     {
1711         internal::prefetch(_src + i);
1712         int16x8_t vline = vld1q_s16(_src + i);
1713         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1714         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1715         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1716         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1717         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1718         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1719         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1720         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1721         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1722         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1723         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1724         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1725         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
1726     }
1727 })
1728 #endif
1729 
1730 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1731 CVTS_FUNC(s16, s32, 8,
1732     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1733     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1734 {
1735     for (size_t i = 0; i < w; i += 8)
1736     {
1737         internal::prefetch(_src + i);
1738         __asm__ (
1739             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1740             "vmovl.s16 q3, d4                                       \n\t"
1741             "vmovl.s16 q4, d5                                       \n\t"
1742             "vcvt.f32.s32 q5, q3                                    \n\t"
1743             "vcvt.f32.s32 q6, q4                                    \n\t"
1744             "vmul.f32 q7, q5, q0                                    \n\t"
1745             "vmul.f32 q8, q6, q0                                    \n\t"
1746             "vadd.f32 q9, q7, q1                                    \n\t"
1747             "vadd.f32 q10, q8, q1                                   \n\t"
1748             "vcvt.s32.f32 q11, q9                                   \n\t"
1749             "vcvt.s32.f32 q12, q10                                  \n\t"
1750             "vst1.32 {d22-d23}, [%[dst1]]                           \n\t"
1751             "vst1.32 {d24-d25}, [%[dst2]]                           \n\t"
1752             : /*no output*/
1753             : [src] "r" (_src + i),
1754               [dst1] "r" (_dst + i + 0),
1755               [dst2] "r" (_dst + i + 4),
1756               "w"  (vscale), "w" (vshift)
1757             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
1758         );
1759     }
1760 })
1761 #else
1762 CVTS_FUNC(s16, s32, 8,
1763     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1764     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1765 {
1766     for (size_t i = 0; i < w; i += 8)
1767     {
1768         internal::prefetch(_src + i);
1769         int16x8_t vline = vld1q_s16(_src + i);
1770         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1771         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1772         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1773         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1774         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1775         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1776         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1777         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1778         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1779         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1780         vst1q_s32(_dst + i + 0, vline1_s32);
1781         vst1q_s32(_dst + i + 4, vline2_s32);
1782     }
1783 })
1784 #endif
1785 
1786 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1787 CVTS_FUNC(s16, f32, 8,
1788     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1789     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
1790 {
1791     for (size_t i = 0; i < w; i += 8)
1792     {
1793         internal::prefetch(_src + i);
1794         __asm__ (
1795             "vld1.16 {d4-d5}, [%[src]]                              \n\t"
1796             "vmovl.s16 q3, d4                                       \n\t"
1797             "vmovl.s16 q4, d5                                       \n\t"
1798             "vcvt.f32.s32 q5, q3                                    \n\t"
1799             "vcvt.f32.s32 q6, q4                                    \n\t"
1800             "vmul.f32 q7, q5, q0                                    \n\t"
1801             "vmul.f32 q8, q6, q0                                    \n\t"
1802             "vadd.f32 q9, q7, q1                                     \n\t"
1803             "vadd.f32 q10, q8, q1                                     \n\t"
1804             "vst1.32 {d18-d19}, [%[dst1]]                             \n\t"
1805             "vst1.32 {d20-d21}, [%[dst2]]                             \n\t"
1806             : /*no output*/
1807             : [src] "r" (_src + i),
1808               [dst1] "r" (_dst + i + 0),
1809               [dst2] "r" (_dst + i + 4),
1810               "w"  (vscale), "w" (vshift)
1811             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21"
1812         );
1813     }
1814 })
1815 #else
1816 CVTS_FUNC(s16, f32, 8,
1817     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1818     float32x4_t vshift = vdupq_n_f32((f32)beta);,
1819 {
1820     for (size_t i = 0; i < w; i += 8)
1821     {
1822         internal::prefetch(_src + i);
1823         int16x8_t vline = vld1q_s16(_src + i);
1824         int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline));
1825         int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline));
1826         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1827         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1828         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1829         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1830         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1831         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1832         vst1q_f32(_dst + i + 0, vline1_f32);
1833         vst1q_f32(_dst + i + 4, vline2_f32);
1834     }
1835 })
1836 #endif
1837 
1838 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1839 CVTS_FUNC(s32, u8, 8,
1840     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1841     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1842 {
1843     for (size_t i = 0; i < w; i += 8)
1844     {
1845         internal::prefetch(_src + i);
1846         __asm__ (
1847             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
1848             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
1849             "vcvt.f32.s32 q4, q2                                     \n\t"
1850             "vcvt.f32.s32 q5, q3                                     \n\t"
1851             "vmul.f32 q6, q4, q0                                     \n\t"
1852             "vmul.f32 q7, q5, q0                                     \n\t"
1853             "vadd.f32 q8, q6, q1                                     \n\t"
1854             "vadd.f32 q9, q7, q1                                     \n\t"
1855             "vcvt.s32.f32 q10, q8                                    \n\t"
1856             "vcvt.s32.f32 q11, q9                                    \n\t"
1857             "vqmovun.s32 d24, q10                                    \n\t"
1858             "vqmovun.s32 d25, q11                                    \n\t"
1859             "vqmovn.u16  d26, q12                                    \n\t"
1860             "vst1.8 {d26}, [%[dst]]                                  \n\t"
1861             : /*no output*/
1862             : [src1] "r" (_src + i + 0),
1863               [src2] "r" (_src + i + 4),
1864               [dst] "r" (_dst + i),
1865               "w"  (vscale), "w" (vshift)
1866             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
1867         );
1868     }
1869 })
1870 #else
1871 CVTS_FUNC(s32, u8, 8,
1872     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1873     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1874 {
1875     for (size_t i = 0; i < w; i += 8)
1876     {
1877         internal::prefetch(_src + i);
1878         int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
1879         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
1880         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1881         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1882         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1883         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1884         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1885         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1886         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1887         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1888         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
1889         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
1890         uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
1891         vst1_u8(_dst + i, vRes);
1892     }
1893 })
1894 #endif
1895 
1896 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1897 CVTS_FUNC(s32, s8, 8,
1898     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1899     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1900 {
1901     for (size_t i = 0; i < w; i += 8)
1902     {
1903         internal::prefetch(_src + i);
1904         __asm__ (
1905             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
1906             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
1907             "vcvt.f32.s32 q4, q2                                     \n\t"
1908             "vcvt.f32.s32 q5, q3                                     \n\t"
1909             "vmul.f32 q6, q4, q0                                     \n\t"
1910             "vmul.f32 q7, q5, q0                                     \n\t"
1911             "vadd.f32 q8, q6, q1                                     \n\t"
1912             "vadd.f32 q9, q7, q1                                     \n\t"
1913             "vcvt.s32.f32 q10, q8                                    \n\t"
1914             "vcvt.s32.f32 q11, q9                                    \n\t"
1915             "vqmovn.s32 d24, q10                                     \n\t"
1916             "vqmovn.s32 d25, q11                                     \n\t"
1917             "vqmovn.s16  d26, q12                                    \n\t"
1918             "vst1.8 {d26}, [%[dst]]                                  \n\t"
1919             : /*no output*/
1920             : [src1] "r" (_src + i + 0),
1921               [src2] "r" (_src + i + 4),
1922               [dst] "r" (_dst + i),
1923               "w"  (vscale), "w" (vshift)
1924             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26"
1925         );
1926     }
1927 })
1928 #else
1929 CVTS_FUNC(s32, s8, 8,
1930     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1931     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1932 {
1933     for (size_t i = 0; i < w; i += 8)
1934     {
1935         internal::prefetch(_src + i);
1936         int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
1937         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
1938         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1939         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1940         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1941         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1942         vline1_f32 = vaddq_f32(vline1_f32, vshift);
1943         vline2_f32 = vaddq_f32(vline2_f32, vshift);
1944         vline1_s32 = vcvtq_s32_f32(vline1_f32);
1945         vline2_s32 = vcvtq_s32_f32(vline2_f32);
1946         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
1947         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
1948         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
1949         vst1_s8(_dst + i, vRes);
1950     }
1951 })
1952 #endif
1953 
1954 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1955 CVTS_FUNC(s32, u16, 8,
1956     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
1957     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
1958 {
1959     for (size_t i = 0; i < w; i += 8)
1960     {
1961         internal::prefetch(_src + i);
1962         __asm__ (
1963             "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
1964             "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
1965             "vcvt.f32.s32 q4, q2                                    \n\t"
1966             "vcvt.f32.s32 q5, q3                                    \n\t"
1967             "vmul.f32 q6, q4, q0                                    \n\t"
1968             "vmul.f32 q7, q5, q0                                    \n\t"
1969             "vadd.f32 q8, q6, q1                                    \n\t"
1970             "vadd.f32 q9, q7, q1                                    \n\t"
1971             "vcvt.s32.f32 q10, q8                                   \n\t"
1972             "vcvt.s32.f32 q11, q9                                   \n\t"
1973             "vqmovun.s32 d24, q10                                   \n\t"
1974             "vqmovun.s32 d25, q11                                   \n\t"
1975             "vst1.16 {d24-d25}, [%[dst]]                            \n\t"
1976             : /*no output*/
1977             : [src1] "r" (_src + i + 0),
1978               [src2] "r" (_src + i + 4),
1979               [dst] "r" (_dst + i),
1980               "w"  (vscale), "w" (vshift)
1981             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
1982         );
1983     }
1984 })
1985 #else
1986 CVTS_FUNC(s32, u16, 8,
1987     float32x4_t vscale = vdupq_n_f32((f32)alpha);
1988     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
1989 {
1990     for (size_t i = 0; i < w; i += 8)
1991     {
1992         internal::prefetch(_src + i);
1993         int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
1994         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
1995         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
1996         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
1997         vline1_f32 = vmulq_f32(vline1_f32, vscale);
1998         vline2_f32 = vmulq_f32(vline2_f32, vscale);
1999         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2000         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2001         vline1_s32 = vcvtq_s32_f32(vline1_f32);
2002         vline2_s32 = vcvtq_s32_f32(vline2_f32);
2003         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
2004         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
2005         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
2006     }
2007 })
2008 #endif
2009 
2010 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2011 CVTS_FUNC(s32, s16, 8,
2012     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2013     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2014 {
2015     for (size_t i = 0; i < w; i += 8)
2016     {
2017         internal::prefetch(_src + i);
2018         __asm__ (
2019             "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
2020             "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
2021             "vcvt.f32.s32 q4, q2                                    \n\t"
2022             "vcvt.f32.s32 q5, q3                                    \n\t"
2023             "vmul.f32 q6, q4, q0                                    \n\t"
2024             "vmul.f32 q7, q5, q0                                    \n\t"
2025             "vadd.f32 q8, q6, q1                                    \n\t"
2026             "vadd.f32 q9, q7, q1                                    \n\t"
2027             "vcvt.s32.f32 q10, q8                                   \n\t"
2028             "vcvt.s32.f32 q11, q9                                   \n\t"
2029             "vqmovn.s32 d24, q10                                    \n\t"
2030             "vqmovn.s32 d25, q11                                    \n\t"
2031             "vst1.8 {d24-d25}, [%[dst]]                             \n\t"
2032             : /*no output*/
2033             : [src1] "r" (_src + i + 0),
2034               [src2] "r" (_src + i + 4),
2035               [dst] "r" (_dst + i),
2036               "w"  (vscale), "w" (vshift)
2037             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25"
2038         );
2039     }
2040 })
2041 #else
2042 CVTS_FUNC(s32, s16, 8,
2043     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2044     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2045 {
2046     for (size_t i = 0; i < w; i += 8)
2047     {
2048         internal::prefetch(_src + i);
2049         int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
2050         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
2051         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
2052         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
2053         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2054         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2055         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2056         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2057         vline1_s32 = vcvtq_s32_f32(vline1_f32);
2058         vline2_s32 = vcvtq_s32_f32(vline2_f32);
2059         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
2060         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
2061         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
2062     }
2063 })
2064 #endif
2065 
2066 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2067 CVTS_FUNC1(s32, 8,
2068     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2069     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2070 {
2071     for (size_t i = 0; i < w; i += 8)
2072     {
2073         internal::prefetch(_src + i);
2074         __asm__ (
2075             "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
2076             "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
2077             "vcvt.f32.s32 q4, q2                                    \n\t"
2078             "vcvt.f32.s32 q5, q3                                    \n\t"
2079             "vmul.f32 q6, q4, q0                                    \n\t"
2080             "vmul.f32 q7, q5, q0                                    \n\t"
2081             "vadd.f32 q8, q6, q1                                    \n\t"
2082             "vadd.f32 q9, q7, q1                                    \n\t"
2083             "vcvt.s32.f32 q10, q8                                   \n\t"
2084             "vcvt.s32.f32 q11, q9                                   \n\t"
2085             "vst1.32 {d20-d21}, [%[dst1]]                           \n\t"
2086             "vst1.32 {d22-d23}, [%[dst2]]                           \n\t"
2087             : /*no output*/
2088             : [src1] "r" (_src + i + 0),
2089               [src2] "r" (_src + i + 4),
2090               [dst1] "r" (_dst + i + 0),
2091               [dst2] "r" (_dst + i + 4),
2092               "w"  (vscale), "w" (vshift)
2093             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
2094         );
2095     }
2096 })
2097 #else
2098 CVTS_FUNC1(s32, 8,
2099     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2100     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2101 {
2102     for (size_t i = 0; i < w; i += 8)
2103     {
2104         internal::prefetch(_src + i);
2105         int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
2106         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
2107         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
2108         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
2109         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2110         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2111         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2112         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2113         vline1_s32 = vcvtq_s32_f32(vline1_f32);
2114         vline2_s32 = vcvtq_s32_f32(vline2_f32);
2115         vst1q_s32(_dst + i + 0, vline1_s32);
2116         vst1q_s32(_dst + i + 4, vline2_s32);
2117     }
2118 })
2119 #endif
2120 
2121 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2122 CVTS_FUNC(s32, f32, 8,
2123     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2124     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
2125 {
2126     for (size_t i = 0; i < w; i += 8)
2127     {
2128         internal::prefetch(_src + i);
2129         __asm__ (
2130             "vld1.32 {d4-d5}, [%[src1]]                             \n\t"
2131             "vld1.32 {d6-d7}, [%[src2]]                             \n\t"
2132             "vcvt.f32.s32 q4, q2                                    \n\t"
2133             "vcvt.f32.s32 q5, q3                                    \n\t"
2134             "vmul.f32 q6, q4, q0                                    \n\t"
2135             "vmul.f32 q7, q5, q0                                    \n\t"
2136             "vadd.f32 q8, q6, q1                                    \n\t"
2137             "vadd.f32 q9, q7, q1                                    \n\t"
2138             "vst1.32 {d16-d17}, [%[dst1]]                           \n\t"
2139             "vst1.32 {d18-d19}, [%[dst2]]                           \n\t"
2140             : /*no output*/
2141             : [src1] "r" (_src + i),
2142               [src2] "r" (_src + i + 4),
2143               [dst1] "r" (_dst + i),
2144               [dst2] "r" (_dst + i + 4),
2145               "w"  (vscale), "w" (vshift)
2146            : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2147         );
2148     }
2149 })
2150 #else
2151 CVTS_FUNC(s32, f32, 8,
2152     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2153     float32x4_t vshift = vdupq_n_f32((f32)beta);,
2154 {
2155     for (size_t i = 0; i < w; i += 8)
2156     {
2157         internal::prefetch(_src + i);
2158         int32x4_t vline1_s32 = vld1q_s32(_src + i + 0);
2159         int32x4_t vline2_s32 = vld1q_s32(_src + i + 4);
2160         float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32);
2161         float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32);
2162         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2163         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2164         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2165         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2166         vst1q_f32(_dst + i + 0, vline1_f32);
2167         vst1q_f32(_dst + i + 4, vline2_f32);
2168     }
2169 })
2170 #endif
2171 
2172 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2173 CVTS_FUNC(f32, u8, 8,
2174     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha));
2175     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta));
2176     register uint32x4_t  vmask  asm ("q2") = vdupq_n_u32(1<<16);,
2177 {
2178     for (size_t i = 0; i < w; i += 8)
2179     {
2180         internal::prefetch(_src + i);
2181         __asm__ (
2182             "vld1.32 {d6-d7}, [%[src1]]                              \n\t"
2183             "vld1.32 {d8-d9}, [%[src2]]                              \n\t"
2184             "vmul.f32 q5, q3, q0                                     \n\t"
2185             "vmul.f32 q6, q4, q0                                     \n\t"
2186             "vadd.f32 q7, q5, q1                                     \n\t"
2187             "vadd.f32 q8, q6, q1                                     \n\t"
2188             "vcvt.u32.f32 q9, q7                                     \n\t"
2189             "vcvt.u32.f32 q10, q8                                    \n\t"
2190             "vbic q11, q2, q6                                        \n\t"
2191             "vbic q12, q2, q7                                        \n\t"
2192             "vshr.u32 q13, q11, #16                                  \n\t"
2193             "vshr.u32 q14, q12, #16                                  \n\t"
2194             "vqsub.u32 q7, q9, q13                                   \n\t"
2195             "vqsub.u32 q8, q10, q14                                  \n\t"
2196             "vqrshrn.u32 d22, q7, #16                                \n\t"
2197             "vqrshrn.u32 d23, q8, #16                                \n\t"
2198             "vqmovn.u16 d30, q11                                     \n\t"
2199             "vst1.8 {d30}, [%[dst]]                                  \n\t"
2200             : /*no output*/
2201             : [src1] "r" (_src + i + 0),
2202               [src2] "r" (_src + i + 4),
2203               [dst] "r" (_dst + i),
2204               "w" (vscale), "w" (vshift), "w" (vmask)
2205             : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30"
2206         );
2207     }
2208 })
2209 #else
2210 CVTS_FUNC(f32, u8, 8,
2211     float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha));
2212     float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta));
2213     uint32x4_t  vmask  = vdupq_n_u32(1<<16);,
2214 {
2215     for (size_t i = 0; i < w; i += 8)
2216     {
2217         internal::prefetch(_src + i);
2218         float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2219         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2220 
2221         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2222         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2223         float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift);
2224         float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift);
2225         uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32);
2226         uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32);
2227         uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32));
2228         uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32));
2229         vline1Mask = vshrq_n_u32(vline1Mask, 16);
2230         vline2Mask = vshrq_n_u32(vline2Mask, 16);
2231         vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask);
2232         vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask);
2233         uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16);
2234         uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16);
2235         uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
2236 
2237         vst1_u8(_dst + i, vRes);
2238     }
2239 })
2240 #endif
2241 
2242 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2243 CVTS_FUNC(f32, s8, 8,
2244     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2245     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2246 {
2247     for (size_t i = 0; i < w; i += 8)
2248     {
2249         internal::prefetch(_src + i);
2250         __asm__ (
2251             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
2252             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
2253             "vmul.f32 q4, q2, q0                                     \n\t"
2254             "vmul.f32 q5, q3, q0                                     \n\t"
2255             "vadd.f32 q6, q4, q1                                     \n\t"
2256             "vadd.f32 q7, q5, q1                                     \n\t"
2257             "vcvt.s32.f32 q8, q6                                     \n\t"
2258             "vcvt.s32.f32 q9, q7                                     \n\t"
2259             "vqmovn.s32 d14, q8                                      \n\t"
2260             "vqmovn.s32 d15, q9                                      \n\t"
2261             "vqmovn.s16 d16, q7                                      \n\t"
2262             "vst1.8 {d16}, [%[dst]]                                  \n\t"
2263             : /*no output*/
2264             : [src1] "r" (_src + i + 0),
2265               [src2] "r" (_src + i + 4),
2266               [dst] "r" (_dst + i),
2267               "w" (vscale), "w" (vshift)
2268             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2269         );
2270     }
2271 })
2272 #else
2273 CVTS_FUNC(f32, s8, 8,
2274     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2275     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2276 {
2277     for (size_t i = 0; i < w; i += 8)
2278     {
2279         internal::prefetch(_src + i);
2280         float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2281         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2282         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2283         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2284         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2285         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2286         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
2287         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
2288         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
2289         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
2290         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
2291         vst1_s8(_dst + i, vRes);
2292     }
2293 })
2294 #endif
2295 
2296 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2297 CVTS_FUNC(f32, u16, 8,
2298     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2299     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2300 {
2301     for (size_t i = 0; i < w; i += 8)
2302     {
2303         internal::prefetch(_src + i);
2304         __asm__ (
2305             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
2306             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
2307             "vmul.f32 q4, q2, q0                                     \n\t"
2308             "vmul.f32 q5, q3, q0                                     \n\t"
2309             "vadd.f32 q6, q4, q1                                     \n\t"
2310             "vadd.f32 q7, q5, q1                                     \n\t"
2311             "vcvt.u32.f32 q8, q6                                     \n\t"
2312             "vcvt.u32.f32 q9, q7                                     \n\t"
2313             "vqmovn.u32 d8, q8                                       \n\t"
2314             "vqmovn.u32 d9, q9                                       \n\t"
2315             "vst1.16 {d8-d9}, [%[dst]]                               \n\t"
2316             : /*no output*/
2317             : [src1] "r" (_src + i + 0),
2318               [src2] "r" (_src + i + 4),
2319               [dst] "r" (_dst + i),
2320               "w" (vscale), "w" (vshift)
2321             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2322         );
2323     }
2324 })
2325 #else
2326 CVTS_FUNC(f32, u16, 8,
2327     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2328     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2329 {
2330     for (size_t i = 0; i < w; i += 8)
2331     {
2332         internal::prefetch(_src + i);
2333         float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2334         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2335         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2336         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2337         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2338         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2339         uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
2340         uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
2341         uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
2342         uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
2343         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
2344     }
2345 })
2346 #endif
2347 
2348 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2349 CVTS_FUNC(f32, s16, 8,
2350     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2351     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2352 {
2353     for (size_t i = 0; i < w; i += 8)
2354     {
2355         internal::prefetch(_src + i);
2356         __asm__ (
2357             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
2358             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
2359             "vmul.f32 q4, q2, q0                                     \n\t"
2360             "vmul.f32 q5, q3, q0                                     \n\t"
2361             "vadd.f32 q6, q4, q1                                     \n\t"
2362             "vadd.f32 q7, q5, q1                                     \n\t"
2363             "vcvt.s32.f32 q8, q6                                     \n\t"
2364             "vcvt.s32.f32 q9, q7                                     \n\t"
2365             "vqmovn.s32 d8, q8                                       \n\t"
2366             "vqmovn.s32 d9, q9                                       \n\t"
2367             "vst1.16 {d8-d9}, [%[dst]]                               \n\t"
2368             : /*no output*/
2369             : [src1] "r" (_src + i + 0),
2370               [src2] "r" (_src + i + 4),
2371               [dst] "r" (_dst + i),
2372               "w" (vscale), "w" (vshift)
2373             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2374         );
2375     }
2376 })
2377 #else
2378 CVTS_FUNC(f32, s16, 8,
2379     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2380     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2381 {
2382     for (size_t i = 0; i < w; i += 8)
2383     {
2384         internal::prefetch(_src + i);
2385         float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2386         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2387         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2388         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2389         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2390         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2391         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
2392         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
2393         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
2394         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
2395         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
2396     }
2397 })
2398 #endif
2399 
2400 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2401 CVTS_FUNC(f32, s32, 8,
2402     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2403     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);,
2404 {
2405     for (size_t i = 0; i < w; i += 8)
2406     {
2407         internal::prefetch(_src + i);
2408         __asm__ (
2409             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
2410             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
2411             "vmul.f32 q4, q2, q0                                     \n\t"
2412             "vmul.f32 q5, q3, q0                                     \n\t"
2413             "vadd.f32 q6, q4, q1                                     \n\t"
2414             "vadd.f32 q7, q5, q1                                     \n\t"
2415             "vcvt.s32.f32 q4, q6                                     \n\t"
2416             "vcvt.s32.f32 q5, q7                                     \n\t"
2417             "vst1.32 {d8-d9},   [%[dst1]]                            \n\t"
2418             "vst1.32 {d10-d11}, [%[dst2]]                            \n\t"
2419             : //no output
2420             : [src1] "r" (_src + i),
2421               [src2] "r" (_src + i + 4),
2422               [dst1] "r" (_dst + i),
2423               [dst2] "r" (_dst + i + 4),
2424               "w" (vscale), "w" (vshift)
2425             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
2426         );
2427     }
2428 })
2429 #else
2430 CVTS_FUNC(f32, s32, 8,
2431     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2432     float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
2433 {
2434     for (size_t i = 0; i < w; i += 8)
2435     {
2436         internal::prefetch(_src + i);
2437         float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2438         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2439         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2440         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2441         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2442         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2443         int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
2444         int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
2445         vst1q_s32(_dst + i + 0, vline1_s32);
2446         vst1q_s32(_dst + i + 4, vline2_s32);
2447     }
2448 })
2449 #endif
2450 
2451 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2452 CVTS_FUNC1(f32, 8,
2453     register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha);
2454     register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);,
2455 {
2456     for (size_t i = 0; i < w; i += 8)
2457     {
2458         internal::prefetch(_src + i);
2459         __asm__ (
2460             "vld1.32 {d4-d5}, [%[src1]]                              \n\t"
2461             "vld1.32 {d6-d7}, [%[src2]]                              \n\t"
2462             "vmul.f32 q4, q2, q0                                     \n\t"
2463             "vmul.f32 q5, q3, q0                                     \n\t"
2464             "vadd.f32 q6, q4, q1                                     \n\t"
2465             "vadd.f32 q7, q5, q1                                     \n\t"
2466             "vst1.32 {d12-d13}, [%[dst1]]                            \n\t"
2467             "vst1.32 {d14-d15}, [%[dst2]]                            \n\t"
2468             : /*no output*/
2469             : [src1] "r" (_src + i + 0),
2470               [src2] "r" (_src + i + 4),
2471               [dst1] "r" (_dst + i + 0),
2472               [dst2] "r" (_dst + i + 4),
2473               "w" (vscale), "w" (vshift)
2474             : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
2475         );
2476     }
2477 })
2478 #else
2479 CVTS_FUNC1(f32, 8,
2480     float32x4_t vscale = vdupq_n_f32((f32)alpha);
2481     float32x4_t vshift = vdupq_n_f32((f32)beta);,
2482 {
2483     for (size_t i = 0; i < w; i += 8)
2484     {
2485         internal::prefetch(_src + i);
2486         float32x4_t vline1_f32 = vld1q_f32(_src + i + 0);
2487         float32x4_t vline2_f32 = vld1q_f32(_src + i + 4);
2488         vline1_f32 = vmulq_f32(vline1_f32, vscale);
2489         vline2_f32 = vmulq_f32(vline2_f32, vscale);
2490         vline1_f32 = vaddq_f32(vline1_f32, vshift);
2491         vline2_f32 = vaddq_f32(vline2_f32, vshift);
2492         vst1q_f32(_dst + i + 0, vline1_f32);
2493         vst1q_f32(_dst + i + 4, vline2_f32);
2494     }
2495 })
2496 #endif
2497 
2498 } // namespace CAROTENE_NS
2499