1 /* 2 * By downloading, copying, installing or using the software you agree to this license. 3 * If you do not agree to this license, do not download, install, 4 * copy or use the software. 5 * 6 * 7 * License Agreement 8 * For Open Source Computer Vision Library 9 * (3-clause BSD License) 10 * 11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. 12 * Third party copyrights are property of their respective owners. 13 * 14 * Redistribution and use in source and binary forms, with or without modification, 15 * are permitted provided that the following conditions are met: 16 * 17 * * Redistributions of source code must retain the above copyright notice, 18 * this list of conditions and the following disclaimer. 19 * 20 * * Redistributions in binary form must reproduce the above copyright notice, 21 * this list of conditions and the following disclaimer in the documentation 22 * and/or other materials provided with the distribution. 23 * 24 * * Neither the names of the copyright holders nor the names of the contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * This software is provided by the copyright holders and contributors "as is" and 29 * any express or implied warranties, including, but not limited to, the implied 30 * warranties of merchantability and fitness for a particular purpose are disclaimed. 31 * In no event shall copyright holders or contributors be liable for any direct, 32 * indirect, incidental, special, exemplary, or consequential damages 33 * (including, but not limited to, procurement of substitute goods or services; 34 * loss of use, data, or profits; or business interruption) however caused 35 * and on any theory of liability, whether in contract, strict liability, 36 * or tort (including negligence or otherwise) arising in any way out of 37 * the use of this software, even if advised of the possibility of such damage. 38 */ 39 40 #include "common.hpp" 41 42 namespace CAROTENE_NS { 43 44 #ifdef CAROTENE_NEON 45 46 #define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ 47 void convertScale(const Size2D &_size, \ 48 const T1 * srcBase, ptrdiff_t srcStride, \ 49 T2 * dstBase, ptrdiff_t dstStride, \ 50 f64 alpha, f64 beta) \ 51 { \ 52 internal::assertSupportedConfiguration(); \ 53 Size2D size(_size); \ 54 if (srcStride == dstStride && \ 55 srcStride == (ptrdiff_t)(size.width)) \ 56 { \ 57 size.width *= size.height; \ 58 size.height = 1; \ 59 } \ 60 const ptrdiff_t sstep = srcStride / sizeof(T1); \ 61 const ptrdiff_t dstep = dstStride / sizeof(T2); \ 62 const size_t w = size.width & ~(SIMD_SIZE-1); \ 63 if (size.width >= SIMD_SIZE) \ 64 { \ 65 const T1* _src = srcBase; \ 66 T2* _dst = dstBase; \ 67 CVTINIT \ 68 for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ 69 CVTROW \ 70 } \ 71 if(w < size.width) \ 72 { \ 73 const T1* _src = srcBase; \ 74 T2* _dst = dstBase; \ 75 for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ 76 for(size_t i = w; i < size.width; i++ ) \ 77 _dst[i] = internal::saturate_cast<T2>(_src[i]*alpha + beta); \ 78 } \ 79 } 80 81 #define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \ 82 void convertScale(const Size2D &_size, \ 83 const T1 * srcBase, ptrdiff_t srcStride, \ 84 T1 * dstBase, ptrdiff_t dstStride, \ 85 f64 alpha, f64 beta) \ 86 { \ 87 internal::assertSupportedConfiguration(); \ 88 Size2D size(_size); \ 89 if (srcStride == dstStride && \ 90 srcStride == (ptrdiff_t)(size.width)) \ 91 { \ 92 size.width *= size.height; \ 93 size.height = 1; \ 94 } \ 95 const ptrdiff_t sstep = srcStride / sizeof(T1); \ 96 const ptrdiff_t dstep = dstStride / sizeof(T1); \ 97 const size_t w = size.width & ~(SIMD_SIZE-1); \ 98 if (size.width >= SIMD_SIZE) \ 99 { \ 100 const T1* _src = srcBase; \ 101 T1* _dst = dstBase; \ 102 CVTSINIT \ 103 for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ 104 CVTSROW \ 105 } \ 106 if(w < size.width) \ 107 { \ 108 const T1* _src = srcBase; \ 109 T1* _dst = dstBase; \ 110 for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ 111 for(size_t i = w; i < size.width; i++ ) \ 112 _dst[i] = internal::saturate_cast<T1>(_src[i]*alpha + beta); \ 113 } \ 114 } 115 116 #else 117 118 #define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ 119 void convertScale(const Size2D &, \ 120 const T1 *, ptrdiff_t, \ 121 T2 *, ptrdiff_t, \ 122 f64, f64) \ 123 { \ 124 internal::assertSupportedConfiguration(); \ 125 } 126 127 #define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \ 128 void convertScale(const Size2D &, \ 129 const T1 *, ptrdiff_t, \ 130 T1 *, ptrdiff_t, \ 131 f64, f64) \ 132 { \ 133 internal::assertSupportedConfiguration(); \ 134 } 135 136 #endif 137 138 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 139 CVTS_FUNC1(u8, 16, 140 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 141 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 142 { 143 for (size_t i = 0; i < w; i += 16) 144 { 145 internal::prefetch(_src + i); 146 __asm__ ( 147 "vld1.8 {d4-d5}, [%[src]] \n\t" 148 "vmovl.u8 q3, d4 \n\t" 149 "vmovl.u8 q4, d5 \n\t" 150 "vmovl.u16 q5, d6 \n\t" 151 "vmovl.u16 q6, d7 \n\t" 152 "vmovl.u16 q7, d8 \n\t" 153 "vmovl.u16 q8, d9 \n\t" 154 "vcvt.f32.u32 q9, q5 \n\t" 155 "vcvt.f32.u32 q10, q6 \n\t" 156 "vcvt.f32.u32 q11, q7 \n\t" 157 "vcvt.f32.u32 q12, q8 \n\t" 158 "vmul.f32 q13, q9, q0 \n\t" 159 "vmul.f32 q14, q10, q0 \n\t" 160 "vmul.f32 q15, q11, q0 \n\t" 161 "vmul.f32 q2, q12, q0 \n\t" 162 "vadd.f32 q3, q13, q1 \n\t" 163 "vadd.f32 q4, q14, q1 \n\t" 164 "vadd.f32 q5, q15, q1 \n\t" 165 "vadd.f32 q6, q2, q1 \n\t" 166 "vcvt.s32.f32 q7, q3 \n\t" 167 "vcvt.s32.f32 q8, q4 \n\t" 168 "vcvt.s32.f32 q9, q5 \n\t" 169 "vcvt.s32.f32 q10, q6 \n\t" 170 "vqmovun.s32 d22, q7 \n\t" 171 "vqmovun.s32 d23, q8 \n\t" 172 "vqmovun.s32 d24, q9 \n\t" 173 "vqmovun.s32 d25, q10 \n\t" 174 "vqmovn.u16 d26, q11 \n\t" 175 "vqmovn.u16 d27, q12 \n\t" 176 "vst1.8 {d26-d27}, [%[dst1]] \n\t" 177 : /*no output*/ 178 : [src] "r" (_src + i), 179 [dst1] "r" (_dst + i + 0), 180 "w" (vscale), "w" (vshift) 181 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 182 ); 183 } 184 }) 185 #else 186 CVTS_FUNC1(u8, 16, 187 float32x4_t vscale = vdupq_n_f32((f32)alpha); 188 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 189 { 190 for (size_t i = 0; i < w; i += 16) 191 { 192 internal::prefetch(_src + i); 193 uint8x16_t vline = vld1q_u8(_src + i); 194 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); 195 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); 196 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); 197 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); 198 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); 199 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); 200 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 201 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 202 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); 203 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); 204 vline1_f32 = vmulq_f32(vline1_f32, vscale); 205 vline2_f32 = vmulq_f32(vline2_f32, vscale); 206 vline3_f32 = vmulq_f32(vline3_f32, vscale); 207 vline4_f32 = vmulq_f32(vline4_f32, vscale); 208 vline1_f32 = vaddq_f32(vline1_f32, vshift); 209 vline2_f32 = vaddq_f32(vline2_f32, vshift); 210 vline3_f32 = vaddq_f32(vline3_f32, vshift); 211 vline4_f32 = vaddq_f32(vline4_f32, vshift); 212 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 213 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 214 int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); 215 int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); 216 uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); 217 uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); 218 vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); 219 } 220 }) 221 #endif 222 223 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 224 CVTS_FUNC(u8, s8, 16, 225 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 226 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 227 { 228 for (size_t i = 0; i < w; i += 16) 229 { 230 internal::prefetch(_src + i); 231 __asm__ ( 232 "vld1.8 {d4-d5}, [%[src]] \n\t" 233 "vmovl.u8 q3, d4 \n\t" 234 "vmovl.u8 q4, d5 \n\t" 235 "vmovl.u16 q5, d6 \n\t" 236 "vmovl.u16 q6, d7 \n\t" 237 "vmovl.u16 q7, d8 \n\t" 238 "vmovl.u16 q8, d9 \n\t" 239 "vcvt.f32.u32 q9, q5 \n\t" 240 "vcvt.f32.u32 q10, q6 \n\t" 241 "vcvt.f32.u32 q11, q7 \n\t" 242 "vcvt.f32.u32 q12, q8 \n\t" 243 "vmul.f32 q13, q9, q0 \n\t" 244 "vmul.f32 q14, q10, q0 \n\t" 245 "vmul.f32 q15, q11, q0 \n\t" 246 "vmul.f32 q2, q12, q0 \n\t" 247 "vadd.f32 q3, q13, q1 \n\t" 248 "vadd.f32 q4, q14, q1 \n\t" 249 "vadd.f32 q5, q15, q1 \n\t" 250 "vadd.f32 q6, q2, q1 \n\t" 251 "vcvt.s32.f32 q7, q3 \n\t" 252 "vcvt.s32.f32 q8, q4 \n\t" 253 "vcvt.s32.f32 q9, q5 \n\t" 254 "vcvt.s32.f32 q10, q6 \n\t" 255 "vqmovn.s32 d22, q7 \n\t" 256 "vqmovn.s32 d23, q8 \n\t" 257 "vqmovn.s32 d24, q9 \n\t" 258 "vqmovn.s32 d25, q10 \n\t" 259 "vqmovn.s16 d26, q11 \n\t" 260 "vqmovn.s16 d27, q12 \n\t" 261 "vst1.8 {d26-d27}, [%[dst1]] \n\t" 262 : //no output 263 : [src] "r" (_src + i), 264 [dst1] "r" (_dst + i + 0), 265 "w" (vscale), "w" (vshift) 266 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 267 ); 268 } 269 }) 270 #else 271 CVTS_FUNC(u8, s8, 16, 272 float32x4_t vscale = vdupq_n_f32((f32)alpha); 273 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 274 { 275 for (size_t i = 0; i < w; i += 16) 276 { 277 internal::prefetch(_src + i); 278 uint8x16_t vline = vld1q_u8(_src + i); 279 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); 280 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); 281 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); 282 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); 283 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); 284 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); 285 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 286 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 287 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); 288 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); 289 vline1_f32 = vmulq_f32(vline1_f32, vscale); 290 vline2_f32 = vmulq_f32(vline2_f32, vscale); 291 vline3_f32 = vmulq_f32(vline3_f32, vscale); 292 vline4_f32 = vmulq_f32(vline4_f32, vscale); 293 vline1_f32 = vaddq_f32(vline1_f32, vshift); 294 vline2_f32 = vaddq_f32(vline2_f32, vshift); 295 vline3_f32 = vaddq_f32(vline3_f32, vshift); 296 vline4_f32 = vaddq_f32(vline4_f32, vshift); 297 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 298 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 299 int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); 300 int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); 301 int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); 302 int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); 303 vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16))); 304 } 305 }) 306 #endif 307 308 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 309 CVTS_FUNC(u8, u16, 16, 310 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 311 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 312 { 313 for (size_t i = 0; i < w; i += 16) 314 { 315 internal::prefetch(_src + i); 316 __asm__ ( 317 "vld1.8 {d4-d5}, [%[src]] \n\t" 318 "vmovl.u8 q3, d4 \n\t" 319 "vmovl.u8 q4, d5 \n\t" 320 "vmovl.u16 q5, d6 \n\t" 321 "vmovl.u16 q6, d7 \n\t" 322 "vmovl.u16 q7, d8 \n\t" 323 "vmovl.u16 q8, d9 \n\t" 324 "vcvt.f32.u32 q9, q5 \n\t" 325 "vcvt.f32.u32 q10, q6 \n\t" 326 "vcvt.f32.u32 q11, q7 \n\t" 327 "vcvt.f32.u32 q12, q8 \n\t" 328 "vmul.f32 q13, q9, q0 \n\t" 329 "vmul.f32 q14, q10, q0 \n\t" 330 "vmul.f32 q15, q11, q0 \n\t" 331 "vmul.f32 q2, q12, q0 \n\t" 332 "vadd.f32 q3, q13, q1 \n\t" 333 "vadd.f32 q4, q14, q1 \n\t" 334 "vadd.f32 q5, q15, q1 \n\t" 335 "vadd.f32 q6, q2, q1 \n\t" 336 "vcvt.s32.f32 q7, q3 \n\t" 337 "vcvt.s32.f32 q8, q4 \n\t" 338 "vcvt.s32.f32 q9, q5 \n\t" 339 "vcvt.s32.f32 q10, q6 \n\t" 340 "vqmovun.s32 d22, q7 \n\t" 341 "vqmovun.s32 d23, q8 \n\t" 342 "vqmovun.s32 d24, q9 \n\t" 343 "vqmovun.s32 d25, q10 \n\t" 344 "vst1.16 {d22-d23}, [%[dst1]] \n\t" 345 "vst1.16 {d24-d25}, [%[dst2]] \n\t" 346 : /*no output*/ 347 : [src] "r" (_src + i), 348 [dst1] "r" (_dst + i + 0), 349 [dst2] "r" (_dst + i + 8), 350 "w" (vscale), "w" (vshift) 351 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 352 ); 353 } 354 }) 355 #else 356 CVTS_FUNC(u8, u16, 16, 357 float32x4_t vscale = vdupq_n_f32((f32)alpha); 358 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 359 { 360 for (size_t i = 0; i < w; i += 16) 361 { 362 internal::prefetch(_src + i); 363 uint8x16_t vline = vld1q_u8(_src + i); 364 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); 365 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); 366 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); 367 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); 368 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); 369 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); 370 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 371 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 372 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); 373 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); 374 vline1_f32 = vmulq_f32(vline1_f32, vscale); 375 vline2_f32 = vmulq_f32(vline2_f32, vscale); 376 vline3_f32 = vmulq_f32(vline3_f32, vscale); 377 vline4_f32 = vmulq_f32(vline4_f32, vscale); 378 vline1_f32 = vaddq_f32(vline1_f32, vshift); 379 vline2_f32 = vaddq_f32(vline2_f32, vshift); 380 vline3_f32 = vaddq_f32(vline3_f32, vshift); 381 vline4_f32 = vaddq_f32(vline4_f32, vshift); 382 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 383 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 384 int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); 385 int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); 386 vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32))); 387 vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32))); 388 } 389 }) 390 #endif 391 392 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 393 CVTS_FUNC(u8, s16, 16, 394 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 395 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 396 { 397 for (size_t i = 0; i < w; i += 16) 398 { 399 internal::prefetch(_src + i); 400 __asm__ ( 401 "vld1.8 {d4-d5}, [%[src]] \n\t" 402 "vmovl.u8 q3, d4 \n\t" 403 "vmovl.u8 q4, d5 \n\t" 404 "vmovl.u16 q5, d6 \n\t" 405 "vmovl.u16 q6, d7 \n\t" 406 "vmovl.u16 q7, d8 \n\t" 407 "vmovl.u16 q8, d9 \n\t" 408 "vcvt.f32.u32 q9, q5 \n\t" 409 "vcvt.f32.u32 q10, q6 \n\t" 410 "vcvt.f32.u32 q11, q7 \n\t" 411 "vcvt.f32.u32 q12, q8 \n\t" 412 "vmul.f32 q13, q9, q0 \n\t" 413 "vmul.f32 q14, q10, q0 \n\t" 414 "vmul.f32 q15, q11, q0 \n\t" 415 "vmul.f32 q2, q12, q0 \n\t" 416 "vadd.f32 q3, q13, q1 \n\t" 417 "vadd.f32 q4, q14, q1 \n\t" 418 "vadd.f32 q5, q15, q1 \n\t" 419 "vadd.f32 q6, q2, q1 \n\t" 420 "vcvt.s32.f32 q7, q3 \n\t" 421 "vcvt.s32.f32 q8, q4 \n\t" 422 "vcvt.s32.f32 q9, q5 \n\t" 423 "vcvt.s32.f32 q10, q6 \n\t" 424 "vqmovn.s32 d22, q7 \n\t" 425 "vqmovn.s32 d23, q8 \n\t" 426 "vqmovn.s32 d24, q9 \n\t" 427 "vqmovn.s32 d25, q10 \n\t" 428 "vst1.16 {d22-d23}, [%[dst1]] \n\t" 429 "vst1.16 {d24-d25}, [%[dst2]] \n\t" 430 : //no output 431 : [src] "r" (_src + i), 432 [dst1] "r" (_dst + i + 0), 433 [dst2] "r" (_dst + i + 8), 434 "w" (vscale), "w" (vshift) 435 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 436 ); 437 } 438 }) 439 #else 440 CVTS_FUNC(u8, s16, 16, 441 float32x4_t vscale = vdupq_n_f32((f32)alpha); 442 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 443 { 444 for (size_t i = 0; i < w; i += 16) 445 { 446 internal::prefetch(_src + i); 447 uint8x16_t vline = vld1q_u8(_src + i); 448 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); 449 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); 450 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); 451 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); 452 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); 453 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); 454 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 455 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 456 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); 457 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); 458 vline1_f32 = vmulq_f32(vline1_f32, vscale); 459 vline2_f32 = vmulq_f32(vline2_f32, vscale); 460 vline3_f32 = vmulq_f32(vline3_f32, vscale); 461 vline4_f32 = vmulq_f32(vline4_f32, vscale); 462 vline1_f32 = vaddq_f32(vline1_f32, vshift); 463 vline2_f32 = vaddq_f32(vline2_f32, vshift); 464 vline3_f32 = vaddq_f32(vline3_f32, vshift); 465 vline4_f32 = vaddq_f32(vline4_f32, vshift); 466 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 467 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 468 int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); 469 int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); 470 vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32))); 471 vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32))); 472 } 473 }) 474 #endif 475 476 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 477 CVTS_FUNC(u8, s32, 16, 478 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 479 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 480 { 481 for (size_t i = 0; i < w; i += 16) 482 { 483 internal::prefetch(_src + i); 484 __asm__ ( 485 "vld1.8 {d4-d5}, [%[src]] \n\t" 486 "vmovl.u8 q3, d4 \n\t" 487 "vmovl.u8 q4, d5 \n\t" 488 "vmovl.u16 q5, d6 \n\t" 489 "vmovl.u16 q6, d7 \n\t" 490 "vmovl.u16 q7, d8 \n\t" 491 "vmovl.u16 q8, d9 \n\t" 492 "vcvt.f32.u32 q9, q5 \n\t" 493 "vcvt.f32.u32 q10, q6 \n\t" 494 "vcvt.f32.u32 q11, q7 \n\t" 495 "vcvt.f32.u32 q12, q8 \n\t" 496 "vmul.f32 q13, q9, q0 \n\t" 497 "vmul.f32 q14, q10, q0 \n\t" 498 "vmul.f32 q15, q11, q0 \n\t" 499 "vmul.f32 q2, q12, q0 \n\t" 500 "vadd.f32 q3, q13, q1 \n\t" 501 "vadd.f32 q4, q14, q1 \n\t" 502 "vadd.f32 q5, q15, q1 \n\t" 503 "vadd.f32 q6, q2, q1 \n\t" 504 "vcvt.s32.f32 q7, q3 \n\t" 505 "vcvt.s32.f32 q8, q4 \n\t" 506 "vcvt.s32.f32 q9, q5 \n\t" 507 "vcvt.s32.f32 q10, q6 \n\t" 508 "vst1.32 {d14-d15}, [%[dst1]] \n\t" 509 "vst1.32 {d16-d17}, [%[dst2]] \n\t" 510 "vst1.32 {d18-d19}, [%[dst3]] \n\t" 511 "vst1.32 {d20-d21}, [%[dst4]] \n\t" 512 : /*no output*/ 513 : [src] "r" (_src + i), 514 [dst1] "r" (_dst + i + 0), 515 [dst2] "r" (_dst + i + 4), 516 [dst3] "r" (_dst + i + 8), 517 [dst4] "r" (_dst + i + 12), 518 "w" (vscale), "w" (vshift) 519 : "d4","d5","d6","d7","d8","d9","d10", 520 "d11","d12","d13","d14","d15","d16","d17", 521 "d18","d19","d20","d21","d22","d23","d24", 522 "d25","d26","d27","d28","d29","d30","d31" 523 ); 524 } 525 }) 526 #else 527 CVTS_FUNC(u8, s32, 16, 528 float32x4_t vscale = vdupq_n_f32((f32)alpha); 529 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 530 { 531 for (size_t i = 0; i < w; i += 16) 532 { 533 internal::prefetch(_src + i); 534 uint8x16_t vline = vld1q_u8(_src + i); 535 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); 536 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); 537 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); 538 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); 539 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); 540 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); 541 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 542 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 543 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); 544 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); 545 vline1_f32 = vmulq_f32(vline1_f32, vscale); 546 vline2_f32 = vmulq_f32(vline2_f32, vscale); 547 vline3_f32 = vmulq_f32(vline3_f32, vscale); 548 vline4_f32 = vmulq_f32(vline4_f32, vscale); 549 vline1_f32 = vaddq_f32(vline1_f32, vshift); 550 vline2_f32 = vaddq_f32(vline2_f32, vshift); 551 vline3_f32 = vaddq_f32(vline3_f32, vshift); 552 vline4_f32 = vaddq_f32(vline4_f32, vshift); 553 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 554 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 555 int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); 556 int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); 557 vst1q_s32(_dst + i + 0, vline1_s32); 558 vst1q_s32(_dst + i + 4, vline2_s32); 559 vst1q_s32(_dst + i + 8, vline3_s32); 560 vst1q_s32(_dst + i + 12, vline4_s32); 561 } 562 }) 563 #endif 564 565 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 566 CVTS_FUNC(u8, f32, 16, 567 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 568 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, 569 { 570 for (size_t i = 0; i < w; i += 16) 571 { 572 internal::prefetch(_src + i); 573 __asm__ ( 574 "vld1.8 {d4-d5}, [%[src]] \n\t" 575 "vmovl.u8 q3, d4 \n\t" 576 "vmovl.u8 q4, d5 \n\t" 577 "vmovl.u16 q5, d6 \n\t" 578 "vmovl.u16 q6, d7 \n\t" 579 "vmovl.u16 q7, d8 \n\t" 580 "vmovl.u16 q8, d9 \n\t" 581 "vcvt.f32.u32 q9, q5 \n\t" 582 "vcvt.f32.u32 q10, q6 \n\t" 583 "vcvt.f32.u32 q11, q7 \n\t" 584 "vcvt.f32.u32 q12, q8 \n\t" 585 "vmul.f32 q13, q9, q0 \n\t" 586 "vmul.f32 q14, q10, q0 \n\t" 587 "vmul.f32 q15, q11, q0 \n\t" 588 "vmul.f32 q2, q12, q0 \n\t" 589 "vadd.f32 q3, q13, q1 \n\t" 590 "vadd.f32 q4, q14, q1 \n\t" 591 "vadd.f32 q5, q15, q1 \n\t" 592 "vadd.f32 q6, q2, q1 \n\t" 593 "vst1.32 {d6-d7}, [%[dst1]] \n\t" 594 "vst1.32 {d8-d9}, [%[dst2]] \n\t" 595 "vst1.32 {d10-d11}, [%[dst3]] \n\t" 596 "vst1.32 {d12-d13}, [%[dst4]] \n\t" 597 : /*no output*/ 598 : [src] "r" (_src + i), 599 [dst1] "r" (_dst + i + 0), 600 [dst2] "r" (_dst + i + 4), 601 [dst3] "r" (_dst + i + 8), 602 [dst4] "r" (_dst + i + 12), 603 "w" (vscale), "w" (vshift) 604 : "d4","d5","d6","d7","d8","d9","d10", 605 "d11","d12","d13","d14","d15","d16","d17", 606 "d18","d19","d20","d21","d22","d23","d24", 607 "d25","d26","d27","d28","d29","d30","d31" 608 ); 609 } 610 }) 611 #else 612 CVTS_FUNC(u8, f32, 16, 613 float32x4_t vscale = vdupq_n_f32((f32)alpha); 614 float32x4_t vshift = vdupq_n_f32((f32)beta);, 615 { 616 for (size_t i = 0; i < w; i += 16) 617 { 618 internal::prefetch(_src + i); 619 uint8x16_t vline = vld1q_u8(_src + i); 620 uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); 621 uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); 622 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); 623 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); 624 uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); 625 uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); 626 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 627 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 628 float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); 629 float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); 630 vline1_f32 = vmulq_f32(vline1_f32, vscale); 631 vline2_f32 = vmulq_f32(vline2_f32, vscale); 632 vline3_f32 = vmulq_f32(vline3_f32, vscale); 633 vline4_f32 = vmulq_f32(vline4_f32, vscale); 634 vline1_f32 = vaddq_f32(vline1_f32, vshift); 635 vline2_f32 = vaddq_f32(vline2_f32, vshift); 636 vline3_f32 = vaddq_f32(vline3_f32, vshift); 637 vline4_f32 = vaddq_f32(vline4_f32, vshift); 638 vst1q_f32(_dst + i + 0, vline1_f32); 639 vst1q_f32(_dst + i + 4, vline2_f32); 640 vst1q_f32(_dst + i + 8, vline3_f32); 641 vst1q_f32(_dst + i + 12, vline4_f32); 642 } 643 }) 644 #endif 645 646 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 647 CVTS_FUNC(s8, u8, 16, 648 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 649 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 650 { 651 for (size_t i = 0; i < w; i += 16) 652 { 653 internal::prefetch(_src + i); 654 __asm__ ( 655 "vld1.8 {d4-d5}, [%[src]] \n\t" 656 "vmovl.s8 q3, d4 \n\t" 657 "vmovl.s8 q4, d5 \n\t" 658 "vmovl.s16 q5, d6 \n\t" 659 "vmovl.s16 q6, d7 \n\t" 660 "vmovl.s16 q7, d8 \n\t" 661 "vmovl.s16 q8, d9 \n\t" 662 "vcvt.f32.s32 q9, q5 \n\t" 663 "vcvt.f32.s32 q10, q6 \n\t" 664 "vcvt.f32.s32 q11, q7 \n\t" 665 "vcvt.f32.s32 q12, q8 \n\t" 666 "vmul.f32 q13, q9, q0 \n\t" 667 "vmul.f32 q14, q10, q0 \n\t" 668 "vmul.f32 q15, q11, q0 \n\t" 669 "vmul.f32 q2, q12, q0 \n\t" 670 "vadd.f32 q3, q13, q1 \n\t" 671 "vadd.f32 q4, q14, q1 \n\t" 672 "vadd.f32 q5, q15, q1 \n\t" 673 "vadd.f32 q6, q2, q1 \n\t" 674 "vcvt.s32.f32 q7, q3 \n\t" 675 "vcvt.s32.f32 q8, q4 \n\t" 676 "vcvt.s32.f32 q9, q5 \n\t" 677 "vcvt.s32.f32 q10, q6 \n\t" 678 "vqmovun.s32 d22, q7 \n\t" 679 "vqmovun.s32 d23, q8 \n\t" 680 "vqmovun.s32 d24, q9 \n\t" 681 "vqmovun.s32 d25, q10 \n\t" 682 "vqmovn.u16 d26, q11 \n\t" 683 "vqmovn.u16 d27, q12 \n\t" 684 "vst1.8 {d26-d27}, [%[dst1]] \n\t" 685 : /*no output*/ 686 : [src] "r" (_src + i), 687 [dst1] "r" (_dst + i + 0), 688 "w" (vscale), "w" (vshift) 689 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 690 ); 691 } 692 }) 693 #else 694 CVTS_FUNC(s8, u8, 16, 695 float32x4_t vscale = vdupq_n_f32((f32)alpha); 696 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 697 { 698 for (size_t i = 0; i < w; i += 16) 699 { 700 internal::prefetch(_src + i); 701 int8x16_t vline = vld1q_s8(_src + i); 702 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); 703 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); 704 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); 705 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); 706 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); 707 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); 708 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 709 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 710 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); 711 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); 712 vline1_f32 = vmulq_f32(vline1_f32, vscale); 713 vline2_f32 = vmulq_f32(vline2_f32, vscale); 714 vline3_f32 = vmulq_f32(vline3_f32, vscale); 715 vline4_f32 = vmulq_f32(vline4_f32, vscale); 716 vline1_f32 = vaddq_f32(vline1_f32, vshift); 717 vline2_f32 = vaddq_f32(vline2_f32, vshift); 718 vline3_f32 = vaddq_f32(vline3_f32, vshift); 719 vline4_f32 = vaddq_f32(vline4_f32, vshift); 720 vline1_s32 = vcvtq_s32_f32(vline1_f32); 721 vline2_s32 = vcvtq_s32_f32(vline2_f32); 722 vline3_s32 = vcvtq_s32_f32(vline3_f32); 723 vline4_s32 = vcvtq_s32_f32(vline4_f32); 724 uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); 725 uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); 726 vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); 727 } 728 }) 729 #endif 730 731 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 732 CVTS_FUNC1(s8, 16, 733 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 734 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 735 { 736 for (size_t i = 0; i < w; i += 16) 737 { 738 internal::prefetch(_src + i); 739 __asm__ ( 740 "vld1.8 {d4-d5}, [%[src]] \n\t" 741 "vmovl.s8 q3, d4 \n\t" 742 "vmovl.s8 q4, d5 \n\t" 743 "vmovl.s16 q5, d6 \n\t" 744 "vmovl.s16 q6, d7 \n\t" 745 "vmovl.s16 q7, d8 \n\t" 746 "vmovl.s16 q8, d9 \n\t" 747 "vcvt.f32.s32 q9, q5 \n\t" 748 "vcvt.f32.s32 q10, q6 \n\t" 749 "vcvt.f32.s32 q11, q7 \n\t" 750 "vcvt.f32.s32 q12, q8 \n\t" 751 "vmul.f32 q13, q9, q0 \n\t" 752 "vmul.f32 q14, q10, q0 \n\t" 753 "vmul.f32 q15, q11, q0 \n\t" 754 "vmul.f32 q2, q12, q0 \n\t" 755 "vadd.f32 q3, q13, q1 \n\t" 756 "vadd.f32 q4, q14, q1 \n\t" 757 "vadd.f32 q5, q15, q1 \n\t" 758 "vadd.f32 q6, q2, q1 \n\t" 759 "vcvt.s32.f32 q7, q3 \n\t" 760 "vcvt.s32.f32 q8, q4 \n\t" 761 "vcvt.s32.f32 q9, q5 \n\t" 762 "vcvt.s32.f32 q10, q6 \n\t" 763 "vqmovn.s32 d22, q7 \n\t" 764 "vqmovn.s32 d23, q8 \n\t" 765 "vqmovn.s32 d24, q9 \n\t" 766 "vqmovn.s32 d25, q10 \n\t" 767 "vqmovn.s16 d26, q11 \n\t" 768 "vqmovn.s16 d27, q12 \n\t" 769 "vst1.8 {d26-d27}, [%[dst1]] \n\t" 770 : /*no output*/ 771 : [src] "r" (_src + i), 772 [dst1] "r" (_dst + i + 0), 773 "w" (vscale), "w" (vshift) 774 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 775 ); 776 } 777 }) 778 #else 779 CVTS_FUNC1(s8, 16, 780 float32x4_t vscale = vdupq_n_f32((f32)alpha); 781 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 782 { 783 for (size_t i = 0; i < w; i += 16) 784 { 785 internal::prefetch(_src + i); 786 int8x16_t vline = vld1q_s8(_src + i); 787 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); 788 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); 789 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); 790 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); 791 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); 792 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); 793 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 794 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 795 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); 796 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); 797 vline1_f32 = vmulq_f32(vline1_f32, vscale); 798 vline2_f32 = vmulq_f32(vline2_f32, vscale); 799 vline3_f32 = vmulq_f32(vline3_f32, vscale); 800 vline4_f32 = vmulq_f32(vline4_f32, vscale); 801 vline1_f32 = vaddq_f32(vline1_f32, vshift); 802 vline2_f32 = vaddq_f32(vline2_f32, vshift); 803 vline3_f32 = vaddq_f32(vline3_f32, vshift); 804 vline4_f32 = vaddq_f32(vline4_f32, vshift); 805 vline1_s32 = vcvtq_s32_f32(vline1_f32); 806 vline2_s32 = vcvtq_s32_f32(vline2_f32); 807 vline3_s32 = vcvtq_s32_f32(vline3_f32); 808 vline4_s32 = vcvtq_s32_f32(vline4_f32); 809 int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); 810 int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); 811 vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16))); 812 } 813 }) 814 #endif 815 816 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 817 CVTS_FUNC(s8, u16, 16, 818 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 819 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 820 { 821 for (size_t i = 0; i < w; i += 16) 822 { 823 internal::prefetch(_src + i); 824 __asm__ ( 825 "vld1.8 {d4-d5}, [%[src]] \n\t" 826 "vmovl.s8 q3, d4 \n\t" 827 "vmovl.s8 q4, d5 \n\t" 828 "vmovl.s16 q5, d6 \n\t" 829 "vmovl.s16 q6, d7 \n\t" 830 "vmovl.s16 q7, d8 \n\t" 831 "vmovl.s16 q8, d9 \n\t" 832 "vcvt.f32.s32 q9, q5 \n\t" 833 "vcvt.f32.s32 q10, q6 \n\t" 834 "vcvt.f32.s32 q11, q7 \n\t" 835 "vcvt.f32.s32 q12, q8 \n\t" 836 "vmul.f32 q13, q9, q0 \n\t" 837 "vmul.f32 q14, q10, q0 \n\t" 838 "vmul.f32 q15, q11, q0 \n\t" 839 "vmul.f32 q2, q12, q0 \n\t" 840 "vadd.f32 q3, q13, q1 \n\t" 841 "vadd.f32 q4, q14, q1 \n\t" 842 "vadd.f32 q5, q15, q1 \n\t" 843 "vadd.f32 q6, q2, q1 \n\t" 844 "vcvt.s32.f32 q7, q3 \n\t" 845 "vcvt.s32.f32 q8, q4 \n\t" 846 "vcvt.s32.f32 q9, q5 \n\t" 847 "vcvt.s32.f32 q10, q6 \n\t" 848 "vqmovun.s32 d22, q7 \n\t" 849 "vqmovun.s32 d23, q8 \n\t" 850 "vqmovun.s32 d24, q9 \n\t" 851 "vqmovun.s32 d25, q10 \n\t" 852 "vst1.16 {d22-d23}, [%[dst1]] \n\t" 853 "vst1.16 {d24-d25}, [%[dst2]] \n\t" 854 : /*no output*/ 855 : [src] "r" (_src + i), 856 [dst1] "r" (_dst + i + 0), 857 [dst2] "r" (_dst + i + 8), 858 "w" (vscale), "w" (vshift) 859 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 860 ); 861 } 862 }) 863 #else 864 CVTS_FUNC(s8, u16, 16, 865 float32x4_t vscale = vdupq_n_f32((f32)alpha); 866 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 867 { 868 for (size_t i = 0; i < w; i += 16) 869 { 870 internal::prefetch(_src + i); 871 int8x16_t vline = vld1q_s8(_src + i); 872 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); 873 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); 874 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); 875 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); 876 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); 877 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); 878 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 879 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 880 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); 881 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); 882 vline1_f32 = vmulq_f32(vline1_f32, vscale); 883 vline2_f32 = vmulq_f32(vline2_f32, vscale); 884 vline3_f32 = vmulq_f32(vline3_f32, vscale); 885 vline4_f32 = vmulq_f32(vline4_f32, vscale); 886 vline1_f32 = vaddq_f32(vline1_f32, vshift); 887 vline2_f32 = vaddq_f32(vline2_f32, vshift); 888 vline3_f32 = vaddq_f32(vline3_f32, vshift); 889 vline4_f32 = vaddq_f32(vline4_f32, vshift); 890 vline1_s32 = vcvtq_s32_f32(vline1_f32); 891 vline2_s32 = vcvtq_s32_f32(vline2_f32); 892 vline3_s32 = vcvtq_s32_f32(vline3_f32); 893 vline4_s32 = vcvtq_s32_f32(vline4_f32); 894 uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); 895 uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); 896 vst1q_u16(_dst + i + 0, vRes1_u16); 897 vst1q_u16(_dst + i + 8, vRes2_u16); 898 } 899 }) 900 #endif 901 902 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) 903 CVTS_FUNC(s8, s16, 16, 904 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 905 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 906 { 907 for (size_t i = 0; i < w; i += 16) 908 { 909 internal::prefetch(_src + i); 910 __asm__ ( 911 "vld1.8 {d4-d5}, [%[src]] \n\t" 912 "vmovl.s8 q3, d4 \n\t" 913 "vmovl.s8 q4, d5 \n\t" 914 "vmovl.s16 q5, d6 \n\t" 915 "vmovl.s16 q6, d7 \n\t" 916 "vmovl.s16 q7, d8 \n\t" 917 "vmovl.s16 q8, d9 \n\t" 918 "vcvt.f32.s32 q9, q5 \n\t" 919 "vcvt.f32.s32 q10, q6 \n\t" 920 "vcvt.f32.s32 q11, q7 \n\t" 921 "vcvt.f32.s32 q12, q8 \n\t" 922 "vmul.f32 q13, q9, q0 \n\t" 923 "vmul.f32 q14, q10, q0 \n\t" 924 "vmul.f32 q15, q11, q0 \n\t" 925 "vmul.f32 q2, q12, q0 \n\t" 926 "vadd.f32 q3, q13, q1 \n\t" 927 "vadd.f32 q4, q14, q1 \n\t" 928 "vadd.f32 q5, q15, q1 \n\t" 929 "vadd.f32 q6, q2, q1 \n\t" 930 "vcvt.s32.f32 q7, q3 \n\t" 931 "vcvt.s32.f32 q8, q4 \n\t" 932 "vcvt.s32.f32 q9, q5 \n\t" 933 "vcvt.s32.f32 q10, q6 \n\t" 934 "vqmovn.s32 d22, q7 \n\t" 935 "vqmovn.s32 d23, q8 \n\t" 936 "vqmovn.s32 d24, q9 \n\t" 937 "vqmovn.s32 d25, q10 \n\t" 938 "vst1.16 {d22-d23}, [%[dst1]] \n\t" 939 "vst1.16 {d24-d25}, [%[dst2]] \n\t" 940 : /*no output*/ 941 : [src] "r" (_src + i), 942 [dst1] "r" (_dst + i + 0), 943 [dst2] "r" (_dst + i + 8), 944 "w" (vscale), "w" (vshift) 945 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 946 ); 947 } 948 }) 949 #else 950 CVTS_FUNC(s8, s16, 16, 951 float32x4_t vscale = vdupq_n_f32((f32)alpha); 952 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 953 { 954 for (size_t i = 0; i < w; i += 16) 955 { 956 internal::prefetch(_src + i); 957 int8x16_t vline = vld1q_s8(_src + i); 958 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); 959 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); 960 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); 961 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); 962 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); 963 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); 964 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 965 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 966 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); 967 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); 968 vline1_f32 = vmulq_f32(vline1_f32, vscale); 969 vline2_f32 = vmulq_f32(vline2_f32, vscale); 970 vline3_f32 = vmulq_f32(vline3_f32, vscale); 971 vline4_f32 = vmulq_f32(vline4_f32, vscale); 972 vline1_f32 = vaddq_f32(vline1_f32, vshift); 973 vline2_f32 = vaddq_f32(vline2_f32, vshift); 974 vline3_f32 = vaddq_f32(vline3_f32, vshift); 975 vline4_f32 = vaddq_f32(vline4_f32, vshift); 976 vline1_s32 = vcvtq_s32_f32(vline1_f32); 977 vline2_s32 = vcvtq_s32_f32(vline2_f32); 978 vline3_s32 = vcvtq_s32_f32(vline3_f32); 979 vline4_s32 = vcvtq_s32_f32(vline4_f32); 980 int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); 981 int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); 982 vst1q_s16(_dst + i + 0, vRes1_s16); 983 vst1q_s16(_dst + i + 8, vRes2_s16); 984 } 985 }) 986 #endif 987 988 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 989 CVTS_FUNC(s8, s32, 16, 990 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 991 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 992 { 993 for (size_t i = 0; i < w; i += 16) 994 { 995 internal::prefetch(_src + i); 996 __asm__ ( 997 "vld1.8 {d4-d5}, [%[src]] \n\t" 998 "vmovl.s8 q3, d4 \n\t" 999 "vmovl.s8 q4, d5 \n\t" 1000 "vmovl.s16 q5, d6 \n\t" 1001 "vmovl.s16 q6, d7 \n\t" 1002 "vmovl.s16 q7, d8 \n\t" 1003 "vmovl.s16 q8, d9 \n\t" 1004 "vcvt.f32.s32 q9, q5 \n\t" 1005 "vcvt.f32.s32 q10, q6 \n\t" 1006 "vcvt.f32.s32 q11, q7 \n\t" 1007 "vcvt.f32.s32 q12, q8 \n\t" 1008 "vmul.f32 q13, q9, q0 \n\t" 1009 "vmul.f32 q14, q10, q0 \n\t" 1010 "vmul.f32 q15, q11, q0 \n\t" 1011 "vmul.f32 q2, q12, q0 \n\t" 1012 "vadd.f32 q3, q13, q1 \n\t" 1013 "vadd.f32 q4, q14, q1 \n\t" 1014 "vadd.f32 q5, q15, q1 \n\t" 1015 "vadd.f32 q6, q2, q1 \n\t" 1016 "vcvt.s32.f32 q7, q3 \n\t" 1017 "vcvt.s32.f32 q8, q4 \n\t" 1018 "vcvt.s32.f32 q9, q5 \n\t" 1019 "vcvt.s32.f32 q10, q6 \n\t" 1020 "vst1.32 {d14-d15}, [%[dst1]] \n\t" 1021 "vst1.32 {d16-d17}, [%[dst2]] \n\t" 1022 "vst1.32 {d18-d19}, [%[dst3]] \n\t" 1023 "vst1.32 {d20-d21}, [%[dst4]] \n\t" 1024 : /*no output*/ 1025 : [src] "r" (_src + i), 1026 [dst1] "r" (_dst + i + 0), 1027 [dst2] "r" (_dst + i + 4), 1028 [dst3] "r" (_dst + i + 8), 1029 [dst4] "r" (_dst + i + 12), 1030 "w" (vscale), "w" (vshift) 1031 : "d4","d5","d6","d7","d8","d9","d10", 1032 "d11","d12","d13","d14","d15","d16","d17", 1033 "d18","d19","d20","d21","d22","d23","d24", 1034 "d25","d26","d27","d28","d29","d30","d31" 1035 ); 1036 } 1037 }) 1038 #else 1039 CVTS_FUNC(s8, s32, 16, 1040 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1041 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1042 { 1043 for (size_t i = 0; i < w; i += 16) 1044 { 1045 internal::prefetch(_src + i); 1046 int8x16_t vline = vld1q_s8(_src + i); 1047 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); 1048 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); 1049 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); 1050 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); 1051 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); 1052 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); 1053 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1054 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1055 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); 1056 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); 1057 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1058 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1059 vline3_f32 = vmulq_f32(vline3_f32, vscale); 1060 vline4_f32 = vmulq_f32(vline4_f32, vscale); 1061 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1062 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1063 vline3_f32 = vaddq_f32(vline3_f32, vshift); 1064 vline4_f32 = vaddq_f32(vline4_f32, vshift); 1065 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1066 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1067 vline3_s32 = vcvtq_s32_f32(vline3_f32); 1068 vline4_s32 = vcvtq_s32_f32(vline4_f32); 1069 vst1q_s32(_dst + i + 0, vline1_s32); 1070 vst1q_s32(_dst + i + 4, vline2_s32); 1071 vst1q_s32(_dst + i + 8, vline3_s32); 1072 vst1q_s32(_dst + i + 12, vline4_s32); 1073 } 1074 }) 1075 #endif 1076 1077 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1078 CVTS_FUNC(s8, f32, 16, 1079 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1080 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, 1081 { 1082 for (size_t i = 0; i < w; i += 16) 1083 { 1084 internal::prefetch(_src + i); 1085 __asm__ ( 1086 "vld1.8 {d4-d5}, [%[src]] \n\t" 1087 "vmovl.s8 q3, d4 \n\t" 1088 "vmovl.s8 q4, d5 \n\t" 1089 "vmovl.s16 q5, d6 \n\t" 1090 "vmovl.s16 q6, d7 \n\t" 1091 "vmovl.s16 q7, d8 \n\t" 1092 "vmovl.s16 q8, d9 \n\t" 1093 "vcvt.f32.s32 q9, q5 \n\t" 1094 "vcvt.f32.s32 q10, q6 \n\t" 1095 "vcvt.f32.s32 q11, q7 \n\t" 1096 "vcvt.f32.s32 q12, q8 \n\t" 1097 "vmul.f32 q13, q9, q0 \n\t" 1098 "vmul.f32 q14, q10, q0 \n\t" 1099 "vmul.f32 q15, q11, q0 \n\t" 1100 "vmul.f32 q2, q12, q0 \n\t" 1101 "vadd.f32 q3, q13, q1 \n\t" 1102 "vadd.f32 q4, q14, q1 \n\t" 1103 "vadd.f32 q5, q15, q1 \n\t" 1104 "vadd.f32 q6, q2, q1 \n\t" 1105 "vst1.32 {d6-d7}, [%[dst1]] \n\t" 1106 "vst1.32 {d8-d9}, [%[dst2]] \n\t" 1107 "vst1.32 {d10-d11}, [%[dst3]] \n\t" 1108 "vst1.32 {d12-d13}, [%[dst4]] \n\t" 1109 : /*no output*/ 1110 : [src] "r" (_src + i), 1111 [dst1] "r" (_dst + i + 0), 1112 [dst2] "r" (_dst + i + 4), 1113 [dst3] "r" (_dst + i + 8), 1114 [dst4] "r" (_dst + i + 12), 1115 "w" (vscale), "w" (vshift) 1116 : "d4","d5","d6","d7","d8","d9","d10", 1117 "d11","d12","d13","d14","d15","d16","d17", 1118 "d18","d19","d20","d21","d22","d23","d24", 1119 "d25","d26","d27","d28","d29","d30","d31" 1120 ); 1121 } 1122 }) 1123 #else 1124 CVTS_FUNC(s8, f32, 16, 1125 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1126 float32x4_t vshift = vdupq_n_f32((f32)beta);, 1127 { 1128 for (size_t i = 0; i < w; i += 16) 1129 { 1130 internal::prefetch(_src + i); 1131 int8x16_t vline = vld1q_s8(_src + i); 1132 int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); 1133 int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); 1134 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); 1135 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); 1136 int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); 1137 int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); 1138 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1139 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1140 float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); 1141 float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); 1142 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1143 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1144 vline3_f32 = vmulq_f32(vline3_f32, vscale); 1145 vline4_f32 = vmulq_f32(vline4_f32, vscale); 1146 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1147 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1148 vline3_f32 = vaddq_f32(vline3_f32, vshift); 1149 vline4_f32 = vaddq_f32(vline4_f32, vshift); 1150 vst1q_f32(_dst + i + 0, vline1_f32); 1151 vst1q_f32(_dst + i + 4, vline2_f32); 1152 vst1q_f32(_dst + i + 8, vline3_f32); 1153 vst1q_f32(_dst + i + 12, vline4_f32); 1154 } 1155 }) 1156 #endif 1157 1158 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1159 CVTS_FUNC(u16, u8, 16, 1160 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1161 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1162 { 1163 for (size_t i = 0; i < w; i += 8) 1164 { 1165 internal::prefetch(_src + i); 1166 __asm__ ( 1167 "vld1.8 {d4-d5}, [%[src1]] \n\t" 1168 "vmovl.u16 q3, d4 \n\t" 1169 "vmovl.u16 q4, d5 \n\t" 1170 "vcvt.f32.u32 q5, q3 \n\t" 1171 "vcvt.f32.u32 q6, q4 \n\t" 1172 "vmul.f32 q7, q5, q0 \n\t" 1173 "vmul.f32 q8, q6, q0 \n\t" 1174 "vadd.f32 q9, q7, q1 \n\t" 1175 "vadd.f32 q10, q8, q1 \n\t" 1176 "vcvt.s32.f32 q11, q9 \n\t" 1177 "vcvt.s32.f32 q12, q10 \n\t" 1178 "vqmovn.s32 d26, q11 \n\t" 1179 "vqmovn.s32 d27, q12 \n\t" 1180 "vqmovun.s16 d28, q13 \n\t" 1181 "vst1.8 {d28}, [%[dst]] \n\t" 1182 : /*no output*/ 1183 : [src1] "r" (_src + i), 1184 [dst] "r" (_dst + i + 0), 1185 "w" (vscale), "w" (vshift) 1186 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" 1187 ); 1188 } 1189 }) 1190 #else 1191 CVTS_FUNC(u16, u8, 16, 1192 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1193 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1194 { 1195 for (size_t i = 0; i < w; i += 8) 1196 { 1197 internal::prefetch(_src + i); 1198 uint16x8_t vline = vld1q_u16(_src + i); 1199 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); 1200 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); 1201 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 1202 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 1203 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1204 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1205 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1206 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1207 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 1208 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 1209 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1210 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1211 uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); 1212 vst1_u8(_dst + i, vRes); 1213 } 1214 }) 1215 #endif 1216 1217 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1218 CVTS_FUNC(u16, s8, 16, 1219 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1220 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1221 { 1222 for (size_t i = 0; i < w; i += 8) 1223 { 1224 internal::prefetch(_src + i); 1225 __asm__ ( 1226 "vld1.8 {d4-d5}, [%[src1]] \n\t" 1227 "vmovl.u16 q3, d4 \n\t" 1228 "vmovl.u16 q4, d5 \n\t" 1229 "vcvt.f32.u32 q5, q3 \n\t" 1230 "vcvt.f32.u32 q6, q4 \n\t" 1231 "vmul.f32 q7, q5, q0 \n\t" 1232 "vmul.f32 q8, q6, q0 \n\t" 1233 "vadd.f32 q9, q7, q1 \n\t" 1234 "vadd.f32 q10, q8, q1 \n\t" 1235 "vcvt.s32.f32 q11, q9 \n\t" 1236 "vcvt.s32.f32 q12, q10 \n\t" 1237 "vqmovn.s32 d26, q11 \n\t" 1238 "vqmovn.s32 d27, q12 \n\t" 1239 "vqmovn.s16 d28, q13 \n\t" 1240 "vst1.8 {d28}, [%[dst]] \n\t" 1241 : /*no output*/ 1242 : [src1] "r" (_src + i), 1243 [dst] "r" (_dst + i + 0), 1244 "w" (vscale), "w" (vshift) 1245 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" 1246 ); 1247 } 1248 }) 1249 #else 1250 CVTS_FUNC(u16, s8, 16, 1251 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1252 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1253 { 1254 for (size_t i = 0; i < w; i += 8) 1255 { 1256 internal::prefetch(_src + i); 1257 uint16x8_t vline = vld1q_u16(_src + i); 1258 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); 1259 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); 1260 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 1261 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 1262 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1263 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1264 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1265 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1266 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 1267 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 1268 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1269 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1270 int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); 1271 vst1_s8(_dst + i, vRes); 1272 } 1273 }) 1274 #endif 1275 1276 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1277 CVTS_FUNC1(u16, 16, 1278 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1279 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1280 { 1281 for (size_t i = 0; i < w; i += 8) 1282 { 1283 internal::prefetch(_src + i); 1284 __asm__ ( 1285 "vld1.16 {d4-d5}, [%[src]] \n\t" 1286 "vmovl.u16 q3, d4 \n\t" 1287 "vmovl.u16 q4, d5 \n\t" 1288 "vcvt.f32.u32 q5, q3 \n\t" 1289 "vcvt.f32.u32 q6, q4 \n\t" 1290 "vmul.f32 q7, q5, q0 \n\t" 1291 "vmul.f32 q8, q6, q0 \n\t" 1292 "vadd.f32 q9, q7, q1 \n\t" 1293 "vadd.f32 q10, q8, q1 \n\t" 1294 "vcvt.s32.f32 q11, q9 \n\t" 1295 "vcvt.s32.f32 q12, q10 \n\t" 1296 "vqmovun.s32 d26, q11 \n\t" 1297 "vqmovun.s32 d27, q12 \n\t" 1298 "vst1.16 {d26-d27}, [%[dst]] \n\t" 1299 : /*no output*/ 1300 : [src] "r" (_src + i), 1301 [dst] "r" (_dst + i + 0), 1302 "w" (vshift), "w" (vscale) 1303 : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" 1304 ); 1305 } 1306 }) 1307 #else 1308 CVTS_FUNC1(u16, 16, 1309 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1310 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1311 { 1312 for (size_t i = 0; i < w; i += 8) 1313 { 1314 internal::prefetch(_src + i); 1315 uint16x8_t vline = vld1q_u16(_src + i); 1316 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); 1317 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); 1318 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 1319 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 1320 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1321 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1322 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1323 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1324 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 1325 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 1326 uint16x4_t vRes1 = vqmovun_s32(vline1_s32); 1327 uint16x4_t vRes2 = vqmovun_s32(vline2_s32); 1328 vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); 1329 } 1330 }) 1331 #endif 1332 1333 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1334 CVTS_FUNC(u16, s16, 8, 1335 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1336 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1337 { 1338 for (size_t i = 0; i < w; i += 8) 1339 { 1340 internal::prefetch(_src + i); 1341 __asm__ ( 1342 "vld1.16 {d4-d5}, [%[src]] \n\t" 1343 "vmovl.u16 q3, d4 \n\t" 1344 "vmovl.u16 q4, d5 \n\t" 1345 "vcvt.f32.u32 q5, q3 \n\t" 1346 "vcvt.f32.u32 q6, q4 \n\t" 1347 "vmul.f32 q7, q5, q0 \n\t" 1348 "vmul.f32 q8, q6, q0 \n\t" 1349 "vadd.f32 q9, q7, q1 \n\t" 1350 "vadd.f32 q10, q8, q1 \n\t" 1351 "vcvt.s32.f32 q11, q9 \n\t" 1352 "vcvt.s32.f32 q12, q10 \n\t" 1353 "vqmovn.s32 d26, q11 \n\t" 1354 "vqmovn.s32 d27, q12 \n\t" 1355 "vst1.16 {d26-d27}, [%[dst]] \n\t" 1356 : /*no output*/ 1357 : [src] "r" (_src + i), 1358 [dst] "r" (_dst + i + 0), 1359 "w" (vshift), "w" (vscale) 1360 : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" 1361 ); 1362 } 1363 }) 1364 #else 1365 CVTS_FUNC(u16, s16, 8, 1366 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1367 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1368 { 1369 for (size_t i = 0; i < w; i += 8) 1370 { 1371 internal::prefetch(_src + i); 1372 uint16x8_t vline = vld1q_u16(_src + i); 1373 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); 1374 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); 1375 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 1376 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 1377 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1378 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1379 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1380 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1381 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 1382 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 1383 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1384 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1385 vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); 1386 } 1387 }) 1388 #endif 1389 1390 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1391 CVTS_FUNC(u16, s32, 8, 1392 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1393 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1394 { 1395 for (size_t i = 0; i < w; i += 8) 1396 { 1397 internal::prefetch(_src + i); 1398 __asm__ ( 1399 "vld1.16 {d4-d5}, [%[src]] \n\t" 1400 "vmovl.u16 q3, d4 \n\t" 1401 "vmovl.u16 q4, d5 \n\t" 1402 "vcvt.f32.u32 q5, q3 \n\t" 1403 "vcvt.f32.u32 q6, q4 \n\t" 1404 "vmul.f32 q7, q5, q0 \n\t" 1405 "vmul.f32 q8, q6, q0 \n\t" 1406 "vadd.f32 q9, q7, q1 \n\t" 1407 "vadd.f32 q10, q8, q1 \n\t" 1408 "vcvt.s32.f32 q11, q9 \n\t" 1409 "vcvt.s32.f32 q12, q10 \n\t" 1410 "vst1.32 {d22-d23}, [%[dst1]] \n\t" 1411 "vst1.32 {d24-d25}, [%[dst2]] \n\t" 1412 : /*no output*/ 1413 : [src] "r" (_src + i), 1414 [dst1] "r" (_dst + i), 1415 [dst2] "r" (_dst + i + 4), 1416 "w" (vshift), "w" (vscale) 1417 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" 1418 ); 1419 } 1420 }) 1421 #else 1422 CVTS_FUNC(u16, s32, 8, 1423 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1424 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1425 { 1426 for (size_t i = 0; i < w; i += 8) 1427 { 1428 internal::prefetch(_src + i); 1429 uint16x8_t vline = vld1q_u16(_src + i); 1430 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); 1431 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); 1432 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 1433 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 1434 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1435 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1436 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1437 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1438 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 1439 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 1440 vst1q_s32(_dst + i + 0, vline1_s32); 1441 vst1q_s32(_dst + i + 4, vline2_s32); 1442 } 1443 }) 1444 #endif 1445 1446 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1447 CVTS_FUNC(u16, f32, 8, 1448 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1449 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, 1450 { 1451 for (size_t i = 0; i < w; i += 8) 1452 { 1453 internal::prefetch(_src + i); 1454 __asm__ ( 1455 "vld1.16 {d4-d5}, [%[src]] \n\t" 1456 "vmovl.u16 q3, d4 \n\t" 1457 "vmovl.u16 q4, d5 \n\t" 1458 "vcvt.f32.u32 q5, q3 \n\t" 1459 "vcvt.f32.u32 q6, q4 \n\t" 1460 "vmul.f32 q7, q5, q0 \n\t" 1461 "vmul.f32 q8, q6, q0 \n\t" 1462 "vadd.f32 q9, q7, q1 \n\t" 1463 "vadd.f32 q10, q8, q1 \n\t" 1464 "vst1.32 {d18-d19}, [%[dst1]] \n\t" 1465 "vst1.32 {d20-d21}, [%[dst2]] \n\t" 1466 : /*no output*/ 1467 : [src] "r" (_src + i), 1468 [dst1] "r" (_dst + i + 0), 1469 [dst2] "r" (_dst + i + 4), 1470 "w" (vscale), "w" (vshift) 1471 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" 1472 ); 1473 } 1474 }) 1475 #else 1476 CVTS_FUNC(u16, f32, 8, 1477 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1478 float32x4_t vshift = vdupq_n_f32((f32)beta);, 1479 { 1480 for (size_t i = 0; i < w; i += 8) 1481 { 1482 internal::prefetch(_src + i); 1483 uint16x8_t vline = vld1q_u16(_src + i); 1484 uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); 1485 uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); 1486 float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); 1487 float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); 1488 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1489 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1490 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1491 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1492 vst1q_f32(_dst + i + 0, vline1_f32); 1493 vst1q_f32(_dst + i + 4, vline2_f32); 1494 } 1495 }) 1496 #endif 1497 1498 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1499 CVTS_FUNC(s16, u8, 16, 1500 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1501 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1502 { 1503 for (size_t i = 0; i < w; i += 8) 1504 { 1505 internal::prefetch(_src + i); 1506 __asm__ ( 1507 "vld1.8 {d4-d5}, [%[src1]] \n\t" 1508 "vmovl.s16 q3, d4 \n\t" 1509 "vmovl.s16 q4, d5 \n\t" 1510 "vcvt.f32.s32 q5, q3 \n\t" 1511 "vcvt.f32.s32 q6, q4 \n\t" 1512 "vmul.f32 q7, q5, q0 \n\t" 1513 "vmul.f32 q8, q6, q0 \n\t" 1514 "vadd.f32 q9, q7, q1 \n\t" 1515 "vadd.f32 q10, q8, q1 \n\t" 1516 "vcvt.s32.f32 q11, q9 \n\t" 1517 "vcvt.s32.f32 q12, q10 \n\t" 1518 "vqmovn.s32 d26, q11 \n\t" 1519 "vqmovn.s32 d27, q12 \n\t" 1520 "vqmovun.s16 d28, q13 \n\t" 1521 "vst1.8 {d28}, [%[dst]] \n\t" 1522 : /*no output*/ 1523 : [src1] "r" (_src + i), 1524 [dst] "r" (_dst + i + 0), 1525 "w" (vscale), "w" (vshift) 1526 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" 1527 ); 1528 } 1529 }) 1530 #else 1531 CVTS_FUNC(s16, u8, 16, 1532 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1533 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1534 { 1535 for (size_t i = 0; i < w; i += 8) 1536 { 1537 internal::prefetch(_src + i); 1538 int16x8_t vline = vld1q_s16(_src + i); 1539 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); 1540 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); 1541 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1542 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1543 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1544 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1545 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1546 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1547 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1548 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1549 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1550 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1551 uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); 1552 vst1_u8(_dst + i, vRes); 1553 } 1554 }) 1555 #endif 1556 1557 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1558 CVTS_FUNC(s16, s8, 16, 1559 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1560 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1561 { 1562 for (size_t i = 0; i < w; i += 8) 1563 { 1564 internal::prefetch(_src + i); 1565 __asm__ ( 1566 "vld1.8 {d4-d5}, [%[src1]] \n\t" 1567 "vmovl.s16 q3, d4 \n\t" 1568 "vmovl.s16 q4, d5 \n\t" 1569 "vcvt.f32.s32 q5, q3 \n\t" 1570 "vcvt.f32.s32 q6, q4 \n\t" 1571 "vmul.f32 q7, q5, q0 \n\t" 1572 "vmul.f32 q8, q6, q0 \n\t" 1573 "vadd.f32 q9, q7, q1 \n\t" 1574 "vadd.f32 q10, q8, q1 \n\t" 1575 "vcvt.s32.f32 q11, q9 \n\t" 1576 "vcvt.s32.f32 q12, q10 \n\t" 1577 "vqmovn.s32 d26, q11 \n\t" 1578 "vqmovn.s32 d27, q12 \n\t" 1579 "vqmovn.s16 d28, q13 \n\t" 1580 "vst1.8 {d28}, [%[dst]] \n\t" 1581 : /*no output*/ 1582 : [src1] "r" (_src + i), 1583 [dst] "r" (_dst + i + 0), 1584 "w" (vscale), "w" (vshift) 1585 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" 1586 ); 1587 } 1588 }) 1589 #else 1590 CVTS_FUNC(s16, s8, 16, 1591 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1592 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1593 { 1594 for (size_t i = 0; i < w; i += 8) 1595 { 1596 internal::prefetch(_src + i); 1597 int16x8_t vline = vld1q_s16(_src + i); 1598 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); 1599 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); 1600 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1601 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1602 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1603 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1604 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1605 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1606 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1607 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1608 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1609 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1610 int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); 1611 vst1_s8(_dst + i, vRes); 1612 } 1613 }) 1614 #endif 1615 1616 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1617 CVTS_FUNC(s16, u16, 8, 1618 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1619 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1620 { 1621 for (size_t i = 0; i < w; i += 8) 1622 { 1623 internal::prefetch(_src + i); 1624 __asm__ ( 1625 "vld1.16 {d4-d5}, [%[src]] \n\t" 1626 "vmovl.s16 q3, d4 \n\t" 1627 "vmovl.s16 q4, d5 \n\t" 1628 "vcvt.f32.s32 q5, q3 \n\t" 1629 "vcvt.f32.s32 q6, q4 \n\t" 1630 "vmul.f32 q7, q5, q0 \n\t" 1631 "vmul.f32 q8, q6, q0 \n\t" 1632 "vadd.f32 q9, q7, q1 \n\t" 1633 "vadd.f32 q10, q8, q1 \n\t" 1634 "vcvt.s32.f32 q11, q9 \n\t" 1635 "vcvt.s32.f32 q12, q10 \n\t" 1636 "vqmovun.s32 d26, q11 \n\t" 1637 "vqmovun.s32 d27, q12 \n\t" 1638 "vst1.16 {d26-d27}, [%[dst]] \n\t" 1639 : /*no output*/ 1640 : [src] "r" (_src + i), 1641 [dst] "r" (_dst + i + 0), 1642 "w" (vscale), "w" (vshift) 1643 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" 1644 ); 1645 } 1646 }) 1647 #else 1648 CVTS_FUNC(s16, u16, 8, 1649 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1650 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1651 { 1652 for (size_t i = 0; i < w; i += 8) 1653 { 1654 internal::prefetch(_src + i); 1655 int16x8_t vline = vld1q_s16(_src + i); 1656 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); 1657 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); 1658 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1659 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1660 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1661 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1662 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1663 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1664 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1665 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1666 uint16x4_t vRes1 = vqmovun_s32(vline1_s32); 1667 uint16x4_t vRes2 = vqmovun_s32(vline2_s32); 1668 vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); 1669 } 1670 }) 1671 #endif 1672 1673 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1674 CVTS_FUNC1(s16, 16, 1675 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1676 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1677 { 1678 for (size_t i = 0; i < w; i += 8) 1679 { 1680 internal::prefetch(_src + i); 1681 __asm__ ( 1682 "vld1.16 {d4-d5}, [%[src]] \n\t" 1683 "vmovl.s16 q3, d4 \n\t" 1684 "vmovl.s16 q4, d5 \n\t" 1685 "vcvt.f32.s32 q5, q3 \n\t" 1686 "vcvt.f32.s32 q6, q4 \n\t" 1687 "vmul.f32 q7, q5, q0 \n\t" 1688 "vmul.f32 q8, q6, q0 \n\t" 1689 "vadd.f32 q9, q7, q1 \n\t" 1690 "vadd.f32 q10, q8, q1 \n\t" 1691 "vcvt.s32.f32 q11, q9 \n\t" 1692 "vcvt.s32.f32 q12, q10 \n\t" 1693 "vqmovn.s32 d26, q11 \n\t" 1694 "vqmovn.s32 d27, q12 \n\t" 1695 "vst1.16 {d26-d27}, [%[dst]] \n\t" 1696 : /*no output*/ 1697 : [src] "r" (_src + i), 1698 [dst] "r" (_dst + i + 0), 1699 "w" (vshift), "w" (vscale) 1700 : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" 1701 ); 1702 } 1703 }) 1704 #else 1705 CVTS_FUNC1(s16, 16, 1706 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1707 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1708 { 1709 for (size_t i = 0; i < w; i += 8) 1710 { 1711 internal::prefetch(_src + i); 1712 int16x8_t vline = vld1q_s16(_src + i); 1713 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); 1714 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); 1715 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1716 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1717 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1718 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1719 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1720 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1721 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1722 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1723 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1724 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1725 vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); 1726 } 1727 }) 1728 #endif 1729 1730 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1731 CVTS_FUNC(s16, s32, 8, 1732 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1733 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1734 { 1735 for (size_t i = 0; i < w; i += 8) 1736 { 1737 internal::prefetch(_src + i); 1738 __asm__ ( 1739 "vld1.16 {d4-d5}, [%[src]] \n\t" 1740 "vmovl.s16 q3, d4 \n\t" 1741 "vmovl.s16 q4, d5 \n\t" 1742 "vcvt.f32.s32 q5, q3 \n\t" 1743 "vcvt.f32.s32 q6, q4 \n\t" 1744 "vmul.f32 q7, q5, q0 \n\t" 1745 "vmul.f32 q8, q6, q0 \n\t" 1746 "vadd.f32 q9, q7, q1 \n\t" 1747 "vadd.f32 q10, q8, q1 \n\t" 1748 "vcvt.s32.f32 q11, q9 \n\t" 1749 "vcvt.s32.f32 q12, q10 \n\t" 1750 "vst1.32 {d22-d23}, [%[dst1]] \n\t" 1751 "vst1.32 {d24-d25}, [%[dst2]] \n\t" 1752 : /*no output*/ 1753 : [src] "r" (_src + i), 1754 [dst1] "r" (_dst + i + 0), 1755 [dst2] "r" (_dst + i + 4), 1756 "w" (vscale), "w" (vshift) 1757 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" 1758 ); 1759 } 1760 }) 1761 #else 1762 CVTS_FUNC(s16, s32, 8, 1763 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1764 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1765 { 1766 for (size_t i = 0; i < w; i += 8) 1767 { 1768 internal::prefetch(_src + i); 1769 int16x8_t vline = vld1q_s16(_src + i); 1770 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); 1771 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); 1772 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1773 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1774 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1775 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1776 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1777 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1778 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1779 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1780 vst1q_s32(_dst + i + 0, vline1_s32); 1781 vst1q_s32(_dst + i + 4, vline2_s32); 1782 } 1783 }) 1784 #endif 1785 1786 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1787 CVTS_FUNC(s16, f32, 8, 1788 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1789 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, 1790 { 1791 for (size_t i = 0; i < w; i += 8) 1792 { 1793 internal::prefetch(_src + i); 1794 __asm__ ( 1795 "vld1.16 {d4-d5}, [%[src]] \n\t" 1796 "vmovl.s16 q3, d4 \n\t" 1797 "vmovl.s16 q4, d5 \n\t" 1798 "vcvt.f32.s32 q5, q3 \n\t" 1799 "vcvt.f32.s32 q6, q4 \n\t" 1800 "vmul.f32 q7, q5, q0 \n\t" 1801 "vmul.f32 q8, q6, q0 \n\t" 1802 "vadd.f32 q9, q7, q1 \n\t" 1803 "vadd.f32 q10, q8, q1 \n\t" 1804 "vst1.32 {d18-d19}, [%[dst1]] \n\t" 1805 "vst1.32 {d20-d21}, [%[dst2]] \n\t" 1806 : /*no output*/ 1807 : [src] "r" (_src + i), 1808 [dst1] "r" (_dst + i + 0), 1809 [dst2] "r" (_dst + i + 4), 1810 "w" (vscale), "w" (vshift) 1811 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" 1812 ); 1813 } 1814 }) 1815 #else 1816 CVTS_FUNC(s16, f32, 8, 1817 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1818 float32x4_t vshift = vdupq_n_f32((f32)beta);, 1819 { 1820 for (size_t i = 0; i < w; i += 8) 1821 { 1822 internal::prefetch(_src + i); 1823 int16x8_t vline = vld1q_s16(_src + i); 1824 int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); 1825 int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); 1826 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1827 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1828 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1829 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1830 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1831 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1832 vst1q_f32(_dst + i + 0, vline1_f32); 1833 vst1q_f32(_dst + i + 4, vline2_f32); 1834 } 1835 }) 1836 #endif 1837 1838 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1839 CVTS_FUNC(s32, u8, 8, 1840 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1841 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1842 { 1843 for (size_t i = 0; i < w; i += 8) 1844 { 1845 internal::prefetch(_src + i); 1846 __asm__ ( 1847 "vld1.32 {d4-d5}, [%[src1]] \n\t" 1848 "vld1.32 {d6-d7}, [%[src2]] \n\t" 1849 "vcvt.f32.s32 q4, q2 \n\t" 1850 "vcvt.f32.s32 q5, q3 \n\t" 1851 "vmul.f32 q6, q4, q0 \n\t" 1852 "vmul.f32 q7, q5, q0 \n\t" 1853 "vadd.f32 q8, q6, q1 \n\t" 1854 "vadd.f32 q9, q7, q1 \n\t" 1855 "vcvt.s32.f32 q10, q8 \n\t" 1856 "vcvt.s32.f32 q11, q9 \n\t" 1857 "vqmovun.s32 d24, q10 \n\t" 1858 "vqmovun.s32 d25, q11 \n\t" 1859 "vqmovn.u16 d26, q12 \n\t" 1860 "vst1.8 {d26}, [%[dst]] \n\t" 1861 : /*no output*/ 1862 : [src1] "r" (_src + i + 0), 1863 [src2] "r" (_src + i + 4), 1864 [dst] "r" (_dst + i), 1865 "w" (vscale), "w" (vshift) 1866 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26" 1867 ); 1868 } 1869 }) 1870 #else 1871 CVTS_FUNC(s32, u8, 8, 1872 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1873 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1874 { 1875 for (size_t i = 0; i < w; i += 8) 1876 { 1877 internal::prefetch(_src + i); 1878 int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); 1879 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); 1880 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1881 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1882 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1883 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1884 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1885 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1886 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1887 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1888 uint16x4_t vRes1 = vqmovun_s32(vline1_s32); 1889 uint16x4_t vRes2 = vqmovun_s32(vline2_s32); 1890 uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); 1891 vst1_u8(_dst + i, vRes); 1892 } 1893 }) 1894 #endif 1895 1896 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1897 CVTS_FUNC(s32, s8, 8, 1898 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1899 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1900 { 1901 for (size_t i = 0; i < w; i += 8) 1902 { 1903 internal::prefetch(_src + i); 1904 __asm__ ( 1905 "vld1.32 {d4-d5}, [%[src1]] \n\t" 1906 "vld1.32 {d6-d7}, [%[src2]] \n\t" 1907 "vcvt.f32.s32 q4, q2 \n\t" 1908 "vcvt.f32.s32 q5, q3 \n\t" 1909 "vmul.f32 q6, q4, q0 \n\t" 1910 "vmul.f32 q7, q5, q0 \n\t" 1911 "vadd.f32 q8, q6, q1 \n\t" 1912 "vadd.f32 q9, q7, q1 \n\t" 1913 "vcvt.s32.f32 q10, q8 \n\t" 1914 "vcvt.s32.f32 q11, q9 \n\t" 1915 "vqmovn.s32 d24, q10 \n\t" 1916 "vqmovn.s32 d25, q11 \n\t" 1917 "vqmovn.s16 d26, q12 \n\t" 1918 "vst1.8 {d26}, [%[dst]] \n\t" 1919 : /*no output*/ 1920 : [src1] "r" (_src + i + 0), 1921 [src2] "r" (_src + i + 4), 1922 [dst] "r" (_dst + i), 1923 "w" (vscale), "w" (vshift) 1924 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26" 1925 ); 1926 } 1927 }) 1928 #else 1929 CVTS_FUNC(s32, s8, 8, 1930 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1931 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1932 { 1933 for (size_t i = 0; i < w; i += 8) 1934 { 1935 internal::prefetch(_src + i); 1936 int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); 1937 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); 1938 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1939 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1940 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1941 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1942 vline1_f32 = vaddq_f32(vline1_f32, vshift); 1943 vline2_f32 = vaddq_f32(vline2_f32, vshift); 1944 vline1_s32 = vcvtq_s32_f32(vline1_f32); 1945 vline2_s32 = vcvtq_s32_f32(vline2_f32); 1946 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 1947 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 1948 int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); 1949 vst1_s8(_dst + i, vRes); 1950 } 1951 }) 1952 #endif 1953 1954 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 1955 CVTS_FUNC(s32, u16, 8, 1956 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 1957 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 1958 { 1959 for (size_t i = 0; i < w; i += 8) 1960 { 1961 internal::prefetch(_src + i); 1962 __asm__ ( 1963 "vld1.32 {d4-d5}, [%[src1]] \n\t" 1964 "vld1.32 {d6-d7}, [%[src2]] \n\t" 1965 "vcvt.f32.s32 q4, q2 \n\t" 1966 "vcvt.f32.s32 q5, q3 \n\t" 1967 "vmul.f32 q6, q4, q0 \n\t" 1968 "vmul.f32 q7, q5, q0 \n\t" 1969 "vadd.f32 q8, q6, q1 \n\t" 1970 "vadd.f32 q9, q7, q1 \n\t" 1971 "vcvt.s32.f32 q10, q8 \n\t" 1972 "vcvt.s32.f32 q11, q9 \n\t" 1973 "vqmovun.s32 d24, q10 \n\t" 1974 "vqmovun.s32 d25, q11 \n\t" 1975 "vst1.16 {d24-d25}, [%[dst]] \n\t" 1976 : /*no output*/ 1977 : [src1] "r" (_src + i + 0), 1978 [src2] "r" (_src + i + 4), 1979 [dst] "r" (_dst + i), 1980 "w" (vscale), "w" (vshift) 1981 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" 1982 ); 1983 } 1984 }) 1985 #else 1986 CVTS_FUNC(s32, u16, 8, 1987 float32x4_t vscale = vdupq_n_f32((f32)alpha); 1988 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 1989 { 1990 for (size_t i = 0; i < w; i += 8) 1991 { 1992 internal::prefetch(_src + i); 1993 int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); 1994 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); 1995 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 1996 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 1997 vline1_f32 = vmulq_f32(vline1_f32, vscale); 1998 vline2_f32 = vmulq_f32(vline2_f32, vscale); 1999 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2000 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2001 vline1_s32 = vcvtq_s32_f32(vline1_f32); 2002 vline2_s32 = vcvtq_s32_f32(vline2_f32); 2003 uint16x4_t vRes1 = vqmovun_s32(vline1_s32); 2004 uint16x4_t vRes2 = vqmovun_s32(vline2_s32); 2005 vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); 2006 } 2007 }) 2008 #endif 2009 2010 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2011 CVTS_FUNC(s32, s16, 8, 2012 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2013 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 2014 { 2015 for (size_t i = 0; i < w; i += 8) 2016 { 2017 internal::prefetch(_src + i); 2018 __asm__ ( 2019 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2020 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2021 "vcvt.f32.s32 q4, q2 \n\t" 2022 "vcvt.f32.s32 q5, q3 \n\t" 2023 "vmul.f32 q6, q4, q0 \n\t" 2024 "vmul.f32 q7, q5, q0 \n\t" 2025 "vadd.f32 q8, q6, q1 \n\t" 2026 "vadd.f32 q9, q7, q1 \n\t" 2027 "vcvt.s32.f32 q10, q8 \n\t" 2028 "vcvt.s32.f32 q11, q9 \n\t" 2029 "vqmovn.s32 d24, q10 \n\t" 2030 "vqmovn.s32 d25, q11 \n\t" 2031 "vst1.8 {d24-d25}, [%[dst]] \n\t" 2032 : /*no output*/ 2033 : [src1] "r" (_src + i + 0), 2034 [src2] "r" (_src + i + 4), 2035 [dst] "r" (_dst + i), 2036 "w" (vscale), "w" (vshift) 2037 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" 2038 ); 2039 } 2040 }) 2041 #else 2042 CVTS_FUNC(s32, s16, 8, 2043 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2044 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 2045 { 2046 for (size_t i = 0; i < w; i += 8) 2047 { 2048 internal::prefetch(_src + i); 2049 int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); 2050 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); 2051 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 2052 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 2053 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2054 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2055 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2056 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2057 vline1_s32 = vcvtq_s32_f32(vline1_f32); 2058 vline2_s32 = vcvtq_s32_f32(vline2_f32); 2059 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 2060 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 2061 vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); 2062 } 2063 }) 2064 #endif 2065 2066 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2067 CVTS_FUNC1(s32, 8, 2068 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2069 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 2070 { 2071 for (size_t i = 0; i < w; i += 8) 2072 { 2073 internal::prefetch(_src + i); 2074 __asm__ ( 2075 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2076 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2077 "vcvt.f32.s32 q4, q2 \n\t" 2078 "vcvt.f32.s32 q5, q3 \n\t" 2079 "vmul.f32 q6, q4, q0 \n\t" 2080 "vmul.f32 q7, q5, q0 \n\t" 2081 "vadd.f32 q8, q6, q1 \n\t" 2082 "vadd.f32 q9, q7, q1 \n\t" 2083 "vcvt.s32.f32 q10, q8 \n\t" 2084 "vcvt.s32.f32 q11, q9 \n\t" 2085 "vst1.32 {d20-d21}, [%[dst1]] \n\t" 2086 "vst1.32 {d22-d23}, [%[dst2]] \n\t" 2087 : /*no output*/ 2088 : [src1] "r" (_src + i + 0), 2089 [src2] "r" (_src + i + 4), 2090 [dst1] "r" (_dst + i + 0), 2091 [dst2] "r" (_dst + i + 4), 2092 "w" (vscale), "w" (vshift) 2093 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" 2094 ); 2095 } 2096 }) 2097 #else 2098 CVTS_FUNC1(s32, 8, 2099 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2100 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 2101 { 2102 for (size_t i = 0; i < w; i += 8) 2103 { 2104 internal::prefetch(_src + i); 2105 int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); 2106 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); 2107 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 2108 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 2109 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2110 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2111 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2112 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2113 vline1_s32 = vcvtq_s32_f32(vline1_f32); 2114 vline2_s32 = vcvtq_s32_f32(vline2_f32); 2115 vst1q_s32(_dst + i + 0, vline1_s32); 2116 vst1q_s32(_dst + i + 4, vline2_s32); 2117 } 2118 }) 2119 #endif 2120 2121 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2122 CVTS_FUNC(s32, f32, 8, 2123 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2124 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, 2125 { 2126 for (size_t i = 0; i < w; i += 8) 2127 { 2128 internal::prefetch(_src + i); 2129 __asm__ ( 2130 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2131 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2132 "vcvt.f32.s32 q4, q2 \n\t" 2133 "vcvt.f32.s32 q5, q3 \n\t" 2134 "vmul.f32 q6, q4, q0 \n\t" 2135 "vmul.f32 q7, q5, q0 \n\t" 2136 "vadd.f32 q8, q6, q1 \n\t" 2137 "vadd.f32 q9, q7, q1 \n\t" 2138 "vst1.32 {d16-d17}, [%[dst1]] \n\t" 2139 "vst1.32 {d18-d19}, [%[dst2]] \n\t" 2140 : /*no output*/ 2141 : [src1] "r" (_src + i), 2142 [src2] "r" (_src + i + 4), 2143 [dst1] "r" (_dst + i), 2144 [dst2] "r" (_dst + i + 4), 2145 "w" (vscale), "w" (vshift) 2146 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" 2147 ); 2148 } 2149 }) 2150 #else 2151 CVTS_FUNC(s32, f32, 8, 2152 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2153 float32x4_t vshift = vdupq_n_f32((f32)beta);, 2154 { 2155 for (size_t i = 0; i < w; i += 8) 2156 { 2157 internal::prefetch(_src + i); 2158 int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); 2159 int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); 2160 float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); 2161 float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); 2162 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2163 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2164 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2165 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2166 vst1q_f32(_dst + i + 0, vline1_f32); 2167 vst1q_f32(_dst + i + 4, vline2_f32); 2168 } 2169 }) 2170 #endif 2171 2172 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2173 CVTS_FUNC(f32, u8, 8, 2174 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha)); 2175 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta)); 2176 register uint32x4_t vmask asm ("q2") = vdupq_n_u32(1<<16);, 2177 { 2178 for (size_t i = 0; i < w; i += 8) 2179 { 2180 internal::prefetch(_src + i); 2181 __asm__ ( 2182 "vld1.32 {d6-d7}, [%[src1]] \n\t" 2183 "vld1.32 {d8-d9}, [%[src2]] \n\t" 2184 "vmul.f32 q5, q3, q0 \n\t" 2185 "vmul.f32 q6, q4, q0 \n\t" 2186 "vadd.f32 q7, q5, q1 \n\t" 2187 "vadd.f32 q8, q6, q1 \n\t" 2188 "vcvt.u32.f32 q9, q7 \n\t" 2189 "vcvt.u32.f32 q10, q8 \n\t" 2190 "vbic q11, q2, q6 \n\t" 2191 "vbic q12, q2, q7 \n\t" 2192 "vshr.u32 q13, q11, #16 \n\t" 2193 "vshr.u32 q14, q12, #16 \n\t" 2194 "vqsub.u32 q7, q9, q13 \n\t" 2195 "vqsub.u32 q8, q10, q14 \n\t" 2196 "vqrshrn.u32 d22, q7, #16 \n\t" 2197 "vqrshrn.u32 d23, q8, #16 \n\t" 2198 "vqmovn.u16 d30, q11 \n\t" 2199 "vst1.8 {d30}, [%[dst]] \n\t" 2200 : /*no output*/ 2201 : [src1] "r" (_src + i + 0), 2202 [src2] "r" (_src + i + 4), 2203 [dst] "r" (_dst + i), 2204 "w" (vscale), "w" (vshift), "w" (vmask) 2205 : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30" 2206 ); 2207 } 2208 }) 2209 #else 2210 CVTS_FUNC(f32, u8, 8, 2211 float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha)); 2212 float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta)); 2213 uint32x4_t vmask = vdupq_n_u32(1<<16);, 2214 { 2215 for (size_t i = 0; i < w; i += 8) 2216 { 2217 internal::prefetch(_src + i); 2218 float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); 2219 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); 2220 2221 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2222 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2223 float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift); 2224 float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift); 2225 uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32); 2226 uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32); 2227 uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32)); 2228 uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32)); 2229 vline1Mask = vshrq_n_u32(vline1Mask, 16); 2230 vline2Mask = vshrq_n_u32(vline2Mask, 16); 2231 vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask); 2232 vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask); 2233 uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16); 2234 uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16); 2235 uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); 2236 2237 vst1_u8(_dst + i, vRes); 2238 } 2239 }) 2240 #endif 2241 2242 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2243 CVTS_FUNC(f32, s8, 8, 2244 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2245 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 2246 { 2247 for (size_t i = 0; i < w; i += 8) 2248 { 2249 internal::prefetch(_src + i); 2250 __asm__ ( 2251 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2252 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2253 "vmul.f32 q4, q2, q0 \n\t" 2254 "vmul.f32 q5, q3, q0 \n\t" 2255 "vadd.f32 q6, q4, q1 \n\t" 2256 "vadd.f32 q7, q5, q1 \n\t" 2257 "vcvt.s32.f32 q8, q6 \n\t" 2258 "vcvt.s32.f32 q9, q7 \n\t" 2259 "vqmovn.s32 d14, q8 \n\t" 2260 "vqmovn.s32 d15, q9 \n\t" 2261 "vqmovn.s16 d16, q7 \n\t" 2262 "vst1.8 {d16}, [%[dst]] \n\t" 2263 : /*no output*/ 2264 : [src1] "r" (_src + i + 0), 2265 [src2] "r" (_src + i + 4), 2266 [dst] "r" (_dst + i), 2267 "w" (vscale), "w" (vshift) 2268 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" 2269 ); 2270 } 2271 }) 2272 #else 2273 CVTS_FUNC(f32, s8, 8, 2274 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2275 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 2276 { 2277 for (size_t i = 0; i < w; i += 8) 2278 { 2279 internal::prefetch(_src + i); 2280 float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); 2281 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); 2282 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2283 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2284 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2285 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2286 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 2287 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 2288 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 2289 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 2290 int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); 2291 vst1_s8(_dst + i, vRes); 2292 } 2293 }) 2294 #endif 2295 2296 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2297 CVTS_FUNC(f32, u16, 8, 2298 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2299 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 2300 { 2301 for (size_t i = 0; i < w; i += 8) 2302 { 2303 internal::prefetch(_src + i); 2304 __asm__ ( 2305 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2306 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2307 "vmul.f32 q4, q2, q0 \n\t" 2308 "vmul.f32 q5, q3, q0 \n\t" 2309 "vadd.f32 q6, q4, q1 \n\t" 2310 "vadd.f32 q7, q5, q1 \n\t" 2311 "vcvt.u32.f32 q8, q6 \n\t" 2312 "vcvt.u32.f32 q9, q7 \n\t" 2313 "vqmovn.u32 d8, q8 \n\t" 2314 "vqmovn.u32 d9, q9 \n\t" 2315 "vst1.16 {d8-d9}, [%[dst]] \n\t" 2316 : /*no output*/ 2317 : [src1] "r" (_src + i + 0), 2318 [src2] "r" (_src + i + 4), 2319 [dst] "r" (_dst + i), 2320 "w" (vscale), "w" (vshift) 2321 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" 2322 ); 2323 } 2324 }) 2325 #else 2326 CVTS_FUNC(f32, u16, 8, 2327 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2328 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 2329 { 2330 for (size_t i = 0; i < w; i += 8) 2331 { 2332 internal::prefetch(_src + i); 2333 float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); 2334 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); 2335 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2336 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2337 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2338 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2339 uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32); 2340 uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32); 2341 uint16x4_t vRes1 = vqmovn_u32(vline1_u32); 2342 uint16x4_t vRes2 = vqmovn_u32(vline2_u32); 2343 vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); 2344 } 2345 }) 2346 #endif 2347 2348 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2349 CVTS_FUNC(f32, s16, 8, 2350 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2351 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 2352 { 2353 for (size_t i = 0; i < w; i += 8) 2354 { 2355 internal::prefetch(_src + i); 2356 __asm__ ( 2357 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2358 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2359 "vmul.f32 q4, q2, q0 \n\t" 2360 "vmul.f32 q5, q3, q0 \n\t" 2361 "vadd.f32 q6, q4, q1 \n\t" 2362 "vadd.f32 q7, q5, q1 \n\t" 2363 "vcvt.s32.f32 q8, q6 \n\t" 2364 "vcvt.s32.f32 q9, q7 \n\t" 2365 "vqmovn.s32 d8, q8 \n\t" 2366 "vqmovn.s32 d9, q9 \n\t" 2367 "vst1.16 {d8-d9}, [%[dst]] \n\t" 2368 : /*no output*/ 2369 : [src1] "r" (_src + i + 0), 2370 [src2] "r" (_src + i + 4), 2371 [dst] "r" (_dst + i), 2372 "w" (vscale), "w" (vshift) 2373 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" 2374 ); 2375 } 2376 }) 2377 #else 2378 CVTS_FUNC(f32, s16, 8, 2379 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2380 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 2381 { 2382 for (size_t i = 0; i < w; i += 8) 2383 { 2384 internal::prefetch(_src + i); 2385 float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); 2386 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); 2387 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2388 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2389 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2390 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2391 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 2392 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 2393 int16x4_t vRes1 = vqmovn_s32(vline1_s32); 2394 int16x4_t vRes2 = vqmovn_s32(vline2_s32); 2395 vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); 2396 } 2397 }) 2398 #endif 2399 2400 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2401 CVTS_FUNC(f32, s32, 8, 2402 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2403 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, 2404 { 2405 for (size_t i = 0; i < w; i += 8) 2406 { 2407 internal::prefetch(_src + i); 2408 __asm__ ( 2409 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2410 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2411 "vmul.f32 q4, q2, q0 \n\t" 2412 "vmul.f32 q5, q3, q0 \n\t" 2413 "vadd.f32 q6, q4, q1 \n\t" 2414 "vadd.f32 q7, q5, q1 \n\t" 2415 "vcvt.s32.f32 q4, q6 \n\t" 2416 "vcvt.s32.f32 q5, q7 \n\t" 2417 "vst1.32 {d8-d9}, [%[dst1]] \n\t" 2418 "vst1.32 {d10-d11}, [%[dst2]] \n\t" 2419 : //no output 2420 : [src1] "r" (_src + i), 2421 [src2] "r" (_src + i + 4), 2422 [dst1] "r" (_dst + i), 2423 [dst2] "r" (_dst + i + 4), 2424 "w" (vscale), "w" (vshift) 2425 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" 2426 ); 2427 } 2428 }) 2429 #else 2430 CVTS_FUNC(f32, s32, 8, 2431 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2432 float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, 2433 { 2434 for (size_t i = 0; i < w; i += 8) 2435 { 2436 internal::prefetch(_src + i); 2437 float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); 2438 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); 2439 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2440 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2441 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2442 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2443 int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); 2444 int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); 2445 vst1q_s32(_dst + i + 0, vline1_s32); 2446 vst1q_s32(_dst + i + 4, vline2_s32); 2447 } 2448 }) 2449 #endif 2450 2451 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) 2452 CVTS_FUNC1(f32, 8, 2453 register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); 2454 register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, 2455 { 2456 for (size_t i = 0; i < w; i += 8) 2457 { 2458 internal::prefetch(_src + i); 2459 __asm__ ( 2460 "vld1.32 {d4-d5}, [%[src1]] \n\t" 2461 "vld1.32 {d6-d7}, [%[src2]] \n\t" 2462 "vmul.f32 q4, q2, q0 \n\t" 2463 "vmul.f32 q5, q3, q0 \n\t" 2464 "vadd.f32 q6, q4, q1 \n\t" 2465 "vadd.f32 q7, q5, q1 \n\t" 2466 "vst1.32 {d12-d13}, [%[dst1]] \n\t" 2467 "vst1.32 {d14-d15}, [%[dst2]] \n\t" 2468 : /*no output*/ 2469 : [src1] "r" (_src + i + 0), 2470 [src2] "r" (_src + i + 4), 2471 [dst1] "r" (_dst + i + 0), 2472 [dst2] "r" (_dst + i + 4), 2473 "w" (vscale), "w" (vshift) 2474 : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" 2475 ); 2476 } 2477 }) 2478 #else 2479 CVTS_FUNC1(f32, 8, 2480 float32x4_t vscale = vdupq_n_f32((f32)alpha); 2481 float32x4_t vshift = vdupq_n_f32((f32)beta);, 2482 { 2483 for (size_t i = 0; i < w; i += 8) 2484 { 2485 internal::prefetch(_src + i); 2486 float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); 2487 float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); 2488 vline1_f32 = vmulq_f32(vline1_f32, vscale); 2489 vline2_f32 = vmulq_f32(vline2_f32, vscale); 2490 vline1_f32 = vaddq_f32(vline1_f32, vshift); 2491 vline2_f32 = vaddq_f32(vline2_f32, vshift); 2492 vst1q_f32(_dst + i + 0, vline1_f32); 2493 vst1q_f32(_dst + i + 4, vline2_f32); 2494 } 2495 }) 2496 #endif 2497 2498 } // namespace CAROTENE_NS 2499