/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_fp16s.h | 296 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1156 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1172 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1173 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack8_fp16s.h | 278 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 301 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1154 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1170 float16x8_t _tmp024c = vaddq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1171 float16x8_t _tmp135c = vsubq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1416 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2217 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 396 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1100 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1116 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1117 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
H A D | convolution_3x3_pack4_bf16s.h | 174 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 197 …oat32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1764 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1780 float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1781 float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1925 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3440 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local [all …]
|
H A D | convolution_3x3_pack8to1_fp16s.h | 290 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() local 295 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 301 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 313 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 325 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon()
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_fp16s.h | 296 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1156 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1172 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1173 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack8_fp16s.h | 278 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 301 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1154 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1170 float16x8_t _tmp024c = vaddq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1171 float16x8_t _tmp135c = vsubq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1416 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2217 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 396 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1100 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1116 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1117 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
H A D | convolution_3x3_pack4_bf16s.h | 174 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 197 …oat32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1764 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1780 float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1781 float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1925 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3440 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local [all …]
|
H A D | convolution_3x3_pack8to1_fp16s.h | 290 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() local 295 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 301 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 313 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 325 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon()
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_fp16s.h | 296 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1156 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1172 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1173 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack8_fp16s.h | 278 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 301 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1154 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1170 float16x8_t _tmp024c = vaddq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1171 float16x8_t _tmp135c = vsubq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1416 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2217 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 396 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1100 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1116 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1117 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
H A D | convolution_3x3_pack4_bf16s.h | 174 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 197 …oat32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1764 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1780 float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1781 float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1925 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3440 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local [all …]
|
H A D | convolution_3x3_pack8to1_fp16s.h | 290 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() local 295 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 301 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 313 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 325 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon()
|
/dports/misc/ncnn/ncnn-20211208/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_fp16s.h | 296 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1156 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1172 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1173 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack8_fp16s.h | 278 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 301 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1154 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1170 float16x8_t _tmp024c = vaddq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1171 float16x8_t _tmp135c = vsubq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1416 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2217 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 396 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1100 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1116 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1117 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
H A D | convolution_3x3_pack4_bf16s.h | 174 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 197 …oat32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1764 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1780 float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1781 float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1925 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3440 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local [all …]
|
H A D | convolution_3x3_pack8to1_fp16s.h | 290 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() local 295 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 301 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 313 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon() 325 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to1_fp16sa_neon()
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_fp16s.h | 296 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …oat16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1156 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1172 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1173 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack8_fp16s.h | 278 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 301 …oat16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1154 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1170 float16x8_t _tmp024c = vaddq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1171 float16x8_t _tmp135c = vsubq_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1416 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2217 float16x8_t _tmp05 = vld1q_f16(tmp[m][5]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 396 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1100 float16x4_t _tmp05 = vld1_f16(tmp[m][5]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1116 float16x4_t _tmp024c = vadd_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1117 float16x4_t _tmp135c = vsub_f16(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
H A D | convolution_3x3_pack4_bf16s.h | 174 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 197 …oat32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1764 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1780 float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1781 float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06); in conv3x3s1_winograd64_pack4_bf16s_neon() 1925 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3440 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]); in conv3x3s1_winograd42_pack4_bf16s_neon() local [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/ |
H A D | convolution_3x3_packn.h | 240 vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); in conv3x3s1_winograd64_packn_rvv() local 245 …vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); in conv3x3s1_winograd64_packn_rvv() 248 … vfloat32m1_t _tmp12b = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); in conv3x3s1_winograd64_packn_rvv() 254 …cc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, _tmp05, vl); in conv3x3s1_winograd64_packn_rvv() 707 vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); in conv3x3s1_winograd64_packn_rvv() local 717 vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl); in conv3x3s1_winograd64_packn_rvv() 718 vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl); in conv3x3s1_winograd64_packn_rvv() 935 vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); in conv3x3s1_winograd42_packn_rvv() local 942 … vfloat32m1_t _r0tm5 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); in conv3x3s1_winograd42_packn_rvv() 1365 vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); in conv3x3s1_winograd42_packn_rvv() local [all …]
|