/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 274 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1150 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1158 float16x8_t _tmp024a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1159 float16x8_t _tmp135a = vsubq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1412 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2213 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2219 float16x8_t _tmp02a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 170 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1760 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1768 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1769 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1921 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3436 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3442 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 292 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …float16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …float16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1152 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1160 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1161 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 401 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 410 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_neon() 416 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_neon() 1991 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 1999 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2000 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2381 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 2392 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_neon() 3896 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 3902 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_neon() [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 392 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1096 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1104 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1105 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 274 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1150 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1158 float16x8_t _tmp024a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1159 float16x8_t _tmp135a = vsubq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1412 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2213 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2219 float16x8_t _tmp02a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 170 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1760 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1768 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1769 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1921 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3436 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3442 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 292 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …float16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …float16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1152 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1160 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1161 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 401 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 410 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_neon() 416 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_neon() 1991 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 1999 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2000 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2381 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 2392 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_neon() 3896 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 3902 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_neon() [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 392 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 401 … float16x4_t _r0tm7 = vfma_n_f16(vsub_f16(_tmp07, _tmp01), vsub_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 407 float16x4_t _tmp12b = vfms_n_f16(vadd_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 419 … float16x4_t _tmp34b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 431 … float16x4_t _tmp56b = vfma_n_f16(vfms_n_f16(vmul_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1096 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1104 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1105 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 274 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1150 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1158 float16x8_t _tmp024a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1159 float16x8_t _tmp135a = vsubq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1412 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2213 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2219 float16x8_t _tmp02a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 170 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1760 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1768 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1769 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1921 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3436 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3442 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 292 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …float16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …float16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1152 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1160 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1161 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 401 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 410 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_neon() 416 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_neon() 1991 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 1999 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2000 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2381 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 2392 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_neon() 3896 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 3902 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_neon() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 274 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1150 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1158 float16x8_t _tmp024a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1159 float16x8_t _tmp135a = vsubq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1412 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2213 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2219 float16x8_t _tmp02a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 170 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1760 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1768 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1769 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1921 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3436 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3442 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 292 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …float16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …float16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1152 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1160 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1161 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 401 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 410 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_neon() 416 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_neon() 1991 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 1999 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2000 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2381 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 2392 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_neon() 3896 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 3902 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 274 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 283 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 289 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1150 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1158 float16x8_t _tmp024a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1159 float16x8_t _tmp135a = vsubq_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1412 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1423 … float16x8_t _r0tm5 = vfmsq_n_f16(vfmaq_n_f16(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2213 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2219 float16x8_t _tmp02a = vaddq_f16(_tmp01, _tmp02); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 170 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 179 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1760 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1768 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1769 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_bf16s_neon() 1921 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1932 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3436 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3442 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 292 float16x8_t _tmp01 = vld1q_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 301 … float16x8_t _r0tm7 = vfmaq_n_f16(vsubq_f16(_tmp07, _tmp01), vsubq_f16(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 307 float16x8_t _tmp12b = vfmsq_n_f16(vaddq_f16(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 319 …float16x8_t _tmp34b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 331 …float16x8_t _tmp56b = vfmaq_n_f16(vfmsq_n_f16(vmulq_n_f16(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5… in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1152 float16x4_t _tmp01 = vld1_f16(tmp[m][1]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1160 float16x4_t _tmp024a = vadd_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1161 float16x4_t _tmp135a = vsub_f16(_tmp01, _tmp02); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 401 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 410 … float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f); in conv3x3s1_winograd64_pack4_neon() 416 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f); in conv3x3s1_winograd64_pack4_neon() 1991 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd64_pack4_neon() local 1999 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2000 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_neon() 2381 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 2392 … float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f); in conv3x3s1_winograd42_pack4_neon() 3896 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]); in conv3x3s1_winograd42_pack4_neon() local 3902 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/ |
H A D | convolution_3x3_packn.h | 236 vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); in conv3x3s1_winograd64_packn_rvv() local 245 …vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, vfsub_vv_f32m1(_t… in conv3x3s1_winograd64_packn_rvv() 248 … vfloat32m1_t _tmp12b = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); in conv3x3s1_winograd64_packn_rvv() 703 vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); in conv3x3s1_winograd64_packn_rvv() local 711 vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); in conv3x3s1_winograd64_packn_rvv() 712 vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); in conv3x3s1_winograd64_packn_rvv() 931 vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); in conv3x3s1_winograd42_packn_rvv() local 942 … vfloat32m1_t _r0tm5 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); in conv3x3s1_winograd42_packn_rvv() 1361 vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); in conv3x3s1_winograd42_packn_rvv() local 1367 vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); in conv3x3s1_winograd42_packn_rvv() [all …]
|
H A D | convolution_3x3_packn_fp16s.h | 236 vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() local 245 …vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, vfsub_vv_f16m1(_t… in conv3x3s1_winograd64_packn_fp16sa_rvv() 248 … vfloat16m1_t _tmp12b = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 703 vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() local 711 vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 712 vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 931 vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() local 942 … vfloat16m1_t _r0tm5 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() 1361 vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() local 1367 vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/mips/ |
H A D | convolution_3x3_pack4.h | 239 v4f32 _tmp01 = (v4f32)__msa_ld_w(tmp[m][1], 0); in conv3x3s1_winograd64_pack4_msa() local 248 … v4f32 _r0tm7 = __msa_fmadd_w(__msa_fsub_w(_tmp07, _tmp01), _v5_25, __msa_fsub_w(_tmp03, _tmp05)); in conv3x3s1_winograd64_pack4_msa() 251 … v4f32 _tmp12b = __msa_fmadd_w(__msa_fadd_w(_tmp01, _tmp05), _vm4_25, _tmp03); in conv3x3s1_winograd64_pack4_msa() 845 v4f32 _tmp01 = (v4f32)__msa_ld_w(tmp[m][1], 0); in conv3x3s1_winograd64_pack4_msa() local 853 v4f32 _tmp024a = __msa_fadd_w(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_msa() 854 v4f32 _tmp135a = __msa_fsub_w(_tmp01, _tmp02); in conv3x3s1_winograd64_pack4_msa() 1073 v4f32 _tmp01 = (v4f32)__msa_ld_w(tmp[m][1], 0); in conv3x3s1_winograd42_pack4_msa() local 1084 … v4f32 _r0tm5 = __msa_fmadd_w(__msa_fmadd_w(_tmp05, _v4, _tmp01), _vm5, _tmp03); in conv3x3s1_winograd42_pack4_msa() 1640 v4f32 _tmp01 = (v4f32)__msa_ld_w(tmp[m][1], 0); in conv3x3s1_winograd42_pack4_msa() local 1646 v4f32 _tmp02a = __msa_fadd_w(_tmp01, _tmp02); in conv3x3s1_winograd42_pack4_msa() [all …]
|