/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 277 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 288 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 312 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1153 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1164 float16x8_t _tmp024b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1165 float16x8_t _tmp135b = vsubq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1415 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1418 … float16x8_t _r0tm0 = vfmsq_n_f16(vfmaq_n_f16(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2216 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2222 float16x8_t _tmp02b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 173 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 184 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 208 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1763 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1774 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1775 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1924 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1927 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3439 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3445 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 295 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 300 … float16x8_t _r0tm0 = vfmaq_n_f16(vsubq_f16(_tmp00, _tmp06), vsubq_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 306 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 318 … float16x8_t _tmp34a = vfmsq_n_f16(vfmaq_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 330 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1155 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1166 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1167 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 404 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 415 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_neon() 439 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_neon() 1994 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 2005 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2006 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2384 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 2387 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_neon() 3899 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 3905 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_neon() [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 395 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 400 … float16x4_t _r0tm0 = vfma_n_f16(vsub_f16(_tmp00, _tmp06), vsub_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 406 float16x4_t _tmp12a = vfms_n_f16(vadd_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 418 … float16x4_t _tmp34a = vfms_n_f16(vfma_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 430 … float16x4_t _tmp56a = vfma_n_f16(_tmp06, vfms_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1099 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1110 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1111 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 277 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 288 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 312 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1153 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1164 float16x8_t _tmp024b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1165 float16x8_t _tmp135b = vsubq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1415 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1418 … float16x8_t _r0tm0 = vfmsq_n_f16(vfmaq_n_f16(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2216 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2222 float16x8_t _tmp02b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 173 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 184 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 208 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1763 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1774 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1775 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1924 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1927 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3439 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3445 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 295 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 300 … float16x8_t _r0tm0 = vfmaq_n_f16(vsubq_f16(_tmp00, _tmp06), vsubq_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 306 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 318 … float16x8_t _tmp34a = vfmsq_n_f16(vfmaq_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 330 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1155 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1166 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1167 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 404 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 415 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_neon() 439 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_neon() 1994 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 2005 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2006 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2384 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 2387 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_neon() 3899 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 3905 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_neon() [all …]
|
H A D | convolution_3x3_pack4_fp16s.h | 395 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 400 … float16x4_t _r0tm0 = vfma_n_f16(vsub_f16(_tmp00, _tmp06), vsub_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 406 float16x4_t _tmp12a = vfms_n_f16(vadd_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 418 … float16x4_t _tmp34a = vfms_n_f16(vfma_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 430 … float16x4_t _tmp56a = vfma_n_f16(_tmp06, vfms_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1099 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack4_fp16sa_neon() local 1110 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_fp16sa_neon() 1111 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_fp16sa_neon()
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 277 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 288 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 312 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1153 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1164 float16x8_t _tmp024b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1165 float16x8_t _tmp135b = vsubq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1415 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1418 … float16x8_t _r0tm0 = vfmsq_n_f16(vfmaq_n_f16(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2216 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2222 float16x8_t _tmp02b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 173 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 184 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 208 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1763 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1774 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1775 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1924 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1927 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3439 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3445 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 295 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 300 … float16x8_t _r0tm0 = vfmaq_n_f16(vsubq_f16(_tmp00, _tmp06), vsubq_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 306 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 318 … float16x8_t _tmp34a = vfmsq_n_f16(vfmaq_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 330 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1155 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1166 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1167 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 404 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 415 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_neon() 439 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_neon() 1994 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 2005 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2006 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2384 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 2387 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_neon() 3899 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 3905 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_neon() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 277 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 288 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 312 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1153 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1164 float16x8_t _tmp024b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1165 float16x8_t _tmp135b = vsubq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1415 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1418 … float16x8_t _r0tm0 = vfmsq_n_f16(vfmaq_n_f16(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2216 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2222 float16x8_t _tmp02b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 173 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 184 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 208 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1763 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1774 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1775 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1924 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1927 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3439 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3445 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 295 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 300 … float16x8_t _r0tm0 = vfmaq_n_f16(vsubq_f16(_tmp00, _tmp06), vsubq_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 306 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 318 … float16x8_t _tmp34a = vfmsq_n_f16(vfmaq_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 330 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1155 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1166 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1167 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 404 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 415 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_neon() 439 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_neon() 1994 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 2005 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2006 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2384 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 2387 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_neon() 3899 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 3905 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/arm/ |
H A D | convolution_3x3_pack8_fp16s.h | 277 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 288 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 312 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1153 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8_fp16sa_neon() local 1164 float16x8_t _tmp024b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1165 float16x8_t _tmp135b = vsubq_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8_fp16sa_neon() 1415 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 1418 … float16x8_t _r0tm0 = vfmsq_n_f16(vfmaq_n_f16(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack8_fp16sa_neon() 2216 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd42_pack8_fp16sa_neon() local 2222 float16x8_t _tmp02b = vaddq_f16(_tmp03, _tmp04); in conv3x3s1_winograd42_pack8_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack4_bf16s.h | 173 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 184 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_bf16s_neon() 208 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_bf16s_neon() 1763 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_bf16s_neon() local 1774 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1775 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_bf16s_neon() 1924 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 1927 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_bf16s_neon() 3439 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_bf16s_neon() local 3445 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_bf16s_neon() [all …]
|
H A D | convolution_3x3_pack8to4_fp16s.h | 295 float16x8_t _tmp04 = vld1q_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 300 … float16x8_t _r0tm0 = vfmaq_n_f16(vsubq_f16(_tmp00, _tmp06), vsubq_f16(_tmp04, _tmp02), 5.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 306 float16x8_t _tmp12a = vfmsq_n_f16(vaddq_f16(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 318 … float16x8_t _tmp34a = vfmsq_n_f16(vfmaq_n_f16(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 330 … float16x8_t _tmp56a = vfmaq_n_f16(_tmp06, vfmsq_n_f16(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1155 float16x4_t _tmp04 = vld1_f16(tmp[m][4]); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() local 1166 float16x4_t _tmp024b = vadd_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon() 1167 float16x4_t _tmp135b = vsub_f16(_tmp03, _tmp04); in conv3x3s1_winograd64_pack8to4_fp16sa_neon()
|
H A D | convolution_3x3_pack4.h | 404 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 415 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f); in conv3x3s1_winograd64_pack4_neon() 439 … float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f); in conv3x3s1_winograd64_pack4_neon() 1994 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd64_pack4_neon() local 2005 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2006 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_neon() 2384 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 2387 … float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f); in conv3x3s1_winograd42_pack4_neon() 3899 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]); in conv3x3s1_winograd42_pack4_neon() local 3905 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/ |
H A D | convolution_3x3_packn.h | 239 vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); in conv3x3s1_winograd64_packn_rvv() local 247 … vfloat32m1_t _tmp12a = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); in conv3x3s1_winograd64_packn_rvv() 259 …loat32m1_t _tmp56a = vfmacc_vf_f32m1(_tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); in conv3x3s1_winograd64_packn_rvv() 706 vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); in conv3x3s1_winograd64_packn_rvv() local 714 vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); in conv3x3s1_winograd64_packn_rvv() 715 vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); in conv3x3s1_winograd64_packn_rvv() 934 vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); in conv3x3s1_winograd42_packn_rvv() local 937 … vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); in conv3x3s1_winograd42_packn_rvv() 1364 vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); in conv3x3s1_winograd42_packn_rvv() local 1370 vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); in conv3x3s1_winograd42_packn_rvv() [all …]
|
H A D | convolution_3x3_packn_fp16s.h | 239 vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() local 247 … vfloat16m1_t _tmp12a = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 259 …loat16m1_t _tmp56a = vfmacc_vf_f16m1(_tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 706 vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() local 714 vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 715 vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); in conv3x3s1_winograd64_packn_fp16sa_rvv() 934 vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() local 937 … vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() 1364 vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() local 1370 vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); in conv3x3s1_winograd42_packn_fp16sa_rvv() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/mips/ |
H A D | convolution_3x3_pack4.h | 242 v4f32 _tmp04 = (v4f32)__msa_ld_w(tmp[m][4], 0); in conv3x3s1_winograd64_pack4_msa() local 250 … v4f32 _tmp12a = __msa_fmadd_w(__msa_fadd_w(_tmp02, _tmp06), _vm4_25, _tmp04); in conv3x3s1_winograd64_pack4_msa() 262 … v4f32 _tmp56a = __msa_fmadd_w(_tmp06, _v4, __msa_fmadd_w(_tmp02, _vm1_25, _tmp04)); in conv3x3s1_winograd64_pack4_msa() 848 v4f32 _tmp04 = (v4f32)__msa_ld_w(tmp[m][4], 0); in conv3x3s1_winograd64_pack4_msa() local 856 v4f32 _tmp024b = __msa_fadd_w(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_msa() 857 v4f32 _tmp135b = __msa_fsub_w(_tmp03, _tmp04); in conv3x3s1_winograd64_pack4_msa() 1076 v4f32 _tmp04 = (v4f32)__msa_ld_w(tmp[m][4], 0); in conv3x3s1_winograd42_pack4_msa() local 1079 … v4f32 _r0tm0 = __msa_fmadd_w(__msa_fmadd_w(_tmp04, _v4, _tmp00), _vm5, _tmp02); in conv3x3s1_winograd42_pack4_msa() 1643 v4f32 _tmp04 = (v4f32)__msa_ld_w(tmp[m][4], 0); in conv3x3s1_winograd42_pack4_msa() local 1649 v4f32 _tmp02b = __msa_fadd_w(_tmp03, _tmp04); in conv3x3s1_winograd42_pack4_msa() [all …]
|