/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/ |
H A D | convolution_7x7_pack1ton.h | 86 _sum5 = vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl); in conv7x7s2_pack1ton_rvv() 94 _sum5 = vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl); in conv7x7s2_pack1ton_rvv() 102 _sum5 = vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl); in conv7x7s2_pack1ton_rvv() 110 _sum5 = vfmacc_vf_f32m1(_sum5, r0[13], _k03, vl); in conv7x7s2_pack1ton_rvv() 118 _sum5 = vfmacc_vf_f32m1(_sum5, r0[14], _k04, vl); in conv7x7s2_pack1ton_rvv() 126 _sum5 = vfmacc_vf_f32m1(_sum5, r0[15], _k05, vl); in conv7x7s2_pack1ton_rvv() 134 _sum5 = vfmacc_vf_f32m1(_sum5, r0[16], _k06, vl); in conv7x7s2_pack1ton_rvv() 153 _sum5 = vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl); in conv7x7s2_pack1ton_rvv() 161 _sum5 = vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl); in conv7x7s2_pack1ton_rvv() 169 _sum5 = vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl); in conv7x7s2_pack1ton_rvv() [all …]
|
H A D | convolution_7x7_pack1ton_fp16s.h | 86 _sum5 = vfmacc_vf_f16m1(_sum5, r0[10], _k00, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 94 _sum5 = vfmacc_vf_f16m1(_sum5, r0[11], _k01, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 102 _sum5 = vfmacc_vf_f16m1(_sum5, r0[12], _k02, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 110 _sum5 = vfmacc_vf_f16m1(_sum5, r0[13], _k03, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 118 _sum5 = vfmacc_vf_f16m1(_sum5, r0[14], _k04, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 126 _sum5 = vfmacc_vf_f16m1(_sum5, r0[15], _k05, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 134 _sum5 = vfmacc_vf_f16m1(_sum5, r0[16], _k06, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 153 _sum5 = vfmacc_vf_f16m1(_sum5, r1[10], _k10, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 161 _sum5 = vfmacc_vf_f16m1(_sum5, r1[11], _k11, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 169 _sum5 = vfmacc_vf_f16m1(_sum5, r1[12], _k12, vl); in conv7x7s2_pack1ton_fp16sa_rvv() [all …]
|
H A D | convolution_3x3_pack1ton.h | 78 _sum5 = vfmacc_vf_f32m1(_sum5, r0[5], _k00, vl); in conv3x3s1_pack1ton_rvv() 86 _sum5 = vfmacc_vf_f32m1(_sum5, r0[6], _k01, vl); in conv3x3s1_pack1ton_rvv() 94 _sum5 = vfmacc_vf_f32m1(_sum5, r0[7], _k02, vl); in conv3x3s1_pack1ton_rvv() 103 _sum5 = vfmacc_vf_f32m1(_sum5, r1[5], _k10, vl); in conv3x3s1_pack1ton_rvv() 111 _sum5 = vfmacc_vf_f32m1(_sum5, r1[6], _k11, vl); in conv3x3s1_pack1ton_rvv() 119 _sum5 = vfmacc_vf_f32m1(_sum5, r1[7], _k12, vl); in conv3x3s1_pack1ton_rvv() 128 _sum5 = vfmacc_vf_f32m1(_sum5, r2[5], _k20, vl); in conv3x3s1_pack1ton_rvv() 136 _sum5 = vfmacc_vf_f32m1(_sum5, r2[6], _k21, vl); in conv3x3s1_pack1ton_rvv() 144 _sum5 = vfmacc_vf_f32m1(_sum5, r2[7], _k22, vl); in conv3x3s1_pack1ton_rvv() 356 _sum5 = vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl); in conv3x3s2_pack1ton_rvv() [all …]
|
H A D | convolution_3x3_pack1ton_fp16s.h | 78 _sum5 = vfmacc_vf_f16m1(_sum5, r0[5], _k00, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 86 _sum5 = vfmacc_vf_f16m1(_sum5, r0[6], _k01, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 94 _sum5 = vfmacc_vf_f16m1(_sum5, r0[7], _k02, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 103 _sum5 = vfmacc_vf_f16m1(_sum5, r1[5], _k10, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 111 _sum5 = vfmacc_vf_f16m1(_sum5, r1[6], _k11, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 119 _sum5 = vfmacc_vf_f16m1(_sum5, r1[7], _k12, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 128 _sum5 = vfmacc_vf_f16m1(_sum5, r2[5], _k20, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 136 _sum5 = vfmacc_vf_f16m1(_sum5, r2[6], _k21, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 144 _sum5 = vfmacc_vf_f16m1(_sum5, r2[7], _k22, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 356 _sum5 = vfmacc_vf_f16m1(_sum5, r0[10], _k00, vl); in conv3x3s2_pack1ton_fp16sa_rvv() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 165 float16x8_t _sum5 = vdupq_laneq_f16(_bias0, 5); in conv1x1s1_sgemm_fp16sa_neon() local 194 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() 203 _sum5 = vfmaq_laneq_f16(_sum5, _p1, _k1, 5); in conv1x1s1_sgemm_fp16sa_neon() 212 _sum5 = vfmaq_laneq_f16(_sum5, _p2, _k2, 5); in conv1x1s1_sgemm_fp16sa_neon() 221 _sum5 = vfmaq_laneq_f16(_sum5, _p3, _k3, 5); in conv1x1s1_sgemm_fp16sa_neon() 230 _sum5 = vfmaq_laneq_f16(_sum5, _p4, _k4, 5); in conv1x1s1_sgemm_fp16sa_neon() 239 _sum5 = vfmaq_laneq_f16(_sum5, _p5, _k5, 5); in conv1x1s1_sgemm_fp16sa_neon() 248 _sum5 = vfmaq_laneq_f16(_sum5, _p6, _k6, 5); in conv1x1s1_sgemm_fp16sa_neon() 257 _sum5 = vfmaq_laneq_f16(_sum5, _p7, _k7, 5); in conv1x1s1_sgemm_fp16sa_neon() 276 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 935 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val0), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 954 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val1), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 973 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w2), vget_low_s16(_val2), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 992 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w3), vget_low_s16(_val3), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1011 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w4), vget_low_s16(_val4), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1030 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w5), vget_low_s16(_val5), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1049 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w6), vget_low_s16(_val6), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1068 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w7), vget_low_s16(_val7), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1139 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val2), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1150 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 165 float16x8_t _sum5 = vdupq_laneq_f16(_bias0, 5); in conv1x1s1_sgemm_fp16sa_neon() local 194 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() 203 _sum5 = vfmaq_laneq_f16(_sum5, _p1, _k1, 5); in conv1x1s1_sgemm_fp16sa_neon() 212 _sum5 = vfmaq_laneq_f16(_sum5, _p2, _k2, 5); in conv1x1s1_sgemm_fp16sa_neon() 221 _sum5 = vfmaq_laneq_f16(_sum5, _p3, _k3, 5); in conv1x1s1_sgemm_fp16sa_neon() 230 _sum5 = vfmaq_laneq_f16(_sum5, _p4, _k4, 5); in conv1x1s1_sgemm_fp16sa_neon() 239 _sum5 = vfmaq_laneq_f16(_sum5, _p5, _k5, 5); in conv1x1s1_sgemm_fp16sa_neon() 248 _sum5 = vfmaq_laneq_f16(_sum5, _p6, _k6, 5); in conv1x1s1_sgemm_fp16sa_neon() 257 _sum5 = vfmaq_laneq_f16(_sum5, _p7, _k7, 5); in conv1x1s1_sgemm_fp16sa_neon() 276 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 935 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val0), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 954 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val1), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 973 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w2), vget_low_s16(_val2), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 992 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w3), vget_low_s16(_val3), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1011 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w4), vget_low_s16(_val4), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1030 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w5), vget_low_s16(_val5), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1049 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w6), vget_low_s16(_val6), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1068 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w7), vget_low_s16(_val7), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1139 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val2), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1150 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 165 float16x8_t _sum5 = vdupq_laneq_f16(_bias0, 5); in conv1x1s1_sgemm_fp16sa_neon() local 194 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() 203 _sum5 = vfmaq_laneq_f16(_sum5, _p1, _k1, 5); in conv1x1s1_sgemm_fp16sa_neon() 212 _sum5 = vfmaq_laneq_f16(_sum5, _p2, _k2, 5); in conv1x1s1_sgemm_fp16sa_neon() 221 _sum5 = vfmaq_laneq_f16(_sum5, _p3, _k3, 5); in conv1x1s1_sgemm_fp16sa_neon() 230 _sum5 = vfmaq_laneq_f16(_sum5, _p4, _k4, 5); in conv1x1s1_sgemm_fp16sa_neon() 239 _sum5 = vfmaq_laneq_f16(_sum5, _p5, _k5, 5); in conv1x1s1_sgemm_fp16sa_neon() 248 _sum5 = vfmaq_laneq_f16(_sum5, _p6, _k6, 5); in conv1x1s1_sgemm_fp16sa_neon() 257 _sum5 = vfmaq_laneq_f16(_sum5, _p7, _k7, 5); in conv1x1s1_sgemm_fp16sa_neon() 276 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 935 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val0), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 954 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val1), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 973 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w2), vget_low_s16(_val2), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 992 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w3), vget_low_s16(_val3), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1011 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w4), vget_low_s16(_val4), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1030 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w5), vget_low_s16(_val5), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1049 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w6), vget_low_s16(_val6), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1068 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w7), vget_low_s16(_val7), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1139 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val2), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1150 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 165 float16x8_t _sum5 = vdupq_laneq_f16(_bias0, 5); in conv1x1s1_sgemm_fp16sa_neon() local 194 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() 203 _sum5 = vfmaq_laneq_f16(_sum5, _p1, _k1, 5); in conv1x1s1_sgemm_fp16sa_neon() 212 _sum5 = vfmaq_laneq_f16(_sum5, _p2, _k2, 5); in conv1x1s1_sgemm_fp16sa_neon() 221 _sum5 = vfmaq_laneq_f16(_sum5, _p3, _k3, 5); in conv1x1s1_sgemm_fp16sa_neon() 230 _sum5 = vfmaq_laneq_f16(_sum5, _p4, _k4, 5); in conv1x1s1_sgemm_fp16sa_neon() 239 _sum5 = vfmaq_laneq_f16(_sum5, _p5, _k5, 5); in conv1x1s1_sgemm_fp16sa_neon() 248 _sum5 = vfmaq_laneq_f16(_sum5, _p6, _k6, 5); in conv1x1s1_sgemm_fp16sa_neon() 257 _sum5 = vfmaq_laneq_f16(_sum5, _p7, _k7, 5); in conv1x1s1_sgemm_fp16sa_neon() 276 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 935 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val0), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 954 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val1), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 973 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w2), vget_low_s16(_val2), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 992 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w3), vget_low_s16(_val3), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1011 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w4), vget_low_s16(_val4), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1030 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w5), vget_low_s16(_val5), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1049 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w6), vget_low_s16(_val6), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1068 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w7), vget_low_s16(_val7), 2); in conv3x3s1_winograd42_pack8to4_int8_neon() 1139 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w0), vget_low_s16(_val2), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1150 _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_w1), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 165 float16x8_t _sum5 = vdupq_laneq_f16(_bias0, 5); in conv1x1s1_sgemm_fp16sa_neon() local 194 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() 203 _sum5 = vfmaq_laneq_f16(_sum5, _p1, _k1, 5); in conv1x1s1_sgemm_fp16sa_neon() 212 _sum5 = vfmaq_laneq_f16(_sum5, _p2, _k2, 5); in conv1x1s1_sgemm_fp16sa_neon() 221 _sum5 = vfmaq_laneq_f16(_sum5, _p3, _k3, 5); in conv1x1s1_sgemm_fp16sa_neon() 230 _sum5 = vfmaq_laneq_f16(_sum5, _p4, _k4, 5); in conv1x1s1_sgemm_fp16sa_neon() 239 _sum5 = vfmaq_laneq_f16(_sum5, _p5, _k5, 5); in conv1x1s1_sgemm_fp16sa_neon() 248 _sum5 = vfmaq_laneq_f16(_sum5, _p6, _k6, 5); in conv1x1s1_sgemm_fp16sa_neon() 257 _sum5 = vfmaq_laneq_f16(_sum5, _p7, _k7, 5); in conv1x1s1_sgemm_fp16sa_neon() 276 _sum5 = vfmaq_laneq_f16(_sum5, _p0, _k0, 5); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/mips/ |
H A D | convolution_3x3_pack1to4.h | 90 _sum5 = __msa_fmadd_w(_sum5, _r05, _k00); in conv3x3s1_pack1to4_msa() 98 _sum5 = __msa_fmadd_w(_sum5, _r06, _k01); in conv3x3s1_pack1to4_msa() 106 _sum5 = __msa_fmadd_w(_sum5, _r07, _k02); in conv3x3s1_pack1to4_msa() 130 _sum5 = __msa_fmadd_w(_sum5, _r15, _k10); in conv3x3s1_pack1to4_msa() 138 _sum5 = __msa_fmadd_w(_sum5, _r16, _k11); in conv3x3s1_pack1to4_msa() 146 _sum5 = __msa_fmadd_w(_sum5, _r17, _k12); in conv3x3s1_pack1to4_msa() 170 _sum5 = __msa_fmadd_w(_sum5, _r25, _k20); in conv3x3s1_pack1to4_msa() 178 _sum5 = __msa_fmadd_w(_sum5, _r26, _k21); in conv3x3s1_pack1to4_msa() 186 _sum5 = __msa_fmadd_w(_sum5, _r27, _k22); in conv3x3s1_pack1to4_msa() 481 _sum5 = __msa_fmadd_w(_sum5, _r0a, _k00); in conv3x3s2_pack1to4_msa() [all …]
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8.h | 485 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_pack8_avx() 486 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_pack8_avx() 487 _sum5 = _mm256_fmadd_ps(_w2, _val52, _sum5); in conv1x1s1_sgemm_pack8_avx() 488 _sum5 = _mm256_fmadd_ps(_w3, _val53, _sum5); in conv1x1s1_sgemm_pack8_avx() 489 _sum5 = _mm256_fmadd_ps(_w4, _val54, _sum5); in conv1x1s1_sgemm_pack8_avx() 490 _sum5 = _mm256_fmadd_ps(_w5, _val55, _sum5); in conv1x1s1_sgemm_pack8_avx() 491 _sum5 = _mm256_fmadd_ps(_w6, _val56, _sum5); in conv1x1s1_sgemm_pack8_avx() 492 _sum5 = _mm256_fmadd_ps(_w7, _val57, _sum5); in conv1x1s1_sgemm_pack8_avx() 733 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_pack8_avx() 734 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_pack8_avx() [all …]
|
H A D | convolution_1x1_pack8_fp16.h | 485 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 486 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 487 _sum5 = _mm256_fmadd_ps(_w2, _val52, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 488 _sum5 = _mm256_fmadd_ps(_w3, _val53, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 489 _sum5 = _mm256_fmadd_ps(_w4, _val54, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 490 _sum5 = _mm256_fmadd_ps(_w5, _val55, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 491 _sum5 = _mm256_fmadd_ps(_w6, _val56, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 492 _sum5 = _mm256_fmadd_ps(_w7, _val57, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 733 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 734 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 485 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 486 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 487 _sum5 = _mm256_fmadd_ps(_w2, _val52, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 488 _sum5 = _mm256_fmadd_ps(_w3, _val53, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 489 _sum5 = _mm256_fmadd_ps(_w4, _val54, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 490 _sum5 = _mm256_fmadd_ps(_w5, _val55, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 491 _sum5 = _mm256_fmadd_ps(_w6, _val56, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 492 _sum5 = _mm256_fmadd_ps(_w7, _val57, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 733 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 734 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 144 __m256 _sum5 = _bias0; in convdw3x3s1_pack8_avx() local 150 _sum5 = _mm256_fmadd_ps(_k00, _r05, _sum5); in convdw3x3s1_pack8_avx() 151 _sum5 = _mm256_fmadd_ps(_k01, _r06, _sum5); in convdw3x3s1_pack8_avx() 152 _sum5 = _mm256_fmadd_ps(_k02, _r07, _sum5); in convdw3x3s1_pack8_avx() 153 _sum5 = _mm256_fmadd_ps(_k10, _r15, _sum5); in convdw3x3s1_pack8_avx() 154 _sum5 = _mm256_fmadd_ps(_k11, _r16, _sum5); in convdw3x3s1_pack8_avx() 155 _sum5 = _mm256_fmadd_ps(_k12, _r17, _sum5); in convdw3x3s1_pack8_avx() 156 _sum5 = _mm256_fmadd_ps(_k20, _r25, _sum5); in convdw3x3s1_pack8_avx() 157 _sum5 = _mm256_fmadd_ps(_k21, _r26, _sum5); in convdw3x3s1_pack8_avx() 158 _sum5 = _mm256_fmadd_ps(_k22, _r27, _sum5); in convdw3x3s1_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 144 __m256 _sum5 = _bias0; in convdw3x3s1_fp16_pack8_avx() local 150 _sum5 = _mm256_fmadd_ps(_k00, _r05, _sum5); in convdw3x3s1_fp16_pack8_avx() 151 _sum5 = _mm256_fmadd_ps(_k01, _r06, _sum5); in convdw3x3s1_fp16_pack8_avx() 152 _sum5 = _mm256_fmadd_ps(_k02, _r07, _sum5); in convdw3x3s1_fp16_pack8_avx() 153 _sum5 = _mm256_fmadd_ps(_k10, _r15, _sum5); in convdw3x3s1_fp16_pack8_avx() 154 _sum5 = _mm256_fmadd_ps(_k11, _r16, _sum5); in convdw3x3s1_fp16_pack8_avx() 155 _sum5 = _mm256_fmadd_ps(_k12, _r17, _sum5); in convdw3x3s1_fp16_pack8_avx() 156 _sum5 = _mm256_fmadd_ps(_k20, _r25, _sum5); in convdw3x3s1_fp16_pack8_avx() 157 _sum5 = _mm256_fmadd_ps(_k21, _r26, _sum5); in convdw3x3s1_fp16_pack8_avx() 158 _sum5 = _mm256_fmadd_ps(_k22, _r27, _sum5); in convdw3x3s1_fp16_pack8_avx() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 485 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 486 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 487 _sum5 = _mm256_fmadd_ps(_w2, _val52, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 488 _sum5 = _mm256_fmadd_ps(_w3, _val53, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 489 _sum5 = _mm256_fmadd_ps(_w4, _val54, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 490 _sum5 = _mm256_fmadd_ps(_w5, _val55, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 491 _sum5 = _mm256_fmadd_ps(_w6, _val56, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 492 _sum5 = _mm256_fmadd_ps(_w7, _val57, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 733 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 734 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 144 __m256 _sum5 = _bias0; in convdw3x3s1_fp16_pack8_avx() local 150 _sum5 = _mm256_fmadd_ps(_k00, _r05, _sum5); in convdw3x3s1_fp16_pack8_avx() 151 _sum5 = _mm256_fmadd_ps(_k01, _r06, _sum5); in convdw3x3s1_fp16_pack8_avx() 152 _sum5 = _mm256_fmadd_ps(_k02, _r07, _sum5); in convdw3x3s1_fp16_pack8_avx() 153 _sum5 = _mm256_fmadd_ps(_k10, _r15, _sum5); in convdw3x3s1_fp16_pack8_avx() 154 _sum5 = _mm256_fmadd_ps(_k11, _r16, _sum5); in convdw3x3s1_fp16_pack8_avx() 155 _sum5 = _mm256_fmadd_ps(_k12, _r17, _sum5); in convdw3x3s1_fp16_pack8_avx() 156 _sum5 = _mm256_fmadd_ps(_k20, _r25, _sum5); in convdw3x3s1_fp16_pack8_avx() 157 _sum5 = _mm256_fmadd_ps(_k21, _r26, _sum5); in convdw3x3s1_fp16_pack8_avx() 158 _sum5 = _mm256_fmadd_ps(_k22, _r27, _sum5); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 144 __m256 _sum5 = _bias0; in convdw3x3s1_pack8_avx() local 150 _sum5 = _mm256_fmadd_ps(_k00, _r05, _sum5); in convdw3x3s1_pack8_avx() 151 _sum5 = _mm256_fmadd_ps(_k01, _r06, _sum5); in convdw3x3s1_pack8_avx() 152 _sum5 = _mm256_fmadd_ps(_k02, _r07, _sum5); in convdw3x3s1_pack8_avx() 153 _sum5 = _mm256_fmadd_ps(_k10, _r15, _sum5); in convdw3x3s1_pack8_avx() 154 _sum5 = _mm256_fmadd_ps(_k11, _r16, _sum5); in convdw3x3s1_pack8_avx() 155 _sum5 = _mm256_fmadd_ps(_k12, _r17, _sum5); in convdw3x3s1_pack8_avx() 156 _sum5 = _mm256_fmadd_ps(_k20, _r25, _sum5); in convdw3x3s1_pack8_avx() 157 _sum5 = _mm256_fmadd_ps(_k21, _r26, _sum5); in convdw3x3s1_pack8_avx() 158 _sum5 = _mm256_fmadd_ps(_k22, _r27, _sum5); in convdw3x3s1_pack8_avx() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 485 _sum5 = _mm256_comp_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 486 _sum5 = _mm256_comp_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 487 _sum5 = _mm256_comp_fmadd_ps(_w2, _val52, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 488 _sum5 = _mm256_comp_fmadd_ps(_w3, _val53, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 489 _sum5 = _mm256_comp_fmadd_ps(_w4, _val54, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 490 _sum5 = _mm256_comp_fmadd_ps(_w5, _val55, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 491 _sum5 = _mm256_comp_fmadd_ps(_w6, _val56, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 492 _sum5 = _mm256_comp_fmadd_ps(_w7, _val57, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 733 _sum5 = _mm256_comp_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 734 _sum5 = _mm256_comp_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack4.h | 144 __m128 _sum5 = _bias0; in convdw3x3s1_pack4_sse() local 150 _sum5 = _mm_comp_fmadd_ps(_k00, _r05, _sum5); in convdw3x3s1_pack4_sse() 151 _sum5 = _mm_comp_fmadd_ps(_k01, _r06, _sum5); in convdw3x3s1_pack4_sse() 152 _sum5 = _mm_comp_fmadd_ps(_k02, _r07, _sum5); in convdw3x3s1_pack4_sse() 153 _sum5 = _mm_comp_fmadd_ps(_k10, _r15, _sum5); in convdw3x3s1_pack4_sse() 154 _sum5 = _mm_comp_fmadd_ps(_k11, _r16, _sum5); in convdw3x3s1_pack4_sse() 155 _sum5 = _mm_comp_fmadd_ps(_k12, _r17, _sum5); in convdw3x3s1_pack4_sse() 156 _sum5 = _mm_comp_fmadd_ps(_k20, _r25, _sum5); in convdw3x3s1_pack4_sse() 157 _sum5 = _mm_comp_fmadd_ps(_k21, _r26, _sum5); in convdw3x3s1_pack4_sse() 158 _sum5 = _mm_comp_fmadd_ps(_k22, _r27, _sum5); in convdw3x3s1_pack4_sse() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 485 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 486 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 487 _sum5 = _mm256_fmadd_ps(_w2, _val52, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 488 _sum5 = _mm256_fmadd_ps(_w3, _val53, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 489 _sum5 = _mm256_fmadd_ps(_w4, _val54, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 490 _sum5 = _mm256_fmadd_ps(_w5, _val55, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 491 _sum5 = _mm256_fmadd_ps(_w6, _val56, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 492 _sum5 = _mm256_fmadd_ps(_w7, _val57, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 733 _sum5 = _mm256_fmadd_ps(_w0, _val50, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() 734 _sum5 = _mm256_fmadd_ps(_w1, _val51, _sum5); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|