/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/ |
H A D | convolution_7x7_pack1ton.h | 87 _sum6 = vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl); in conv7x7s2_pack1ton_rvv() 95 _sum6 = vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl); in conv7x7s2_pack1ton_rvv() 103 _sum6 = vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl); in conv7x7s2_pack1ton_rvv() 111 _sum6 = vfmacc_vf_f32m1(_sum6, r0[15], _k03, vl); in conv7x7s2_pack1ton_rvv() 119 _sum6 = vfmacc_vf_f32m1(_sum6, r0[16], _k04, vl); in conv7x7s2_pack1ton_rvv() 127 _sum6 = vfmacc_vf_f32m1(_sum6, r0[17], _k05, vl); in conv7x7s2_pack1ton_rvv() 135 _sum6 = vfmacc_vf_f32m1(_sum6, r0[18], _k06, vl); in conv7x7s2_pack1ton_rvv() 154 _sum6 = vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl); in conv7x7s2_pack1ton_rvv() 162 _sum6 = vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl); in conv7x7s2_pack1ton_rvv() 170 _sum6 = vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl); in conv7x7s2_pack1ton_rvv() [all …]
|
H A D | convolution_7x7_pack1ton_fp16s.h | 87 _sum6 = vfmacc_vf_f16m1(_sum6, r0[12], _k00, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 95 _sum6 = vfmacc_vf_f16m1(_sum6, r0[13], _k01, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 103 _sum6 = vfmacc_vf_f16m1(_sum6, r0[14], _k02, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 111 _sum6 = vfmacc_vf_f16m1(_sum6, r0[15], _k03, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 119 _sum6 = vfmacc_vf_f16m1(_sum6, r0[16], _k04, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 127 _sum6 = vfmacc_vf_f16m1(_sum6, r0[17], _k05, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 135 _sum6 = vfmacc_vf_f16m1(_sum6, r0[18], _k06, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 154 _sum6 = vfmacc_vf_f16m1(_sum6, r1[12], _k10, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 162 _sum6 = vfmacc_vf_f16m1(_sum6, r1[13], _k11, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 170 _sum6 = vfmacc_vf_f16m1(_sum6, r1[14], _k12, vl); in conv7x7s2_pack1ton_fp16sa_rvv() [all …]
|
H A D | convolution_3x3_pack1ton.h | 79 _sum6 = vfmacc_vf_f32m1(_sum6, r0[6], _k00, vl); in conv3x3s1_pack1ton_rvv() 87 _sum6 = vfmacc_vf_f32m1(_sum6, r0[7], _k01, vl); in conv3x3s1_pack1ton_rvv() 95 _sum6 = vfmacc_vf_f32m1(_sum6, r0[8], _k02, vl); in conv3x3s1_pack1ton_rvv() 104 _sum6 = vfmacc_vf_f32m1(_sum6, r1[6], _k10, vl); in conv3x3s1_pack1ton_rvv() 112 _sum6 = vfmacc_vf_f32m1(_sum6, r1[7], _k11, vl); in conv3x3s1_pack1ton_rvv() 120 _sum6 = vfmacc_vf_f32m1(_sum6, r1[8], _k12, vl); in conv3x3s1_pack1ton_rvv() 129 _sum6 = vfmacc_vf_f32m1(_sum6, r2[6], _k20, vl); in conv3x3s1_pack1ton_rvv() 137 _sum6 = vfmacc_vf_f32m1(_sum6, r2[7], _k21, vl); in conv3x3s1_pack1ton_rvv() 145 _sum6 = vfmacc_vf_f32m1(_sum6, r2[8], _k22, vl); in conv3x3s1_pack1ton_rvv() 357 _sum6 = vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl); in conv3x3s2_pack1ton_rvv() [all …]
|
H A D | convolution_3x3_pack1ton_fp16s.h | 79 _sum6 = vfmacc_vf_f16m1(_sum6, r0[6], _k00, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 87 _sum6 = vfmacc_vf_f16m1(_sum6, r0[7], _k01, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 95 _sum6 = vfmacc_vf_f16m1(_sum6, r0[8], _k02, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 104 _sum6 = vfmacc_vf_f16m1(_sum6, r1[6], _k10, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 112 _sum6 = vfmacc_vf_f16m1(_sum6, r1[7], _k11, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 120 _sum6 = vfmacc_vf_f16m1(_sum6, r1[8], _k12, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 129 _sum6 = vfmacc_vf_f16m1(_sum6, r2[6], _k20, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 137 _sum6 = vfmacc_vf_f16m1(_sum6, r2[7], _k21, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 145 _sum6 = vfmacc_vf_f16m1(_sum6, r2[8], _k22, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 357 _sum6 = vfmacc_vf_f16m1(_sum6, r0[12], _k00, vl); in conv3x3s2_pack1ton_fp16sa_rvv() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 166 float16x8_t _sum6 = vdupq_laneq_f16(_bias0, 6); in conv1x1s1_sgemm_fp16sa_neon() local 195 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() 204 _sum6 = vfmaq_laneq_f16(_sum6, _p1, _k1, 6); in conv1x1s1_sgemm_fp16sa_neon() 213 _sum6 = vfmaq_laneq_f16(_sum6, _p2, _k2, 6); in conv1x1s1_sgemm_fp16sa_neon() 222 _sum6 = vfmaq_laneq_f16(_sum6, _p3, _k3, 6); in conv1x1s1_sgemm_fp16sa_neon() 231 _sum6 = vfmaq_laneq_f16(_sum6, _p4, _k4, 6); in conv1x1s1_sgemm_fp16sa_neon() 240 _sum6 = vfmaq_laneq_f16(_sum6, _p5, _k5, 6); in conv1x1s1_sgemm_fp16sa_neon() 249 _sum6 = vfmaq_laneq_f16(_sum6, _p6, _k6, 6); in conv1x1s1_sgemm_fp16sa_neon() 258 _sum6 = vfmaq_laneq_f16(_sum6, _p7, _k7, 6); in conv1x1s1_sgemm_fp16sa_neon() 277 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 936 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val0), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 955 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val1), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 974 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w2), vget_low_s16(_val2), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 993 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w3), vget_low_s16(_val3), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1012 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w4), vget_low_s16(_val4), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1031 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w5), vget_low_s16(_val5), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1050 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w6), vget_low_s16(_val6), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1069 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w7), vget_low_s16(_val7), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1140 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val3), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1151 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 166 float16x8_t _sum6 = vdupq_laneq_f16(_bias0, 6); in conv1x1s1_sgemm_fp16sa_neon() local 195 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() 204 _sum6 = vfmaq_laneq_f16(_sum6, _p1, _k1, 6); in conv1x1s1_sgemm_fp16sa_neon() 213 _sum6 = vfmaq_laneq_f16(_sum6, _p2, _k2, 6); in conv1x1s1_sgemm_fp16sa_neon() 222 _sum6 = vfmaq_laneq_f16(_sum6, _p3, _k3, 6); in conv1x1s1_sgemm_fp16sa_neon() 231 _sum6 = vfmaq_laneq_f16(_sum6, _p4, _k4, 6); in conv1x1s1_sgemm_fp16sa_neon() 240 _sum6 = vfmaq_laneq_f16(_sum6, _p5, _k5, 6); in conv1x1s1_sgemm_fp16sa_neon() 249 _sum6 = vfmaq_laneq_f16(_sum6, _p6, _k6, 6); in conv1x1s1_sgemm_fp16sa_neon() 258 _sum6 = vfmaq_laneq_f16(_sum6, _p7, _k7, 6); in conv1x1s1_sgemm_fp16sa_neon() 277 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 936 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val0), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 955 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val1), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 974 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w2), vget_low_s16(_val2), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 993 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w3), vget_low_s16(_val3), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1012 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w4), vget_low_s16(_val4), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1031 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w5), vget_low_s16(_val5), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1050 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w6), vget_low_s16(_val6), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1069 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w7), vget_low_s16(_val7), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1140 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val3), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1151 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 166 float16x8_t _sum6 = vdupq_laneq_f16(_bias0, 6); in conv1x1s1_sgemm_fp16sa_neon() local 195 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() 204 _sum6 = vfmaq_laneq_f16(_sum6, _p1, _k1, 6); in conv1x1s1_sgemm_fp16sa_neon() 213 _sum6 = vfmaq_laneq_f16(_sum6, _p2, _k2, 6); in conv1x1s1_sgemm_fp16sa_neon() 222 _sum6 = vfmaq_laneq_f16(_sum6, _p3, _k3, 6); in conv1x1s1_sgemm_fp16sa_neon() 231 _sum6 = vfmaq_laneq_f16(_sum6, _p4, _k4, 6); in conv1x1s1_sgemm_fp16sa_neon() 240 _sum6 = vfmaq_laneq_f16(_sum6, _p5, _k5, 6); in conv1x1s1_sgemm_fp16sa_neon() 249 _sum6 = vfmaq_laneq_f16(_sum6, _p6, _k6, 6); in conv1x1s1_sgemm_fp16sa_neon() 258 _sum6 = vfmaq_laneq_f16(_sum6, _p7, _k7, 6); in conv1x1s1_sgemm_fp16sa_neon() 277 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 936 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val0), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 955 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val1), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 974 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w2), vget_low_s16(_val2), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 993 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w3), vget_low_s16(_val3), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1012 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w4), vget_low_s16(_val4), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1031 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w5), vget_low_s16(_val5), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1050 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w6), vget_low_s16(_val6), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1069 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w7), vget_low_s16(_val7), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1140 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val3), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1151 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 166 float16x8_t _sum6 = vdupq_laneq_f16(_bias0, 6); in conv1x1s1_sgemm_fp16sa_neon() local 195 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() 204 _sum6 = vfmaq_laneq_f16(_sum6, _p1, _k1, 6); in conv1x1s1_sgemm_fp16sa_neon() 213 _sum6 = vfmaq_laneq_f16(_sum6, _p2, _k2, 6); in conv1x1s1_sgemm_fp16sa_neon() 222 _sum6 = vfmaq_laneq_f16(_sum6, _p3, _k3, 6); in conv1x1s1_sgemm_fp16sa_neon() 231 _sum6 = vfmaq_laneq_f16(_sum6, _p4, _k4, 6); in conv1x1s1_sgemm_fp16sa_neon() 240 _sum6 = vfmaq_laneq_f16(_sum6, _p5, _k5, 6); in conv1x1s1_sgemm_fp16sa_neon() 249 _sum6 = vfmaq_laneq_f16(_sum6, _p6, _k6, 6); in conv1x1s1_sgemm_fp16sa_neon() 258 _sum6 = vfmaq_laneq_f16(_sum6, _p7, _k7, 6); in conv1x1s1_sgemm_fp16sa_neon() 277 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
H A D | convolution_3x3_pack8to4_int8.h | 936 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val0), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 955 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val1), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 974 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w2), vget_low_s16(_val2), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 993 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w3), vget_low_s16(_val3), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1012 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w4), vget_low_s16(_val4), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1031 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w5), vget_low_s16(_val5), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1050 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w6), vget_low_s16(_val6), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1069 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w7), vget_low_s16(_val7), 3); in conv3x3s1_winograd42_pack8to4_int8_neon() 1140 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w0), vget_low_s16(_val3), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1151 _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_w1), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/arm/ |
H A D | convolution_1x1_fp16s.h | 166 float16x8_t _sum6 = vdupq_laneq_f16(_bias0, 6); in conv1x1s1_sgemm_fp16sa_neon() local 195 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() 204 _sum6 = vfmaq_laneq_f16(_sum6, _p1, _k1, 6); in conv1x1s1_sgemm_fp16sa_neon() 213 _sum6 = vfmaq_laneq_f16(_sum6, _p2, _k2, 6); in conv1x1s1_sgemm_fp16sa_neon() 222 _sum6 = vfmaq_laneq_f16(_sum6, _p3, _k3, 6); in conv1x1s1_sgemm_fp16sa_neon() 231 _sum6 = vfmaq_laneq_f16(_sum6, _p4, _k4, 6); in conv1x1s1_sgemm_fp16sa_neon() 240 _sum6 = vfmaq_laneq_f16(_sum6, _p5, _k5, 6); in conv1x1s1_sgemm_fp16sa_neon() 249 _sum6 = vfmaq_laneq_f16(_sum6, _p6, _k6, 6); in conv1x1s1_sgemm_fp16sa_neon() 258 _sum6 = vfmaq_laneq_f16(_sum6, _p7, _k7, 6); in conv1x1s1_sgemm_fp16sa_neon() 277 _sum6 = vfmaq_laneq_f16(_sum6, _p0, _k0, 6); in conv1x1s1_sgemm_fp16sa_neon() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/mips/ |
H A D | convolution_3x3_pack1to4.h | 91 _sum6 = __msa_fmadd_w(_sum6, _r06, _k00); in conv3x3s1_pack1to4_msa() 99 _sum6 = __msa_fmadd_w(_sum6, _r07, _k01); in conv3x3s1_pack1to4_msa() 107 _sum6 = __msa_fmadd_w(_sum6, _r08, _k02); in conv3x3s1_pack1to4_msa() 131 _sum6 = __msa_fmadd_w(_sum6, _r16, _k10); in conv3x3s1_pack1to4_msa() 139 _sum6 = __msa_fmadd_w(_sum6, _r17, _k11); in conv3x3s1_pack1to4_msa() 147 _sum6 = __msa_fmadd_w(_sum6, _r18, _k12); in conv3x3s1_pack1to4_msa() 171 _sum6 = __msa_fmadd_w(_sum6, _r26, _k20); in conv3x3s1_pack1to4_msa() 179 _sum6 = __msa_fmadd_w(_sum6, _r27, _k21); in conv3x3s1_pack1to4_msa() 187 _sum6 = __msa_fmadd_w(_sum6, _r28, _k22); in conv3x3s1_pack1to4_msa() 482 _sum6 = __msa_fmadd_w(_sum6, _r0c, _k00); in conv3x3s2_pack1to4_msa() [all …]
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8.h | 511 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_pack8_avx() 512 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_pack8_avx() 513 _sum6 = _mm256_fmadd_ps(_w2, _val62, _sum6); in conv1x1s1_sgemm_pack8_avx() 514 _sum6 = _mm256_fmadd_ps(_w3, _val63, _sum6); in conv1x1s1_sgemm_pack8_avx() 515 _sum6 = _mm256_fmadd_ps(_w4, _val64, _sum6); in conv1x1s1_sgemm_pack8_avx() 516 _sum6 = _mm256_fmadd_ps(_w5, _val65, _sum6); in conv1x1s1_sgemm_pack8_avx() 517 _sum6 = _mm256_fmadd_ps(_w6, _val66, _sum6); in conv1x1s1_sgemm_pack8_avx() 518 _sum6 = _mm256_fmadd_ps(_w7, _val67, _sum6); in conv1x1s1_sgemm_pack8_avx() 759 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_pack8_avx() 760 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_pack8_avx() [all …]
|
H A D | convolution_1x1_pack8_fp16.h | 511 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 512 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 513 _sum6 = _mm256_fmadd_ps(_w2, _val62, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 514 _sum6 = _mm256_fmadd_ps(_w3, _val63, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 515 _sum6 = _mm256_fmadd_ps(_w4, _val64, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 516 _sum6 = _mm256_fmadd_ps(_w5, _val65, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 517 _sum6 = _mm256_fmadd_ps(_w6, _val66, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 518 _sum6 = _mm256_fmadd_ps(_w7, _val67, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 759 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 760 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 511 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 512 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 513 _sum6 = _mm256_fmadd_ps(_w2, _val62, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 514 _sum6 = _mm256_fmadd_ps(_w3, _val63, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 515 _sum6 = _mm256_fmadd_ps(_w4, _val64, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 516 _sum6 = _mm256_fmadd_ps(_w5, _val65, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 517 _sum6 = _mm256_fmadd_ps(_w6, _val66, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 518 _sum6 = _mm256_fmadd_ps(_w7, _val67, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 759 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 760 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 160 __m256 _sum6 = _bias0; in convdw3x3s1_pack8_avx() local 166 _sum6 = _mm256_fmadd_ps(_k00, _r06, _sum6); in convdw3x3s1_pack8_avx() 167 _sum6 = _mm256_fmadd_ps(_k01, _r07, _sum6); in convdw3x3s1_pack8_avx() 168 _sum6 = _mm256_fmadd_ps(_k02, _r08, _sum6); in convdw3x3s1_pack8_avx() 169 _sum6 = _mm256_fmadd_ps(_k10, _r16, _sum6); in convdw3x3s1_pack8_avx() 170 _sum6 = _mm256_fmadd_ps(_k11, _r17, _sum6); in convdw3x3s1_pack8_avx() 171 _sum6 = _mm256_fmadd_ps(_k12, _r18, _sum6); in convdw3x3s1_pack8_avx() 172 _sum6 = _mm256_fmadd_ps(_k20, _r26, _sum6); in convdw3x3s1_pack8_avx() 173 _sum6 = _mm256_fmadd_ps(_k21, _r27, _sum6); in convdw3x3s1_pack8_avx() 174 _sum6 = _mm256_fmadd_ps(_k22, _r28, _sum6); in convdw3x3s1_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 160 __m256 _sum6 = _bias0; in convdw3x3s1_fp16_pack8_avx() local 166 _sum6 = _mm256_fmadd_ps(_k00, _r06, _sum6); in convdw3x3s1_fp16_pack8_avx() 167 _sum6 = _mm256_fmadd_ps(_k01, _r07, _sum6); in convdw3x3s1_fp16_pack8_avx() 168 _sum6 = _mm256_fmadd_ps(_k02, _r08, _sum6); in convdw3x3s1_fp16_pack8_avx() 169 _sum6 = _mm256_fmadd_ps(_k10, _r16, _sum6); in convdw3x3s1_fp16_pack8_avx() 170 _sum6 = _mm256_fmadd_ps(_k11, _r17, _sum6); in convdw3x3s1_fp16_pack8_avx() 171 _sum6 = _mm256_fmadd_ps(_k12, _r18, _sum6); in convdw3x3s1_fp16_pack8_avx() 172 _sum6 = _mm256_fmadd_ps(_k20, _r26, _sum6); in convdw3x3s1_fp16_pack8_avx() 173 _sum6 = _mm256_fmadd_ps(_k21, _r27, _sum6); in convdw3x3s1_fp16_pack8_avx() 174 _sum6 = _mm256_fmadd_ps(_k22, _r28, _sum6); in convdw3x3s1_fp16_pack8_avx() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 511 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 512 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 513 _sum6 = _mm256_fmadd_ps(_w2, _val62, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 514 _sum6 = _mm256_fmadd_ps(_w3, _val63, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 515 _sum6 = _mm256_fmadd_ps(_w4, _val64, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 516 _sum6 = _mm256_fmadd_ps(_w5, _val65, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 517 _sum6 = _mm256_fmadd_ps(_w6, _val66, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 518 _sum6 = _mm256_fmadd_ps(_w7, _val67, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 759 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 760 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 160 __m256 _sum6 = _bias0; in convdw3x3s1_fp16_pack8_avx() local 166 _sum6 = _mm256_fmadd_ps(_k00, _r06, _sum6); in convdw3x3s1_fp16_pack8_avx() 167 _sum6 = _mm256_fmadd_ps(_k01, _r07, _sum6); in convdw3x3s1_fp16_pack8_avx() 168 _sum6 = _mm256_fmadd_ps(_k02, _r08, _sum6); in convdw3x3s1_fp16_pack8_avx() 169 _sum6 = _mm256_fmadd_ps(_k10, _r16, _sum6); in convdw3x3s1_fp16_pack8_avx() 170 _sum6 = _mm256_fmadd_ps(_k11, _r17, _sum6); in convdw3x3s1_fp16_pack8_avx() 171 _sum6 = _mm256_fmadd_ps(_k12, _r18, _sum6); in convdw3x3s1_fp16_pack8_avx() 172 _sum6 = _mm256_fmadd_ps(_k20, _r26, _sum6); in convdw3x3s1_fp16_pack8_avx() 173 _sum6 = _mm256_fmadd_ps(_k21, _r27, _sum6); in convdw3x3s1_fp16_pack8_avx() 174 _sum6 = _mm256_fmadd_ps(_k22, _r28, _sum6); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 160 __m256 _sum6 = _bias0; in convdw3x3s1_pack8_avx() local 166 _sum6 = _mm256_fmadd_ps(_k00, _r06, _sum6); in convdw3x3s1_pack8_avx() 167 _sum6 = _mm256_fmadd_ps(_k01, _r07, _sum6); in convdw3x3s1_pack8_avx() 168 _sum6 = _mm256_fmadd_ps(_k02, _r08, _sum6); in convdw3x3s1_pack8_avx() 169 _sum6 = _mm256_fmadd_ps(_k10, _r16, _sum6); in convdw3x3s1_pack8_avx() 170 _sum6 = _mm256_fmadd_ps(_k11, _r17, _sum6); in convdw3x3s1_pack8_avx() 171 _sum6 = _mm256_fmadd_ps(_k12, _r18, _sum6); in convdw3x3s1_pack8_avx() 172 _sum6 = _mm256_fmadd_ps(_k20, _r26, _sum6); in convdw3x3s1_pack8_avx() 173 _sum6 = _mm256_fmadd_ps(_k21, _r27, _sum6); in convdw3x3s1_pack8_avx() 174 _sum6 = _mm256_fmadd_ps(_k22, _r28, _sum6); in convdw3x3s1_pack8_avx() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 511 _sum6 = _mm256_comp_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 512 _sum6 = _mm256_comp_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 513 _sum6 = _mm256_comp_fmadd_ps(_w2, _val62, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 514 _sum6 = _mm256_comp_fmadd_ps(_w3, _val63, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 515 _sum6 = _mm256_comp_fmadd_ps(_w4, _val64, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 516 _sum6 = _mm256_comp_fmadd_ps(_w5, _val65, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 517 _sum6 = _mm256_comp_fmadd_ps(_w6, _val66, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 518 _sum6 = _mm256_comp_fmadd_ps(_w7, _val67, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 759 _sum6 = _mm256_comp_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 760 _sum6 = _mm256_comp_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack4.h | 160 __m128 _sum6 = _bias0; in convdw3x3s1_pack4_sse() local 166 _sum6 = _mm_comp_fmadd_ps(_k00, _r06, _sum6); in convdw3x3s1_pack4_sse() 167 _sum6 = _mm_comp_fmadd_ps(_k01, _r07, _sum6); in convdw3x3s1_pack4_sse() 168 _sum6 = _mm_comp_fmadd_ps(_k02, _r08, _sum6); in convdw3x3s1_pack4_sse() 169 _sum6 = _mm_comp_fmadd_ps(_k10, _r16, _sum6); in convdw3x3s1_pack4_sse() 170 _sum6 = _mm_comp_fmadd_ps(_k11, _r17, _sum6); in convdw3x3s1_pack4_sse() 171 _sum6 = _mm_comp_fmadd_ps(_k12, _r18, _sum6); in convdw3x3s1_pack4_sse() 172 _sum6 = _mm_comp_fmadd_ps(_k20, _r26, _sum6); in convdw3x3s1_pack4_sse() 173 _sum6 = _mm_comp_fmadd_ps(_k21, _r27, _sum6); in convdw3x3s1_pack4_sse() 174 _sum6 = _mm_comp_fmadd_ps(_k22, _r28, _sum6); in convdw3x3s1_pack4_sse() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/ |
H A D | convolution_1x1_pack8_fp16.h | 511 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 512 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 513 _sum6 = _mm256_fmadd_ps(_w2, _val62, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 514 _sum6 = _mm256_fmadd_ps(_w3, _val63, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 515 _sum6 = _mm256_fmadd_ps(_w4, _val64, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 516 _sum6 = _mm256_fmadd_ps(_w5, _val65, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 517 _sum6 = _mm256_fmadd_ps(_w6, _val66, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 518 _sum6 = _mm256_fmadd_ps(_w7, _val67, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 759 _sum6 = _mm256_fmadd_ps(_w0, _val60, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() 760 _sum6 = _mm256_fmadd_ps(_w1, _val61, _sum6); in conv1x1s1_sgemm_fp16_pack8_avx() [all …]
|